{ "best_global_step": 14500, "best_metric": 1.3074105978012085, "best_model_checkpoint": "/scratch/ssinha78/main-project/d1/autoregressive/SFT/llama3-s1/checkpoint-14500", "epoch": 7.680084745762712, "eval_steps": 250, "global_step": 29000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005296610169491525, "grad_norm": 2.0074269771575928, "learning_rate": 9.999867584745763e-06, "loss": 2.5778, "mean_token_accuracy": 0.5148812420666218, "num_tokens": 2016.0, "step": 2 }, { "epoch": 0.001059322033898305, "grad_norm": 2.803886890411377, "learning_rate": 9.999602754237288e-06, "loss": 2.2489, "mean_token_accuracy": 0.5560990571975708, "num_tokens": 3513.0, "step": 4 }, { "epoch": 0.0015889830508474577, "grad_norm": 2.387035369873047, "learning_rate": 9.999337923728815e-06, "loss": 2.4036, "mean_token_accuracy": 0.5258920826017857, "num_tokens": 5075.0, "step": 6 }, { "epoch": 0.00211864406779661, "grad_norm": 1.5948877334594727, "learning_rate": 9.99907309322034e-06, "loss": 2.0485, "mean_token_accuracy": 0.5751121900975704, "num_tokens": 6881.0, "step": 8 }, { "epoch": 0.0026483050847457626, "grad_norm": 1.9232244491577148, "learning_rate": 9.998808262711865e-06, "loss": 2.4208, "mean_token_accuracy": 0.532907884567976, "num_tokens": 8521.0, "step": 10 }, { "epoch": 0.0031779661016949155, "grad_norm": 1.6791532039642334, "learning_rate": 9.99854343220339e-06, "loss": 2.2036, "mean_token_accuracy": 0.5470703430473804, "num_tokens": 10463.0, "step": 12 }, { "epoch": 0.003707627118644068, "grad_norm": 1.746849536895752, "learning_rate": 9.998278601694916e-06, "loss": 2.4146, "mean_token_accuracy": 0.5274148397147655, "num_tokens": 12272.0, "step": 14 }, { "epoch": 0.00423728813559322, "grad_norm": 1.6440602540969849, "learning_rate": 9.998013771186441e-06, "loss": 2.5654, "mean_token_accuracy": 0.4922427833080292, "num_tokens": 13675.0, "step": 16 }, { "epoch": 0.004766949152542373, "grad_norm": 1.9841537475585938, "learning_rate": 9.997748940677968e-06, "loss": 2.1978, "mean_token_accuracy": 0.5591752119362354, "num_tokens": 15084.0, "step": 18 }, { "epoch": 0.005296610169491525, "grad_norm": 1.6695791482925415, "learning_rate": 9.997484110169493e-06, "loss": 1.9251, "mean_token_accuracy": 0.5773647427558899, "num_tokens": 16778.0, "step": 20 }, { "epoch": 0.005826271186440678, "grad_norm": 1.6566264629364014, "learning_rate": 9.997219279661018e-06, "loss": 2.3774, "mean_token_accuracy": 0.5127240605652332, "num_tokens": 18322.0, "step": 22 }, { "epoch": 0.006355932203389831, "grad_norm": 1.439497470855713, "learning_rate": 9.996954449152543e-06, "loss": 1.7437, "mean_token_accuracy": 0.6422243230044842, "num_tokens": 19843.0, "step": 24 }, { "epoch": 0.006885593220338983, "grad_norm": 1.6708091497421265, "learning_rate": 9.99668961864407e-06, "loss": 1.9904, "mean_token_accuracy": 0.5787142068147659, "num_tokens": 21484.0, "step": 26 }, { "epoch": 0.007415254237288136, "grad_norm": 2.3018457889556885, "learning_rate": 9.996424788135594e-06, "loss": 1.8614, "mean_token_accuracy": 0.5867900773882866, "num_tokens": 22961.0, "step": 28 }, { "epoch": 0.007944915254237288, "grad_norm": 1.7954425811767578, "learning_rate": 9.996159957627119e-06, "loss": 1.9551, "mean_token_accuracy": 0.5606272891163826, "num_tokens": 24764.0, "step": 30 }, { "epoch": 0.00847457627118644, "grad_norm": 2.116990566253662, "learning_rate": 9.995895127118644e-06, "loss": 1.9272, "mean_token_accuracy": 0.595637708902359, "num_tokens": 26115.0, "step": 32 }, { "epoch": 0.009004237288135594, "grad_norm": 1.6797536611557007, "learning_rate": 9.99563029661017e-06, "loss": 1.9309, "mean_token_accuracy": 0.5715734362602234, "num_tokens": 27927.0, "step": 34 }, { "epoch": 0.009533898305084746, "grad_norm": 2.1797525882720947, "learning_rate": 9.995365466101696e-06, "loss": 1.745, "mean_token_accuracy": 0.6208231039345264, "num_tokens": 29335.0, "step": 36 }, { "epoch": 0.010063559322033898, "grad_norm": 1.928411602973938, "learning_rate": 9.99510063559322e-06, "loss": 1.5686, "mean_token_accuracy": 0.6453973352909088, "num_tokens": 31019.0, "step": 38 }, { "epoch": 0.01059322033898305, "grad_norm": 2.1614277362823486, "learning_rate": 9.994835805084745e-06, "loss": 2.3535, "mean_token_accuracy": 0.5284794270992279, "num_tokens": 32630.0, "step": 40 }, { "epoch": 0.011122881355932203, "grad_norm": 1.8109742403030396, "learning_rate": 9.994570974576272e-06, "loss": 1.8076, "mean_token_accuracy": 0.60367601364851, "num_tokens": 34269.0, "step": 42 }, { "epoch": 0.011652542372881356, "grad_norm": 2.281477928161621, "learning_rate": 9.994306144067799e-06, "loss": 1.7657, "mean_token_accuracy": 0.637152936309576, "num_tokens": 35814.0, "step": 44 }, { "epoch": 0.012182203389830509, "grad_norm": 1.9303622245788574, "learning_rate": 9.994041313559324e-06, "loss": 1.5669, "mean_token_accuracy": 0.6492938734591007, "num_tokens": 37319.0, "step": 46 }, { "epoch": 0.012711864406779662, "grad_norm": 2.6924281120300293, "learning_rate": 9.993776483050849e-06, "loss": 1.4818, "mean_token_accuracy": 0.6591616421937943, "num_tokens": 38923.0, "step": 48 }, { "epoch": 0.013241525423728813, "grad_norm": 1.9540587663650513, "learning_rate": 9.993511652542373e-06, "loss": 1.501, "mean_token_accuracy": 0.6831130683422089, "num_tokens": 40349.0, "step": 50 }, { "epoch": 0.013771186440677966, "grad_norm": 1.9299689531326294, "learning_rate": 9.9932468220339e-06, "loss": 1.5303, "mean_token_accuracy": 0.660371907055378, "num_tokens": 41924.0, "step": 52 }, { "epoch": 0.014300847457627119, "grad_norm": 1.3961211442947388, "learning_rate": 9.992981991525425e-06, "loss": 1.8573, "mean_token_accuracy": 0.6194165535271168, "num_tokens": 43515.0, "step": 54 }, { "epoch": 0.014830508474576272, "grad_norm": 1.3209558725357056, "learning_rate": 9.99271716101695e-06, "loss": 1.3236, "mean_token_accuracy": 0.69769736379385, "num_tokens": 45140.0, "step": 56 }, { "epoch": 0.015360169491525424, "grad_norm": 1.4309337139129639, "learning_rate": 9.992452330508475e-06, "loss": 1.3891, "mean_token_accuracy": 0.6617655456066132, "num_tokens": 46802.0, "step": 58 }, { "epoch": 0.015889830508474576, "grad_norm": 1.7232557535171509, "learning_rate": 9.992187500000001e-06, "loss": 1.7789, "mean_token_accuracy": 0.6100467666983604, "num_tokens": 48620.0, "step": 60 }, { "epoch": 0.01641949152542373, "grad_norm": 1.6640840768814087, "learning_rate": 9.991922669491526e-06, "loss": 1.5614, "mean_token_accuracy": 0.647768460214138, "num_tokens": 50484.0, "step": 62 }, { "epoch": 0.01694915254237288, "grad_norm": 1.6505564451217651, "learning_rate": 9.991657838983051e-06, "loss": 1.8731, "mean_token_accuracy": 0.603739257901907, "num_tokens": 52290.0, "step": 64 }, { "epoch": 0.017478813559322032, "grad_norm": 1.5151312351226807, "learning_rate": 9.991393008474576e-06, "loss": 1.4127, "mean_token_accuracy": 0.6685617566108704, "num_tokens": 54043.0, "step": 66 }, { "epoch": 0.018008474576271187, "grad_norm": 1.716017723083496, "learning_rate": 9.991128177966103e-06, "loss": 1.9608, "mean_token_accuracy": 0.5847997218370438, "num_tokens": 55798.0, "step": 68 }, { "epoch": 0.018538135593220338, "grad_norm": 1.5899641513824463, "learning_rate": 9.990863347457628e-06, "loss": 1.8321, "mean_token_accuracy": 0.6245149597525597, "num_tokens": 57546.0, "step": 70 }, { "epoch": 0.019067796610169493, "grad_norm": 1.4649622440338135, "learning_rate": 9.990598516949154e-06, "loss": 1.4426, "mean_token_accuracy": 0.6849891804158688, "num_tokens": 59206.0, "step": 72 }, { "epoch": 0.019597457627118644, "grad_norm": 1.9354323148727417, "learning_rate": 9.99033368644068e-06, "loss": 0.9989, "mean_token_accuracy": 0.7538513541221619, "num_tokens": 60843.0, "step": 74 }, { "epoch": 0.020127118644067795, "grad_norm": 2.0622782707214355, "learning_rate": 9.990068855932204e-06, "loss": 1.5861, "mean_token_accuracy": 0.658365286886692, "num_tokens": 62418.0, "step": 76 }, { "epoch": 0.02065677966101695, "grad_norm": 1.6357537508010864, "learning_rate": 9.98980402542373e-06, "loss": 1.8538, "mean_token_accuracy": 0.6181722730398178, "num_tokens": 64178.0, "step": 78 }, { "epoch": 0.0211864406779661, "grad_norm": 1.5546207427978516, "learning_rate": 9.989539194915256e-06, "loss": 1.3998, "mean_token_accuracy": 0.7120168618857861, "num_tokens": 65867.0, "step": 80 }, { "epoch": 0.021716101694915255, "grad_norm": 1.9792648553848267, "learning_rate": 9.98927436440678e-06, "loss": 1.3798, "mean_token_accuracy": 0.7170016840100288, "num_tokens": 67161.0, "step": 82 }, { "epoch": 0.022245762711864406, "grad_norm": 1.89841628074646, "learning_rate": 9.989009533898306e-06, "loss": 1.7647, "mean_token_accuracy": 0.6139678806066513, "num_tokens": 68891.0, "step": 84 }, { "epoch": 0.022775423728813558, "grad_norm": 2.0618557929992676, "learning_rate": 9.98874470338983e-06, "loss": 1.641, "mean_token_accuracy": 0.6519770957529545, "num_tokens": 70477.0, "step": 86 }, { "epoch": 0.023305084745762712, "grad_norm": 1.4825453758239746, "learning_rate": 9.988479872881357e-06, "loss": 1.3822, "mean_token_accuracy": 0.7096373066306114, "num_tokens": 72222.0, "step": 88 }, { "epoch": 0.023834745762711863, "grad_norm": 1.5068010091781616, "learning_rate": 9.988215042372882e-06, "loss": 1.2466, "mean_token_accuracy": 0.6801020726561546, "num_tokens": 74928.0, "step": 90 }, { "epoch": 0.024364406779661018, "grad_norm": 1.70755934715271, "learning_rate": 9.987950211864407e-06, "loss": 1.2527, "mean_token_accuracy": 0.7198696807026863, "num_tokens": 76345.0, "step": 92 }, { "epoch": 0.02489406779661017, "grad_norm": 1.6905447244644165, "learning_rate": 9.987685381355932e-06, "loss": 1.2615, "mean_token_accuracy": 0.6893582083284855, "num_tokens": 78003.0, "step": 94 }, { "epoch": 0.025423728813559324, "grad_norm": 2.3287241458892822, "learning_rate": 9.987420550847459e-06, "loss": 1.5748, "mean_token_accuracy": 0.6659586876630783, "num_tokens": 79446.0, "step": 96 }, { "epoch": 0.025953389830508475, "grad_norm": 1.612897515296936, "learning_rate": 9.987155720338984e-06, "loss": 1.655, "mean_token_accuracy": 0.6401602812111378, "num_tokens": 81260.0, "step": 98 }, { "epoch": 0.026483050847457626, "grad_norm": 1.579565167427063, "learning_rate": 9.98689088983051e-06, "loss": 1.355, "mean_token_accuracy": 0.691826194524765, "num_tokens": 82827.0, "step": 100 }, { "epoch": 0.02701271186440678, "grad_norm": 1.6937757730484009, "learning_rate": 9.986626059322035e-06, "loss": 1.631, "mean_token_accuracy": 0.6271424032747746, "num_tokens": 84936.0, "step": 102 }, { "epoch": 0.02754237288135593, "grad_norm": 1.932204246520996, "learning_rate": 9.98636122881356e-06, "loss": 1.3769, "mean_token_accuracy": 0.6845236644148827, "num_tokens": 86319.0, "step": 104 }, { "epoch": 0.028072033898305086, "grad_norm": 1.9173791408538818, "learning_rate": 9.986096398305085e-06, "loss": 1.664, "mean_token_accuracy": 0.6311646103858948, "num_tokens": 88148.0, "step": 106 }, { "epoch": 0.028601694915254237, "grad_norm": 1.6837165355682373, "learning_rate": 9.985831567796612e-06, "loss": 1.8747, "mean_token_accuracy": 0.5986073464155197, "num_tokens": 89905.0, "step": 108 }, { "epoch": 0.02913135593220339, "grad_norm": 1.6575011014938354, "learning_rate": 9.985566737288137e-06, "loss": 1.38, "mean_token_accuracy": 0.6999502666294575, "num_tokens": 91647.0, "step": 110 }, { "epoch": 0.029661016949152543, "grad_norm": 2.1627087593078613, "learning_rate": 9.985301906779661e-06, "loss": 1.7283, "mean_token_accuracy": 0.6500340811908245, "num_tokens": 93221.0, "step": 112 }, { "epoch": 0.030190677966101694, "grad_norm": 1.752878189086914, "learning_rate": 9.985037076271186e-06, "loss": 1.3568, "mean_token_accuracy": 0.6967433169484138, "num_tokens": 94897.0, "step": 114 }, { "epoch": 0.03072033898305085, "grad_norm": 2.0348403453826904, "learning_rate": 9.984772245762713e-06, "loss": 1.6569, "mean_token_accuracy": 0.6239815279841423, "num_tokens": 96463.0, "step": 116 }, { "epoch": 0.03125, "grad_norm": 1.975300908088684, "learning_rate": 9.984507415254238e-06, "loss": 1.6356, "mean_token_accuracy": 0.6796421371400356, "num_tokens": 98026.0, "step": 118 }, { "epoch": 0.03177966101694915, "grad_norm": 1.804592490196228, "learning_rate": 9.984242584745763e-06, "loss": 1.5938, "mean_token_accuracy": 0.6545914337038994, "num_tokens": 99488.0, "step": 120 }, { "epoch": 0.0323093220338983, "grad_norm": 1.516128420829773, "learning_rate": 9.983977754237288e-06, "loss": 1.2764, "mean_token_accuracy": 0.7058827057480812, "num_tokens": 100944.0, "step": 122 }, { "epoch": 0.03283898305084746, "grad_norm": 1.5259764194488525, "learning_rate": 9.983712923728814e-06, "loss": 1.4423, "mean_token_accuracy": 0.6756614297628403, "num_tokens": 102377.0, "step": 124 }, { "epoch": 0.03336864406779661, "grad_norm": 2.0997393131256104, "learning_rate": 9.98344809322034e-06, "loss": 1.8599, "mean_token_accuracy": 0.6083253063261509, "num_tokens": 103867.0, "step": 126 }, { "epoch": 0.03389830508474576, "grad_norm": 1.8615548610687256, "learning_rate": 9.983183262711866e-06, "loss": 1.3159, "mean_token_accuracy": 0.6905862540006638, "num_tokens": 105652.0, "step": 128 }, { "epoch": 0.034427966101694914, "grad_norm": 2.02128005027771, "learning_rate": 9.982918432203391e-06, "loss": 1.26, "mean_token_accuracy": 0.7188637405633926, "num_tokens": 107159.0, "step": 130 }, { "epoch": 0.034957627118644065, "grad_norm": 1.3415277004241943, "learning_rate": 9.982653601694916e-06, "loss": 1.4257, "mean_token_accuracy": 0.662492610514164, "num_tokens": 108968.0, "step": 132 }, { "epoch": 0.03548728813559322, "grad_norm": 1.9440228939056396, "learning_rate": 9.982388771186442e-06, "loss": 1.2315, "mean_token_accuracy": 0.7342100068926811, "num_tokens": 110459.0, "step": 134 }, { "epoch": 0.036016949152542374, "grad_norm": 2.4781737327575684, "learning_rate": 9.982123940677967e-06, "loss": 1.52, "mean_token_accuracy": 0.6620160304009914, "num_tokens": 112245.0, "step": 136 }, { "epoch": 0.036546610169491525, "grad_norm": 1.6463896036148071, "learning_rate": 9.981859110169492e-06, "loss": 1.2165, "mean_token_accuracy": 0.7013404369354248, "num_tokens": 114212.0, "step": 138 }, { "epoch": 0.037076271186440676, "grad_norm": 1.6842187643051147, "learning_rate": 9.981594279661017e-06, "loss": 1.8832, "mean_token_accuracy": 0.6112418808043003, "num_tokens": 115783.0, "step": 140 }, { "epoch": 0.03760593220338983, "grad_norm": 1.7255690097808838, "learning_rate": 9.981329449152544e-06, "loss": 1.2671, "mean_token_accuracy": 0.6964607238769531, "num_tokens": 117263.0, "step": 142 }, { "epoch": 0.038135593220338986, "grad_norm": 2.1103198528289795, "learning_rate": 9.981064618644069e-06, "loss": 1.713, "mean_token_accuracy": 0.6251596957445145, "num_tokens": 118578.0, "step": 144 }, { "epoch": 0.03866525423728814, "grad_norm": 1.5897526741027832, "learning_rate": 9.980799788135594e-06, "loss": 1.5948, "mean_token_accuracy": 0.662494495511055, "num_tokens": 119899.0, "step": 146 }, { "epoch": 0.03919491525423729, "grad_norm": 1.8615723848342896, "learning_rate": 9.980534957627119e-06, "loss": 1.2334, "mean_token_accuracy": 0.6989872083067894, "num_tokens": 121520.0, "step": 148 }, { "epoch": 0.03972457627118644, "grad_norm": 1.968605637550354, "learning_rate": 9.980270127118645e-06, "loss": 1.4623, "mean_token_accuracy": 0.6868574768304825, "num_tokens": 123062.0, "step": 150 }, { "epoch": 0.04025423728813559, "grad_norm": 1.6275725364685059, "learning_rate": 9.98000529661017e-06, "loss": 1.2453, "mean_token_accuracy": 0.7159281857311726, "num_tokens": 124515.0, "step": 152 }, { "epoch": 0.04078389830508475, "grad_norm": 1.2306444644927979, "learning_rate": 9.979740466101697e-06, "loss": 1.0456, "mean_token_accuracy": 0.7413577064871788, "num_tokens": 126402.0, "step": 154 }, { "epoch": 0.0413135593220339, "grad_norm": 1.3400633335113525, "learning_rate": 9.979475635593222e-06, "loss": 0.8524, "mean_token_accuracy": 0.7919603139162064, "num_tokens": 127577.0, "step": 156 }, { "epoch": 0.04184322033898305, "grad_norm": 2.635664463043213, "learning_rate": 9.979210805084747e-06, "loss": 1.9276, "mean_token_accuracy": 0.5765147171914577, "num_tokens": 128879.0, "step": 158 }, { "epoch": 0.0423728813559322, "grad_norm": 1.6580435037612915, "learning_rate": 9.978945974576272e-06, "loss": 1.2737, "mean_token_accuracy": 0.7061606273055077, "num_tokens": 130454.0, "step": 160 }, { "epoch": 0.04290254237288135, "grad_norm": 1.6183724403381348, "learning_rate": 9.978681144067798e-06, "loss": 1.7136, "mean_token_accuracy": 0.6379189640283585, "num_tokens": 132242.0, "step": 162 }, { "epoch": 0.04343220338983051, "grad_norm": 1.7253040075302124, "learning_rate": 9.978416313559323e-06, "loss": 1.6008, "mean_token_accuracy": 0.6675308421254158, "num_tokens": 133903.0, "step": 164 }, { "epoch": 0.04396186440677966, "grad_norm": 1.5342806577682495, "learning_rate": 9.978151483050848e-06, "loss": 1.312, "mean_token_accuracy": 0.6900443956255913, "num_tokens": 135514.0, "step": 166 }, { "epoch": 0.04449152542372881, "grad_norm": 1.7323129177093506, "learning_rate": 9.977886652542373e-06, "loss": 1.3628, "mean_token_accuracy": 0.6876569613814354, "num_tokens": 136964.0, "step": 168 }, { "epoch": 0.045021186440677964, "grad_norm": 1.418041706085205, "learning_rate": 9.9776218220339e-06, "loss": 1.5755, "mean_token_accuracy": 0.6570128872990608, "num_tokens": 138708.0, "step": 170 }, { "epoch": 0.045550847457627115, "grad_norm": 2.084711790084839, "learning_rate": 9.977356991525425e-06, "loss": 1.437, "mean_token_accuracy": 0.6878075376152992, "num_tokens": 140267.0, "step": 172 }, { "epoch": 0.04608050847457627, "grad_norm": 1.7671602964401245, "learning_rate": 9.97709216101695e-06, "loss": 1.4622, "mean_token_accuracy": 0.6884222105145454, "num_tokens": 141739.0, "step": 174 }, { "epoch": 0.046610169491525424, "grad_norm": 1.6979678869247437, "learning_rate": 9.976827330508474e-06, "loss": 1.0786, "mean_token_accuracy": 0.7428569421172142, "num_tokens": 143320.0, "step": 176 }, { "epoch": 0.047139830508474576, "grad_norm": 1.7950983047485352, "learning_rate": 9.976562500000001e-06, "loss": 0.9603, "mean_token_accuracy": 0.7623608931899071, "num_tokens": 144626.0, "step": 178 }, { "epoch": 0.04766949152542373, "grad_norm": 2.234283447265625, "learning_rate": 9.976297669491526e-06, "loss": 1.2141, "mean_token_accuracy": 0.7091927900910378, "num_tokens": 146361.0, "step": 180 }, { "epoch": 0.048199152542372885, "grad_norm": 1.643517255783081, "learning_rate": 9.976032838983053e-06, "loss": 1.5675, "mean_token_accuracy": 0.6401951536536217, "num_tokens": 147893.0, "step": 182 }, { "epoch": 0.048728813559322036, "grad_norm": 1.6162828207015991, "learning_rate": 9.975768008474576e-06, "loss": 1.3296, "mean_token_accuracy": 0.7008878514170647, "num_tokens": 149080.0, "step": 184 }, { "epoch": 0.04925847457627119, "grad_norm": 2.740443229675293, "learning_rate": 9.975503177966103e-06, "loss": 1.7908, "mean_token_accuracy": 0.6103722229599953, "num_tokens": 150412.0, "step": 186 }, { "epoch": 0.04978813559322034, "grad_norm": 2.3142967224121094, "learning_rate": 9.975238347457627e-06, "loss": 1.2255, "mean_token_accuracy": 0.7128494828939438, "num_tokens": 152074.0, "step": 188 }, { "epoch": 0.05031779661016949, "grad_norm": 1.7109137773513794, "learning_rate": 9.974973516949154e-06, "loss": 1.5924, "mean_token_accuracy": 0.6365768387913704, "num_tokens": 153325.0, "step": 190 }, { "epoch": 0.05084745762711865, "grad_norm": 2.224001407623291, "learning_rate": 9.974708686440679e-06, "loss": 1.4079, "mean_token_accuracy": 0.7031778767704964, "num_tokens": 154855.0, "step": 192 }, { "epoch": 0.0513771186440678, "grad_norm": 1.8044685125350952, "learning_rate": 9.974443855932204e-06, "loss": 1.3073, "mean_token_accuracy": 0.6957237757742405, "num_tokens": 156639.0, "step": 194 }, { "epoch": 0.05190677966101695, "grad_norm": 1.5959546566009521, "learning_rate": 9.974179025423729e-06, "loss": 1.4653, "mean_token_accuracy": 0.6671195030212402, "num_tokens": 158203.0, "step": 196 }, { "epoch": 0.0524364406779661, "grad_norm": 1.55582594871521, "learning_rate": 9.973914194915255e-06, "loss": 1.2251, "mean_token_accuracy": 0.714422769844532, "num_tokens": 160050.0, "step": 198 }, { "epoch": 0.05296610169491525, "grad_norm": 1.8497540950775146, "learning_rate": 9.97364936440678e-06, "loss": 1.822, "mean_token_accuracy": 0.6098014302551746, "num_tokens": 161910.0, "step": 200 }, { "epoch": 0.05349576271186441, "grad_norm": 2.0975632667541504, "learning_rate": 9.973384533898305e-06, "loss": 2.0466, "mean_token_accuracy": 0.5700209848582745, "num_tokens": 163559.0, "step": 202 }, { "epoch": 0.05402542372881356, "grad_norm": 1.5004866123199463, "learning_rate": 9.97311970338983e-06, "loss": 1.4883, "mean_token_accuracy": 0.6708476692438126, "num_tokens": 165002.0, "step": 204 }, { "epoch": 0.05455508474576271, "grad_norm": 1.6115100383758545, "learning_rate": 9.972854872881357e-06, "loss": 1.2763, "mean_token_accuracy": 0.7041986659169197, "num_tokens": 166532.0, "step": 206 }, { "epoch": 0.05508474576271186, "grad_norm": 1.9508548974990845, "learning_rate": 9.972590042372882e-06, "loss": 1.6957, "mean_token_accuracy": 0.6377431564033031, "num_tokens": 167923.0, "step": 208 }, { "epoch": 0.055614406779661014, "grad_norm": 2.0338315963745117, "learning_rate": 9.972325211864408e-06, "loss": 1.6715, "mean_token_accuracy": 0.6593330129981041, "num_tokens": 169544.0, "step": 210 }, { "epoch": 0.05614406779661017, "grad_norm": 2.0567121505737305, "learning_rate": 9.972060381355933e-06, "loss": 1.2859, "mean_token_accuracy": 0.6924242675304413, "num_tokens": 170988.0, "step": 212 }, { "epoch": 0.056673728813559324, "grad_norm": 1.7616095542907715, "learning_rate": 9.971795550847458e-06, "loss": 1.4765, "mean_token_accuracy": 0.6772053837776184, "num_tokens": 172743.0, "step": 214 }, { "epoch": 0.057203389830508475, "grad_norm": 1.7302945852279663, "learning_rate": 9.971530720338985e-06, "loss": 1.1318, "mean_token_accuracy": 0.7219294235110283, "num_tokens": 174272.0, "step": 216 }, { "epoch": 0.057733050847457626, "grad_norm": 1.822998285293579, "learning_rate": 9.97126588983051e-06, "loss": 1.7061, "mean_token_accuracy": 0.6483809761703014, "num_tokens": 175621.0, "step": 218 }, { "epoch": 0.05826271186440678, "grad_norm": 1.49419105052948, "learning_rate": 9.971001059322035e-06, "loss": 1.4924, "mean_token_accuracy": 0.6688279062509537, "num_tokens": 177150.0, "step": 220 }, { "epoch": 0.058792372881355935, "grad_norm": 1.8823741674423218, "learning_rate": 9.97073622881356e-06, "loss": 1.4018, "mean_token_accuracy": 0.6944353133440018, "num_tokens": 178799.0, "step": 222 }, { "epoch": 0.059322033898305086, "grad_norm": 2.0783584117889404, "learning_rate": 9.970471398305086e-06, "loss": 1.2966, "mean_token_accuracy": 0.6987222284078598, "num_tokens": 180501.0, "step": 224 }, { "epoch": 0.05985169491525424, "grad_norm": 1.7885565757751465, "learning_rate": 9.970206567796611e-06, "loss": 2.0561, "mean_token_accuracy": 0.571915328502655, "num_tokens": 182226.0, "step": 226 }, { "epoch": 0.06038135593220339, "grad_norm": 1.4179558753967285, "learning_rate": 9.969941737288136e-06, "loss": 1.3309, "mean_token_accuracy": 0.6892136484384537, "num_tokens": 183970.0, "step": 228 }, { "epoch": 0.06091101694915254, "grad_norm": 1.745453119277954, "learning_rate": 9.969676906779661e-06, "loss": 0.8392, "mean_token_accuracy": 0.7973359599709511, "num_tokens": 185493.0, "step": 230 }, { "epoch": 0.0614406779661017, "grad_norm": 1.8389214277267456, "learning_rate": 9.969412076271188e-06, "loss": 1.4708, "mean_token_accuracy": 0.6636429950594902, "num_tokens": 187290.0, "step": 232 }, { "epoch": 0.06197033898305085, "grad_norm": 1.5276554822921753, "learning_rate": 9.969147245762713e-06, "loss": 1.5403, "mean_token_accuracy": 0.6697374954819679, "num_tokens": 189038.0, "step": 234 }, { "epoch": 0.0625, "grad_norm": 1.8439043760299683, "learning_rate": 9.96888241525424e-06, "loss": 1.4726, "mean_token_accuracy": 0.6785898357629776, "num_tokens": 190515.0, "step": 236 }, { "epoch": 0.06302966101694915, "grad_norm": 1.9146534204483032, "learning_rate": 9.968617584745763e-06, "loss": 1.6812, "mean_token_accuracy": 0.6800332441926003, "num_tokens": 192034.0, "step": 238 }, { "epoch": 0.0635593220338983, "grad_norm": 1.4977613687515259, "learning_rate": 9.968352754237289e-06, "loss": 1.261, "mean_token_accuracy": 0.7138035148382187, "num_tokens": 193519.0, "step": 240 }, { "epoch": 0.06408898305084745, "grad_norm": 1.6710529327392578, "learning_rate": 9.968087923728814e-06, "loss": 1.2875, "mean_token_accuracy": 0.6961909905076027, "num_tokens": 195246.0, "step": 242 }, { "epoch": 0.0646186440677966, "grad_norm": 1.6820735931396484, "learning_rate": 9.96782309322034e-06, "loss": 1.4073, "mean_token_accuracy": 0.664181686937809, "num_tokens": 196762.0, "step": 244 }, { "epoch": 0.06514830508474577, "grad_norm": 1.6623212099075317, "learning_rate": 9.967558262711866e-06, "loss": 1.4374, "mean_token_accuracy": 0.6835715174674988, "num_tokens": 198219.0, "step": 246 }, { "epoch": 0.06567796610169492, "grad_norm": 1.8704837560653687, "learning_rate": 9.96729343220339e-06, "loss": 1.2135, "mean_token_accuracy": 0.7195655964314938, "num_tokens": 199770.0, "step": 248 }, { "epoch": 0.06620762711864407, "grad_norm": 1.727866530418396, "learning_rate": 9.967028601694915e-06, "loss": 1.4653, "step": 250 }, { "epoch": 0.06620762711864407, "eval_loss": 1.4172513484954834, "eval_mean_token_accuracy": 0.6816647860717464, "eval_num_tokens": 201083.0, "eval_runtime": 48.3378, "eval_samples_per_second": 6.372, "eval_steps_per_second": 6.372, "step": 250 }, { "epoch": 0.06673728813559322, "grad_norm": 1.5990207195281982, "learning_rate": 9.966763771186442e-06, "loss": 1.2811, "mean_token_accuracy": 0.6839507017284632, "num_tokens": 202871.0, "step": 252 }, { "epoch": 0.06726694915254237, "grad_norm": 1.745080828666687, "learning_rate": 9.966498940677967e-06, "loss": 1.6561, "mean_token_accuracy": 0.6370409727096558, "num_tokens": 204340.0, "step": 254 }, { "epoch": 0.06779661016949153, "grad_norm": 1.6199792623519897, "learning_rate": 9.966234110169492e-06, "loss": 1.5619, "mean_token_accuracy": 0.6660095751285553, "num_tokens": 206156.0, "step": 256 }, { "epoch": 0.06832627118644068, "grad_norm": 1.5917309522628784, "learning_rate": 9.965969279661017e-06, "loss": 1.5102, "mean_token_accuracy": 0.6727786511182785, "num_tokens": 207998.0, "step": 258 }, { "epoch": 0.06885593220338983, "grad_norm": 1.5581461191177368, "learning_rate": 9.965704449152544e-06, "loss": 1.5228, "mean_token_accuracy": 0.6590321734547615, "num_tokens": 209838.0, "step": 260 }, { "epoch": 0.06938559322033898, "grad_norm": 2.346834421157837, "learning_rate": 9.965439618644068e-06, "loss": 1.765, "mean_token_accuracy": 0.6231243349611759, "num_tokens": 211417.0, "step": 262 }, { "epoch": 0.06991525423728813, "grad_norm": 1.7151641845703125, "learning_rate": 9.965174788135595e-06, "loss": 1.4685, "mean_token_accuracy": 0.6573059856891632, "num_tokens": 213116.0, "step": 264 }, { "epoch": 0.0704449152542373, "grad_norm": 1.7705423831939697, "learning_rate": 9.964909957627118e-06, "loss": 1.6798, "mean_token_accuracy": 0.6684731282293797, "num_tokens": 215398.0, "step": 266 }, { "epoch": 0.07097457627118645, "grad_norm": 1.564461350440979, "learning_rate": 9.964645127118645e-06, "loss": 1.512, "mean_token_accuracy": 0.6533032581210136, "num_tokens": 216839.0, "step": 268 }, { "epoch": 0.0715042372881356, "grad_norm": 1.6634494066238403, "learning_rate": 9.96438029661017e-06, "loss": 1.9012, "mean_token_accuracy": 0.6104568429291248, "num_tokens": 218426.0, "step": 270 }, { "epoch": 0.07203389830508475, "grad_norm": 1.6394497156143188, "learning_rate": 9.964115466101696e-06, "loss": 1.0365, "mean_token_accuracy": 0.7485365234315395, "num_tokens": 220074.0, "step": 272 }, { "epoch": 0.0725635593220339, "grad_norm": 1.9613643884658813, "learning_rate": 9.963850635593221e-06, "loss": 1.0817, "mean_token_accuracy": 0.7139298096299171, "num_tokens": 221484.0, "step": 274 }, { "epoch": 0.07309322033898305, "grad_norm": 1.6178405284881592, "learning_rate": 9.963585805084746e-06, "loss": 1.7617, "mean_token_accuracy": 0.6229805871844292, "num_tokens": 223125.0, "step": 276 }, { "epoch": 0.0736228813559322, "grad_norm": 1.7911776304244995, "learning_rate": 9.963320974576271e-06, "loss": 1.5955, "mean_token_accuracy": 0.6430438458919525, "num_tokens": 224720.0, "step": 278 }, { "epoch": 0.07415254237288135, "grad_norm": 1.8381720781326294, "learning_rate": 9.963056144067798e-06, "loss": 1.4593, "mean_token_accuracy": 0.710250910371542, "num_tokens": 226455.0, "step": 280 }, { "epoch": 0.0746822033898305, "grad_norm": 2.26836895942688, "learning_rate": 9.962791313559323e-06, "loss": 1.546, "mean_token_accuracy": 0.6757340282201767, "num_tokens": 228236.0, "step": 282 }, { "epoch": 0.07521186440677965, "grad_norm": 1.733903169631958, "learning_rate": 9.962526483050848e-06, "loss": 1.6503, "mean_token_accuracy": 0.6666452698409557, "num_tokens": 229800.0, "step": 284 }, { "epoch": 0.07574152542372882, "grad_norm": 2.31662654876709, "learning_rate": 9.962261652542373e-06, "loss": 1.5316, "mean_token_accuracy": 0.6632180958986282, "num_tokens": 231208.0, "step": 286 }, { "epoch": 0.07627118644067797, "grad_norm": 1.6854466199874878, "learning_rate": 9.9619968220339e-06, "loss": 1.9069, "mean_token_accuracy": 0.6111326739192009, "num_tokens": 232806.0, "step": 288 }, { "epoch": 0.07680084745762712, "grad_norm": 1.4609169960021973, "learning_rate": 9.961731991525424e-06, "loss": 1.0583, "mean_token_accuracy": 0.7293980419635773, "num_tokens": 234357.0, "step": 290 }, { "epoch": 0.07733050847457627, "grad_norm": 1.6687240600585938, "learning_rate": 9.96146716101695e-06, "loss": 1.0465, "mean_token_accuracy": 0.7411384582519531, "num_tokens": 235887.0, "step": 292 }, { "epoch": 0.07786016949152542, "grad_norm": 1.4112576246261597, "learning_rate": 9.961202330508474e-06, "loss": 1.2078, "mean_token_accuracy": 0.7125238850712776, "num_tokens": 237645.0, "step": 294 }, { "epoch": 0.07838983050847458, "grad_norm": 2.0098953247070312, "learning_rate": 9.9609375e-06, "loss": 1.8732, "mean_token_accuracy": 0.5860188789665699, "num_tokens": 239269.0, "step": 296 }, { "epoch": 0.07891949152542373, "grad_norm": 1.7466524839401245, "learning_rate": 9.960672669491527e-06, "loss": 1.6504, "mean_token_accuracy": 0.6433536186814308, "num_tokens": 240757.0, "step": 298 }, { "epoch": 0.07944915254237288, "grad_norm": 1.7333427667617798, "learning_rate": 9.960407838983052e-06, "loss": 1.9177, "mean_token_accuracy": 0.5651349201798439, "num_tokens": 242585.0, "step": 300 }, { "epoch": 0.07997881355932203, "grad_norm": 1.709916591644287, "learning_rate": 9.960143008474577e-06, "loss": 1.1649, "mean_token_accuracy": 0.7331667393445969, "num_tokens": 243876.0, "step": 302 }, { "epoch": 0.08050847457627118, "grad_norm": 1.8282674551010132, "learning_rate": 9.959878177966102e-06, "loss": 1.2728, "mean_token_accuracy": 0.7109148055315018, "num_tokens": 245202.0, "step": 304 }, { "epoch": 0.08103813559322035, "grad_norm": 2.8617560863494873, "learning_rate": 9.959613347457629e-06, "loss": 1.0424, "mean_token_accuracy": 0.7504686415195465, "num_tokens": 246579.0, "step": 306 }, { "epoch": 0.0815677966101695, "grad_norm": 2.0111913681030273, "learning_rate": 9.959348516949154e-06, "loss": 1.5797, "mean_token_accuracy": 0.6779318116605282, "num_tokens": 247936.0, "step": 308 }, { "epoch": 0.08209745762711865, "grad_norm": 1.7723294496536255, "learning_rate": 9.959083686440679e-06, "loss": 1.7424, "mean_token_accuracy": 0.6139780580997467, "num_tokens": 249595.0, "step": 310 }, { "epoch": 0.0826271186440678, "grad_norm": 1.8593417406082153, "learning_rate": 9.958818855932204e-06, "loss": 1.2218, "mean_token_accuracy": 0.7228121235966682, "num_tokens": 250932.0, "step": 312 }, { "epoch": 0.08315677966101695, "grad_norm": 1.833020806312561, "learning_rate": 9.95855402542373e-06, "loss": 1.6159, "mean_token_accuracy": 0.672015868127346, "num_tokens": 253252.0, "step": 314 }, { "epoch": 0.0836864406779661, "grad_norm": 2.3483023643493652, "learning_rate": 9.958289194915255e-06, "loss": 1.0776, "mean_token_accuracy": 0.7260991856455803, "num_tokens": 255050.0, "step": 316 }, { "epoch": 0.08421610169491525, "grad_norm": 2.114051103591919, "learning_rate": 9.958024364406782e-06, "loss": 1.6736, "mean_token_accuracy": 0.6415327824652195, "num_tokens": 256547.0, "step": 318 }, { "epoch": 0.0847457627118644, "grad_norm": 1.1944133043289185, "learning_rate": 9.957759533898305e-06, "loss": 1.1326, "mean_token_accuracy": 0.7290316596627235, "num_tokens": 258224.0, "step": 320 }, { "epoch": 0.08527542372881355, "grad_norm": 1.362478494644165, "learning_rate": 9.957494703389832e-06, "loss": 1.2927, "mean_token_accuracy": 0.7171972468495369, "num_tokens": 259706.0, "step": 322 }, { "epoch": 0.0858050847457627, "grad_norm": 1.4699339866638184, "learning_rate": 9.957229872881357e-06, "loss": 1.0394, "mean_token_accuracy": 0.7793666273355484, "num_tokens": 261662.0, "step": 324 }, { "epoch": 0.08633474576271187, "grad_norm": 2.1047210693359375, "learning_rate": 9.956965042372883e-06, "loss": 1.8787, "mean_token_accuracy": 0.6193991005420685, "num_tokens": 263307.0, "step": 326 }, { "epoch": 0.08686440677966102, "grad_norm": 2.3721725940704346, "learning_rate": 9.956700211864408e-06, "loss": 1.5215, "mean_token_accuracy": 0.671922717243433, "num_tokens": 264606.0, "step": 328 }, { "epoch": 0.08739406779661017, "grad_norm": 1.7759004831314087, "learning_rate": 9.956435381355933e-06, "loss": 1.5447, "mean_token_accuracy": 0.6884756833314896, "num_tokens": 266038.0, "step": 330 }, { "epoch": 0.08792372881355932, "grad_norm": 2.0987448692321777, "learning_rate": 9.956170550847458e-06, "loss": 1.7101, "mean_token_accuracy": 0.6091567873954773, "num_tokens": 267758.0, "step": 332 }, { "epoch": 0.08845338983050847, "grad_norm": 1.841191053390503, "learning_rate": 9.955905720338985e-06, "loss": 1.5275, "mean_token_accuracy": 0.6734009273350239, "num_tokens": 269311.0, "step": 334 }, { "epoch": 0.08898305084745763, "grad_norm": 1.7459673881530762, "learning_rate": 9.95564088983051e-06, "loss": 1.0377, "mean_token_accuracy": 0.7418401539325714, "num_tokens": 271184.0, "step": 336 }, { "epoch": 0.08951271186440678, "grad_norm": 1.763412594795227, "learning_rate": 9.955376059322034e-06, "loss": 1.2988, "mean_token_accuracy": 0.6978670731186867, "num_tokens": 273058.0, "step": 338 }, { "epoch": 0.09004237288135593, "grad_norm": 1.5794516801834106, "learning_rate": 9.95511122881356e-06, "loss": 1.3627, "mean_token_accuracy": 0.6770955324172974, "num_tokens": 274507.0, "step": 340 }, { "epoch": 0.09057203389830508, "grad_norm": 1.9028760194778442, "learning_rate": 9.954846398305086e-06, "loss": 1.4792, "mean_token_accuracy": 0.6572467386722565, "num_tokens": 276056.0, "step": 342 }, { "epoch": 0.09110169491525423, "grad_norm": 1.9547511339187622, "learning_rate": 9.954581567796611e-06, "loss": 1.2321, "mean_token_accuracy": 0.7310040555894375, "num_tokens": 277668.0, "step": 344 }, { "epoch": 0.0916313559322034, "grad_norm": 1.9128260612487793, "learning_rate": 9.954316737288136e-06, "loss": 1.3527, "mean_token_accuracy": 0.7057406157255173, "num_tokens": 279259.0, "step": 346 }, { "epoch": 0.09216101694915255, "grad_norm": 1.5533405542373657, "learning_rate": 9.95405190677966e-06, "loss": 1.3994, "mean_token_accuracy": 0.674719512462616, "num_tokens": 281042.0, "step": 348 }, { "epoch": 0.0926906779661017, "grad_norm": 2.3660829067230225, "learning_rate": 9.953787076271187e-06, "loss": 1.5082, "mean_token_accuracy": 0.6611040458083153, "num_tokens": 282471.0, "step": 350 }, { "epoch": 0.09322033898305085, "grad_norm": 1.7340567111968994, "learning_rate": 9.953522245762712e-06, "loss": 1.6754, "mean_token_accuracy": 0.6478119790554047, "num_tokens": 284027.0, "step": 352 }, { "epoch": 0.09375, "grad_norm": 1.8736917972564697, "learning_rate": 9.953257415254239e-06, "loss": 1.7538, "mean_token_accuracy": 0.6088890805840492, "num_tokens": 286598.0, "step": 354 }, { "epoch": 0.09427966101694915, "grad_norm": 1.9580436944961548, "learning_rate": 9.952992584745764e-06, "loss": 1.9541, "mean_token_accuracy": 0.6110469400882721, "num_tokens": 288213.0, "step": 356 }, { "epoch": 0.0948093220338983, "grad_norm": 2.0023841857910156, "learning_rate": 9.952727754237289e-06, "loss": 1.2838, "mean_token_accuracy": 0.6968412399291992, "num_tokens": 289552.0, "step": 358 }, { "epoch": 0.09533898305084745, "grad_norm": 1.6580374240875244, "learning_rate": 9.952462923728814e-06, "loss": 1.3696, "mean_token_accuracy": 0.7051873132586479, "num_tokens": 291129.0, "step": 360 }, { "epoch": 0.0958686440677966, "grad_norm": 1.763455867767334, "learning_rate": 9.95219809322034e-06, "loss": 1.7653, "mean_token_accuracy": 0.6204437538981438, "num_tokens": 292958.0, "step": 362 }, { "epoch": 0.09639830508474577, "grad_norm": 1.7698560953140259, "learning_rate": 9.951933262711865e-06, "loss": 1.4722, "mean_token_accuracy": 0.6613679379224777, "num_tokens": 294634.0, "step": 364 }, { "epoch": 0.09692796610169492, "grad_norm": 1.8266053199768066, "learning_rate": 9.95166843220339e-06, "loss": 1.5242, "mean_token_accuracy": 0.688556544482708, "num_tokens": 296296.0, "step": 366 }, { "epoch": 0.09745762711864407, "grad_norm": 1.8256853818893433, "learning_rate": 9.951403601694915e-06, "loss": 1.8325, "mean_token_accuracy": 0.611061330884695, "num_tokens": 298000.0, "step": 368 }, { "epoch": 0.09798728813559322, "grad_norm": 1.4484354257583618, "learning_rate": 9.951138771186442e-06, "loss": 1.5207, "mean_token_accuracy": 0.6687660068273544, "num_tokens": 299811.0, "step": 370 }, { "epoch": 0.09851694915254237, "grad_norm": 1.905441164970398, "learning_rate": 9.950873940677967e-06, "loss": 1.1733, "mean_token_accuracy": 0.7064818814396858, "num_tokens": 301691.0, "step": 372 }, { "epoch": 0.09904661016949153, "grad_norm": 1.4455541372299194, "learning_rate": 9.950609110169492e-06, "loss": 1.3946, "mean_token_accuracy": 0.6605423241853714, "num_tokens": 303576.0, "step": 374 }, { "epoch": 0.09957627118644068, "grad_norm": 1.5487926006317139, "learning_rate": 9.950344279661017e-06, "loss": 1.5571, "mean_token_accuracy": 0.6658943369984627, "num_tokens": 304962.0, "step": 376 }, { "epoch": 0.10010593220338983, "grad_norm": 1.9543153047561646, "learning_rate": 9.950079449152543e-06, "loss": 1.4263, "mean_token_accuracy": 0.6808644533157349, "num_tokens": 306305.0, "step": 378 }, { "epoch": 0.10063559322033898, "grad_norm": 1.3512108325958252, "learning_rate": 9.949814618644068e-06, "loss": 1.1346, "mean_token_accuracy": 0.7231660708785057, "num_tokens": 308071.0, "step": 380 }, { "epoch": 0.10116525423728813, "grad_norm": 1.7263091802597046, "learning_rate": 9.949549788135595e-06, "loss": 1.3227, "mean_token_accuracy": 0.6850630715489388, "num_tokens": 309680.0, "step": 382 }, { "epoch": 0.1016949152542373, "grad_norm": 1.5390876531600952, "learning_rate": 9.94928495762712e-06, "loss": 1.4111, "mean_token_accuracy": 0.6960853487253189, "num_tokens": 311165.0, "step": 384 }, { "epoch": 0.10222457627118645, "grad_norm": 1.2161409854888916, "learning_rate": 9.949020127118645e-06, "loss": 1.079, "mean_token_accuracy": 0.7408205345273018, "num_tokens": 313629.0, "step": 386 }, { "epoch": 0.1027542372881356, "grad_norm": 1.9592559337615967, "learning_rate": 9.948755296610171e-06, "loss": 1.2669, "mean_token_accuracy": 0.7216491661965847, "num_tokens": 315579.0, "step": 388 }, { "epoch": 0.10328389830508475, "grad_norm": 2.2291131019592285, "learning_rate": 9.948490466101696e-06, "loss": 1.5296, "mean_token_accuracy": 0.6495336070656776, "num_tokens": 316894.0, "step": 390 }, { "epoch": 0.1038135593220339, "grad_norm": 1.6749569177627563, "learning_rate": 9.948225635593221e-06, "loss": 1.4591, "mean_token_accuracy": 0.6717566847801208, "num_tokens": 318563.0, "step": 392 }, { "epoch": 0.10434322033898305, "grad_norm": 1.268286943435669, "learning_rate": 9.947960805084746e-06, "loss": 1.3409, "mean_token_accuracy": 0.7098381966352463, "num_tokens": 320336.0, "step": 394 }, { "epoch": 0.1048728813559322, "grad_norm": 1.9920912981033325, "learning_rate": 9.947695974576273e-06, "loss": 1.0609, "mean_token_accuracy": 0.7265462875366211, "num_tokens": 321829.0, "step": 396 }, { "epoch": 0.10540254237288135, "grad_norm": 1.5729392766952515, "learning_rate": 9.947431144067798e-06, "loss": 1.2566, "mean_token_accuracy": 0.7017540335655212, "num_tokens": 323298.0, "step": 398 }, { "epoch": 0.1059322033898305, "grad_norm": 1.4688762426376343, "learning_rate": 9.947166313559322e-06, "loss": 1.066, "mean_token_accuracy": 0.7573279067873955, "num_tokens": 324847.0, "step": 400 }, { "epoch": 0.10646186440677965, "grad_norm": 1.771448016166687, "learning_rate": 9.946901483050847e-06, "loss": 1.5344, "mean_token_accuracy": 0.6811985597014427, "num_tokens": 326343.0, "step": 402 }, { "epoch": 0.10699152542372882, "grad_norm": 1.4460303783416748, "learning_rate": 9.946636652542374e-06, "loss": 0.8707, "mean_token_accuracy": 0.7833942845463753, "num_tokens": 327806.0, "step": 404 }, { "epoch": 0.10752118644067797, "grad_norm": 1.7068620920181274, "learning_rate": 9.946371822033899e-06, "loss": 1.3426, "mean_token_accuracy": 0.702693298459053, "num_tokens": 329305.0, "step": 406 }, { "epoch": 0.10805084745762712, "grad_norm": 1.7135292291641235, "learning_rate": 9.946106991525426e-06, "loss": 1.3615, "mean_token_accuracy": 0.6748966984450817, "num_tokens": 330810.0, "step": 408 }, { "epoch": 0.10858050847457627, "grad_norm": 1.7509855031967163, "learning_rate": 9.94584216101695e-06, "loss": 1.4911, "mean_token_accuracy": 0.6636864989995956, "num_tokens": 332658.0, "step": 410 }, { "epoch": 0.10911016949152542, "grad_norm": 1.6215853691101074, "learning_rate": 9.945577330508475e-06, "loss": 1.3422, "mean_token_accuracy": 0.6625668257474899, "num_tokens": 334311.0, "step": 412 }, { "epoch": 0.10963983050847458, "grad_norm": 2.3820853233337402, "learning_rate": 9.9453125e-06, "loss": 1.3565, "mean_token_accuracy": 0.6814393922686577, "num_tokens": 335732.0, "step": 414 }, { "epoch": 0.11016949152542373, "grad_norm": 2.1386454105377197, "learning_rate": 9.945047669491527e-06, "loss": 1.308, "mean_token_accuracy": 0.7139138132333755, "num_tokens": 337128.0, "step": 416 }, { "epoch": 0.11069915254237288, "grad_norm": 1.796376347541809, "learning_rate": 9.944782838983052e-06, "loss": 1.7154, "mean_token_accuracy": 0.6222399286925793, "num_tokens": 338701.0, "step": 418 }, { "epoch": 0.11122881355932203, "grad_norm": 2.041480541229248, "learning_rate": 9.944518008474577e-06, "loss": 1.5013, "mean_token_accuracy": 0.6676587723195553, "num_tokens": 340161.0, "step": 420 }, { "epoch": 0.11175847457627118, "grad_norm": 1.7185332775115967, "learning_rate": 9.944253177966102e-06, "loss": 1.7238, "mean_token_accuracy": 0.614720243960619, "num_tokens": 341865.0, "step": 422 }, { "epoch": 0.11228813559322035, "grad_norm": 1.629929780960083, "learning_rate": 9.943988347457628e-06, "loss": 1.8522, "mean_token_accuracy": 0.5945881642401218, "num_tokens": 343497.0, "step": 424 }, { "epoch": 0.1128177966101695, "grad_norm": 1.7689826488494873, "learning_rate": 9.943723516949153e-06, "loss": 1.4556, "mean_token_accuracy": 0.6844851188361645, "num_tokens": 344865.0, "step": 426 }, { "epoch": 0.11334745762711865, "grad_norm": 1.6568607091903687, "learning_rate": 9.943458686440678e-06, "loss": 1.417, "mean_token_accuracy": 0.6856525912880898, "num_tokens": 346476.0, "step": 428 }, { "epoch": 0.1138771186440678, "grad_norm": 2.167257070541382, "learning_rate": 9.943193855932203e-06, "loss": 1.6235, "mean_token_accuracy": 0.6385648548603058, "num_tokens": 348110.0, "step": 430 }, { "epoch": 0.11440677966101695, "grad_norm": 1.6102381944656372, "learning_rate": 9.94292902542373e-06, "loss": 1.7613, "mean_token_accuracy": 0.6562721692025661, "num_tokens": 349513.0, "step": 432 }, { "epoch": 0.1149364406779661, "grad_norm": 1.491376519203186, "learning_rate": 9.942664194915255e-06, "loss": 1.4805, "mean_token_accuracy": 0.6825698353350163, "num_tokens": 351087.0, "step": 434 }, { "epoch": 0.11546610169491525, "grad_norm": 2.127281904220581, "learning_rate": 9.942399364406781e-06, "loss": 1.567, "mean_token_accuracy": 0.6603780500590801, "num_tokens": 352867.0, "step": 436 }, { "epoch": 0.1159957627118644, "grad_norm": 2.0487139225006104, "learning_rate": 9.942134533898306e-06, "loss": 1.1488, "mean_token_accuracy": 0.7540232725441456, "num_tokens": 354419.0, "step": 438 }, { "epoch": 0.11652542372881355, "grad_norm": 2.3364503383636475, "learning_rate": 9.941869703389831e-06, "loss": 1.8929, "mean_token_accuracy": 0.590262483805418, "num_tokens": 356057.0, "step": 440 }, { "epoch": 0.1170550847457627, "grad_norm": 1.9606975317001343, "learning_rate": 9.941604872881356e-06, "loss": 1.7041, "mean_token_accuracy": 0.6528899148106575, "num_tokens": 357925.0, "step": 442 }, { "epoch": 0.11758474576271187, "grad_norm": 1.7035855054855347, "learning_rate": 9.941340042372883e-06, "loss": 1.4413, "mean_token_accuracy": 0.6801901943981647, "num_tokens": 359490.0, "step": 444 }, { "epoch": 0.11811440677966102, "grad_norm": 1.4629147052764893, "learning_rate": 9.941075211864408e-06, "loss": 1.3774, "mean_token_accuracy": 0.6951415091753006, "num_tokens": 361023.0, "step": 446 }, { "epoch": 0.11864406779661017, "grad_norm": 2.4775326251983643, "learning_rate": 9.940810381355933e-06, "loss": 1.6041, "mean_token_accuracy": 0.6668714433908463, "num_tokens": 362711.0, "step": 448 }, { "epoch": 0.11917372881355932, "grad_norm": 1.8101361989974976, "learning_rate": 9.940545550847458e-06, "loss": 1.6086, "mean_token_accuracy": 0.6758907176554203, "num_tokens": 364273.0, "step": 450 }, { "epoch": 0.11970338983050847, "grad_norm": 1.6925305128097534, "learning_rate": 9.940280720338984e-06, "loss": 1.6313, "mean_token_accuracy": 0.646467637270689, "num_tokens": 365858.0, "step": 452 }, { "epoch": 0.12023305084745763, "grad_norm": 1.6824592351913452, "learning_rate": 9.940015889830509e-06, "loss": 1.0182, "mean_token_accuracy": 0.7616409212350845, "num_tokens": 367314.0, "step": 454 }, { "epoch": 0.12076271186440678, "grad_norm": 1.728043794631958, "learning_rate": 9.939751059322034e-06, "loss": 1.4693, "mean_token_accuracy": 0.6804414726793766, "num_tokens": 368959.0, "step": 456 }, { "epoch": 0.12129237288135593, "grad_norm": 1.4486781358718872, "learning_rate": 9.939486228813559e-06, "loss": 1.2148, "mean_token_accuracy": 0.7162006124854088, "num_tokens": 370642.0, "step": 458 }, { "epoch": 0.12182203389830508, "grad_norm": 1.1412382125854492, "learning_rate": 9.939221398305086e-06, "loss": 0.8745, "mean_token_accuracy": 0.7825608849525452, "num_tokens": 372370.0, "step": 460 }, { "epoch": 0.12235169491525423, "grad_norm": 1.7980690002441406, "learning_rate": 9.93895656779661e-06, "loss": 1.7688, "mean_token_accuracy": 0.6255956813693047, "num_tokens": 374243.0, "step": 462 }, { "epoch": 0.1228813559322034, "grad_norm": 1.6589163541793823, "learning_rate": 9.938691737288137e-06, "loss": 1.435, "mean_token_accuracy": 0.660824827849865, "num_tokens": 376131.0, "step": 464 }, { "epoch": 0.12341101694915255, "grad_norm": 1.824189305305481, "learning_rate": 9.938426906779662e-06, "loss": 1.179, "mean_token_accuracy": 0.7334876731038094, "num_tokens": 377538.0, "step": 466 }, { "epoch": 0.1239406779661017, "grad_norm": 1.480167269706726, "learning_rate": 9.938162076271187e-06, "loss": 1.0286, "mean_token_accuracy": 0.7682427167892456, "num_tokens": 378887.0, "step": 468 }, { "epoch": 0.12447033898305085, "grad_norm": 1.8163381814956665, "learning_rate": 9.937897245762714e-06, "loss": 1.2161, "mean_token_accuracy": 0.7056666538119316, "num_tokens": 380373.0, "step": 470 }, { "epoch": 0.125, "grad_norm": 1.8242475986480713, "learning_rate": 9.937632415254239e-06, "loss": 1.4143, "mean_token_accuracy": 0.6946615055203438, "num_tokens": 381836.0, "step": 472 }, { "epoch": 0.12552966101694915, "grad_norm": 1.8008042573928833, "learning_rate": 9.937367584745763e-06, "loss": 1.1021, "mean_token_accuracy": 0.7447023391723633, "num_tokens": 383531.0, "step": 474 }, { "epoch": 0.1260593220338983, "grad_norm": 2.015711545944214, "learning_rate": 9.937102754237288e-06, "loss": 1.3821, "mean_token_accuracy": 0.6798715889453888, "num_tokens": 384935.0, "step": 476 }, { "epoch": 0.12658898305084745, "grad_norm": 1.5072296857833862, "learning_rate": 9.936837923728815e-06, "loss": 0.9835, "mean_token_accuracy": 0.760782279074192, "num_tokens": 386553.0, "step": 478 }, { "epoch": 0.1271186440677966, "grad_norm": 2.007265567779541, "learning_rate": 9.93657309322034e-06, "loss": 1.8103, "mean_token_accuracy": 0.6010462641716003, "num_tokens": 387989.0, "step": 480 }, { "epoch": 0.12764830508474576, "grad_norm": 1.6380457878112793, "learning_rate": 9.936308262711865e-06, "loss": 1.3089, "mean_token_accuracy": 0.6814712136983871, "num_tokens": 389581.0, "step": 482 }, { "epoch": 0.1281779661016949, "grad_norm": 1.168702244758606, "learning_rate": 9.93604343220339e-06, "loss": 0.8784, "mean_token_accuracy": 0.7872947454452515, "num_tokens": 391256.0, "step": 484 }, { "epoch": 0.12870762711864406, "grad_norm": 1.6083400249481201, "learning_rate": 9.935778601694916e-06, "loss": 1.6186, "mean_token_accuracy": 0.648200087249279, "num_tokens": 392707.0, "step": 486 }, { "epoch": 0.1292372881355932, "grad_norm": 2.072547197341919, "learning_rate": 9.935513771186441e-06, "loss": 1.3425, "mean_token_accuracy": 0.6844190955162048, "num_tokens": 394165.0, "step": 488 }, { "epoch": 0.12976694915254236, "grad_norm": 1.7350910902023315, "learning_rate": 9.935248940677968e-06, "loss": 1.3164, "mean_token_accuracy": 0.7188730016350746, "num_tokens": 395775.0, "step": 490 }, { "epoch": 0.13029661016949154, "grad_norm": 1.9530750513076782, "learning_rate": 9.934984110169493e-06, "loss": 1.6004, "mean_token_accuracy": 0.6352651007473469, "num_tokens": 397525.0, "step": 492 }, { "epoch": 0.1308262711864407, "grad_norm": 1.7759456634521484, "learning_rate": 9.934719279661018e-06, "loss": 0.9874, "mean_token_accuracy": 0.7526594549417496, "num_tokens": 399043.0, "step": 494 }, { "epoch": 0.13135593220338984, "grad_norm": 1.8589885234832764, "learning_rate": 9.934454449152543e-06, "loss": 1.3307, "mean_token_accuracy": 0.7161327973008156, "num_tokens": 400609.0, "step": 496 }, { "epoch": 0.131885593220339, "grad_norm": 2.1742420196533203, "learning_rate": 9.93418961864407e-06, "loss": 1.6412, "mean_token_accuracy": 0.6395372115075588, "num_tokens": 402112.0, "step": 498 }, { "epoch": 0.13241525423728814, "grad_norm": 2.4230217933654785, "learning_rate": 9.933924788135594e-06, "loss": 1.4681, "step": 500 }, { "epoch": 0.13241525423728814, "eval_loss": 1.3937320709228516, "eval_mean_token_accuracy": 0.6848550191173306, "eval_num_tokens": 404281.0, "eval_runtime": 48.297, "eval_samples_per_second": 6.377, "eval_steps_per_second": 6.377, "step": 500 }, { "epoch": 0.1329449152542373, "grad_norm": 1.4254608154296875, "learning_rate": 9.93365995762712e-06, "loss": 0.8884, "mean_token_accuracy": 0.7554112188518047, "num_tokens": 406430.0, "step": 502 }, { "epoch": 0.13347457627118645, "grad_norm": 1.7688559293746948, "learning_rate": 9.933395127118644e-06, "loss": 1.6942, "mean_token_accuracy": 0.6365661732852459, "num_tokens": 408049.0, "step": 504 }, { "epoch": 0.1340042372881356, "grad_norm": 1.425464391708374, "learning_rate": 9.93313029661017e-06, "loss": 1.0052, "mean_token_accuracy": 0.7540885135531425, "num_tokens": 409502.0, "step": 506 }, { "epoch": 0.13453389830508475, "grad_norm": 1.5085105895996094, "learning_rate": 9.932865466101696e-06, "loss": 1.5399, "mean_token_accuracy": 0.6580390445888042, "num_tokens": 411118.0, "step": 508 }, { "epoch": 0.1350635593220339, "grad_norm": 1.3953803777694702, "learning_rate": 9.93260063559322e-06, "loss": 1.2375, "mean_token_accuracy": 0.7172570377588272, "num_tokens": 412928.0, "step": 510 }, { "epoch": 0.13559322033898305, "grad_norm": 1.6118196249008179, "learning_rate": 9.932335805084746e-06, "loss": 1.6304, "mean_token_accuracy": 0.6440053768455982, "num_tokens": 414822.0, "step": 512 }, { "epoch": 0.1361228813559322, "grad_norm": 1.34540855884552, "learning_rate": 9.932070974576272e-06, "loss": 1.2298, "mean_token_accuracy": 0.720926083624363, "num_tokens": 416483.0, "step": 514 }, { "epoch": 0.13665254237288135, "grad_norm": 1.757503867149353, "learning_rate": 9.931806144067797e-06, "loss": 0.9304, "mean_token_accuracy": 0.7754505649209023, "num_tokens": 418104.0, "step": 516 }, { "epoch": 0.1371822033898305, "grad_norm": 1.6745868921279907, "learning_rate": 9.931541313559324e-06, "loss": 1.4355, "mean_token_accuracy": 0.6540173031389713, "num_tokens": 419903.0, "step": 518 }, { "epoch": 0.13771186440677965, "grad_norm": 1.4681719541549683, "learning_rate": 9.931276483050849e-06, "loss": 1.3007, "mean_token_accuracy": 0.7121259197592735, "num_tokens": 421607.0, "step": 520 }, { "epoch": 0.1382415254237288, "grad_norm": 1.5515954494476318, "learning_rate": 9.931011652542374e-06, "loss": 1.5635, "mean_token_accuracy": 0.6485245302319527, "num_tokens": 423501.0, "step": 522 }, { "epoch": 0.13877118644067796, "grad_norm": 1.6643019914627075, "learning_rate": 9.930746822033899e-06, "loss": 1.5898, "mean_token_accuracy": 0.6650684215128422, "num_tokens": 425045.0, "step": 524 }, { "epoch": 0.1393008474576271, "grad_norm": 1.6884112358093262, "learning_rate": 9.930481991525425e-06, "loss": 1.6323, "mean_token_accuracy": 0.6284923739731312, "num_tokens": 426783.0, "step": 526 }, { "epoch": 0.13983050847457626, "grad_norm": 1.982269287109375, "learning_rate": 9.93021716101695e-06, "loss": 1.4609, "mean_token_accuracy": 0.6697079613804817, "num_tokens": 428463.0, "step": 528 }, { "epoch": 0.1403601694915254, "grad_norm": 1.8830633163452148, "learning_rate": 9.929952330508475e-06, "loss": 1.5469, "mean_token_accuracy": 0.6279741264879704, "num_tokens": 430250.0, "step": 530 }, { "epoch": 0.1408898305084746, "grad_norm": 1.507760763168335, "learning_rate": 9.9296875e-06, "loss": 1.0645, "mean_token_accuracy": 0.7404237687587738, "num_tokens": 431806.0, "step": 532 }, { "epoch": 0.14141949152542374, "grad_norm": 1.5673879384994507, "learning_rate": 9.929422669491527e-06, "loss": 1.3647, "mean_token_accuracy": 0.6792694628238678, "num_tokens": 433439.0, "step": 534 }, { "epoch": 0.1419491525423729, "grad_norm": 1.7228164672851562, "learning_rate": 9.929157838983052e-06, "loss": 1.5845, "mean_token_accuracy": 0.6531463265419006, "num_tokens": 434927.0, "step": 536 }, { "epoch": 0.14247881355932204, "grad_norm": 1.3169156312942505, "learning_rate": 9.928893008474576e-06, "loss": 1.5774, "mean_token_accuracy": 0.6649730056524277, "num_tokens": 436635.0, "step": 538 }, { "epoch": 0.1430084745762712, "grad_norm": 1.7581442594528198, "learning_rate": 9.928628177966101e-06, "loss": 1.9434, "mean_token_accuracy": 0.609809048473835, "num_tokens": 438512.0, "step": 540 }, { "epoch": 0.14353813559322035, "grad_norm": 1.3604645729064941, "learning_rate": 9.928363347457628e-06, "loss": 1.4177, "mean_token_accuracy": 0.6686570718884468, "num_tokens": 440159.0, "step": 542 }, { "epoch": 0.1440677966101695, "grad_norm": 1.938342571258545, "learning_rate": 9.928098516949153e-06, "loss": 1.7016, "mean_token_accuracy": 0.6437485031783581, "num_tokens": 442700.0, "step": 544 }, { "epoch": 0.14459745762711865, "grad_norm": 1.5884978771209717, "learning_rate": 9.92783368644068e-06, "loss": 1.0807, "mean_token_accuracy": 0.7392308413982391, "num_tokens": 444255.0, "step": 546 }, { "epoch": 0.1451271186440678, "grad_norm": 1.4827502965927124, "learning_rate": 9.927568855932204e-06, "loss": 1.761, "mean_token_accuracy": 0.6321918219327927, "num_tokens": 446057.0, "step": 548 }, { "epoch": 0.14565677966101695, "grad_norm": 1.528292179107666, "learning_rate": 9.92730402542373e-06, "loss": 1.3911, "mean_token_accuracy": 0.6794048398733139, "num_tokens": 447783.0, "step": 550 }, { "epoch": 0.1461864406779661, "grad_norm": 1.1858106851577759, "learning_rate": 9.927039194915256e-06, "loss": 1.2818, "mean_token_accuracy": 0.7154315710067749, "num_tokens": 449524.0, "step": 552 }, { "epoch": 0.14671610169491525, "grad_norm": 1.5788283348083496, "learning_rate": 9.926774364406781e-06, "loss": 1.713, "mean_token_accuracy": 0.6507054306566715, "num_tokens": 451200.0, "step": 554 }, { "epoch": 0.1472457627118644, "grad_norm": 2.1711974143981934, "learning_rate": 9.926509533898306e-06, "loss": 1.3829, "mean_token_accuracy": 0.6865035593509674, "num_tokens": 452534.0, "step": 556 }, { "epoch": 0.14777542372881355, "grad_norm": 1.8831191062927246, "learning_rate": 9.92624470338983e-06, "loss": 1.5328, "mean_token_accuracy": 0.6559769064188004, "num_tokens": 454257.0, "step": 558 }, { "epoch": 0.1483050847457627, "grad_norm": 1.9336702823638916, "learning_rate": 9.925979872881357e-06, "loss": 1.519, "mean_token_accuracy": 0.6528650224208832, "num_tokens": 456086.0, "step": 560 }, { "epoch": 0.14883474576271186, "grad_norm": 1.840306282043457, "learning_rate": 9.925715042372882e-06, "loss": 1.4837, "mean_token_accuracy": 0.6699738129973412, "num_tokens": 457627.0, "step": 562 }, { "epoch": 0.149364406779661, "grad_norm": 1.494986891746521, "learning_rate": 9.925450211864407e-06, "loss": 1.3884, "mean_token_accuracy": 0.6767890751361847, "num_tokens": 459133.0, "step": 564 }, { "epoch": 0.14989406779661016, "grad_norm": 1.4574816226959229, "learning_rate": 9.925185381355932e-06, "loss": 1.1896, "mean_token_accuracy": 0.7028718441724777, "num_tokens": 460662.0, "step": 566 }, { "epoch": 0.1504237288135593, "grad_norm": 1.8815207481384277, "learning_rate": 9.924920550847459e-06, "loss": 1.3807, "mean_token_accuracy": 0.6590714752674103, "num_tokens": 462270.0, "step": 568 }, { "epoch": 0.15095338983050846, "grad_norm": 1.5782090425491333, "learning_rate": 9.924655720338984e-06, "loss": 1.3545, "mean_token_accuracy": 0.6996451988816261, "num_tokens": 463956.0, "step": 570 }, { "epoch": 0.15148305084745764, "grad_norm": 1.830964207649231, "learning_rate": 9.92439088983051e-06, "loss": 1.6445, "mean_token_accuracy": 0.647131435573101, "num_tokens": 465318.0, "step": 572 }, { "epoch": 0.1520127118644068, "grad_norm": 1.356331706047058, "learning_rate": 9.924126059322035e-06, "loss": 1.0563, "mean_token_accuracy": 0.7395019680261612, "num_tokens": 467079.0, "step": 574 }, { "epoch": 0.15254237288135594, "grad_norm": 2.0672202110290527, "learning_rate": 9.92386122881356e-06, "loss": 1.5871, "mean_token_accuracy": 0.6757973805069923, "num_tokens": 468346.0, "step": 576 }, { "epoch": 0.1530720338983051, "grad_norm": 1.631706714630127, "learning_rate": 9.923596398305085e-06, "loss": 1.1259, "mean_token_accuracy": 0.7157552093267441, "num_tokens": 470606.0, "step": 578 }, { "epoch": 0.15360169491525424, "grad_norm": 1.987979531288147, "learning_rate": 9.923331567796612e-06, "loss": 1.5413, "mean_token_accuracy": 0.6585549339652061, "num_tokens": 472480.0, "step": 580 }, { "epoch": 0.1541313559322034, "grad_norm": 1.4512251615524292, "learning_rate": 9.923066737288137e-06, "loss": 1.7471, "mean_token_accuracy": 0.6203080303966999, "num_tokens": 474132.0, "step": 582 }, { "epoch": 0.15466101694915255, "grad_norm": 1.6794267892837524, "learning_rate": 9.922801906779662e-06, "loss": 1.3117, "mean_token_accuracy": 0.6986596845090389, "num_tokens": 475990.0, "step": 584 }, { "epoch": 0.1551906779661017, "grad_norm": 1.455756425857544, "learning_rate": 9.922537076271187e-06, "loss": 1.281, "mean_token_accuracy": 0.7098699510097504, "num_tokens": 477669.0, "step": 586 }, { "epoch": 0.15572033898305085, "grad_norm": 1.8675779104232788, "learning_rate": 9.922272245762713e-06, "loss": 1.4377, "mean_token_accuracy": 0.6989539265632629, "num_tokens": 479385.0, "step": 588 }, { "epoch": 0.15625, "grad_norm": 2.2066726684570312, "learning_rate": 9.922007415254238e-06, "loss": 1.3799, "mean_token_accuracy": 0.6740908622741699, "num_tokens": 480650.0, "step": 590 }, { "epoch": 0.15677966101694915, "grad_norm": 1.2362158298492432, "learning_rate": 9.921742584745763e-06, "loss": 1.2804, "mean_token_accuracy": 0.7172962352633476, "num_tokens": 482333.0, "step": 592 }, { "epoch": 0.1573093220338983, "grad_norm": 1.629742980003357, "learning_rate": 9.921477754237288e-06, "loss": 1.5335, "mean_token_accuracy": 0.6744911074638367, "num_tokens": 483935.0, "step": 594 }, { "epoch": 0.15783898305084745, "grad_norm": 1.6263574361801147, "learning_rate": 9.921212923728815e-06, "loss": 1.4068, "mean_token_accuracy": 0.6532641872763634, "num_tokens": 485533.0, "step": 596 }, { "epoch": 0.1583686440677966, "grad_norm": 1.6428649425506592, "learning_rate": 9.92094809322034e-06, "loss": 0.9013, "mean_token_accuracy": 0.7872329726815224, "num_tokens": 486901.0, "step": 598 }, { "epoch": 0.15889830508474576, "grad_norm": 2.502692699432373, "learning_rate": 9.920683262711866e-06, "loss": 1.5595, "mean_token_accuracy": 0.6750899478793144, "num_tokens": 488205.0, "step": 600 }, { "epoch": 0.1594279661016949, "grad_norm": 1.6969702243804932, "learning_rate": 9.920418432203391e-06, "loss": 1.0716, "mean_token_accuracy": 0.7309423387050629, "num_tokens": 489876.0, "step": 602 }, { "epoch": 0.15995762711864406, "grad_norm": 1.9947600364685059, "learning_rate": 9.920153601694916e-06, "loss": 1.6787, "mean_token_accuracy": 0.6524284072220325, "num_tokens": 491471.0, "step": 604 }, { "epoch": 0.1604872881355932, "grad_norm": 1.790948510169983, "learning_rate": 9.919888771186441e-06, "loss": 1.3618, "mean_token_accuracy": 0.7094609439373016, "num_tokens": 493172.0, "step": 606 }, { "epoch": 0.16101694915254236, "grad_norm": 1.6518632173538208, "learning_rate": 9.919623940677968e-06, "loss": 1.4295, "mean_token_accuracy": 0.6798008941113949, "num_tokens": 494668.0, "step": 608 }, { "epoch": 0.16154661016949154, "grad_norm": 1.5216853618621826, "learning_rate": 9.919359110169493e-06, "loss": 1.6537, "mean_token_accuracy": 0.6537009552121162, "num_tokens": 496189.0, "step": 610 }, { "epoch": 0.1620762711864407, "grad_norm": 1.6010081768035889, "learning_rate": 9.919094279661017e-06, "loss": 1.7308, "mean_token_accuracy": 0.623883068561554, "num_tokens": 497884.0, "step": 612 }, { "epoch": 0.16260593220338984, "grad_norm": 1.929654598236084, "learning_rate": 9.918829449152542e-06, "loss": 1.422, "mean_token_accuracy": 0.6817982718348503, "num_tokens": 499475.0, "step": 614 }, { "epoch": 0.163135593220339, "grad_norm": 1.7829991579055786, "learning_rate": 9.918564618644069e-06, "loss": 1.556, "mean_token_accuracy": 0.6841001771390438, "num_tokens": 501462.0, "step": 616 }, { "epoch": 0.16366525423728814, "grad_norm": 1.643609881401062, "learning_rate": 9.918299788135594e-06, "loss": 1.394, "mean_token_accuracy": 0.6880982592701912, "num_tokens": 502931.0, "step": 618 }, { "epoch": 0.1641949152542373, "grad_norm": 1.7028234004974365, "learning_rate": 9.918034957627119e-06, "loss": 1.4286, "mean_token_accuracy": 0.6648978516459465, "num_tokens": 504538.0, "step": 620 }, { "epoch": 0.16472457627118645, "grad_norm": 1.5374736785888672, "learning_rate": 9.917770127118644e-06, "loss": 1.0224, "mean_token_accuracy": 0.731379933655262, "num_tokens": 507198.0, "step": 622 }, { "epoch": 0.1652542372881356, "grad_norm": 1.7910871505737305, "learning_rate": 9.91750529661017e-06, "loss": 1.5101, "mean_token_accuracy": 0.6880385279655457, "num_tokens": 508711.0, "step": 624 }, { "epoch": 0.16578389830508475, "grad_norm": 1.428447961807251, "learning_rate": 9.917240466101695e-06, "loss": 1.0697, "mean_token_accuracy": 0.7503379434347153, "num_tokens": 510357.0, "step": 626 }, { "epoch": 0.1663135593220339, "grad_norm": 1.4624168872833252, "learning_rate": 9.916975635593222e-06, "loss": 1.2787, "mean_token_accuracy": 0.6913928911089897, "num_tokens": 511828.0, "step": 628 }, { "epoch": 0.16684322033898305, "grad_norm": 1.9187239408493042, "learning_rate": 9.916710805084745e-06, "loss": 1.431, "mean_token_accuracy": 0.6832051128149033, "num_tokens": 513401.0, "step": 630 }, { "epoch": 0.1673728813559322, "grad_norm": 1.8455389738082886, "learning_rate": 9.916445974576272e-06, "loss": 1.3779, "mean_token_accuracy": 0.6800517365336418, "num_tokens": 515100.0, "step": 632 }, { "epoch": 0.16790254237288135, "grad_norm": 2.2066054344177246, "learning_rate": 9.916181144067798e-06, "loss": 1.5302, "mean_token_accuracy": 0.682159747928381, "num_tokens": 516523.0, "step": 634 }, { "epoch": 0.1684322033898305, "grad_norm": 1.5787866115570068, "learning_rate": 9.915916313559323e-06, "loss": 1.4817, "mean_token_accuracy": 0.6866355240345001, "num_tokens": 517989.0, "step": 636 }, { "epoch": 0.16896186440677965, "grad_norm": 1.9450236558914185, "learning_rate": 9.915651483050848e-06, "loss": 1.2764, "mean_token_accuracy": 0.6959664523601532, "num_tokens": 519530.0, "step": 638 }, { "epoch": 0.1694915254237288, "grad_norm": 2.0343222618103027, "learning_rate": 9.915386652542373e-06, "loss": 1.3253, "mean_token_accuracy": 0.6795613840222359, "num_tokens": 521056.0, "step": 640 }, { "epoch": 0.17002118644067796, "grad_norm": 1.9958717823028564, "learning_rate": 9.9151218220339e-06, "loss": 1.3949, "mean_token_accuracy": 0.6779462322592735, "num_tokens": 522587.0, "step": 642 }, { "epoch": 0.1705508474576271, "grad_norm": 2.0700485706329346, "learning_rate": 9.914856991525425e-06, "loss": 1.4484, "mean_token_accuracy": 0.6816198527812958, "num_tokens": 524194.0, "step": 644 }, { "epoch": 0.17108050847457626, "grad_norm": 2.0017640590667725, "learning_rate": 9.91459216101695e-06, "loss": 1.5875, "mean_token_accuracy": 0.606354620307684, "num_tokens": 525889.0, "step": 646 }, { "epoch": 0.1716101694915254, "grad_norm": 1.8611994981765747, "learning_rate": 9.914327330508475e-06, "loss": 1.8224, "mean_token_accuracy": 0.6287704706192017, "num_tokens": 527553.0, "step": 648 }, { "epoch": 0.1721398305084746, "grad_norm": 1.3863378763198853, "learning_rate": 9.914062500000001e-06, "loss": 0.897, "mean_token_accuracy": 0.7716369926929474, "num_tokens": 529372.0, "step": 650 }, { "epoch": 0.17266949152542374, "grad_norm": 1.9474018812179565, "learning_rate": 9.913797669491526e-06, "loss": 1.489, "mean_token_accuracy": 0.6766289249062538, "num_tokens": 531038.0, "step": 652 }, { "epoch": 0.1731991525423729, "grad_norm": 1.3115308284759521, "learning_rate": 9.913532838983053e-06, "loss": 1.0256, "mean_token_accuracy": 0.7329429388046265, "num_tokens": 532740.0, "step": 654 }, { "epoch": 0.17372881355932204, "grad_norm": 1.8103431463241577, "learning_rate": 9.913268008474578e-06, "loss": 1.4023, "mean_token_accuracy": 0.6692689433693886, "num_tokens": 534288.0, "step": 656 }, { "epoch": 0.1742584745762712, "grad_norm": 1.635090947151184, "learning_rate": 9.913003177966103e-06, "loss": 1.273, "mean_token_accuracy": 0.6873618364334106, "num_tokens": 535983.0, "step": 658 }, { "epoch": 0.17478813559322035, "grad_norm": 1.5008151531219482, "learning_rate": 9.912738347457628e-06, "loss": 0.933, "mean_token_accuracy": 0.7589449509978294, "num_tokens": 537520.0, "step": 660 }, { "epoch": 0.1753177966101695, "grad_norm": 1.6944507360458374, "learning_rate": 9.912473516949154e-06, "loss": 1.5795, "mean_token_accuracy": 0.6514994464814663, "num_tokens": 539229.0, "step": 662 }, { "epoch": 0.17584745762711865, "grad_norm": 1.8555341958999634, "learning_rate": 9.91220868644068e-06, "loss": 1.5626, "mean_token_accuracy": 0.6455405205488205, "num_tokens": 540965.0, "step": 664 }, { "epoch": 0.1763771186440678, "grad_norm": 1.6152762174606323, "learning_rate": 9.911943855932204e-06, "loss": 1.1467, "mean_token_accuracy": 0.7344977930188179, "num_tokens": 542603.0, "step": 666 }, { "epoch": 0.17690677966101695, "grad_norm": 2.622927665710449, "learning_rate": 9.911679025423729e-06, "loss": 1.8007, "mean_token_accuracy": 0.6262625567615032, "num_tokens": 544116.0, "step": 668 }, { "epoch": 0.1774364406779661, "grad_norm": 1.7333749532699585, "learning_rate": 9.911414194915256e-06, "loss": 1.437, "mean_token_accuracy": 0.6801304072141647, "num_tokens": 545644.0, "step": 670 }, { "epoch": 0.17796610169491525, "grad_norm": 1.8761110305786133, "learning_rate": 9.91114936440678e-06, "loss": 1.3994, "mean_token_accuracy": 0.6868420764803886, "num_tokens": 547130.0, "step": 672 }, { "epoch": 0.1784957627118644, "grad_norm": 1.9201432466506958, "learning_rate": 9.910884533898306e-06, "loss": 1.3443, "mean_token_accuracy": 0.693937636911869, "num_tokens": 548857.0, "step": 674 }, { "epoch": 0.17902542372881355, "grad_norm": 1.764388918876648, "learning_rate": 9.91061970338983e-06, "loss": 1.6383, "mean_token_accuracy": 0.652241662144661, "num_tokens": 550575.0, "step": 676 }, { "epoch": 0.1795550847457627, "grad_norm": 1.9486324787139893, "learning_rate": 9.910354872881357e-06, "loss": 1.4343, "mean_token_accuracy": 0.6672681868076324, "num_tokens": 552418.0, "step": 678 }, { "epoch": 0.18008474576271186, "grad_norm": 1.5014735460281372, "learning_rate": 9.910090042372882e-06, "loss": 1.4633, "mean_token_accuracy": 0.6691960729658604, "num_tokens": 554163.0, "step": 680 }, { "epoch": 0.180614406779661, "grad_norm": 1.4077109098434448, "learning_rate": 9.909825211864409e-06, "loss": 1.1125, "mean_token_accuracy": 0.7354493886232376, "num_tokens": 555897.0, "step": 682 }, { "epoch": 0.18114406779661016, "grad_norm": 1.7360155582427979, "learning_rate": 9.909560381355932e-06, "loss": 1.3808, "mean_token_accuracy": 0.6810082942247391, "num_tokens": 557574.0, "step": 684 }, { "epoch": 0.1816737288135593, "grad_norm": 1.7283200025558472, "learning_rate": 9.909295550847458e-06, "loss": 1.5952, "mean_token_accuracy": 0.6660092622041702, "num_tokens": 559227.0, "step": 686 }, { "epoch": 0.18220338983050846, "grad_norm": 2.0276718139648438, "learning_rate": 9.909030720338983e-06, "loss": 1.0681, "mean_token_accuracy": 0.7440655007958412, "num_tokens": 560605.0, "step": 688 }, { "epoch": 0.18273305084745764, "grad_norm": 1.5560239553451538, "learning_rate": 9.90876588983051e-06, "loss": 1.6534, "mean_token_accuracy": 0.6576598063111305, "num_tokens": 562474.0, "step": 690 }, { "epoch": 0.1832627118644068, "grad_norm": 1.9064395427703857, "learning_rate": 9.908501059322035e-06, "loss": 1.6865, "mean_token_accuracy": 0.6079551056027412, "num_tokens": 564237.0, "step": 692 }, { "epoch": 0.18379237288135594, "grad_norm": 1.8480714559555054, "learning_rate": 9.90823622881356e-06, "loss": 1.4833, "mean_token_accuracy": 0.6792490519583225, "num_tokens": 565796.0, "step": 694 }, { "epoch": 0.1843220338983051, "grad_norm": 1.5126444101333618, "learning_rate": 9.907971398305085e-06, "loss": 1.1436, "mean_token_accuracy": 0.7322905771434307, "num_tokens": 567496.0, "step": 696 }, { "epoch": 0.18485169491525424, "grad_norm": 1.5506970882415771, "learning_rate": 9.907706567796611e-06, "loss": 1.1717, "mean_token_accuracy": 0.7439185716211796, "num_tokens": 569095.0, "step": 698 }, { "epoch": 0.1853813559322034, "grad_norm": 1.9071749448776245, "learning_rate": 9.907441737288136e-06, "loss": 1.453, "mean_token_accuracy": 0.6874568089842796, "num_tokens": 570439.0, "step": 700 }, { "epoch": 0.18591101694915255, "grad_norm": 1.5063233375549316, "learning_rate": 9.907176906779661e-06, "loss": 1.2225, "mean_token_accuracy": 0.7340446561574936, "num_tokens": 572235.0, "step": 702 }, { "epoch": 0.1864406779661017, "grad_norm": 1.5849815607070923, "learning_rate": 9.906912076271186e-06, "loss": 1.3985, "mean_token_accuracy": 0.6863158419728279, "num_tokens": 574174.0, "step": 704 }, { "epoch": 0.18697033898305085, "grad_norm": 1.5781581401824951, "learning_rate": 9.906647245762713e-06, "loss": 1.435, "mean_token_accuracy": 0.6915807798504829, "num_tokens": 575673.0, "step": 706 }, { "epoch": 0.1875, "grad_norm": 1.686885118484497, "learning_rate": 9.906382415254238e-06, "loss": 1.7979, "mean_token_accuracy": 0.625814463943243, "num_tokens": 577312.0, "step": 708 }, { "epoch": 0.18802966101694915, "grad_norm": 1.7670502662658691, "learning_rate": 9.906117584745764e-06, "loss": 1.5625, "mean_token_accuracy": 0.6432202085852623, "num_tokens": 578909.0, "step": 710 }, { "epoch": 0.1885593220338983, "grad_norm": 1.566864252090454, "learning_rate": 9.905852754237288e-06, "loss": 1.5184, "mean_token_accuracy": 0.6637701541185379, "num_tokens": 580780.0, "step": 712 }, { "epoch": 0.18908898305084745, "grad_norm": 1.510916829109192, "learning_rate": 9.905587923728814e-06, "loss": 1.5175, "mean_token_accuracy": 0.6721879169344902, "num_tokens": 582420.0, "step": 714 }, { "epoch": 0.1896186440677966, "grad_norm": 1.6208821535110474, "learning_rate": 9.90532309322034e-06, "loss": 1.669, "mean_token_accuracy": 0.6271258033812046, "num_tokens": 584200.0, "step": 716 }, { "epoch": 0.19014830508474576, "grad_norm": 1.4100314378738403, "learning_rate": 9.905058262711866e-06, "loss": 1.1588, "mean_token_accuracy": 0.7327063009142876, "num_tokens": 585845.0, "step": 718 }, { "epoch": 0.1906779661016949, "grad_norm": 1.7890677452087402, "learning_rate": 9.90479343220339e-06, "loss": 1.5157, "mean_token_accuracy": 0.6764539703726768, "num_tokens": 587253.0, "step": 720 }, { "epoch": 0.19120762711864406, "grad_norm": 1.6761451959609985, "learning_rate": 9.904528601694916e-06, "loss": 1.5736, "mean_token_accuracy": 0.6361852772533894, "num_tokens": 589744.0, "step": 722 }, { "epoch": 0.1917372881355932, "grad_norm": 1.4527208805084229, "learning_rate": 9.904263771186442e-06, "loss": 1.2379, "mean_token_accuracy": 0.7172387093305588, "num_tokens": 591366.0, "step": 724 }, { "epoch": 0.19226694915254236, "grad_norm": 1.5540848970413208, "learning_rate": 9.903998940677967e-06, "loss": 1.1632, "mean_token_accuracy": 0.7226495146751404, "num_tokens": 592799.0, "step": 726 }, { "epoch": 0.19279661016949154, "grad_norm": 1.670442819595337, "learning_rate": 9.903734110169492e-06, "loss": 1.0162, "mean_token_accuracy": 0.7573948875069618, "num_tokens": 594372.0, "step": 728 }, { "epoch": 0.1933262711864407, "grad_norm": 1.2652817964553833, "learning_rate": 9.903469279661017e-06, "loss": 1.2827, "mean_token_accuracy": 0.7027871906757355, "num_tokens": 596147.0, "step": 730 }, { "epoch": 0.19385593220338984, "grad_norm": 1.6668962240219116, "learning_rate": 9.903204449152544e-06, "loss": 1.7409, "mean_token_accuracy": 0.6316470950841904, "num_tokens": 597727.0, "step": 732 }, { "epoch": 0.194385593220339, "grad_norm": 1.5527966022491455, "learning_rate": 9.902939618644069e-06, "loss": 0.8351, "mean_token_accuracy": 0.7822057828307152, "num_tokens": 599584.0, "step": 734 }, { "epoch": 0.19491525423728814, "grad_norm": 1.546166181564331, "learning_rate": 9.902674788135595e-06, "loss": 1.2532, "mean_token_accuracy": 0.703591838479042, "num_tokens": 601274.0, "step": 736 }, { "epoch": 0.1954449152542373, "grad_norm": 2.4726860523223877, "learning_rate": 9.902409957627118e-06, "loss": 1.842, "mean_token_accuracy": 0.5978826954960823, "num_tokens": 603521.0, "step": 738 }, { "epoch": 0.19597457627118645, "grad_norm": 2.5067038536071777, "learning_rate": 9.902145127118645e-06, "loss": 1.049, "mean_token_accuracy": 0.7238490805029869, "num_tokens": 605013.0, "step": 740 }, { "epoch": 0.1965042372881356, "grad_norm": 1.5234863758087158, "learning_rate": 9.90188029661017e-06, "loss": 0.8312, "mean_token_accuracy": 0.7964414954185486, "num_tokens": 606592.0, "step": 742 }, { "epoch": 0.19703389830508475, "grad_norm": 2.1281423568725586, "learning_rate": 9.901615466101697e-06, "loss": 1.683, "mean_token_accuracy": 0.6453238800168037, "num_tokens": 608144.0, "step": 744 }, { "epoch": 0.1975635593220339, "grad_norm": 1.6550722122192383, "learning_rate": 9.901350635593222e-06, "loss": 1.2779, "mean_token_accuracy": 0.7138424478471279, "num_tokens": 610092.0, "step": 746 }, { "epoch": 0.19809322033898305, "grad_norm": 1.9432944059371948, "learning_rate": 9.901085805084747e-06, "loss": 1.9289, "mean_token_accuracy": 0.5986232869327068, "num_tokens": 611988.0, "step": 748 }, { "epoch": 0.1986228813559322, "grad_norm": 1.9672948122024536, "learning_rate": 9.900820974576271e-06, "loss": 1.3305, "step": 750 }, { "epoch": 0.1986228813559322, "eval_loss": 1.3764935731887817, "eval_mean_token_accuracy": 0.6869819544546016, "eval_num_tokens": 613448.0, "eval_runtime": 48.3014, "eval_samples_per_second": 6.377, "eval_steps_per_second": 6.377, "step": 750 }, { "epoch": 0.19915254237288135, "grad_norm": 1.67292058467865, "learning_rate": 9.900556144067798e-06, "loss": 1.3302, "mean_token_accuracy": 0.6987408623099327, "num_tokens": 615036.0, "step": 752 }, { "epoch": 0.1996822033898305, "grad_norm": 1.6544362306594849, "learning_rate": 9.900291313559323e-06, "loss": 1.4695, "mean_token_accuracy": 0.6656528860330582, "num_tokens": 616882.0, "step": 754 }, { "epoch": 0.20021186440677965, "grad_norm": 1.816240668296814, "learning_rate": 9.900026483050848e-06, "loss": 1.5203, "mean_token_accuracy": 0.6938999071717262, "num_tokens": 618243.0, "step": 756 }, { "epoch": 0.2007415254237288, "grad_norm": 1.7089686393737793, "learning_rate": 9.899761652542373e-06, "loss": 1.5997, "mean_token_accuracy": 0.6461131870746613, "num_tokens": 619984.0, "step": 758 }, { "epoch": 0.20127118644067796, "grad_norm": 2.9222395420074463, "learning_rate": 9.8994968220339e-06, "loss": 1.6646, "mean_token_accuracy": 0.6899576112627983, "num_tokens": 621202.0, "step": 760 }, { "epoch": 0.2018008474576271, "grad_norm": 1.566623330116272, "learning_rate": 9.899231991525424e-06, "loss": 1.5367, "mean_token_accuracy": 0.6846042834222317, "num_tokens": 622717.0, "step": 762 }, { "epoch": 0.20233050847457626, "grad_norm": 1.3215099573135376, "learning_rate": 9.898967161016951e-06, "loss": 1.4196, "mean_token_accuracy": 0.6695416495203972, "num_tokens": 624573.0, "step": 764 }, { "epoch": 0.2028601694915254, "grad_norm": 1.878130316734314, "learning_rate": 9.898702330508474e-06, "loss": 1.3362, "mean_token_accuracy": 0.676585279405117, "num_tokens": 626291.0, "step": 766 }, { "epoch": 0.2033898305084746, "grad_norm": 1.7538425922393799, "learning_rate": 9.898437500000001e-06, "loss": 1.2571, "mean_token_accuracy": 0.7186030186712742, "num_tokens": 627870.0, "step": 768 }, { "epoch": 0.20391949152542374, "grad_norm": 1.7217211723327637, "learning_rate": 9.898172669491526e-06, "loss": 1.1665, "mean_token_accuracy": 0.7540055401623249, "num_tokens": 629234.0, "step": 770 }, { "epoch": 0.2044491525423729, "grad_norm": 1.942010760307312, "learning_rate": 9.897907838983052e-06, "loss": 1.3084, "mean_token_accuracy": 0.6719639897346497, "num_tokens": 630695.0, "step": 772 }, { "epoch": 0.20497881355932204, "grad_norm": 1.5221027135849, "learning_rate": 9.897643008474577e-06, "loss": 1.1076, "mean_token_accuracy": 0.7385289296507835, "num_tokens": 632142.0, "step": 774 }, { "epoch": 0.2055084745762712, "grad_norm": 1.2886099815368652, "learning_rate": 9.897378177966102e-06, "loss": 1.0747, "mean_token_accuracy": 0.7373541370034218, "num_tokens": 633552.0, "step": 776 }, { "epoch": 0.20603813559322035, "grad_norm": 1.8021152019500732, "learning_rate": 9.897113347457627e-06, "loss": 1.3992, "mean_token_accuracy": 0.6819315850734711, "num_tokens": 635162.0, "step": 778 }, { "epoch": 0.2065677966101695, "grad_norm": 2.327587604522705, "learning_rate": 9.896848516949154e-06, "loss": 1.3769, "mean_token_accuracy": 0.6759792640805244, "num_tokens": 636878.0, "step": 780 }, { "epoch": 0.20709745762711865, "grad_norm": 1.6020047664642334, "learning_rate": 9.896583686440679e-06, "loss": 1.4378, "mean_token_accuracy": 0.6793088689446449, "num_tokens": 638333.0, "step": 782 }, { "epoch": 0.2076271186440678, "grad_norm": 2.0522637367248535, "learning_rate": 9.896318855932204e-06, "loss": 1.6867, "mean_token_accuracy": 0.6206050626933575, "num_tokens": 640000.0, "step": 784 }, { "epoch": 0.20815677966101695, "grad_norm": 2.053615093231201, "learning_rate": 9.896054025423729e-06, "loss": 1.5023, "mean_token_accuracy": 0.6514757052063942, "num_tokens": 641625.0, "step": 786 }, { "epoch": 0.2086864406779661, "grad_norm": 1.9899702072143555, "learning_rate": 9.895789194915255e-06, "loss": 1.8809, "mean_token_accuracy": 0.6127825453877449, "num_tokens": 643166.0, "step": 788 }, { "epoch": 0.20921610169491525, "grad_norm": 1.9851408004760742, "learning_rate": 9.89552436440678e-06, "loss": 1.473, "mean_token_accuracy": 0.6846022494137287, "num_tokens": 644639.0, "step": 790 }, { "epoch": 0.2097457627118644, "grad_norm": 1.8240432739257812, "learning_rate": 9.895259533898305e-06, "loss": 1.3266, "mean_token_accuracy": 0.6963711529970169, "num_tokens": 646079.0, "step": 792 }, { "epoch": 0.21027542372881355, "grad_norm": 1.5919053554534912, "learning_rate": 9.89499470338983e-06, "loss": 1.4093, "mean_token_accuracy": 0.6931180506944656, "num_tokens": 647734.0, "step": 794 }, { "epoch": 0.2108050847457627, "grad_norm": 1.7393685579299927, "learning_rate": 9.894729872881357e-06, "loss": 1.8492, "mean_token_accuracy": 0.608059961348772, "num_tokens": 649732.0, "step": 796 }, { "epoch": 0.21133474576271186, "grad_norm": 1.5470993518829346, "learning_rate": 9.894465042372882e-06, "loss": 1.4964, "mean_token_accuracy": 0.650240883231163, "num_tokens": 651403.0, "step": 798 }, { "epoch": 0.211864406779661, "grad_norm": 1.6468613147735596, "learning_rate": 9.894200211864408e-06, "loss": 1.3177, "mean_token_accuracy": 0.7019313052296638, "num_tokens": 653025.0, "step": 800 }, { "epoch": 0.21239406779661016, "grad_norm": 1.3837547302246094, "learning_rate": 9.893935381355933e-06, "loss": 1.2216, "mean_token_accuracy": 0.7490173727273941, "num_tokens": 654852.0, "step": 802 }, { "epoch": 0.2129237288135593, "grad_norm": 1.8165363073349, "learning_rate": 9.893670550847458e-06, "loss": 1.6881, "mean_token_accuracy": 0.6443322971463203, "num_tokens": 656485.0, "step": 804 }, { "epoch": 0.21345338983050846, "grad_norm": 1.441816806793213, "learning_rate": 9.893405720338985e-06, "loss": 1.0644, "mean_token_accuracy": 0.7395388931035995, "num_tokens": 658058.0, "step": 806 }, { "epoch": 0.21398305084745764, "grad_norm": 1.6078441143035889, "learning_rate": 9.89314088983051e-06, "loss": 1.7347, "mean_token_accuracy": 0.6145856343209743, "num_tokens": 659619.0, "step": 808 }, { "epoch": 0.2145127118644068, "grad_norm": 1.5485670566558838, "learning_rate": 9.892876059322035e-06, "loss": 1.4545, "mean_token_accuracy": 0.6602029949426651, "num_tokens": 661007.0, "step": 810 }, { "epoch": 0.21504237288135594, "grad_norm": 2.2125513553619385, "learning_rate": 9.89261122881356e-06, "loss": 1.594, "mean_token_accuracy": 0.6529589742422104, "num_tokens": 662518.0, "step": 812 }, { "epoch": 0.2155720338983051, "grad_norm": 1.7385833263397217, "learning_rate": 9.892346398305086e-06, "loss": 1.4622, "mean_token_accuracy": 0.6689644083380699, "num_tokens": 664098.0, "step": 814 }, { "epoch": 0.21610169491525424, "grad_norm": 1.9781430959701538, "learning_rate": 9.892081567796611e-06, "loss": 1.414, "mean_token_accuracy": 0.6920876279473305, "num_tokens": 665412.0, "step": 816 }, { "epoch": 0.2166313559322034, "grad_norm": 1.586085319519043, "learning_rate": 9.891816737288138e-06, "loss": 1.3183, "mean_token_accuracy": 0.6971094273030758, "num_tokens": 667134.0, "step": 818 }, { "epoch": 0.21716101694915255, "grad_norm": 2.593465566635132, "learning_rate": 9.891551906779661e-06, "loss": 1.8869, "mean_token_accuracy": 0.5817256383597851, "num_tokens": 668959.0, "step": 820 }, { "epoch": 0.2176906779661017, "grad_norm": 1.776039719581604, "learning_rate": 9.891287076271188e-06, "loss": 1.1603, "mean_token_accuracy": 0.7356222346425056, "num_tokens": 670351.0, "step": 822 }, { "epoch": 0.21822033898305085, "grad_norm": 1.316954493522644, "learning_rate": 9.891022245762712e-06, "loss": 1.1347, "mean_token_accuracy": 0.7337861992418766, "num_tokens": 671902.0, "step": 824 }, { "epoch": 0.21875, "grad_norm": 1.666424036026001, "learning_rate": 9.890757415254239e-06, "loss": 1.4024, "mean_token_accuracy": 0.661447960883379, "num_tokens": 673637.0, "step": 826 }, { "epoch": 0.21927966101694915, "grad_norm": 1.6114070415496826, "learning_rate": 9.890492584745764e-06, "loss": 1.6244, "mean_token_accuracy": 0.6376644745469093, "num_tokens": 675427.0, "step": 828 }, { "epoch": 0.2198093220338983, "grad_norm": 1.533165454864502, "learning_rate": 9.890227754237289e-06, "loss": 1.1141, "mean_token_accuracy": 0.7346767783164978, "num_tokens": 677047.0, "step": 830 }, { "epoch": 0.22033898305084745, "grad_norm": 1.6453176736831665, "learning_rate": 9.889962923728814e-06, "loss": 1.8128, "mean_token_accuracy": 0.6008205339312553, "num_tokens": 679486.0, "step": 832 }, { "epoch": 0.2208686440677966, "grad_norm": 1.96952223777771, "learning_rate": 9.88969809322034e-06, "loss": 1.4666, "mean_token_accuracy": 0.6719990745186806, "num_tokens": 681105.0, "step": 834 }, { "epoch": 0.22139830508474576, "grad_norm": 1.8385611772537231, "learning_rate": 9.889433262711865e-06, "loss": 1.2095, "mean_token_accuracy": 0.7362895607948303, "num_tokens": 682536.0, "step": 836 }, { "epoch": 0.2219279661016949, "grad_norm": 1.6007664203643799, "learning_rate": 9.88916843220339e-06, "loss": 1.724, "mean_token_accuracy": 0.6339913345873356, "num_tokens": 684164.0, "step": 838 }, { "epoch": 0.22245762711864406, "grad_norm": 1.8647794723510742, "learning_rate": 9.888903601694915e-06, "loss": 1.3766, "mean_token_accuracy": 0.6854364275932312, "num_tokens": 685811.0, "step": 840 }, { "epoch": 0.2229872881355932, "grad_norm": 1.4930920600891113, "learning_rate": 9.888638771186442e-06, "loss": 1.3568, "mean_token_accuracy": 0.6933001577854156, "num_tokens": 687646.0, "step": 842 }, { "epoch": 0.22351694915254236, "grad_norm": 1.5800026655197144, "learning_rate": 9.888373940677967e-06, "loss": 1.2048, "mean_token_accuracy": 0.7328377664089203, "num_tokens": 689183.0, "step": 844 }, { "epoch": 0.22404661016949154, "grad_norm": 1.7218165397644043, "learning_rate": 9.888109110169492e-06, "loss": 1.324, "mean_token_accuracy": 0.688602901995182, "num_tokens": 690998.0, "step": 846 }, { "epoch": 0.2245762711864407, "grad_norm": 1.571707010269165, "learning_rate": 9.887844279661017e-06, "loss": 1.0893, "mean_token_accuracy": 0.7312567308545113, "num_tokens": 692787.0, "step": 848 }, { "epoch": 0.22510593220338984, "grad_norm": 1.5324918031692505, "learning_rate": 9.887579449152543e-06, "loss": 1.7163, "mean_token_accuracy": 0.6094905883073807, "num_tokens": 694537.0, "step": 850 }, { "epoch": 0.225635593220339, "grad_norm": 1.6797510385513306, "learning_rate": 9.887314618644068e-06, "loss": 1.2942, "mean_token_accuracy": 0.6817330047488213, "num_tokens": 696225.0, "step": 852 }, { "epoch": 0.22616525423728814, "grad_norm": 1.5455447435379028, "learning_rate": 9.887049788135595e-06, "loss": 0.7944, "mean_token_accuracy": 0.7742331326007843, "num_tokens": 697737.0, "step": 854 }, { "epoch": 0.2266949152542373, "grad_norm": 1.8442729711532593, "learning_rate": 9.88678495762712e-06, "loss": 1.5358, "mean_token_accuracy": 0.6309470646083355, "num_tokens": 699479.0, "step": 856 }, { "epoch": 0.22722457627118645, "grad_norm": 1.7454932928085327, "learning_rate": 9.886520127118645e-06, "loss": 1.1927, "mean_token_accuracy": 0.7433787733316422, "num_tokens": 700925.0, "step": 858 }, { "epoch": 0.2277542372881356, "grad_norm": 1.482830286026001, "learning_rate": 9.88625529661017e-06, "loss": 1.1506, "mean_token_accuracy": 0.7201443314552307, "num_tokens": 702849.0, "step": 860 }, { "epoch": 0.22828389830508475, "grad_norm": 1.8712430000305176, "learning_rate": 9.885990466101696e-06, "loss": 1.6697, "mean_token_accuracy": 0.6278537064790726, "num_tokens": 704564.0, "step": 862 }, { "epoch": 0.2288135593220339, "grad_norm": 1.452398657798767, "learning_rate": 9.885725635593221e-06, "loss": 1.0437, "mean_token_accuracy": 0.7666813135147095, "num_tokens": 706428.0, "step": 864 }, { "epoch": 0.22934322033898305, "grad_norm": 1.7786182165145874, "learning_rate": 9.885460805084746e-06, "loss": 1.808, "mean_token_accuracy": 0.6123015992343426, "num_tokens": 708009.0, "step": 866 }, { "epoch": 0.2298728813559322, "grad_norm": 1.741838812828064, "learning_rate": 9.885195974576271e-06, "loss": 1.5572, "mean_token_accuracy": 0.6583544425666332, "num_tokens": 709698.0, "step": 868 }, { "epoch": 0.23040254237288135, "grad_norm": 1.9558377265930176, "learning_rate": 9.884931144067798e-06, "loss": 1.4883, "mean_token_accuracy": 0.66124027967453, "num_tokens": 711023.0, "step": 870 }, { "epoch": 0.2309322033898305, "grad_norm": 1.3367283344268799, "learning_rate": 9.884666313559323e-06, "loss": 1.084, "mean_token_accuracy": 0.7244335934519768, "num_tokens": 712788.0, "step": 872 }, { "epoch": 0.23146186440677965, "grad_norm": 1.8574358224868774, "learning_rate": 9.884401483050848e-06, "loss": 1.8543, "mean_token_accuracy": 0.6091218516230583, "num_tokens": 714736.0, "step": 874 }, { "epoch": 0.2319915254237288, "grad_norm": 1.5605871677398682, "learning_rate": 9.884136652542372e-06, "loss": 1.6965, "mean_token_accuracy": 0.646085936576128, "num_tokens": 716564.0, "step": 876 }, { "epoch": 0.23252118644067796, "grad_norm": 1.6599310636520386, "learning_rate": 9.883871822033899e-06, "loss": 1.4363, "mean_token_accuracy": 0.6618439145386219, "num_tokens": 717924.0, "step": 878 }, { "epoch": 0.2330508474576271, "grad_norm": 1.99509859085083, "learning_rate": 9.883606991525424e-06, "loss": 1.2951, "mean_token_accuracy": 0.7020512111485004, "num_tokens": 719592.0, "step": 880 }, { "epoch": 0.23358050847457626, "grad_norm": 1.6877232789993286, "learning_rate": 9.88334216101695e-06, "loss": 1.6888, "mean_token_accuracy": 0.6274920105934143, "num_tokens": 721150.0, "step": 882 }, { "epoch": 0.2341101694915254, "grad_norm": 1.4110190868377686, "learning_rate": 9.883077330508476e-06, "loss": 1.6508, "mean_token_accuracy": 0.6056394129991531, "num_tokens": 722673.0, "step": 884 }, { "epoch": 0.2346398305084746, "grad_norm": 2.230031967163086, "learning_rate": 9.8828125e-06, "loss": 1.1566, "mean_token_accuracy": 0.7280313149094582, "num_tokens": 724040.0, "step": 886 }, { "epoch": 0.23516949152542374, "grad_norm": 1.6916923522949219, "learning_rate": 9.882547669491527e-06, "loss": 1.2483, "mean_token_accuracy": 0.6892464645206928, "num_tokens": 726329.0, "step": 888 }, { "epoch": 0.2356991525423729, "grad_norm": 2.096622943878174, "learning_rate": 9.882282838983052e-06, "loss": 1.7348, "mean_token_accuracy": 0.6377063244581223, "num_tokens": 727866.0, "step": 890 }, { "epoch": 0.23622881355932204, "grad_norm": 1.7750911712646484, "learning_rate": 9.882018008474577e-06, "loss": 1.247, "mean_token_accuracy": 0.7035304307937622, "num_tokens": 729227.0, "step": 892 }, { "epoch": 0.2367584745762712, "grad_norm": 1.8865658044815063, "learning_rate": 9.881753177966102e-06, "loss": 1.1878, "mean_token_accuracy": 0.7144695520401001, "num_tokens": 730696.0, "step": 894 }, { "epoch": 0.23728813559322035, "grad_norm": 1.6451306343078613, "learning_rate": 9.881488347457629e-06, "loss": 1.6296, "mean_token_accuracy": 0.6138842441141605, "num_tokens": 732350.0, "step": 896 }, { "epoch": 0.2378177966101695, "grad_norm": 2.187988042831421, "learning_rate": 9.881223516949153e-06, "loss": 1.9474, "mean_token_accuracy": 0.6156736761331558, "num_tokens": 733861.0, "step": 898 }, { "epoch": 0.23834745762711865, "grad_norm": 1.6960551738739014, "learning_rate": 9.880958686440678e-06, "loss": 1.1295, "mean_token_accuracy": 0.7365912720561028, "num_tokens": 735289.0, "step": 900 }, { "epoch": 0.2388771186440678, "grad_norm": 1.2837649583816528, "learning_rate": 9.880693855932203e-06, "loss": 0.8974, "mean_token_accuracy": 0.7695788219571114, "num_tokens": 736970.0, "step": 902 }, { "epoch": 0.23940677966101695, "grad_norm": 1.6076020002365112, "learning_rate": 9.88042902542373e-06, "loss": 1.5206, "mean_token_accuracy": 0.6642968878149986, "num_tokens": 738631.0, "step": 904 }, { "epoch": 0.2399364406779661, "grad_norm": 1.6444157361984253, "learning_rate": 9.880164194915255e-06, "loss": 1.4631, "mean_token_accuracy": 0.6602422222495079, "num_tokens": 740522.0, "step": 906 }, { "epoch": 0.24046610169491525, "grad_norm": 1.6546685695648193, "learning_rate": 9.879899364406782e-06, "loss": 1.2791, "mean_token_accuracy": 0.7295465245842934, "num_tokens": 742015.0, "step": 908 }, { "epoch": 0.2409957627118644, "grad_norm": 1.3703263998031616, "learning_rate": 9.879634533898306e-06, "loss": 1.1016, "mean_token_accuracy": 0.7556537464261055, "num_tokens": 743585.0, "step": 910 }, { "epoch": 0.24152542372881355, "grad_norm": 1.8638229370117188, "learning_rate": 9.879369703389831e-06, "loss": 1.5195, "mean_token_accuracy": 0.6825337782502174, "num_tokens": 744917.0, "step": 912 }, { "epoch": 0.2420550847457627, "grad_norm": 1.5646971464157104, "learning_rate": 9.879104872881356e-06, "loss": 1.2497, "mean_token_accuracy": 0.7153870314359665, "num_tokens": 746595.0, "step": 914 }, { "epoch": 0.24258474576271186, "grad_norm": 1.9500586986541748, "learning_rate": 9.878840042372883e-06, "loss": 1.2401, "mean_token_accuracy": 0.726530060172081, "num_tokens": 748019.0, "step": 916 }, { "epoch": 0.243114406779661, "grad_norm": 2.1237471103668213, "learning_rate": 9.878575211864408e-06, "loss": 1.4388, "mean_token_accuracy": 0.6843199878931046, "num_tokens": 749312.0, "step": 918 }, { "epoch": 0.24364406779661016, "grad_norm": 1.4335144758224487, "learning_rate": 9.878310381355933e-06, "loss": 0.908, "mean_token_accuracy": 0.7851855307817459, "num_tokens": 750802.0, "step": 920 }, { "epoch": 0.2441737288135593, "grad_norm": 1.8187919855117798, "learning_rate": 9.878045550847458e-06, "loss": 1.385, "mean_token_accuracy": 0.6895303130149841, "num_tokens": 752216.0, "step": 922 }, { "epoch": 0.24470338983050846, "grad_norm": 1.9854286909103394, "learning_rate": 9.877780720338984e-06, "loss": 1.2638, "mean_token_accuracy": 0.718742735683918, "num_tokens": 753729.0, "step": 924 }, { "epoch": 0.24523305084745764, "grad_norm": 1.7246047258377075, "learning_rate": 9.87751588983051e-06, "loss": 1.6978, "mean_token_accuracy": 0.6461331844329834, "num_tokens": 755430.0, "step": 926 }, { "epoch": 0.2457627118644068, "grad_norm": 1.4070392847061157, "learning_rate": 9.877251059322034e-06, "loss": 1.1509, "mean_token_accuracy": 0.7433924078941345, "num_tokens": 756982.0, "step": 928 }, { "epoch": 0.24629237288135594, "grad_norm": 1.951919436454773, "learning_rate": 9.876986228813559e-06, "loss": 1.6013, "mean_token_accuracy": 0.6647916324436665, "num_tokens": 758318.0, "step": 930 }, { "epoch": 0.2468220338983051, "grad_norm": 2.0848686695098877, "learning_rate": 9.876721398305086e-06, "loss": 1.7586, "mean_token_accuracy": 0.6105613596737385, "num_tokens": 759832.0, "step": 932 }, { "epoch": 0.24735169491525424, "grad_norm": 1.5069047212600708, "learning_rate": 9.87645656779661e-06, "loss": 1.3063, "mean_token_accuracy": 0.689956970512867, "num_tokens": 761473.0, "step": 934 }, { "epoch": 0.2478813559322034, "grad_norm": 1.715346336364746, "learning_rate": 9.876191737288137e-06, "loss": 1.2125, "mean_token_accuracy": 0.7332047894597054, "num_tokens": 763298.0, "step": 936 }, { "epoch": 0.24841101694915255, "grad_norm": 1.7571088075637817, "learning_rate": 9.875926906779662e-06, "loss": 1.5652, "mean_token_accuracy": 0.6576407365500927, "num_tokens": 764937.0, "step": 938 }, { "epoch": 0.2489406779661017, "grad_norm": 1.631527066230774, "learning_rate": 9.875662076271187e-06, "loss": 1.5439, "mean_token_accuracy": 0.6645934619009495, "num_tokens": 766395.0, "step": 940 }, { "epoch": 0.24947033898305085, "grad_norm": 1.453840970993042, "learning_rate": 9.875397245762712e-06, "loss": 0.9008, "mean_token_accuracy": 0.7897345796227455, "num_tokens": 767864.0, "step": 942 }, { "epoch": 0.25, "grad_norm": 1.8358123302459717, "learning_rate": 9.875132415254239e-06, "loss": 1.9708, "mean_token_accuracy": 0.5993053615093231, "num_tokens": 769267.0, "step": 944 }, { "epoch": 0.2505296610169492, "grad_norm": 1.9180582761764526, "learning_rate": 9.874867584745764e-06, "loss": 1.2993, "mean_token_accuracy": 0.710658747702837, "num_tokens": 770841.0, "step": 946 }, { "epoch": 0.2510593220338983, "grad_norm": 1.9476042985916138, "learning_rate": 9.874602754237289e-06, "loss": 1.5845, "mean_token_accuracy": 0.674192413687706, "num_tokens": 772499.0, "step": 948 }, { "epoch": 0.2515889830508475, "grad_norm": 1.7461069822311401, "learning_rate": 9.874337923728813e-06, "loss": 1.45, "mean_token_accuracy": 0.6734370738267899, "num_tokens": 774172.0, "step": 950 }, { "epoch": 0.2521186440677966, "grad_norm": 1.7804850339889526, "learning_rate": 9.87407309322034e-06, "loss": 1.4757, "mean_token_accuracy": 0.664439857006073, "num_tokens": 775744.0, "step": 952 }, { "epoch": 0.2526483050847458, "grad_norm": 1.4479492902755737, "learning_rate": 9.873808262711865e-06, "loss": 1.1237, "mean_token_accuracy": 0.7434381544589996, "num_tokens": 777350.0, "step": 954 }, { "epoch": 0.2531779661016949, "grad_norm": 1.6421482563018799, "learning_rate": 9.87354343220339e-06, "loss": 1.6404, "mean_token_accuracy": 0.658412791788578, "num_tokens": 779045.0, "step": 956 }, { "epoch": 0.2537076271186441, "grad_norm": 2.470076084136963, "learning_rate": 9.873278601694915e-06, "loss": 1.5901, "mean_token_accuracy": 0.6618145033717155, "num_tokens": 780475.0, "step": 958 }, { "epoch": 0.2542372881355932, "grad_norm": 1.3713234663009644, "learning_rate": 9.873013771186442e-06, "loss": 0.7639, "mean_token_accuracy": 0.8192535787820816, "num_tokens": 781878.0, "step": 960 }, { "epoch": 0.2547669491525424, "grad_norm": 1.6466646194458008, "learning_rate": 9.872748940677966e-06, "loss": 1.4536, "mean_token_accuracy": 0.6986314430832863, "num_tokens": 783236.0, "step": 962 }, { "epoch": 0.2552966101694915, "grad_norm": 1.638144850730896, "learning_rate": 9.872484110169493e-06, "loss": 1.1962, "mean_token_accuracy": 0.7052483931183815, "num_tokens": 784945.0, "step": 964 }, { "epoch": 0.2558262711864407, "grad_norm": 2.0043282508850098, "learning_rate": 9.872219279661018e-06, "loss": 1.5115, "mean_token_accuracy": 0.6559064164757729, "num_tokens": 786485.0, "step": 966 }, { "epoch": 0.2563559322033898, "grad_norm": 1.9453219175338745, "learning_rate": 9.871954449152543e-06, "loss": 1.292, "mean_token_accuracy": 0.707102507352829, "num_tokens": 788127.0, "step": 968 }, { "epoch": 0.256885593220339, "grad_norm": 1.6720777750015259, "learning_rate": 9.871689618644068e-06, "loss": 1.3746, "mean_token_accuracy": 0.7034286521375179, "num_tokens": 789465.0, "step": 970 }, { "epoch": 0.2574152542372881, "grad_norm": 1.8307738304138184, "learning_rate": 9.871424788135594e-06, "loss": 1.1927, "mean_token_accuracy": 0.748045027256012, "num_tokens": 790945.0, "step": 972 }, { "epoch": 0.2579449152542373, "grad_norm": 2.219668388366699, "learning_rate": 9.87115995762712e-06, "loss": 1.3262, "mean_token_accuracy": 0.7000508457422256, "num_tokens": 792492.0, "step": 974 }, { "epoch": 0.2584745762711864, "grad_norm": 1.7805217504501343, "learning_rate": 9.870895127118644e-06, "loss": 1.2906, "mean_token_accuracy": 0.7118720263242722, "num_tokens": 793826.0, "step": 976 }, { "epoch": 0.2590042372881356, "grad_norm": 1.7917307615280151, "learning_rate": 9.870630296610171e-06, "loss": 1.8719, "mean_token_accuracy": 0.5969331040978432, "num_tokens": 795659.0, "step": 978 }, { "epoch": 0.2595338983050847, "grad_norm": 1.8204587697982788, "learning_rate": 9.870365466101696e-06, "loss": 0.7459, "mean_token_accuracy": 0.8033374026417732, "num_tokens": 797150.0, "step": 980 }, { "epoch": 0.2600635593220339, "grad_norm": 1.222103238105774, "learning_rate": 9.87010063559322e-06, "loss": 1.3142, "mean_token_accuracy": 0.7006101720035076, "num_tokens": 799653.0, "step": 982 }, { "epoch": 0.2605932203389831, "grad_norm": 1.7179161310195923, "learning_rate": 9.869835805084746e-06, "loss": 1.6765, "mean_token_accuracy": 0.6219578310847282, "num_tokens": 801172.0, "step": 984 }, { "epoch": 0.2611228813559322, "grad_norm": 1.6882188320159912, "learning_rate": 9.869570974576272e-06, "loss": 1.661, "mean_token_accuracy": 0.6302271634340286, "num_tokens": 803167.0, "step": 986 }, { "epoch": 0.2616525423728814, "grad_norm": 1.8238940238952637, "learning_rate": 9.869306144067797e-06, "loss": 1.0804, "mean_token_accuracy": 0.7264656499028206, "num_tokens": 804599.0, "step": 988 }, { "epoch": 0.2621822033898305, "grad_norm": 2.109239339828491, "learning_rate": 9.869041313559324e-06, "loss": 1.5479, "mean_token_accuracy": 0.6643270552158356, "num_tokens": 806201.0, "step": 990 }, { "epoch": 0.2627118644067797, "grad_norm": 1.8601099252700806, "learning_rate": 9.868776483050849e-06, "loss": 1.2781, "mean_token_accuracy": 0.7229215204715729, "num_tokens": 807618.0, "step": 992 }, { "epoch": 0.2632415254237288, "grad_norm": 1.493497371673584, "learning_rate": 9.868511652542374e-06, "loss": 0.9718, "mean_token_accuracy": 0.7580265253782272, "num_tokens": 809239.0, "step": 994 }, { "epoch": 0.263771186440678, "grad_norm": 1.3679717779159546, "learning_rate": 9.868246822033899e-06, "loss": 1.0487, "mean_token_accuracy": 0.7497910559177399, "num_tokens": 810984.0, "step": 996 }, { "epoch": 0.2643008474576271, "grad_norm": 1.6029260158538818, "learning_rate": 9.867981991525425e-06, "loss": 1.4459, "mean_token_accuracy": 0.687916710972786, "num_tokens": 812550.0, "step": 998 }, { "epoch": 0.2648305084745763, "grad_norm": 1.6561225652694702, "learning_rate": 9.86771716101695e-06, "loss": 1.5934, "step": 1000 }, { "epoch": 0.2648305084745763, "eval_loss": 1.3626646995544434, "eval_mean_token_accuracy": 0.6930809021963702, "eval_num_tokens": 814223.0, "eval_runtime": 48.3116, "eval_samples_per_second": 6.375, "eval_steps_per_second": 6.375, "step": 1000 }, { "epoch": 0.2653601694915254, "grad_norm": 1.918026089668274, "learning_rate": 9.867452330508475e-06, "loss": 1.6222, "mean_token_accuracy": 0.6558754052966833, "num_tokens": 815774.0, "step": 1002 }, { "epoch": 0.2658898305084746, "grad_norm": 2.125101089477539, "learning_rate": 9.8671875e-06, "loss": 1.4291, "mean_token_accuracy": 0.6843815669417381, "num_tokens": 817260.0, "step": 1004 }, { "epoch": 0.2664194915254237, "grad_norm": 2.0670318603515625, "learning_rate": 9.866922669491527e-06, "loss": 1.5027, "mean_token_accuracy": 0.6831289604306221, "num_tokens": 818509.0, "step": 1006 }, { "epoch": 0.2669491525423729, "grad_norm": 1.6808688640594482, "learning_rate": 9.866657838983052e-06, "loss": 1.2729, "mean_token_accuracy": 0.6865945309400558, "num_tokens": 819938.0, "step": 1008 }, { "epoch": 0.267478813559322, "grad_norm": 1.5677694082260132, "learning_rate": 9.866393008474577e-06, "loss": 1.6439, "mean_token_accuracy": 0.652267586439848, "num_tokens": 821561.0, "step": 1010 }, { "epoch": 0.2680084745762712, "grad_norm": 1.8035222291946411, "learning_rate": 9.866128177966102e-06, "loss": 1.4981, "mean_token_accuracy": 0.6711120270192623, "num_tokens": 822856.0, "step": 1012 }, { "epoch": 0.2685381355932203, "grad_norm": 1.7792528867721558, "learning_rate": 9.865863347457628e-06, "loss": 1.3401, "mean_token_accuracy": 0.6982848644256592, "num_tokens": 824433.0, "step": 1014 }, { "epoch": 0.2690677966101695, "grad_norm": 1.5496729612350464, "learning_rate": 9.865598516949153e-06, "loss": 1.2926, "mean_token_accuracy": 0.7049727663397789, "num_tokens": 826324.0, "step": 1016 }, { "epoch": 0.2695974576271186, "grad_norm": 1.5117305517196655, "learning_rate": 9.86533368644068e-06, "loss": 1.1845, "mean_token_accuracy": 0.7220450788736343, "num_tokens": 828083.0, "step": 1018 }, { "epoch": 0.2701271186440678, "grad_norm": 2.304896354675293, "learning_rate": 9.865068855932205e-06, "loss": 1.5842, "mean_token_accuracy": 0.6676731929183006, "num_tokens": 829575.0, "step": 1020 }, { "epoch": 0.2706567796610169, "grad_norm": 1.7282378673553467, "learning_rate": 9.86480402542373e-06, "loss": 1.261, "mean_token_accuracy": 0.7051106169819832, "num_tokens": 831071.0, "step": 1022 }, { "epoch": 0.2711864406779661, "grad_norm": 2.151214122772217, "learning_rate": 9.864539194915255e-06, "loss": 1.4886, "mean_token_accuracy": 0.6813904941082001, "num_tokens": 832579.0, "step": 1024 }, { "epoch": 0.2717161016949153, "grad_norm": 1.8755675554275513, "learning_rate": 9.864274364406781e-06, "loss": 1.7253, "mean_token_accuracy": 0.6416633874177933, "num_tokens": 834205.0, "step": 1026 }, { "epoch": 0.2722457627118644, "grad_norm": 1.5570834875106812, "learning_rate": 9.864009533898306e-06, "loss": 1.0832, "mean_token_accuracy": 0.7604283094406128, "num_tokens": 835655.0, "step": 1028 }, { "epoch": 0.2727754237288136, "grad_norm": 1.7016854286193848, "learning_rate": 9.863744703389831e-06, "loss": 1.7151, "mean_token_accuracy": 0.6311546862125397, "num_tokens": 837451.0, "step": 1030 }, { "epoch": 0.2733050847457627, "grad_norm": 1.6157912015914917, "learning_rate": 9.863479872881356e-06, "loss": 1.6072, "mean_token_accuracy": 0.6661215573549271, "num_tokens": 839085.0, "step": 1032 }, { "epoch": 0.2738347457627119, "grad_norm": 1.6119229793548584, "learning_rate": 9.863215042372883e-06, "loss": 1.3133, "mean_token_accuracy": 0.698796235024929, "num_tokens": 840725.0, "step": 1034 }, { "epoch": 0.274364406779661, "grad_norm": 1.9877371788024902, "learning_rate": 9.862950211864407e-06, "loss": 1.719, "mean_token_accuracy": 0.6158713586628437, "num_tokens": 842177.0, "step": 1036 }, { "epoch": 0.2748940677966102, "grad_norm": 1.9416371583938599, "learning_rate": 9.862685381355932e-06, "loss": 1.8178, "mean_token_accuracy": 0.6176969781517982, "num_tokens": 843776.0, "step": 1038 }, { "epoch": 0.2754237288135593, "grad_norm": 1.7687972784042358, "learning_rate": 9.862420550847457e-06, "loss": 1.4423, "mean_token_accuracy": 0.6831441223621368, "num_tokens": 845512.0, "step": 1040 }, { "epoch": 0.2759533898305085, "grad_norm": 2.3040425777435303, "learning_rate": 9.862155720338984e-06, "loss": 1.7773, "mean_token_accuracy": 0.6653873175382614, "num_tokens": 846941.0, "step": 1042 }, { "epoch": 0.2764830508474576, "grad_norm": 1.5629597902297974, "learning_rate": 9.861890889830509e-06, "loss": 1.236, "mean_token_accuracy": 0.7115985527634621, "num_tokens": 848637.0, "step": 1044 }, { "epoch": 0.2770127118644068, "grad_norm": 2.040011167526245, "learning_rate": 9.861626059322036e-06, "loss": 1.7574, "mean_token_accuracy": 0.6005527824163437, "num_tokens": 850228.0, "step": 1046 }, { "epoch": 0.2775423728813559, "grad_norm": 1.598115086555481, "learning_rate": 9.86136122881356e-06, "loss": 1.2076, "mean_token_accuracy": 0.743260320276022, "num_tokens": 851791.0, "step": 1048 }, { "epoch": 0.2780720338983051, "grad_norm": 1.7736403942108154, "learning_rate": 9.861096398305085e-06, "loss": 1.3156, "mean_token_accuracy": 0.7120776250958443, "num_tokens": 853151.0, "step": 1050 }, { "epoch": 0.2786016949152542, "grad_norm": 1.57033371925354, "learning_rate": 9.86083156779661e-06, "loss": 1.0519, "mean_token_accuracy": 0.7404533848166466, "num_tokens": 854901.0, "step": 1052 }, { "epoch": 0.2791313559322034, "grad_norm": 1.7348589897155762, "learning_rate": 9.860566737288137e-06, "loss": 1.6174, "mean_token_accuracy": 0.6427013576030731, "num_tokens": 856528.0, "step": 1054 }, { "epoch": 0.2796610169491525, "grad_norm": 1.4861253499984741, "learning_rate": 9.860301906779662e-06, "loss": 1.5772, "mean_token_accuracy": 0.6460726261138916, "num_tokens": 858470.0, "step": 1056 }, { "epoch": 0.2801906779661017, "grad_norm": 2.1389541625976562, "learning_rate": 9.860037076271187e-06, "loss": 1.3292, "mean_token_accuracy": 0.697709709405899, "num_tokens": 860294.0, "step": 1058 }, { "epoch": 0.2807203389830508, "grad_norm": 1.529287576675415, "learning_rate": 9.859772245762713e-06, "loss": 1.8831, "mean_token_accuracy": 0.5975403115153313, "num_tokens": 862150.0, "step": 1060 }, { "epoch": 0.28125, "grad_norm": 1.7682559490203857, "learning_rate": 9.859507415254238e-06, "loss": 0.9156, "mean_token_accuracy": 0.7777522727847099, "num_tokens": 863705.0, "step": 1062 }, { "epoch": 0.2817796610169492, "grad_norm": 1.9458725452423096, "learning_rate": 9.859242584745763e-06, "loss": 1.3824, "mean_token_accuracy": 0.6982241049408913, "num_tokens": 865139.0, "step": 1064 }, { "epoch": 0.2823093220338983, "grad_norm": 1.9038639068603516, "learning_rate": 9.858977754237288e-06, "loss": 1.3013, "mean_token_accuracy": 0.7069234922528267, "num_tokens": 866898.0, "step": 1066 }, { "epoch": 0.2828389830508475, "grad_norm": 1.4200489521026611, "learning_rate": 9.858712923728815e-06, "loss": 1.3784, "mean_token_accuracy": 0.6864609941840172, "num_tokens": 868488.0, "step": 1068 }, { "epoch": 0.2833686440677966, "grad_norm": 1.2671781778335571, "learning_rate": 9.85844809322034e-06, "loss": 1.0729, "mean_token_accuracy": 0.7471098676323891, "num_tokens": 869821.0, "step": 1070 }, { "epoch": 0.2838983050847458, "grad_norm": 1.4353896379470825, "learning_rate": 9.858183262711866e-06, "loss": 1.8305, "mean_token_accuracy": 0.6046427376568317, "num_tokens": 871905.0, "step": 1072 }, { "epoch": 0.2844279661016949, "grad_norm": 2.1029610633850098, "learning_rate": 9.857918432203391e-06, "loss": 1.0166, "mean_token_accuracy": 0.7519918829202652, "num_tokens": 873126.0, "step": 1074 }, { "epoch": 0.2849576271186441, "grad_norm": 1.4235508441925049, "learning_rate": 9.857653601694916e-06, "loss": 1.8023, "mean_token_accuracy": 0.6317195035517216, "num_tokens": 874878.0, "step": 1076 }, { "epoch": 0.2854872881355932, "grad_norm": 1.7381980419158936, "learning_rate": 9.857388771186441e-06, "loss": 1.4403, "mean_token_accuracy": 0.6786594241857529, "num_tokens": 876616.0, "step": 1078 }, { "epoch": 0.2860169491525424, "grad_norm": 1.9798003435134888, "learning_rate": 9.857123940677968e-06, "loss": 1.5715, "mean_token_accuracy": 0.6431739926338196, "num_tokens": 878294.0, "step": 1080 }, { "epoch": 0.2865466101694915, "grad_norm": 1.627997875213623, "learning_rate": 9.856859110169493e-06, "loss": 1.2114, "mean_token_accuracy": 0.705564983189106, "num_tokens": 879748.0, "step": 1082 }, { "epoch": 0.2870762711864407, "grad_norm": 1.5913255214691162, "learning_rate": 9.856594279661018e-06, "loss": 1.2407, "mean_token_accuracy": 0.699594184756279, "num_tokens": 881740.0, "step": 1084 }, { "epoch": 0.2876059322033898, "grad_norm": 1.8063733577728271, "learning_rate": 9.856329449152543e-06, "loss": 0.9711, "mean_token_accuracy": 0.7482531070709229, "num_tokens": 883346.0, "step": 1086 }, { "epoch": 0.288135593220339, "grad_norm": 1.852990746498108, "learning_rate": 9.85606461864407e-06, "loss": 1.4297, "mean_token_accuracy": 0.6756062135100365, "num_tokens": 885009.0, "step": 1088 }, { "epoch": 0.2886652542372881, "grad_norm": 1.3330026865005493, "learning_rate": 9.855799788135594e-06, "loss": 0.8496, "mean_token_accuracy": 0.7783206179738045, "num_tokens": 886500.0, "step": 1090 }, { "epoch": 0.2891949152542373, "grad_norm": 1.7366153001785278, "learning_rate": 9.855534957627119e-06, "loss": 1.189, "mean_token_accuracy": 0.7315552607178688, "num_tokens": 888133.0, "step": 1092 }, { "epoch": 0.2897245762711864, "grad_norm": 1.5914069414138794, "learning_rate": 9.855270127118644e-06, "loss": 1.1634, "mean_token_accuracy": 0.7147088274359703, "num_tokens": 889842.0, "step": 1094 }, { "epoch": 0.2902542372881356, "grad_norm": 1.235314130783081, "learning_rate": 9.85500529661017e-06, "loss": 1.5745, "mean_token_accuracy": 0.659779854118824, "num_tokens": 892768.0, "step": 1096 }, { "epoch": 0.2907838983050847, "grad_norm": 1.8171411752700806, "learning_rate": 9.854740466101696e-06, "loss": 1.5027, "mean_token_accuracy": 0.674339234828949, "num_tokens": 894276.0, "step": 1098 }, { "epoch": 0.2913135593220339, "grad_norm": 1.2087987661361694, "learning_rate": 9.854475635593222e-06, "loss": 1.0093, "mean_token_accuracy": 0.7624877840280533, "num_tokens": 895840.0, "step": 1100 }, { "epoch": 0.2918432203389831, "grad_norm": 1.9954752922058105, "learning_rate": 9.854210805084747e-06, "loss": 1.6257, "mean_token_accuracy": 0.6489362567663193, "num_tokens": 897375.0, "step": 1102 }, { "epoch": 0.2923728813559322, "grad_norm": 1.6857082843780518, "learning_rate": 9.853945974576272e-06, "loss": 1.6075, "mean_token_accuracy": 0.6598218828439713, "num_tokens": 898831.0, "step": 1104 }, { "epoch": 0.2929025423728814, "grad_norm": 1.4884827136993408, "learning_rate": 9.853681144067797e-06, "loss": 1.1474, "mean_token_accuracy": 0.7441422790288925, "num_tokens": 900350.0, "step": 1106 }, { "epoch": 0.2934322033898305, "grad_norm": 1.959697961807251, "learning_rate": 9.853416313559324e-06, "loss": 1.2555, "mean_token_accuracy": 0.7143794819712639, "num_tokens": 902019.0, "step": 1108 }, { "epoch": 0.2939618644067797, "grad_norm": 1.6969259977340698, "learning_rate": 9.853151483050848e-06, "loss": 1.7004, "mean_token_accuracy": 0.6368567273020744, "num_tokens": 903479.0, "step": 1110 }, { "epoch": 0.2944915254237288, "grad_norm": 1.7868616580963135, "learning_rate": 9.852886652542373e-06, "loss": 1.5796, "mean_token_accuracy": 0.6622754484415054, "num_tokens": 905000.0, "step": 1112 }, { "epoch": 0.295021186440678, "grad_norm": 1.6237331628799438, "learning_rate": 9.852621822033898e-06, "loss": 1.3848, "mean_token_accuracy": 0.7057696245610714, "num_tokens": 906642.0, "step": 1114 }, { "epoch": 0.2955508474576271, "grad_norm": 1.74051833152771, "learning_rate": 9.852356991525425e-06, "loss": 1.5552, "mean_token_accuracy": 0.659699946641922, "num_tokens": 907968.0, "step": 1116 }, { "epoch": 0.2960805084745763, "grad_norm": 2.099501132965088, "learning_rate": 9.85209216101695e-06, "loss": 1.6924, "mean_token_accuracy": 0.6346505507826805, "num_tokens": 909606.0, "step": 1118 }, { "epoch": 0.2966101694915254, "grad_norm": 1.8409360647201538, "learning_rate": 9.851827330508475e-06, "loss": 1.3874, "mean_token_accuracy": 0.7007378712296486, "num_tokens": 910946.0, "step": 1120 }, { "epoch": 0.2971398305084746, "grad_norm": 1.4940913915634155, "learning_rate": 9.8515625e-06, "loss": 1.0387, "mean_token_accuracy": 0.7574049234390259, "num_tokens": 912530.0, "step": 1122 }, { "epoch": 0.2976694915254237, "grad_norm": 1.8489896059036255, "learning_rate": 9.851297669491526e-06, "loss": 1.5977, "mean_token_accuracy": 0.677609272301197, "num_tokens": 914057.0, "step": 1124 }, { "epoch": 0.2981991525423729, "grad_norm": 1.7445460557937622, "learning_rate": 9.851032838983051e-06, "loss": 1.6016, "mean_token_accuracy": 0.6623365357518196, "num_tokens": 915749.0, "step": 1126 }, { "epoch": 0.298728813559322, "grad_norm": 1.9617334604263306, "learning_rate": 9.850768008474578e-06, "loss": 1.2972, "mean_token_accuracy": 0.6973803713917732, "num_tokens": 917232.0, "step": 1128 }, { "epoch": 0.2992584745762712, "grad_norm": 1.6376395225524902, "learning_rate": 9.850503177966101e-06, "loss": 1.4243, "mean_token_accuracy": 0.6772843822836876, "num_tokens": 918897.0, "step": 1130 }, { "epoch": 0.2997881355932203, "grad_norm": 1.8939847946166992, "learning_rate": 9.850238347457628e-06, "loss": 1.2877, "mean_token_accuracy": 0.7321081608533859, "num_tokens": 920412.0, "step": 1132 }, { "epoch": 0.3003177966101695, "grad_norm": 2.0617454051971436, "learning_rate": 9.849973516949153e-06, "loss": 1.6823, "mean_token_accuracy": 0.6516198515892029, "num_tokens": 921853.0, "step": 1134 }, { "epoch": 0.3008474576271186, "grad_norm": 1.9127477407455444, "learning_rate": 9.84970868644068e-06, "loss": 1.5196, "mean_token_accuracy": 0.7124830707907677, "num_tokens": 923276.0, "step": 1136 }, { "epoch": 0.3013771186440678, "grad_norm": 1.593559741973877, "learning_rate": 9.849443855932204e-06, "loss": 1.2297, "mean_token_accuracy": 0.7292300760746002, "num_tokens": 924918.0, "step": 1138 }, { "epoch": 0.3019067796610169, "grad_norm": 1.8690749406814575, "learning_rate": 9.84917902542373e-06, "loss": 1.6993, "mean_token_accuracy": 0.6305676996707916, "num_tokens": 926614.0, "step": 1140 }, { "epoch": 0.3024364406779661, "grad_norm": 1.6167528629302979, "learning_rate": 9.848914194915256e-06, "loss": 1.0545, "mean_token_accuracy": 0.738220825791359, "num_tokens": 928077.0, "step": 1142 }, { "epoch": 0.3029661016949153, "grad_norm": 1.8012675046920776, "learning_rate": 9.84864936440678e-06, "loss": 1.146, "mean_token_accuracy": 0.7238287553191185, "num_tokens": 929725.0, "step": 1144 }, { "epoch": 0.3034957627118644, "grad_norm": 1.8527559041976929, "learning_rate": 9.848384533898306e-06, "loss": 1.2967, "mean_token_accuracy": 0.7122826986014843, "num_tokens": 931394.0, "step": 1146 }, { "epoch": 0.3040254237288136, "grad_norm": 1.5782470703125, "learning_rate": 9.84811970338983e-06, "loss": 1.3699, "mean_token_accuracy": 0.695412315428257, "num_tokens": 932957.0, "step": 1148 }, { "epoch": 0.3045550847457627, "grad_norm": 1.6463804244995117, "learning_rate": 9.847854872881357e-06, "loss": 1.3264, "mean_token_accuracy": 0.7065656036138535, "num_tokens": 934445.0, "step": 1150 }, { "epoch": 0.3050847457627119, "grad_norm": 1.5157073736190796, "learning_rate": 9.847590042372882e-06, "loss": 0.8107, "mean_token_accuracy": 0.7996777668595314, "num_tokens": 935868.0, "step": 1152 }, { "epoch": 0.305614406779661, "grad_norm": 1.5399096012115479, "learning_rate": 9.847325211864409e-06, "loss": 1.4711, "mean_token_accuracy": 0.7007943354547024, "num_tokens": 937561.0, "step": 1154 }, { "epoch": 0.3061440677966102, "grad_norm": 1.4424047470092773, "learning_rate": 9.847060381355934e-06, "loss": 1.1738, "mean_token_accuracy": 0.7461091205477715, "num_tokens": 939302.0, "step": 1156 }, { "epoch": 0.3066737288135593, "grad_norm": 2.079011917114258, "learning_rate": 9.846795550847459e-06, "loss": 1.6744, "mean_token_accuracy": 0.6401412189006805, "num_tokens": 940740.0, "step": 1158 }, { "epoch": 0.3072033898305085, "grad_norm": 1.7716330289840698, "learning_rate": 9.846530720338984e-06, "loss": 1.2721, "mean_token_accuracy": 0.7152782678604126, "num_tokens": 942299.0, "step": 1160 }, { "epoch": 0.3077330508474576, "grad_norm": 1.5095833539962769, "learning_rate": 9.84626588983051e-06, "loss": 1.1895, "mean_token_accuracy": 0.6979699656367302, "num_tokens": 943972.0, "step": 1162 }, { "epoch": 0.3082627118644068, "grad_norm": 1.5136301517486572, "learning_rate": 9.846001059322035e-06, "loss": 1.4394, "mean_token_accuracy": 0.6718713119626045, "num_tokens": 945993.0, "step": 1164 }, { "epoch": 0.3087923728813559, "grad_norm": 1.678294062614441, "learning_rate": 9.84573622881356e-06, "loss": 1.838, "mean_token_accuracy": 0.6062478795647621, "num_tokens": 947773.0, "step": 1166 }, { "epoch": 0.3093220338983051, "grad_norm": 1.4388192892074585, "learning_rate": 9.845471398305085e-06, "loss": 1.4053, "mean_token_accuracy": 0.6915944404900074, "num_tokens": 949336.0, "step": 1168 }, { "epoch": 0.3098516949152542, "grad_norm": 1.7442442178726196, "learning_rate": 9.845206567796612e-06, "loss": 1.5781, "mean_token_accuracy": 0.6489049941301346, "num_tokens": 950994.0, "step": 1170 }, { "epoch": 0.3103813559322034, "grad_norm": 1.8274072408676147, "learning_rate": 9.844941737288137e-06, "loss": 1.6196, "mean_token_accuracy": 0.6328502558171749, "num_tokens": 952740.0, "step": 1172 }, { "epoch": 0.3109110169491525, "grad_norm": 1.7543781995773315, "learning_rate": 9.844676906779661e-06, "loss": 1.5018, "mean_token_accuracy": 0.6851766780018806, "num_tokens": 954544.0, "step": 1174 }, { "epoch": 0.3114406779661017, "grad_norm": 1.8017784357070923, "learning_rate": 9.844412076271186e-06, "loss": 1.3494, "mean_token_accuracy": 0.696676567196846, "num_tokens": 956221.0, "step": 1176 }, { "epoch": 0.3119703389830508, "grad_norm": 1.5847026109695435, "learning_rate": 9.844147245762713e-06, "loss": 1.4499, "mean_token_accuracy": 0.6702465862035751, "num_tokens": 957856.0, "step": 1178 }, { "epoch": 0.3125, "grad_norm": 1.6465368270874023, "learning_rate": 9.843882415254238e-06, "loss": 1.2066, "mean_token_accuracy": 0.7089572846889496, "num_tokens": 959517.0, "step": 1180 }, { "epoch": 0.3130296610169492, "grad_norm": 1.8675180673599243, "learning_rate": 9.843617584745765e-06, "loss": 1.1872, "mean_token_accuracy": 0.7351469844579697, "num_tokens": 961077.0, "step": 1182 }, { "epoch": 0.3135593220338983, "grad_norm": 1.499443769454956, "learning_rate": 9.843352754237288e-06, "loss": 1.2299, "mean_token_accuracy": 0.7249677181243896, "num_tokens": 962578.0, "step": 1184 }, { "epoch": 0.3140889830508475, "grad_norm": 1.9284266233444214, "learning_rate": 9.843087923728814e-06, "loss": 1.4248, "mean_token_accuracy": 0.689951166510582, "num_tokens": 963934.0, "step": 1186 }, { "epoch": 0.3146186440677966, "grad_norm": 2.0840179920196533, "learning_rate": 9.84282309322034e-06, "loss": 1.2256, "mean_token_accuracy": 0.7177025973796844, "num_tokens": 965338.0, "step": 1188 }, { "epoch": 0.3151483050847458, "grad_norm": 1.53691828250885, "learning_rate": 9.842558262711866e-06, "loss": 1.2547, "mean_token_accuracy": 0.6826934069395065, "num_tokens": 966791.0, "step": 1190 }, { "epoch": 0.3156779661016949, "grad_norm": 1.5026233196258545, "learning_rate": 9.842293432203391e-06, "loss": 1.1826, "mean_token_accuracy": 0.7514768913388252, "num_tokens": 968392.0, "step": 1192 }, { "epoch": 0.3162076271186441, "grad_norm": 1.9934942722320557, "learning_rate": 9.842028601694916e-06, "loss": 1.6898, "mean_token_accuracy": 0.6300688609480858, "num_tokens": 969767.0, "step": 1194 }, { "epoch": 0.3167372881355932, "grad_norm": 1.7152061462402344, "learning_rate": 9.84176377118644e-06, "loss": 1.2877, "mean_token_accuracy": 0.7086796388030052, "num_tokens": 971342.0, "step": 1196 }, { "epoch": 0.3172669491525424, "grad_norm": 1.7544559240341187, "learning_rate": 9.841498940677967e-06, "loss": 1.8537, "mean_token_accuracy": 0.6230918914079666, "num_tokens": 973119.0, "step": 1198 }, { "epoch": 0.3177966101694915, "grad_norm": 1.335146427154541, "learning_rate": 9.841234110169492e-06, "loss": 1.4055, "mean_token_accuracy": 0.6814668960869312, "num_tokens": 974932.0, "step": 1200 }, { "epoch": 0.3183262711864407, "grad_norm": 1.8149960041046143, "learning_rate": 9.840969279661017e-06, "loss": 1.2923, "mean_token_accuracy": 0.6976901218295097, "num_tokens": 976500.0, "step": 1202 }, { "epoch": 0.3188559322033898, "grad_norm": 2.104412794113159, "learning_rate": 9.840704449152542e-06, "loss": 1.9378, "mean_token_accuracy": 0.6095275059342384, "num_tokens": 978002.0, "step": 1204 }, { "epoch": 0.319385593220339, "grad_norm": 1.839156150817871, "learning_rate": 9.840439618644069e-06, "loss": 1.5563, "mean_token_accuracy": 0.6979890689253807, "num_tokens": 979589.0, "step": 1206 }, { "epoch": 0.3199152542372881, "grad_norm": 1.912338376045227, "learning_rate": 9.840174788135594e-06, "loss": 1.1418, "mean_token_accuracy": 0.7297976166009903, "num_tokens": 981135.0, "step": 1208 }, { "epoch": 0.3204449152542373, "grad_norm": 1.7511762380599976, "learning_rate": 9.83990995762712e-06, "loss": 1.7937, "mean_token_accuracy": 0.6152028441429138, "num_tokens": 982877.0, "step": 1210 }, { "epoch": 0.3209745762711864, "grad_norm": 1.3896794319152832, "learning_rate": 9.839645127118644e-06, "loss": 1.0501, "mean_token_accuracy": 0.7536274641752243, "num_tokens": 984464.0, "step": 1212 }, { "epoch": 0.3215042372881356, "grad_norm": 1.7348734140396118, "learning_rate": 9.83938029661017e-06, "loss": 1.6578, "mean_token_accuracy": 0.6305435672402382, "num_tokens": 986145.0, "step": 1214 }, { "epoch": 0.3220338983050847, "grad_norm": 1.5393718481063843, "learning_rate": 9.839115466101695e-06, "loss": 1.0957, "mean_token_accuracy": 0.7778049632906914, "num_tokens": 987653.0, "step": 1216 }, { "epoch": 0.3225635593220339, "grad_norm": 1.9854894876480103, "learning_rate": 9.838850635593222e-06, "loss": 1.5106, "mean_token_accuracy": 0.6708530634641647, "num_tokens": 989130.0, "step": 1218 }, { "epoch": 0.3230932203389831, "grad_norm": 1.6296554803848267, "learning_rate": 9.838585805084747e-06, "loss": 1.331, "mean_token_accuracy": 0.6747524589300156, "num_tokens": 991462.0, "step": 1220 }, { "epoch": 0.3236228813559322, "grad_norm": 1.7956199645996094, "learning_rate": 9.838320974576272e-06, "loss": 1.613, "mean_token_accuracy": 0.6604634150862694, "num_tokens": 992956.0, "step": 1222 }, { "epoch": 0.3241525423728814, "grad_norm": 1.5881457328796387, "learning_rate": 9.838056144067798e-06, "loss": 1.5176, "mean_token_accuracy": 0.6561891958117485, "num_tokens": 994855.0, "step": 1224 }, { "epoch": 0.3246822033898305, "grad_norm": 1.9409314393997192, "learning_rate": 9.837791313559323e-06, "loss": 1.4397, "mean_token_accuracy": 0.6634356752038002, "num_tokens": 996290.0, "step": 1226 }, { "epoch": 0.3252118644067797, "grad_norm": 1.4217227697372437, "learning_rate": 9.837526483050848e-06, "loss": 1.1873, "mean_token_accuracy": 0.7231696397066116, "num_tokens": 997727.0, "step": 1228 }, { "epoch": 0.3257415254237288, "grad_norm": 1.440622091293335, "learning_rate": 9.837261652542373e-06, "loss": 1.3898, "mean_token_accuracy": 0.714287880808115, "num_tokens": 999380.0, "step": 1230 }, { "epoch": 0.326271186440678, "grad_norm": 2.310304641723633, "learning_rate": 9.8369968220339e-06, "loss": 1.8626, "mean_token_accuracy": 0.6133618727326393, "num_tokens": 1000860.0, "step": 1232 }, { "epoch": 0.3268008474576271, "grad_norm": 1.6475896835327148, "learning_rate": 9.836731991525425e-06, "loss": 1.2706, "mean_token_accuracy": 0.6975463628768921, "num_tokens": 1002834.0, "step": 1234 }, { "epoch": 0.3273305084745763, "grad_norm": 1.5437289476394653, "learning_rate": 9.836467161016951e-06, "loss": 1.07, "mean_token_accuracy": 0.7239202186465263, "num_tokens": 1004521.0, "step": 1236 }, { "epoch": 0.3278601694915254, "grad_norm": 1.7069344520568848, "learning_rate": 9.836202330508474e-06, "loss": 1.1277, "mean_token_accuracy": 0.7298260405659676, "num_tokens": 1005756.0, "step": 1238 }, { "epoch": 0.3283898305084746, "grad_norm": 1.5779974460601807, "learning_rate": 9.835937500000001e-06, "loss": 0.9196, "mean_token_accuracy": 0.7654735743999481, "num_tokens": 1007172.0, "step": 1240 }, { "epoch": 0.3289194915254237, "grad_norm": 1.6774365901947021, "learning_rate": 9.835672669491526e-06, "loss": 1.2483, "mean_token_accuracy": 0.7025940045714378, "num_tokens": 1009204.0, "step": 1242 }, { "epoch": 0.3294491525423729, "grad_norm": 1.2723779678344727, "learning_rate": 9.835407838983053e-06, "loss": 1.4057, "mean_token_accuracy": 0.694402351975441, "num_tokens": 1011020.0, "step": 1244 }, { "epoch": 0.329978813559322, "grad_norm": 1.8068181276321411, "learning_rate": 9.835143008474578e-06, "loss": 1.1901, "mean_token_accuracy": 0.7124124988913536, "num_tokens": 1012384.0, "step": 1246 }, { "epoch": 0.3305084745762712, "grad_norm": 2.412879228591919, "learning_rate": 9.834878177966102e-06, "loss": 1.5006, "mean_token_accuracy": 0.6756228804588318, "num_tokens": 1013986.0, "step": 1248 }, { "epoch": 0.3310381355932203, "grad_norm": 1.3485826253890991, "learning_rate": 9.834613347457627e-06, "loss": 0.8722, "step": 1250 }, { "epoch": 0.3310381355932203, "eval_loss": 1.3493969440460205, "eval_mean_token_accuracy": 0.6941682522173052, "eval_num_tokens": 1015476.0, "eval_runtime": 48.3332, "eval_samples_per_second": 6.372, "eval_steps_per_second": 6.372, "step": 1250 }, { "epoch": 0.3315677966101695, "grad_norm": 1.5027204751968384, "learning_rate": 9.834348516949154e-06, "loss": 1.6668, "mean_token_accuracy": 0.7170548364520073, "num_tokens": 1017095.0, "step": 1252 }, { "epoch": 0.3320974576271186, "grad_norm": 2.1687419414520264, "learning_rate": 9.834083686440679e-06, "loss": 1.9821, "mean_token_accuracy": 0.5863998606801033, "num_tokens": 1018426.0, "step": 1254 }, { "epoch": 0.3326271186440678, "grad_norm": 1.8429185152053833, "learning_rate": 9.833818855932204e-06, "loss": 1.0906, "mean_token_accuracy": 0.7330270856618881, "num_tokens": 1019945.0, "step": 1256 }, { "epoch": 0.3331567796610169, "grad_norm": 2.105743169784546, "learning_rate": 9.833554025423729e-06, "loss": 1.3927, "mean_token_accuracy": 0.6879642456769943, "num_tokens": 1021707.0, "step": 1258 }, { "epoch": 0.3336864406779661, "grad_norm": 1.7824991941452026, "learning_rate": 9.833289194915255e-06, "loss": 1.4478, "mean_token_accuracy": 0.6863528564572334, "num_tokens": 1023094.0, "step": 1260 }, { "epoch": 0.3342161016949153, "grad_norm": 1.775935173034668, "learning_rate": 9.83302436440678e-06, "loss": 1.4045, "mean_token_accuracy": 0.7021141275763512, "num_tokens": 1024541.0, "step": 1262 }, { "epoch": 0.3347457627118644, "grad_norm": 1.979688048362732, "learning_rate": 9.832759533898307e-06, "loss": 1.8469, "mean_token_accuracy": 0.6374040693044662, "num_tokens": 1026223.0, "step": 1264 }, { "epoch": 0.3352754237288136, "grad_norm": 1.7393985986709595, "learning_rate": 9.83249470338983e-06, "loss": 1.3497, "mean_token_accuracy": 0.7019542083144188, "num_tokens": 1027554.0, "step": 1266 }, { "epoch": 0.3358050847457627, "grad_norm": 1.870712399482727, "learning_rate": 9.832229872881357e-06, "loss": 1.5156, "mean_token_accuracy": 0.6606012061238289, "num_tokens": 1029500.0, "step": 1268 }, { "epoch": 0.3363347457627119, "grad_norm": 2.0372982025146484, "learning_rate": 9.831965042372882e-06, "loss": 1.7533, "mean_token_accuracy": 0.635370846837759, "num_tokens": 1031286.0, "step": 1270 }, { "epoch": 0.336864406779661, "grad_norm": 2.0919110774993896, "learning_rate": 9.831700211864408e-06, "loss": 1.2757, "mean_token_accuracy": 0.7187880873680115, "num_tokens": 1032761.0, "step": 1272 }, { "epoch": 0.3373940677966102, "grad_norm": 1.580164909362793, "learning_rate": 9.831435381355933e-06, "loss": 1.2109, "mean_token_accuracy": 0.7121895775198936, "num_tokens": 1035360.0, "step": 1274 }, { "epoch": 0.3379237288135593, "grad_norm": 1.4186867475509644, "learning_rate": 9.831170550847458e-06, "loss": 1.2528, "mean_token_accuracy": 0.7272329330444336, "num_tokens": 1036901.0, "step": 1276 }, { "epoch": 0.3384533898305085, "grad_norm": 1.6295344829559326, "learning_rate": 9.830905720338983e-06, "loss": 1.1808, "mean_token_accuracy": 0.7068195641040802, "num_tokens": 1038325.0, "step": 1278 }, { "epoch": 0.3389830508474576, "grad_norm": 2.0853092670440674, "learning_rate": 9.83064088983051e-06, "loss": 1.8123, "mean_token_accuracy": 0.6213087439537048, "num_tokens": 1039826.0, "step": 1280 }, { "epoch": 0.3395127118644068, "grad_norm": 1.8167997598648071, "learning_rate": 9.830376059322035e-06, "loss": 1.5003, "mean_token_accuracy": 0.6684191673994064, "num_tokens": 1041085.0, "step": 1282 }, { "epoch": 0.3400423728813559, "grad_norm": 1.9275468587875366, "learning_rate": 9.83011122881356e-06, "loss": 1.6496, "mean_token_accuracy": 0.6765923351049423, "num_tokens": 1042593.0, "step": 1284 }, { "epoch": 0.3405720338983051, "grad_norm": 1.9118810892105103, "learning_rate": 9.829846398305085e-06, "loss": 1.6758, "mean_token_accuracy": 0.6289241313934326, "num_tokens": 1044171.0, "step": 1286 }, { "epoch": 0.3411016949152542, "grad_norm": 1.5817738771438599, "learning_rate": 9.829581567796611e-06, "loss": 1.5159, "mean_token_accuracy": 0.6549912542104721, "num_tokens": 1045756.0, "step": 1288 }, { "epoch": 0.3416313559322034, "grad_norm": 1.8156007528305054, "learning_rate": 9.829316737288136e-06, "loss": 1.4458, "mean_token_accuracy": 0.6865904480218887, "num_tokens": 1047224.0, "step": 1290 }, { "epoch": 0.3421610169491525, "grad_norm": 1.7833994626998901, "learning_rate": 9.829051906779661e-06, "loss": 1.5898, "mean_token_accuracy": 0.6517780423164368, "num_tokens": 1048728.0, "step": 1292 }, { "epoch": 0.3426906779661017, "grad_norm": 1.9012064933776855, "learning_rate": 9.828787076271186e-06, "loss": 1.3805, "mean_token_accuracy": 0.6959599480032921, "num_tokens": 1050142.0, "step": 1294 }, { "epoch": 0.3432203389830508, "grad_norm": 1.6693214178085327, "learning_rate": 9.828522245762713e-06, "loss": 1.4319, "mean_token_accuracy": 0.703242838382721, "num_tokens": 1051765.0, "step": 1296 }, { "epoch": 0.34375, "grad_norm": 1.8357373476028442, "learning_rate": 9.828257415254238e-06, "loss": 1.4049, "mean_token_accuracy": 0.7030894532799721, "num_tokens": 1053157.0, "step": 1298 }, { "epoch": 0.3442796610169492, "grad_norm": 1.6614954471588135, "learning_rate": 9.827992584745764e-06, "loss": 1.1435, "mean_token_accuracy": 0.7373182848095894, "num_tokens": 1054824.0, "step": 1300 }, { "epoch": 0.3448093220338983, "grad_norm": 1.530680537223816, "learning_rate": 9.827727754237289e-06, "loss": 1.4071, "mean_token_accuracy": 0.6797198504209518, "num_tokens": 1056515.0, "step": 1302 }, { "epoch": 0.3453389830508475, "grad_norm": 1.6749030351638794, "learning_rate": 9.827462923728814e-06, "loss": 1.6956, "mean_token_accuracy": 0.6489793062210083, "num_tokens": 1058179.0, "step": 1304 }, { "epoch": 0.3458686440677966, "grad_norm": 1.8178738355636597, "learning_rate": 9.827198093220339e-06, "loss": 1.427, "mean_token_accuracy": 0.6895484775304794, "num_tokens": 1059728.0, "step": 1306 }, { "epoch": 0.3463983050847458, "grad_norm": 1.726342797279358, "learning_rate": 9.826933262711866e-06, "loss": 1.2739, "mean_token_accuracy": 0.7158145681023598, "num_tokens": 1061438.0, "step": 1308 }, { "epoch": 0.3469279661016949, "grad_norm": 1.717726707458496, "learning_rate": 9.82666843220339e-06, "loss": 1.3158, "mean_token_accuracy": 0.7018631398677826, "num_tokens": 1062837.0, "step": 1310 }, { "epoch": 0.3474576271186441, "grad_norm": 1.8794881105422974, "learning_rate": 9.826403601694915e-06, "loss": 1.4238, "mean_token_accuracy": 0.6863047704100609, "num_tokens": 1064560.0, "step": 1312 }, { "epoch": 0.3479872881355932, "grad_norm": 1.3744890689849854, "learning_rate": 9.826138771186442e-06, "loss": 1.1098, "mean_token_accuracy": 0.7373677864670753, "num_tokens": 1066114.0, "step": 1314 }, { "epoch": 0.3485169491525424, "grad_norm": 2.0578293800354004, "learning_rate": 9.825873940677967e-06, "loss": 1.1908, "mean_token_accuracy": 0.7216816022992134, "num_tokens": 1067585.0, "step": 1316 }, { "epoch": 0.3490466101694915, "grad_norm": 1.6551405191421509, "learning_rate": 9.825609110169494e-06, "loss": 0.9468, "mean_token_accuracy": 0.7818041741847992, "num_tokens": 1069034.0, "step": 1318 }, { "epoch": 0.3495762711864407, "grad_norm": 1.650794506072998, "learning_rate": 9.825344279661017e-06, "loss": 1.5909, "mean_token_accuracy": 0.6676042452454567, "num_tokens": 1070802.0, "step": 1320 }, { "epoch": 0.3501059322033898, "grad_norm": 1.6060055494308472, "learning_rate": 9.825079449152543e-06, "loss": 1.5133, "mean_token_accuracy": 0.6439795941114426, "num_tokens": 1072474.0, "step": 1322 }, { "epoch": 0.350635593220339, "grad_norm": 1.260301113128662, "learning_rate": 9.824814618644068e-06, "loss": 1.2519, "mean_token_accuracy": 0.7232806384563446, "num_tokens": 1074080.0, "step": 1324 }, { "epoch": 0.3511652542372881, "grad_norm": 1.5792280435562134, "learning_rate": 9.824549788135595e-06, "loss": 1.3399, "mean_token_accuracy": 0.7090883329510689, "num_tokens": 1075703.0, "step": 1326 }, { "epoch": 0.3516949152542373, "grad_norm": 2.022716999053955, "learning_rate": 9.82428495762712e-06, "loss": 1.0539, "mean_token_accuracy": 0.7478301748633385, "num_tokens": 1077076.0, "step": 1328 }, { "epoch": 0.3522245762711864, "grad_norm": 1.67029869556427, "learning_rate": 9.824020127118645e-06, "loss": 1.4874, "mean_token_accuracy": 0.6731366589665413, "num_tokens": 1078676.0, "step": 1330 }, { "epoch": 0.3527542372881356, "grad_norm": 1.5002703666687012, "learning_rate": 9.82375529661017e-06, "loss": 1.4975, "mean_token_accuracy": 0.6637377478182316, "num_tokens": 1080308.0, "step": 1332 }, { "epoch": 0.3532838983050847, "grad_norm": 1.296995759010315, "learning_rate": 9.823490466101696e-06, "loss": 1.3933, "mean_token_accuracy": 0.700856477022171, "num_tokens": 1081942.0, "step": 1334 }, { "epoch": 0.3538135593220339, "grad_norm": 1.8836252689361572, "learning_rate": 9.823225635593221e-06, "loss": 1.5411, "mean_token_accuracy": 0.6571707986295223, "num_tokens": 1083559.0, "step": 1336 }, { "epoch": 0.3543432203389831, "grad_norm": 2.013718605041504, "learning_rate": 9.822960805084746e-06, "loss": 1.655, "mean_token_accuracy": 0.6409268975257874, "num_tokens": 1084970.0, "step": 1338 }, { "epoch": 0.3548728813559322, "grad_norm": 1.539192795753479, "learning_rate": 9.822695974576271e-06, "loss": 1.3677, "mean_token_accuracy": 0.6880576834082603, "num_tokens": 1086374.0, "step": 1340 }, { "epoch": 0.3554025423728814, "grad_norm": 1.434101939201355, "learning_rate": 9.822431144067798e-06, "loss": 1.3137, "mean_token_accuracy": 0.7379110530018806, "num_tokens": 1087782.0, "step": 1342 }, { "epoch": 0.3559322033898305, "grad_norm": 1.592491865158081, "learning_rate": 9.822166313559323e-06, "loss": 1.2864, "mean_token_accuracy": 0.7089529037475586, "num_tokens": 1089381.0, "step": 1344 }, { "epoch": 0.3564618644067797, "grad_norm": 1.6607918739318848, "learning_rate": 9.821901483050848e-06, "loss": 0.97, "mean_token_accuracy": 0.7580206841230392, "num_tokens": 1090691.0, "step": 1346 }, { "epoch": 0.3569915254237288, "grad_norm": 1.6522917747497559, "learning_rate": 9.821636652542373e-06, "loss": 1.5447, "mean_token_accuracy": 0.6661188751459122, "num_tokens": 1092282.0, "step": 1348 }, { "epoch": 0.357521186440678, "grad_norm": 1.923994779586792, "learning_rate": 9.8213718220339e-06, "loss": 1.3626, "mean_token_accuracy": 0.7332223691046238, "num_tokens": 1093830.0, "step": 1350 }, { "epoch": 0.3580508474576271, "grad_norm": 1.507966160774231, "learning_rate": 9.821106991525424e-06, "loss": 1.4642, "mean_token_accuracy": 0.6839348897337914, "num_tokens": 1095794.0, "step": 1352 }, { "epoch": 0.3585805084745763, "grad_norm": 1.8993146419525146, "learning_rate": 9.82084216101695e-06, "loss": 1.2266, "mean_token_accuracy": 0.7428520172834396, "num_tokens": 1097354.0, "step": 1354 }, { "epoch": 0.3591101694915254, "grad_norm": 1.7444735765457153, "learning_rate": 9.820577330508476e-06, "loss": 1.43, "mean_token_accuracy": 0.6711094751954079, "num_tokens": 1099011.0, "step": 1356 }, { "epoch": 0.3596398305084746, "grad_norm": 1.703978180885315, "learning_rate": 9.8203125e-06, "loss": 1.7032, "mean_token_accuracy": 0.635410737246275, "num_tokens": 1100674.0, "step": 1358 }, { "epoch": 0.3601694915254237, "grad_norm": 1.442445993423462, "learning_rate": 9.820047669491526e-06, "loss": 1.1984, "mean_token_accuracy": 0.7305633276700974, "num_tokens": 1102508.0, "step": 1360 }, { "epoch": 0.3606991525423729, "grad_norm": 1.598874807357788, "learning_rate": 9.819782838983052e-06, "loss": 1.2875, "mean_token_accuracy": 0.671860683709383, "num_tokens": 1104233.0, "step": 1362 }, { "epoch": 0.361228813559322, "grad_norm": 1.9381864070892334, "learning_rate": 9.819518008474577e-06, "loss": 1.6134, "mean_token_accuracy": 0.6518260389566422, "num_tokens": 1105697.0, "step": 1364 }, { "epoch": 0.3617584745762712, "grad_norm": 1.624312162399292, "learning_rate": 9.819253177966102e-06, "loss": 1.7433, "mean_token_accuracy": 0.6194727048277855, "num_tokens": 1107236.0, "step": 1366 }, { "epoch": 0.3622881355932203, "grad_norm": 1.8892614841461182, "learning_rate": 9.818988347457627e-06, "loss": 1.084, "mean_token_accuracy": 0.7386355027556419, "num_tokens": 1108497.0, "step": 1368 }, { "epoch": 0.3628177966101695, "grad_norm": 1.8476266860961914, "learning_rate": 9.818723516949154e-06, "loss": 0.9219, "mean_token_accuracy": 0.7562163695693016, "num_tokens": 1110500.0, "step": 1370 }, { "epoch": 0.3633474576271186, "grad_norm": 1.4750697612762451, "learning_rate": 9.818458686440679e-06, "loss": 1.2905, "mean_token_accuracy": 0.6843155398964882, "num_tokens": 1112231.0, "step": 1372 }, { "epoch": 0.3638771186440678, "grad_norm": 1.582957148551941, "learning_rate": 9.818193855932204e-06, "loss": 1.1592, "mean_token_accuracy": 0.7339979112148285, "num_tokens": 1113618.0, "step": 1374 }, { "epoch": 0.3644067796610169, "grad_norm": 1.308117389678955, "learning_rate": 9.817929025423728e-06, "loss": 0.9233, "mean_token_accuracy": 0.7671405300498009, "num_tokens": 1115301.0, "step": 1376 }, { "epoch": 0.3649364406779661, "grad_norm": 1.618547797203064, "learning_rate": 9.817664194915255e-06, "loss": 1.1395, "mean_token_accuracy": 0.7533803209662437, "num_tokens": 1116764.0, "step": 1378 }, { "epoch": 0.3654661016949153, "grad_norm": 0.987407922744751, "learning_rate": 9.81739936440678e-06, "loss": 1.0018, "mean_token_accuracy": 0.7190834656357765, "num_tokens": 1118778.0, "step": 1380 }, { "epoch": 0.3659957627118644, "grad_norm": 1.8736051321029663, "learning_rate": 9.817134533898307e-06, "loss": 1.468, "mean_token_accuracy": 0.6704383157193661, "num_tokens": 1120635.0, "step": 1382 }, { "epoch": 0.3665254237288136, "grad_norm": 1.3499995470046997, "learning_rate": 9.816869703389832e-06, "loss": 1.2001, "mean_token_accuracy": 0.7170996814966202, "num_tokens": 1122109.0, "step": 1384 }, { "epoch": 0.3670550847457627, "grad_norm": 1.4757649898529053, "learning_rate": 9.816604872881356e-06, "loss": 1.3542, "mean_token_accuracy": 0.685465969145298, "num_tokens": 1123855.0, "step": 1386 }, { "epoch": 0.3675847457627119, "grad_norm": 1.3246570825576782, "learning_rate": 9.816340042372881e-06, "loss": 0.8035, "mean_token_accuracy": 0.7883419618010521, "num_tokens": 1125126.0, "step": 1388 }, { "epoch": 0.368114406779661, "grad_norm": 1.7484229803085327, "learning_rate": 9.816075211864408e-06, "loss": 1.5304, "mean_token_accuracy": 0.6612164750695229, "num_tokens": 1126496.0, "step": 1390 }, { "epoch": 0.3686440677966102, "grad_norm": 1.6453028917312622, "learning_rate": 9.815810381355933e-06, "loss": 0.9475, "mean_token_accuracy": 0.77399031072855, "num_tokens": 1128116.0, "step": 1392 }, { "epoch": 0.3691737288135593, "grad_norm": 1.7378848791122437, "learning_rate": 9.815545550847458e-06, "loss": 0.8751, "mean_token_accuracy": 0.7826608568429947, "num_tokens": 1129800.0, "step": 1394 }, { "epoch": 0.3697033898305085, "grad_norm": 1.8532065153121948, "learning_rate": 9.815280720338985e-06, "loss": 1.1688, "mean_token_accuracy": 0.7154687196016312, "num_tokens": 1131265.0, "step": 1396 }, { "epoch": 0.3702330508474576, "grad_norm": 1.7123346328735352, "learning_rate": 9.81501588983051e-06, "loss": 1.5678, "mean_token_accuracy": 0.6813063621520996, "num_tokens": 1132791.0, "step": 1398 }, { "epoch": 0.3707627118644068, "grad_norm": 1.9690678119659424, "learning_rate": 9.814751059322034e-06, "loss": 1.251, "mean_token_accuracy": 0.7332608103752136, "num_tokens": 1134211.0, "step": 1400 }, { "epoch": 0.3712923728813559, "grad_norm": 2.0257019996643066, "learning_rate": 9.81448622881356e-06, "loss": 1.5224, "mean_token_accuracy": 0.6583033800125122, "num_tokens": 1135943.0, "step": 1402 }, { "epoch": 0.3718220338983051, "grad_norm": 2.462228536605835, "learning_rate": 9.814221398305086e-06, "loss": 1.7989, "mean_token_accuracy": 0.6125912964344025, "num_tokens": 1137452.0, "step": 1404 }, { "epoch": 0.3723516949152542, "grad_norm": 1.8961074352264404, "learning_rate": 9.81395656779661e-06, "loss": 1.5779, "mean_token_accuracy": 0.6513666212558746, "num_tokens": 1138860.0, "step": 1406 }, { "epoch": 0.3728813559322034, "grad_norm": 2.3757221698760986, "learning_rate": 9.813691737288137e-06, "loss": 1.3879, "mean_token_accuracy": 0.6808147504925728, "num_tokens": 1140281.0, "step": 1408 }, { "epoch": 0.3734110169491525, "grad_norm": 1.4853894710540771, "learning_rate": 9.813426906779662e-06, "loss": 1.543, "mean_token_accuracy": 0.6794335842132568, "num_tokens": 1142036.0, "step": 1410 }, { "epoch": 0.3739406779661017, "grad_norm": 1.5034842491149902, "learning_rate": 9.813162076271187e-06, "loss": 1.0096, "mean_token_accuracy": 0.7605720013380051, "num_tokens": 1143430.0, "step": 1412 }, { "epoch": 0.3744703389830508, "grad_norm": 1.8408870697021484, "learning_rate": 9.812897245762712e-06, "loss": 1.4991, "mean_token_accuracy": 0.6645800471305847, "num_tokens": 1144866.0, "step": 1414 }, { "epoch": 0.375, "grad_norm": 1.8396223783493042, "learning_rate": 9.812632415254239e-06, "loss": 1.5654, "mean_token_accuracy": 0.6280973702669144, "num_tokens": 1146723.0, "step": 1416 }, { "epoch": 0.3755296610169492, "grad_norm": 1.379517912864685, "learning_rate": 9.812367584745764e-06, "loss": 0.9468, "mean_token_accuracy": 0.772427998483181, "num_tokens": 1148021.0, "step": 1418 }, { "epoch": 0.3760593220338983, "grad_norm": 1.6864473819732666, "learning_rate": 9.812102754237289e-06, "loss": 1.0732, "mean_token_accuracy": 0.7366140261292458, "num_tokens": 1149419.0, "step": 1420 }, { "epoch": 0.3765889830508475, "grad_norm": 1.7352112531661987, "learning_rate": 9.811837923728814e-06, "loss": 1.8677, "mean_token_accuracy": 0.6091560572385788, "num_tokens": 1151040.0, "step": 1422 }, { "epoch": 0.3771186440677966, "grad_norm": 2.201453685760498, "learning_rate": 9.81157309322034e-06, "loss": 1.4183, "mean_token_accuracy": 0.6683969646692276, "num_tokens": 1152627.0, "step": 1424 }, { "epoch": 0.3776483050847458, "grad_norm": 1.7740710973739624, "learning_rate": 9.811308262711865e-06, "loss": 1.4568, "mean_token_accuracy": 0.701896995306015, "num_tokens": 1154107.0, "step": 1426 }, { "epoch": 0.3781779661016949, "grad_norm": 1.6332138776779175, "learning_rate": 9.81104343220339e-06, "loss": 1.4032, "mean_token_accuracy": 0.6910844631493092, "num_tokens": 1155748.0, "step": 1428 }, { "epoch": 0.3787076271186441, "grad_norm": 1.8542091846466064, "learning_rate": 9.810778601694915e-06, "loss": 1.4314, "mean_token_accuracy": 0.6639650017023087, "num_tokens": 1157331.0, "step": 1430 }, { "epoch": 0.3792372881355932, "grad_norm": 2.0234577655792236, "learning_rate": 9.810513771186442e-06, "loss": 1.295, "mean_token_accuracy": 0.7073369920253754, "num_tokens": 1158910.0, "step": 1432 }, { "epoch": 0.3797669491525424, "grad_norm": 1.791063666343689, "learning_rate": 9.810248940677967e-06, "loss": 1.4829, "mean_token_accuracy": 0.6849559769034386, "num_tokens": 1160423.0, "step": 1434 }, { "epoch": 0.3802966101694915, "grad_norm": 1.6156909465789795, "learning_rate": 9.809984110169493e-06, "loss": 1.4179, "mean_token_accuracy": 0.7092466652393341, "num_tokens": 1161947.0, "step": 1436 }, { "epoch": 0.3808262711864407, "grad_norm": 1.94063138961792, "learning_rate": 9.809719279661018e-06, "loss": 1.3149, "mean_token_accuracy": 0.7113095782697201, "num_tokens": 1163265.0, "step": 1438 }, { "epoch": 0.3813559322033898, "grad_norm": 2.145890235900879, "learning_rate": 9.809454449152543e-06, "loss": 1.6444, "mean_token_accuracy": 0.635151993483305, "num_tokens": 1164919.0, "step": 1440 }, { "epoch": 0.381885593220339, "grad_norm": 1.8890119791030884, "learning_rate": 9.809189618644068e-06, "loss": 1.6791, "mean_token_accuracy": 0.6292555332183838, "num_tokens": 1166671.0, "step": 1442 }, { "epoch": 0.3824152542372881, "grad_norm": 1.8272807598114014, "learning_rate": 9.808924788135595e-06, "loss": 1.4807, "mean_token_accuracy": 0.6665691584348679, "num_tokens": 1168345.0, "step": 1444 }, { "epoch": 0.3829449152542373, "grad_norm": 2.3270390033721924, "learning_rate": 9.80865995762712e-06, "loss": 1.512, "mean_token_accuracy": 0.6936564408242702, "num_tokens": 1169916.0, "step": 1446 }, { "epoch": 0.3834745762711864, "grad_norm": 1.7452815771102905, "learning_rate": 9.808395127118645e-06, "loss": 1.4589, "mean_token_accuracy": 0.6602049581706524, "num_tokens": 1171482.0, "step": 1448 }, { "epoch": 0.3840042372881356, "grad_norm": 1.6493669748306274, "learning_rate": 9.80813029661017e-06, "loss": 1.2923, "mean_token_accuracy": 0.7041742131114006, "num_tokens": 1172929.0, "step": 1450 }, { "epoch": 0.3845338983050847, "grad_norm": 1.757163405418396, "learning_rate": 9.807865466101696e-06, "loss": 1.5842, "mean_token_accuracy": 0.6926102414727211, "num_tokens": 1174466.0, "step": 1452 }, { "epoch": 0.3850635593220339, "grad_norm": 1.656572937965393, "learning_rate": 9.807600635593221e-06, "loss": 1.7011, "mean_token_accuracy": 0.6274440065026283, "num_tokens": 1175998.0, "step": 1454 }, { "epoch": 0.3855932203389831, "grad_norm": 1.5688221454620361, "learning_rate": 9.807335805084746e-06, "loss": 1.9801, "mean_token_accuracy": 0.5886626616120338, "num_tokens": 1177765.0, "step": 1456 }, { "epoch": 0.3861228813559322, "grad_norm": 1.5039767026901245, "learning_rate": 9.807070974576271e-06, "loss": 1.3298, "mean_token_accuracy": 0.7000409439206123, "num_tokens": 1179315.0, "step": 1458 }, { "epoch": 0.3866525423728814, "grad_norm": 1.7097017765045166, "learning_rate": 9.806806144067797e-06, "loss": 1.3956, "mean_token_accuracy": 0.7114788070321083, "num_tokens": 1180969.0, "step": 1460 }, { "epoch": 0.3871822033898305, "grad_norm": 1.7143226861953735, "learning_rate": 9.806541313559322e-06, "loss": 1.5324, "mean_token_accuracy": 0.6636186316609383, "num_tokens": 1182853.0, "step": 1462 }, { "epoch": 0.3877118644067797, "grad_norm": 1.727990746498108, "learning_rate": 9.806276483050849e-06, "loss": 1.4364, "mean_token_accuracy": 0.6970421969890594, "num_tokens": 1184340.0, "step": 1464 }, { "epoch": 0.3882415254237288, "grad_norm": 1.5631020069122314, "learning_rate": 9.806011652542374e-06, "loss": 1.1571, "mean_token_accuracy": 0.7280039489269257, "num_tokens": 1185865.0, "step": 1466 }, { "epoch": 0.388771186440678, "grad_norm": 1.598578929901123, "learning_rate": 9.805746822033899e-06, "loss": 1.1733, "mean_token_accuracy": 0.7362649291753769, "num_tokens": 1187449.0, "step": 1468 }, { "epoch": 0.3893008474576271, "grad_norm": 1.7904332876205444, "learning_rate": 9.805481991525424e-06, "loss": 1.8743, "mean_token_accuracy": 0.6042033843696117, "num_tokens": 1189008.0, "step": 1470 }, { "epoch": 0.3898305084745763, "grad_norm": 1.7442598342895508, "learning_rate": 9.80521716101695e-06, "loss": 1.9402, "mean_token_accuracy": 0.6063378863036633, "num_tokens": 1190939.0, "step": 1472 }, { "epoch": 0.3903601694915254, "grad_norm": 2.1300361156463623, "learning_rate": 9.804952330508475e-06, "loss": 1.0689, "mean_token_accuracy": 0.7497252225875854, "num_tokens": 1192325.0, "step": 1474 }, { "epoch": 0.3908898305084746, "grad_norm": 1.5279004573822021, "learning_rate": 9.8046875e-06, "loss": 1.6833, "mean_token_accuracy": 0.6188357323408127, "num_tokens": 1194313.0, "step": 1476 }, { "epoch": 0.3914194915254237, "grad_norm": 1.6100242137908936, "learning_rate": 9.804422669491527e-06, "loss": 1.1739, "mean_token_accuracy": 0.7441292405128479, "num_tokens": 1195907.0, "step": 1478 }, { "epoch": 0.3919491525423729, "grad_norm": 1.6625827550888062, "learning_rate": 9.804157838983052e-06, "loss": 1.3983, "mean_token_accuracy": 0.6815410777926445, "num_tokens": 1197527.0, "step": 1480 }, { "epoch": 0.392478813559322, "grad_norm": 1.521419644355774, "learning_rate": 9.803893008474577e-06, "loss": 1.5085, "mean_token_accuracy": 0.6911141499876976, "num_tokens": 1198991.0, "step": 1482 }, { "epoch": 0.3930084745762712, "grad_norm": 1.8664625883102417, "learning_rate": 9.803628177966102e-06, "loss": 1.235, "mean_token_accuracy": 0.6946830078959465, "num_tokens": 1200801.0, "step": 1484 }, { "epoch": 0.3935381355932203, "grad_norm": 1.808510661125183, "learning_rate": 9.803363347457628e-06, "loss": 1.3601, "mean_token_accuracy": 0.696448877453804, "num_tokens": 1202523.0, "step": 1486 }, { "epoch": 0.3940677966101695, "grad_norm": 1.5490500926971436, "learning_rate": 9.803098516949153e-06, "loss": 1.2584, "mean_token_accuracy": 0.6944599002599716, "num_tokens": 1204166.0, "step": 1488 }, { "epoch": 0.3945974576271186, "grad_norm": 1.6964247226715088, "learning_rate": 9.80283368644068e-06, "loss": 1.5958, "mean_token_accuracy": 0.6666499078273773, "num_tokens": 1205945.0, "step": 1490 }, { "epoch": 0.3951271186440678, "grad_norm": 1.4197559356689453, "learning_rate": 9.802568855932205e-06, "loss": 0.9691, "mean_token_accuracy": 0.7615212872624397, "num_tokens": 1207360.0, "step": 1492 }, { "epoch": 0.3956567796610169, "grad_norm": 1.7901649475097656, "learning_rate": 9.80230402542373e-06, "loss": 1.5098, "mean_token_accuracy": 0.664769671857357, "num_tokens": 1208857.0, "step": 1494 }, { "epoch": 0.3961864406779661, "grad_norm": 1.9837372303009033, "learning_rate": 9.802039194915255e-06, "loss": 1.5927, "mean_token_accuracy": 0.6660583093762398, "num_tokens": 1210214.0, "step": 1496 }, { "epoch": 0.3967161016949153, "grad_norm": 1.6710950136184692, "learning_rate": 9.801774364406781e-06, "loss": 1.3736, "mean_token_accuracy": 0.6990925036370754, "num_tokens": 1211834.0, "step": 1498 }, { "epoch": 0.3972457627118644, "grad_norm": 1.3454054594039917, "learning_rate": 9.801509533898306e-06, "loss": 1.2701, "step": 1500 }, { "epoch": 0.3972457627118644, "eval_loss": 1.3444522619247437, "eval_mean_token_accuracy": 0.6955855406530491, "eval_num_tokens": 1213349.0, "eval_runtime": 48.294, "eval_samples_per_second": 6.378, "eval_steps_per_second": 6.378, "step": 1500 }, { "epoch": 0.3977754237288136, "grad_norm": 1.9824166297912598, "learning_rate": 9.801244703389831e-06, "loss": 1.6389, "mean_token_accuracy": 0.6930784750729799, "num_tokens": 1215125.0, "step": 1502 }, { "epoch": 0.3983050847457627, "grad_norm": 1.8930481672286987, "learning_rate": 9.800979872881356e-06, "loss": 1.6953, "mean_token_accuracy": 0.6512252688407898, "num_tokens": 1216536.0, "step": 1504 }, { "epoch": 0.3988347457627119, "grad_norm": 1.3300050497055054, "learning_rate": 9.800715042372883e-06, "loss": 1.1727, "mean_token_accuracy": 0.7024733871221542, "num_tokens": 1218105.0, "step": 1506 }, { "epoch": 0.399364406779661, "grad_norm": 1.7348800897598267, "learning_rate": 9.800450211864408e-06, "loss": 1.3462, "mean_token_accuracy": 0.6950579658150673, "num_tokens": 1219783.0, "step": 1508 }, { "epoch": 0.3998940677966102, "grad_norm": 1.6764739751815796, "learning_rate": 9.800185381355933e-06, "loss": 1.329, "mean_token_accuracy": 0.7096819058060646, "num_tokens": 1221345.0, "step": 1510 }, { "epoch": 0.4004237288135593, "grad_norm": 1.93403160572052, "learning_rate": 9.799920550847458e-06, "loss": 1.6324, "mean_token_accuracy": 0.6307551749050617, "num_tokens": 1222938.0, "step": 1512 }, { "epoch": 0.4009533898305085, "grad_norm": 1.628515601158142, "learning_rate": 9.799655720338984e-06, "loss": 1.6643, "mean_token_accuracy": 0.6470360159873962, "num_tokens": 1224449.0, "step": 1514 }, { "epoch": 0.4014830508474576, "grad_norm": 1.4391148090362549, "learning_rate": 9.799390889830509e-06, "loss": 1.3268, "mean_token_accuracy": 0.7054315134882927, "num_tokens": 1226151.0, "step": 1516 }, { "epoch": 0.4020127118644068, "grad_norm": 1.5161540508270264, "learning_rate": 9.799126059322036e-06, "loss": 0.9561, "mean_token_accuracy": 0.753193773329258, "num_tokens": 1227747.0, "step": 1518 }, { "epoch": 0.4025423728813559, "grad_norm": 1.961499810218811, "learning_rate": 9.79886122881356e-06, "loss": 1.2272, "mean_token_accuracy": 0.7214312478899956, "num_tokens": 1229159.0, "step": 1520 }, { "epoch": 0.4030720338983051, "grad_norm": 1.7978200912475586, "learning_rate": 9.798596398305086e-06, "loss": 1.0604, "mean_token_accuracy": 0.7444168701767921, "num_tokens": 1230740.0, "step": 1522 }, { "epoch": 0.4036016949152542, "grad_norm": 1.750644326210022, "learning_rate": 9.79833156779661e-06, "loss": 1.0159, "mean_token_accuracy": 0.7490158379077911, "num_tokens": 1232356.0, "step": 1524 }, { "epoch": 0.4041313559322034, "grad_norm": 2.066990613937378, "learning_rate": 9.798066737288137e-06, "loss": 1.6128, "mean_token_accuracy": 0.6461581811308861, "num_tokens": 1233851.0, "step": 1526 }, { "epoch": 0.4046610169491525, "grad_norm": 1.6663082838058472, "learning_rate": 9.797801906779662e-06, "loss": 0.9955, "mean_token_accuracy": 0.7524760514497757, "num_tokens": 1235442.0, "step": 1528 }, { "epoch": 0.4051906779661017, "grad_norm": 1.8017226457595825, "learning_rate": 9.797537076271187e-06, "loss": 1.3627, "mean_token_accuracy": 0.6951692551374435, "num_tokens": 1236854.0, "step": 1530 }, { "epoch": 0.4057203389830508, "grad_norm": 1.6061216592788696, "learning_rate": 9.797272245762712e-06, "loss": 1.102, "mean_token_accuracy": 0.74770238250494, "num_tokens": 1238575.0, "step": 1532 }, { "epoch": 0.40625, "grad_norm": 1.4825836420059204, "learning_rate": 9.797007415254238e-06, "loss": 1.5959, "mean_token_accuracy": 0.6324769482016563, "num_tokens": 1240331.0, "step": 1534 }, { "epoch": 0.4067796610169492, "grad_norm": 1.7276365756988525, "learning_rate": 9.796742584745763e-06, "loss": 1.3396, "mean_token_accuracy": 0.699121467769146, "num_tokens": 1242059.0, "step": 1536 }, { "epoch": 0.4073093220338983, "grad_norm": 1.6184040307998657, "learning_rate": 9.796477754237288e-06, "loss": 1.5542, "mean_token_accuracy": 0.663873665034771, "num_tokens": 1243885.0, "step": 1538 }, { "epoch": 0.4078389830508475, "grad_norm": 1.9388718605041504, "learning_rate": 9.796212923728813e-06, "loss": 1.4712, "mean_token_accuracy": 0.6748085394501686, "num_tokens": 1245526.0, "step": 1540 }, { "epoch": 0.4083686440677966, "grad_norm": 1.386449933052063, "learning_rate": 9.79594809322034e-06, "loss": 1.1931, "mean_token_accuracy": 0.7297926843166351, "num_tokens": 1247300.0, "step": 1542 }, { "epoch": 0.4088983050847458, "grad_norm": 1.2258257865905762, "learning_rate": 9.795683262711865e-06, "loss": 1.0642, "mean_token_accuracy": 0.7679282799363136, "num_tokens": 1248919.0, "step": 1544 }, { "epoch": 0.4094279661016949, "grad_norm": 1.6368130445480347, "learning_rate": 9.795418432203391e-06, "loss": 1.5017, "mean_token_accuracy": 0.6697660908102989, "num_tokens": 1250760.0, "step": 1546 }, { "epoch": 0.4099576271186441, "grad_norm": 2.251760244369507, "learning_rate": 9.795153601694916e-06, "loss": 1.4824, "mean_token_accuracy": 0.6787116229534149, "num_tokens": 1252099.0, "step": 1548 }, { "epoch": 0.4104872881355932, "grad_norm": 2.1314713954925537, "learning_rate": 9.794888771186441e-06, "loss": 1.6782, "mean_token_accuracy": 0.6329032555222511, "num_tokens": 1253777.0, "step": 1550 }, { "epoch": 0.4110169491525424, "grad_norm": 1.7045215368270874, "learning_rate": 9.794623940677966e-06, "loss": 1.1925, "mean_token_accuracy": 0.7374773472547531, "num_tokens": 1255166.0, "step": 1552 }, { "epoch": 0.4115466101694915, "grad_norm": 1.384372591972351, "learning_rate": 9.794359110169493e-06, "loss": 0.7494, "mean_token_accuracy": 0.8033151403069496, "num_tokens": 1256582.0, "step": 1554 }, { "epoch": 0.4120762711864407, "grad_norm": 2.235893487930298, "learning_rate": 9.794094279661018e-06, "loss": 1.3989, "mean_token_accuracy": 0.7033841833472252, "num_tokens": 1258085.0, "step": 1556 }, { "epoch": 0.4126059322033898, "grad_norm": 1.6253751516342163, "learning_rate": 9.793829449152543e-06, "loss": 1.3422, "mean_token_accuracy": 0.7014637589454651, "num_tokens": 1259427.0, "step": 1558 }, { "epoch": 0.413135593220339, "grad_norm": 1.6126431226730347, "learning_rate": 9.793564618644068e-06, "loss": 1.4768, "mean_token_accuracy": 0.6606144458055496, "num_tokens": 1261305.0, "step": 1560 }, { "epoch": 0.4136652542372881, "grad_norm": 1.6089359521865845, "learning_rate": 9.793299788135594e-06, "loss": 1.1773, "mean_token_accuracy": 0.7375832423567772, "num_tokens": 1262672.0, "step": 1562 }, { "epoch": 0.4141949152542373, "grad_norm": 1.4223570823669434, "learning_rate": 9.79303495762712e-06, "loss": 1.0401, "mean_token_accuracy": 0.7467959597706795, "num_tokens": 1264148.0, "step": 1564 }, { "epoch": 0.4147245762711864, "grad_norm": 1.9851726293563843, "learning_rate": 9.792770127118644e-06, "loss": 1.7253, "mean_token_accuracy": 0.6353067830204964, "num_tokens": 1265904.0, "step": 1566 }, { "epoch": 0.4152542372881356, "grad_norm": 1.7446296215057373, "learning_rate": 9.79250529661017e-06, "loss": 1.3408, "mean_token_accuracy": 0.698583822697401, "num_tokens": 1268037.0, "step": 1568 }, { "epoch": 0.4157838983050847, "grad_norm": 2.250060558319092, "learning_rate": 9.792240466101696e-06, "loss": 1.4502, "mean_token_accuracy": 0.6765835657715797, "num_tokens": 1269379.0, "step": 1570 }, { "epoch": 0.4163135593220339, "grad_norm": 2.355703592300415, "learning_rate": 9.791975635593222e-06, "loss": 1.8168, "mean_token_accuracy": 0.623588465154171, "num_tokens": 1270893.0, "step": 1572 }, { "epoch": 0.4168432203389831, "grad_norm": 1.7303513288497925, "learning_rate": 9.791710805084747e-06, "loss": 1.385, "mean_token_accuracy": 0.7018922045826912, "num_tokens": 1272277.0, "step": 1574 }, { "epoch": 0.4173728813559322, "grad_norm": 1.925964593887329, "learning_rate": 9.791445974576272e-06, "loss": 0.8786, "mean_token_accuracy": 0.7712843269109726, "num_tokens": 1273546.0, "step": 1576 }, { "epoch": 0.4179025423728814, "grad_norm": 1.4376938343048096, "learning_rate": 9.791181144067797e-06, "loss": 0.8927, "mean_token_accuracy": 0.7681245133280754, "num_tokens": 1274987.0, "step": 1578 }, { "epoch": 0.4184322033898305, "grad_norm": 1.6777201890945435, "learning_rate": 9.790916313559324e-06, "loss": 1.3639, "mean_token_accuracy": 0.6965077668428421, "num_tokens": 1276733.0, "step": 1580 }, { "epoch": 0.4189618644067797, "grad_norm": 1.195625901222229, "learning_rate": 9.790651483050849e-06, "loss": 0.8363, "mean_token_accuracy": 0.7895467132329941, "num_tokens": 1278704.0, "step": 1582 }, { "epoch": 0.4194915254237288, "grad_norm": 1.8651041984558105, "learning_rate": 9.790386652542374e-06, "loss": 1.6715, "mean_token_accuracy": 0.6283318474888802, "num_tokens": 1280437.0, "step": 1584 }, { "epoch": 0.420021186440678, "grad_norm": 1.6108791828155518, "learning_rate": 9.790121822033899e-06, "loss": 1.3234, "mean_token_accuracy": 0.6945262104272842, "num_tokens": 1282246.0, "step": 1586 }, { "epoch": 0.4205508474576271, "grad_norm": 1.5220295190811157, "learning_rate": 9.789856991525425e-06, "loss": 1.0465, "mean_token_accuracy": 0.7741525433957577, "num_tokens": 1283844.0, "step": 1588 }, { "epoch": 0.4210805084745763, "grad_norm": 1.7936218976974487, "learning_rate": 9.78959216101695e-06, "loss": 1.5347, "mean_token_accuracy": 0.6462323255836964, "num_tokens": 1285531.0, "step": 1590 }, { "epoch": 0.4216101694915254, "grad_norm": 1.3674556016921997, "learning_rate": 9.789327330508475e-06, "loss": 1.2088, "mean_token_accuracy": 0.7438443526625633, "num_tokens": 1286929.0, "step": 1592 }, { "epoch": 0.4221398305084746, "grad_norm": 1.4865379333496094, "learning_rate": 9.7890625e-06, "loss": 0.9193, "mean_token_accuracy": 0.7883015424013138, "num_tokens": 1288807.0, "step": 1594 }, { "epoch": 0.4226694915254237, "grad_norm": 1.843950629234314, "learning_rate": 9.788797669491527e-06, "loss": 1.0691, "mean_token_accuracy": 0.7501823678612709, "num_tokens": 1290163.0, "step": 1596 }, { "epoch": 0.4231991525423729, "grad_norm": 1.669131875038147, "learning_rate": 9.788532838983051e-06, "loss": 1.2328, "mean_token_accuracy": 0.7069834545254707, "num_tokens": 1291553.0, "step": 1598 }, { "epoch": 0.423728813559322, "grad_norm": 1.3807752132415771, "learning_rate": 9.788268008474578e-06, "loss": 1.3645, "mean_token_accuracy": 0.6859980039298534, "num_tokens": 1293308.0, "step": 1600 }, { "epoch": 0.4242584745762712, "grad_norm": 1.6034599542617798, "learning_rate": 9.788003177966103e-06, "loss": 1.3293, "mean_token_accuracy": 0.6789006367325783, "num_tokens": 1294720.0, "step": 1602 }, { "epoch": 0.4247881355932203, "grad_norm": 1.5239371061325073, "learning_rate": 9.787738347457628e-06, "loss": 0.8685, "mean_token_accuracy": 0.7827233150601387, "num_tokens": 1296367.0, "step": 1604 }, { "epoch": 0.4253177966101695, "grad_norm": 1.4315828084945679, "learning_rate": 9.787473516949153e-06, "loss": 1.3472, "mean_token_accuracy": 0.6953053027391434, "num_tokens": 1298046.0, "step": 1606 }, { "epoch": 0.4258474576271186, "grad_norm": 1.583893060684204, "learning_rate": 9.78720868644068e-06, "loss": 1.3996, "mean_token_accuracy": 0.6867002546787262, "num_tokens": 1299720.0, "step": 1608 }, { "epoch": 0.4263771186440678, "grad_norm": 2.1103620529174805, "learning_rate": 9.786943855932204e-06, "loss": 1.2172, "mean_token_accuracy": 0.7087855115532875, "num_tokens": 1301697.0, "step": 1610 }, { "epoch": 0.4269067796610169, "grad_norm": 1.53180730342865, "learning_rate": 9.78667902542373e-06, "loss": 1.4858, "mean_token_accuracy": 0.6771590709686279, "num_tokens": 1303395.0, "step": 1612 }, { "epoch": 0.4274364406779661, "grad_norm": 1.9592207670211792, "learning_rate": 9.786414194915254e-06, "loss": 1.4334, "mean_token_accuracy": 0.6753050684928894, "num_tokens": 1304834.0, "step": 1614 }, { "epoch": 0.4279661016949153, "grad_norm": 1.323996901512146, "learning_rate": 9.786149364406781e-06, "loss": 1.209, "mean_token_accuracy": 0.6912725567817688, "num_tokens": 1306626.0, "step": 1616 }, { "epoch": 0.4284957627118644, "grad_norm": 1.3585567474365234, "learning_rate": 9.785884533898306e-06, "loss": 0.8617, "mean_token_accuracy": 0.7819287702441216, "num_tokens": 1308250.0, "step": 1618 }, { "epoch": 0.4290254237288136, "grad_norm": 1.394649624824524, "learning_rate": 9.78561970338983e-06, "loss": 1.1257, "mean_token_accuracy": 0.7308533787727356, "num_tokens": 1309906.0, "step": 1620 }, { "epoch": 0.4295550847457627, "grad_norm": 1.6092897653579712, "learning_rate": 9.785354872881356e-06, "loss": 1.0719, "mean_token_accuracy": 0.741239883005619, "num_tokens": 1311626.0, "step": 1622 }, { "epoch": 0.4300847457627119, "grad_norm": 1.4813737869262695, "learning_rate": 9.785090042372882e-06, "loss": 1.686, "mean_token_accuracy": 0.637068510055542, "num_tokens": 1313133.0, "step": 1624 }, { "epoch": 0.430614406779661, "grad_norm": 1.4408694505691528, "learning_rate": 9.784825211864407e-06, "loss": 1.1948, "mean_token_accuracy": 0.7626707181334496, "num_tokens": 1314786.0, "step": 1626 }, { "epoch": 0.4311440677966102, "grad_norm": 2.463191509246826, "learning_rate": 9.784560381355934e-06, "loss": 1.7533, "mean_token_accuracy": 0.6374395489692688, "num_tokens": 1316467.0, "step": 1628 }, { "epoch": 0.4316737288135593, "grad_norm": 1.5128530263900757, "learning_rate": 9.784295550847457e-06, "loss": 0.8151, "mean_token_accuracy": 0.7932421267032623, "num_tokens": 1317964.0, "step": 1630 }, { "epoch": 0.4322033898305085, "grad_norm": 1.7706862688064575, "learning_rate": 9.784030720338984e-06, "loss": 1.6214, "mean_token_accuracy": 0.6610933393239975, "num_tokens": 1319541.0, "step": 1632 }, { "epoch": 0.4327330508474576, "grad_norm": 1.5175938606262207, "learning_rate": 9.783765889830509e-06, "loss": 1.1273, "mean_token_accuracy": 0.7146413177251816, "num_tokens": 1321146.0, "step": 1634 }, { "epoch": 0.4332627118644068, "grad_norm": 1.5122501850128174, "learning_rate": 9.783501059322035e-06, "loss": 1.3591, "mean_token_accuracy": 0.7049078345298767, "num_tokens": 1322898.0, "step": 1636 }, { "epoch": 0.4337923728813559, "grad_norm": 2.34224009513855, "learning_rate": 9.78323622881356e-06, "loss": 1.1147, "mean_token_accuracy": 0.7392509058117867, "num_tokens": 1324464.0, "step": 1638 }, { "epoch": 0.4343220338983051, "grad_norm": 1.7911791801452637, "learning_rate": 9.782971398305085e-06, "loss": 1.2362, "mean_token_accuracy": 0.7129024714231491, "num_tokens": 1325906.0, "step": 1640 }, { "epoch": 0.4348516949152542, "grad_norm": 1.6146841049194336, "learning_rate": 9.78270656779661e-06, "loss": 0.955, "mean_token_accuracy": 0.761175125837326, "num_tokens": 1327632.0, "step": 1642 }, { "epoch": 0.4353813559322034, "grad_norm": 1.536130666732788, "learning_rate": 9.782441737288137e-06, "loss": 1.2226, "mean_token_accuracy": 0.7074310854077339, "num_tokens": 1329213.0, "step": 1644 }, { "epoch": 0.4359110169491525, "grad_norm": 1.3472086191177368, "learning_rate": 9.782176906779662e-06, "loss": 1.1005, "mean_token_accuracy": 0.7456482127308846, "num_tokens": 1330907.0, "step": 1646 }, { "epoch": 0.4364406779661017, "grad_norm": 1.832914113998413, "learning_rate": 9.781912076271187e-06, "loss": 1.4767, "mean_token_accuracy": 0.6495065316557884, "num_tokens": 1332685.0, "step": 1648 }, { "epoch": 0.4369703389830508, "grad_norm": 1.6807862520217896, "learning_rate": 9.781647245762713e-06, "loss": 1.656, "mean_token_accuracy": 0.6623229086399078, "num_tokens": 1334346.0, "step": 1650 }, { "epoch": 0.4375, "grad_norm": 2.473912477493286, "learning_rate": 9.781382415254238e-06, "loss": 1.9018, "mean_token_accuracy": 0.6264105625450611, "num_tokens": 1335780.0, "step": 1652 }, { "epoch": 0.4380296610169492, "grad_norm": 1.7851107120513916, "learning_rate": 9.781117584745765e-06, "loss": 1.7737, "mean_token_accuracy": 0.6218627989292145, "num_tokens": 1337491.0, "step": 1654 }, { "epoch": 0.4385593220338983, "grad_norm": 2.1331710815429688, "learning_rate": 9.78085275423729e-06, "loss": 1.7339, "mean_token_accuracy": 0.6182245388627052, "num_tokens": 1339225.0, "step": 1656 }, { "epoch": 0.4390889830508475, "grad_norm": 1.449155569076538, "learning_rate": 9.780587923728815e-06, "loss": 1.3738, "mean_token_accuracy": 0.6788248345255852, "num_tokens": 1341077.0, "step": 1658 }, { "epoch": 0.4396186440677966, "grad_norm": 1.5949594974517822, "learning_rate": 9.78032309322034e-06, "loss": 1.4424, "mean_token_accuracy": 0.6611275225877762, "num_tokens": 1342543.0, "step": 1660 }, { "epoch": 0.4401483050847458, "grad_norm": 1.9192930459976196, "learning_rate": 9.780058262711866e-06, "loss": 1.5797, "mean_token_accuracy": 0.6744851432740688, "num_tokens": 1344165.0, "step": 1662 }, { "epoch": 0.4406779661016949, "grad_norm": 1.7334280014038086, "learning_rate": 9.779793432203391e-06, "loss": 1.1702, "mean_token_accuracy": 0.7221393696963787, "num_tokens": 1345878.0, "step": 1664 }, { "epoch": 0.4412076271186441, "grad_norm": 1.7608201503753662, "learning_rate": 9.779528601694916e-06, "loss": 1.509, "mean_token_accuracy": 0.6857901737093925, "num_tokens": 1347405.0, "step": 1666 }, { "epoch": 0.4417372881355932, "grad_norm": 2.2070603370666504, "learning_rate": 9.779263771186441e-06, "loss": 1.6415, "mean_token_accuracy": 0.6455109193921089, "num_tokens": 1349093.0, "step": 1668 }, { "epoch": 0.4422669491525424, "grad_norm": 1.6954015493392944, "learning_rate": 9.778998940677968e-06, "loss": 1.4325, "mean_token_accuracy": 0.6836129277944565, "num_tokens": 1350676.0, "step": 1670 }, { "epoch": 0.4427966101694915, "grad_norm": 1.7437763214111328, "learning_rate": 9.778734110169492e-06, "loss": 1.0913, "mean_token_accuracy": 0.7209212556481361, "num_tokens": 1352296.0, "step": 1672 }, { "epoch": 0.4433262711864407, "grad_norm": 1.851339340209961, "learning_rate": 9.778469279661017e-06, "loss": 1.7667, "mean_token_accuracy": 0.6233443580567837, "num_tokens": 1353814.0, "step": 1674 }, { "epoch": 0.4438559322033898, "grad_norm": 2.1765217781066895, "learning_rate": 9.778204449152542e-06, "loss": 1.4426, "mean_token_accuracy": 0.6668160483241081, "num_tokens": 1355354.0, "step": 1676 }, { "epoch": 0.444385593220339, "grad_norm": 1.7040940523147583, "learning_rate": 9.777939618644069e-06, "loss": 1.4157, "mean_token_accuracy": 0.6715026162564754, "num_tokens": 1357037.0, "step": 1678 }, { "epoch": 0.4449152542372881, "grad_norm": 1.7655174732208252, "learning_rate": 9.777674788135594e-06, "loss": 1.6425, "mean_token_accuracy": 0.6241929307579994, "num_tokens": 1358689.0, "step": 1680 }, { "epoch": 0.4454449152542373, "grad_norm": 1.892505168914795, "learning_rate": 9.77740995762712e-06, "loss": 1.484, "mean_token_accuracy": 0.6648385003209114, "num_tokens": 1360522.0, "step": 1682 }, { "epoch": 0.4459745762711864, "grad_norm": 1.5646804571151733, "learning_rate": 9.777145127118644e-06, "loss": 0.9555, "mean_token_accuracy": 0.7481755092740059, "num_tokens": 1362110.0, "step": 1684 }, { "epoch": 0.4465042372881356, "grad_norm": 1.5597354173660278, "learning_rate": 9.77688029661017e-06, "loss": 1.0873, "mean_token_accuracy": 0.739130474627018, "num_tokens": 1364131.0, "step": 1686 }, { "epoch": 0.4470338983050847, "grad_norm": 1.9801877737045288, "learning_rate": 9.776615466101695e-06, "loss": 1.5169, "mean_token_accuracy": 0.666796587407589, "num_tokens": 1365782.0, "step": 1688 }, { "epoch": 0.4475635593220339, "grad_norm": 1.425750494003296, "learning_rate": 9.776350635593222e-06, "loss": 1.3877, "mean_token_accuracy": 0.6746315136551857, "num_tokens": 1367526.0, "step": 1690 }, { "epoch": 0.4480932203389831, "grad_norm": 2.177065134048462, "learning_rate": 9.776085805084747e-06, "loss": 1.6928, "mean_token_accuracy": 0.6058543622493744, "num_tokens": 1369388.0, "step": 1692 }, { "epoch": 0.4486228813559322, "grad_norm": 1.8494915962219238, "learning_rate": 9.775820974576272e-06, "loss": 1.4095, "mean_token_accuracy": 0.6882108524441719, "num_tokens": 1370839.0, "step": 1694 }, { "epoch": 0.4491525423728814, "grad_norm": 1.4884287118911743, "learning_rate": 9.775556144067797e-06, "loss": 1.3993, "mean_token_accuracy": 0.661358430981636, "num_tokens": 1372653.0, "step": 1696 }, { "epoch": 0.4496822033898305, "grad_norm": 1.862597942352295, "learning_rate": 9.775291313559323e-06, "loss": 1.1075, "mean_token_accuracy": 0.7233971878886223, "num_tokens": 1374352.0, "step": 1698 }, { "epoch": 0.4502118644067797, "grad_norm": 1.629762887954712, "learning_rate": 9.775026483050848e-06, "loss": 1.1623, "mean_token_accuracy": 0.7391564771533012, "num_tokens": 1376926.0, "step": 1700 }, { "epoch": 0.4507415254237288, "grad_norm": 1.6403207778930664, "learning_rate": 9.774761652542373e-06, "loss": 1.4761, "mean_token_accuracy": 0.684290885925293, "num_tokens": 1378249.0, "step": 1702 }, { "epoch": 0.451271186440678, "grad_norm": 1.3956843614578247, "learning_rate": 9.774496822033898e-06, "loss": 1.2144, "mean_token_accuracy": 0.7261741533875465, "num_tokens": 1380282.0, "step": 1704 }, { "epoch": 0.4518008474576271, "grad_norm": 1.4056516885757446, "learning_rate": 9.774231991525425e-06, "loss": 1.3326, "mean_token_accuracy": 0.6916126608848572, "num_tokens": 1382102.0, "step": 1706 }, { "epoch": 0.4523305084745763, "grad_norm": 1.955591082572937, "learning_rate": 9.77396716101695e-06, "loss": 1.3762, "mean_token_accuracy": 0.684043250977993, "num_tokens": 1383681.0, "step": 1708 }, { "epoch": 0.4528601694915254, "grad_norm": 2.9356017112731934, "learning_rate": 9.773702330508476e-06, "loss": 1.5016, "mean_token_accuracy": 0.6896270588040352, "num_tokens": 1385029.0, "step": 1710 }, { "epoch": 0.4533898305084746, "grad_norm": 1.7857393026351929, "learning_rate": 9.7734375e-06, "loss": 1.6348, "mean_token_accuracy": 0.6673773638904095, "num_tokens": 1386557.0, "step": 1712 }, { "epoch": 0.4539194915254237, "grad_norm": 1.759330153465271, "learning_rate": 9.773172669491526e-06, "loss": 1.2343, "mean_token_accuracy": 0.7206433713436127, "num_tokens": 1388290.0, "step": 1714 }, { "epoch": 0.4544491525423729, "grad_norm": 1.8785123825073242, "learning_rate": 9.772907838983051e-06, "loss": 0.9437, "mean_token_accuracy": 0.7731888890266418, "num_tokens": 1389982.0, "step": 1716 }, { "epoch": 0.454978813559322, "grad_norm": 1.606225848197937, "learning_rate": 9.772643008474578e-06, "loss": 1.6002, "mean_token_accuracy": 0.644604966044426, "num_tokens": 1391658.0, "step": 1718 }, { "epoch": 0.4555084745762712, "grad_norm": 1.590033769607544, "learning_rate": 9.772378177966103e-06, "loss": 1.5457, "mean_token_accuracy": 0.6622063964605331, "num_tokens": 1393384.0, "step": 1720 }, { "epoch": 0.4560381355932203, "grad_norm": 1.7249886989593506, "learning_rate": 9.772113347457628e-06, "loss": 1.7109, "mean_token_accuracy": 0.6318425014615059, "num_tokens": 1394836.0, "step": 1722 }, { "epoch": 0.4565677966101695, "grad_norm": 1.547715187072754, "learning_rate": 9.771848516949153e-06, "loss": 1.1148, "mean_token_accuracy": 0.7476969212293625, "num_tokens": 1396453.0, "step": 1724 }, { "epoch": 0.4570974576271186, "grad_norm": 2.0752530097961426, "learning_rate": 9.771583686440679e-06, "loss": 1.3072, "mean_token_accuracy": 0.6946291476488113, "num_tokens": 1398014.0, "step": 1726 }, { "epoch": 0.4576271186440678, "grad_norm": 1.7325001955032349, "learning_rate": 9.771318855932204e-06, "loss": 1.5894, "mean_token_accuracy": 0.6434069201350212, "num_tokens": 1399725.0, "step": 1728 }, { "epoch": 0.4581567796610169, "grad_norm": 2.063993453979492, "learning_rate": 9.771054025423729e-06, "loss": 1.1337, "mean_token_accuracy": 0.7276319414377213, "num_tokens": 1400998.0, "step": 1730 }, { "epoch": 0.4586864406779661, "grad_norm": 1.5335273742675781, "learning_rate": 9.770789194915256e-06, "loss": 1.2195, "mean_token_accuracy": 0.7259647436439991, "num_tokens": 1402628.0, "step": 1732 }, { "epoch": 0.4592161016949153, "grad_norm": 1.610114336013794, "learning_rate": 9.77052436440678e-06, "loss": 1.1447, "mean_token_accuracy": 0.7125775814056396, "num_tokens": 1404051.0, "step": 1734 }, { "epoch": 0.4597457627118644, "grad_norm": 1.2068850994110107, "learning_rate": 9.770259533898307e-06, "loss": 1.1352, "mean_token_accuracy": 0.7399440333247185, "num_tokens": 1405805.0, "step": 1736 }, { "epoch": 0.4602754237288136, "grad_norm": 1.8459019660949707, "learning_rate": 9.76999470338983e-06, "loss": 1.9467, "mean_token_accuracy": 0.6135747469961643, "num_tokens": 1407524.0, "step": 1738 }, { "epoch": 0.4608050847457627, "grad_norm": 1.6394835710525513, "learning_rate": 9.769729872881357e-06, "loss": 1.5664, "mean_token_accuracy": 0.672406829893589, "num_tokens": 1408971.0, "step": 1740 }, { "epoch": 0.4613347457627119, "grad_norm": 1.5407962799072266, "learning_rate": 9.769465042372882e-06, "loss": 1.4291, "mean_token_accuracy": 0.6819615513086319, "num_tokens": 1410449.0, "step": 1742 }, { "epoch": 0.461864406779661, "grad_norm": 2.2277190685272217, "learning_rate": 9.769200211864409e-06, "loss": 1.5312, "mean_token_accuracy": 0.6592047587037086, "num_tokens": 1411894.0, "step": 1744 }, { "epoch": 0.4623940677966102, "grad_norm": 1.280754566192627, "learning_rate": 9.768935381355934e-06, "loss": 0.8944, "mean_token_accuracy": 0.7983585298061371, "num_tokens": 1413259.0, "step": 1746 }, { "epoch": 0.4629237288135593, "grad_norm": 1.3700469732284546, "learning_rate": 9.768670550847458e-06, "loss": 0.8676, "mean_token_accuracy": 0.7700074836611748, "num_tokens": 1414919.0, "step": 1748 }, { "epoch": 0.4634533898305085, "grad_norm": 1.2917717695236206, "learning_rate": 9.768405720338983e-06, "loss": 1.1769, "step": 1750 }, { "epoch": 0.4634533898305085, "eval_loss": 1.3407189846038818, "eval_mean_token_accuracy": 0.6955669045254782, "eval_num_tokens": 1416384.0, "eval_runtime": 48.2787, "eval_samples_per_second": 6.38, "eval_steps_per_second": 6.38, "step": 1750 }, { "epoch": 0.4639830508474576, "grad_norm": 2.09487247467041, "learning_rate": 9.76814088983051e-06, "loss": 1.3841, "mean_token_accuracy": 0.7171919420361519, "num_tokens": 1417810.0, "step": 1752 }, { "epoch": 0.4645127118644068, "grad_norm": 1.7902021408081055, "learning_rate": 9.767876059322035e-06, "loss": 1.2997, "mean_token_accuracy": 0.7031099647283554, "num_tokens": 1419344.0, "step": 1754 }, { "epoch": 0.4650423728813559, "grad_norm": 1.6086153984069824, "learning_rate": 9.76761122881356e-06, "loss": 1.2305, "mean_token_accuracy": 0.7105413600802422, "num_tokens": 1420779.0, "step": 1756 }, { "epoch": 0.4655720338983051, "grad_norm": 1.6392362117767334, "learning_rate": 9.767346398305085e-06, "loss": 1.6204, "mean_token_accuracy": 0.6515418253839016, "num_tokens": 1422678.0, "step": 1758 }, { "epoch": 0.4661016949152542, "grad_norm": 1.920868992805481, "learning_rate": 9.767081567796611e-06, "loss": 1.7063, "mean_token_accuracy": 0.6372577473521233, "num_tokens": 1424272.0, "step": 1760 }, { "epoch": 0.4666313559322034, "grad_norm": 2.0388927459716797, "learning_rate": 9.766816737288136e-06, "loss": 1.5462, "mean_token_accuracy": 0.6747164912521839, "num_tokens": 1425550.0, "step": 1762 }, { "epoch": 0.4671610169491525, "grad_norm": 1.7637664079666138, "learning_rate": 9.766551906779663e-06, "loss": 1.8391, "mean_token_accuracy": 0.6242223009467125, "num_tokens": 1427380.0, "step": 1764 }, { "epoch": 0.4676906779661017, "grad_norm": 1.5074515342712402, "learning_rate": 9.766287076271186e-06, "loss": 1.3903, "mean_token_accuracy": 0.6962914615869522, "num_tokens": 1429063.0, "step": 1766 }, { "epoch": 0.4682203389830508, "grad_norm": 1.6282706260681152, "learning_rate": 9.766022245762713e-06, "loss": 1.3572, "mean_token_accuracy": 0.6795931607484818, "num_tokens": 1430767.0, "step": 1768 }, { "epoch": 0.46875, "grad_norm": 1.601631999015808, "learning_rate": 9.765757415254238e-06, "loss": 1.1246, "mean_token_accuracy": 0.723406046628952, "num_tokens": 1432503.0, "step": 1770 }, { "epoch": 0.4692796610169492, "grad_norm": 1.8385792970657349, "learning_rate": 9.765492584745764e-06, "loss": 1.5298, "mean_token_accuracy": 0.6761112734675407, "num_tokens": 1434051.0, "step": 1772 }, { "epoch": 0.4698093220338983, "grad_norm": 1.8144336938858032, "learning_rate": 9.76522775423729e-06, "loss": 1.2165, "mean_token_accuracy": 0.7063615173101425, "num_tokens": 1435558.0, "step": 1774 }, { "epoch": 0.4703389830508475, "grad_norm": 2.007408380508423, "learning_rate": 9.764962923728814e-06, "loss": 1.3891, "mean_token_accuracy": 0.6856327280402184, "num_tokens": 1437101.0, "step": 1776 }, { "epoch": 0.4708686440677966, "grad_norm": 1.5158873796463013, "learning_rate": 9.764698093220339e-06, "loss": 1.5665, "mean_token_accuracy": 0.6522149741649628, "num_tokens": 1438777.0, "step": 1778 }, { "epoch": 0.4713983050847458, "grad_norm": 1.583121418952942, "learning_rate": 9.764433262711866e-06, "loss": 1.3628, "mean_token_accuracy": 0.6848710030317307, "num_tokens": 1440587.0, "step": 1780 }, { "epoch": 0.4719279661016949, "grad_norm": 1.8971039056777954, "learning_rate": 9.76416843220339e-06, "loss": 1.5475, "mean_token_accuracy": 0.6594791635870934, "num_tokens": 1441962.0, "step": 1782 }, { "epoch": 0.4724576271186441, "grad_norm": 1.589476466178894, "learning_rate": 9.763903601694916e-06, "loss": 0.9668, "mean_token_accuracy": 0.7517091557383537, "num_tokens": 1443330.0, "step": 1784 }, { "epoch": 0.4729872881355932, "grad_norm": 1.6800682544708252, "learning_rate": 9.76363877118644e-06, "loss": 1.5493, "mean_token_accuracy": 0.6444086730480194, "num_tokens": 1444944.0, "step": 1786 }, { "epoch": 0.4735169491525424, "grad_norm": 1.1117998361587524, "learning_rate": 9.763373940677967e-06, "loss": 1.1421, "mean_token_accuracy": 0.7510351985692978, "num_tokens": 1446667.0, "step": 1788 }, { "epoch": 0.4740466101694915, "grad_norm": 1.5494073629379272, "learning_rate": 9.763109110169492e-06, "loss": 1.5331, "mean_token_accuracy": 0.686118584126234, "num_tokens": 1448328.0, "step": 1790 }, { "epoch": 0.4745762711864407, "grad_norm": 1.8555963039398193, "learning_rate": 9.762844279661017e-06, "loss": 1.5007, "mean_token_accuracy": 0.6746472083032131, "num_tokens": 1450043.0, "step": 1792 }, { "epoch": 0.4751059322033898, "grad_norm": 1.908095359802246, "learning_rate": 9.762579449152542e-06, "loss": 1.125, "mean_token_accuracy": 0.7239080667495728, "num_tokens": 1451530.0, "step": 1794 }, { "epoch": 0.475635593220339, "grad_norm": 1.9456907510757446, "learning_rate": 9.762314618644069e-06, "loss": 1.7037, "mean_token_accuracy": 0.6453837156295776, "num_tokens": 1453151.0, "step": 1796 }, { "epoch": 0.4761652542372881, "grad_norm": 1.8172533512115479, "learning_rate": 9.762049788135594e-06, "loss": 1.5212, "mean_token_accuracy": 0.6727450489997864, "num_tokens": 1454610.0, "step": 1798 }, { "epoch": 0.4766949152542373, "grad_norm": 2.1655690670013428, "learning_rate": 9.76178495762712e-06, "loss": 1.3678, "mean_token_accuracy": 0.695250041782856, "num_tokens": 1456171.0, "step": 1800 }, { "epoch": 0.4772245762711864, "grad_norm": 1.9220002889633179, "learning_rate": 9.761520127118645e-06, "loss": 1.5303, "mean_token_accuracy": 0.6285325959324837, "num_tokens": 1458322.0, "step": 1802 }, { "epoch": 0.4777542372881356, "grad_norm": 1.9143348932266235, "learning_rate": 9.76125529661017e-06, "loss": 1.6083, "mean_token_accuracy": 0.68442003428936, "num_tokens": 1459768.0, "step": 1804 }, { "epoch": 0.4782838983050847, "grad_norm": 1.2432236671447754, "learning_rate": 9.760990466101695e-06, "loss": 1.202, "mean_token_accuracy": 0.6678652316331863, "num_tokens": 1461961.0, "step": 1806 }, { "epoch": 0.4788135593220339, "grad_norm": 1.3333191871643066, "learning_rate": 9.760725635593222e-06, "loss": 1.2644, "mean_token_accuracy": 0.7079744413495064, "num_tokens": 1463680.0, "step": 1808 }, { "epoch": 0.4793432203389831, "grad_norm": 1.4910027980804443, "learning_rate": 9.760460805084746e-06, "loss": 1.5228, "mean_token_accuracy": 0.67714549228549, "num_tokens": 1465281.0, "step": 1810 }, { "epoch": 0.4798728813559322, "grad_norm": 1.5059150457382202, "learning_rate": 9.760195974576271e-06, "loss": 1.0863, "mean_token_accuracy": 0.7291001304984093, "num_tokens": 1467277.0, "step": 1812 }, { "epoch": 0.4804025423728814, "grad_norm": 1.8311717510223389, "learning_rate": 9.759931144067798e-06, "loss": 1.3451, "mean_token_accuracy": 0.696875810623169, "num_tokens": 1468763.0, "step": 1814 }, { "epoch": 0.4809322033898305, "grad_norm": 1.4429653882980347, "learning_rate": 9.759666313559323e-06, "loss": 1.4772, "mean_token_accuracy": 0.6765041053295135, "num_tokens": 1470467.0, "step": 1816 }, { "epoch": 0.4814618644067797, "grad_norm": 1.7187200784683228, "learning_rate": 9.75940148305085e-06, "loss": 1.5353, "mean_token_accuracy": 0.6560563370585442, "num_tokens": 1472032.0, "step": 1818 }, { "epoch": 0.4819915254237288, "grad_norm": 2.0526323318481445, "learning_rate": 9.759136652542373e-06, "loss": 1.3067, "mean_token_accuracy": 0.7065559402108192, "num_tokens": 1473416.0, "step": 1820 }, { "epoch": 0.482521186440678, "grad_norm": 1.457580804824829, "learning_rate": 9.7588718220339e-06, "loss": 1.3184, "mean_token_accuracy": 0.6827302798628807, "num_tokens": 1475269.0, "step": 1822 }, { "epoch": 0.4830508474576271, "grad_norm": 1.901405930519104, "learning_rate": 9.758606991525424e-06, "loss": 1.6808, "mean_token_accuracy": 0.6390978693962097, "num_tokens": 1477516.0, "step": 1824 }, { "epoch": 0.4835805084745763, "grad_norm": 1.974132776260376, "learning_rate": 9.758342161016951e-06, "loss": 1.5675, "mean_token_accuracy": 0.6620647311210632, "num_tokens": 1478897.0, "step": 1826 }, { "epoch": 0.4841101694915254, "grad_norm": 1.5072150230407715, "learning_rate": 9.758077330508476e-06, "loss": 1.0095, "mean_token_accuracy": 0.7488392218947411, "num_tokens": 1480403.0, "step": 1828 }, { "epoch": 0.4846398305084746, "grad_norm": 1.7596622705459595, "learning_rate": 9.757812500000001e-06, "loss": 1.4283, "mean_token_accuracy": 0.6824403926730156, "num_tokens": 1481886.0, "step": 1830 }, { "epoch": 0.4851694915254237, "grad_norm": 1.512420415878296, "learning_rate": 9.757547669491526e-06, "loss": 1.2022, "mean_token_accuracy": 0.735924020409584, "num_tokens": 1483486.0, "step": 1832 }, { "epoch": 0.4856991525423729, "grad_norm": 1.7502774000167847, "learning_rate": 9.757282838983052e-06, "loss": 1.1909, "mean_token_accuracy": 0.7224372029304504, "num_tokens": 1485078.0, "step": 1834 }, { "epoch": 0.486228813559322, "grad_norm": 1.681650996208191, "learning_rate": 9.757018008474577e-06, "loss": 1.2987, "mean_token_accuracy": 0.6958970949053764, "num_tokens": 1486642.0, "step": 1836 }, { "epoch": 0.4867584745762712, "grad_norm": 1.8711559772491455, "learning_rate": 9.756753177966102e-06, "loss": 1.4673, "mean_token_accuracy": 0.6824847534298897, "num_tokens": 1488170.0, "step": 1838 }, { "epoch": 0.4872881355932203, "grad_norm": 1.2949111461639404, "learning_rate": 9.756488347457627e-06, "loss": 0.8119, "mean_token_accuracy": 0.7975327968597412, "num_tokens": 1489915.0, "step": 1840 }, { "epoch": 0.4878177966101695, "grad_norm": 1.6316587924957275, "learning_rate": 9.756223516949154e-06, "loss": 1.5662, "mean_token_accuracy": 0.6606340557336807, "num_tokens": 1491522.0, "step": 1842 }, { "epoch": 0.4883474576271186, "grad_norm": 2.264892101287842, "learning_rate": 9.755958686440679e-06, "loss": 1.3312, "mean_token_accuracy": 0.710834689438343, "num_tokens": 1492876.0, "step": 1844 }, { "epoch": 0.4888771186440678, "grad_norm": 2.0321452617645264, "learning_rate": 9.755693855932204e-06, "loss": 1.7248, "mean_token_accuracy": 0.625398188829422, "num_tokens": 1494413.0, "step": 1846 }, { "epoch": 0.4894067796610169, "grad_norm": 1.6952732801437378, "learning_rate": 9.755429025423729e-06, "loss": 1.0536, "mean_token_accuracy": 0.7247809767723083, "num_tokens": 1495916.0, "step": 1848 }, { "epoch": 0.4899364406779661, "grad_norm": 1.868277668952942, "learning_rate": 9.755164194915255e-06, "loss": 1.4006, "mean_token_accuracy": 0.6566600576043129, "num_tokens": 1497468.0, "step": 1850 }, { "epoch": 0.4904661016949153, "grad_norm": 2.0683844089508057, "learning_rate": 9.75489936440678e-06, "loss": 1.6135, "mean_token_accuracy": 0.6793633177876472, "num_tokens": 1498829.0, "step": 1852 }, { "epoch": 0.4909957627118644, "grad_norm": 1.347119688987732, "learning_rate": 9.754634533898307e-06, "loss": 1.1214, "mean_token_accuracy": 0.74251339584589, "num_tokens": 1500261.0, "step": 1854 }, { "epoch": 0.4915254237288136, "grad_norm": 2.2595162391662598, "learning_rate": 9.754369703389832e-06, "loss": 1.5266, "mean_token_accuracy": 0.678853128105402, "num_tokens": 1501741.0, "step": 1856 }, { "epoch": 0.4920550847457627, "grad_norm": 1.6617144346237183, "learning_rate": 9.754104872881357e-06, "loss": 1.6602, "mean_token_accuracy": 0.6344305351376534, "num_tokens": 1503595.0, "step": 1858 }, { "epoch": 0.4925847457627119, "grad_norm": 1.4582092761993408, "learning_rate": 9.753840042372882e-06, "loss": 1.169, "mean_token_accuracy": 0.7212079502642155, "num_tokens": 1505083.0, "step": 1860 }, { "epoch": 0.493114406779661, "grad_norm": 1.7776728868484497, "learning_rate": 9.753575211864408e-06, "loss": 1.8635, "mean_token_accuracy": 0.6140730381011963, "num_tokens": 1506585.0, "step": 1862 }, { "epoch": 0.4936440677966102, "grad_norm": 1.5301263332366943, "learning_rate": 9.753310381355933e-06, "loss": 1.1007, "mean_token_accuracy": 0.7327371761202812, "num_tokens": 1507997.0, "step": 1864 }, { "epoch": 0.4941737288135593, "grad_norm": 1.673374891281128, "learning_rate": 9.753045550847458e-06, "loss": 1.7917, "mean_token_accuracy": 0.619265791028738, "num_tokens": 1509546.0, "step": 1866 }, { "epoch": 0.4947033898305085, "grad_norm": 1.5854299068450928, "learning_rate": 9.752780720338983e-06, "loss": 1.2258, "mean_token_accuracy": 0.7307852357625961, "num_tokens": 1510897.0, "step": 1868 }, { "epoch": 0.4952330508474576, "grad_norm": 2.0224616527557373, "learning_rate": 9.75251588983051e-06, "loss": 1.3603, "mean_token_accuracy": 0.6998405307531357, "num_tokens": 1512310.0, "step": 1870 }, { "epoch": 0.4957627118644068, "grad_norm": 1.7163324356079102, "learning_rate": 9.752251059322035e-06, "loss": 1.0901, "mean_token_accuracy": 0.7330753356218338, "num_tokens": 1513848.0, "step": 1872 }, { "epoch": 0.4962923728813559, "grad_norm": 1.670668363571167, "learning_rate": 9.75198622881356e-06, "loss": 1.2833, "mean_token_accuracy": 0.722165547311306, "num_tokens": 1515657.0, "step": 1874 }, { "epoch": 0.4968220338983051, "grad_norm": 2.105088710784912, "learning_rate": 9.751721398305084e-06, "loss": 1.877, "mean_token_accuracy": 0.6108671203255653, "num_tokens": 1517301.0, "step": 1876 }, { "epoch": 0.4973516949152542, "grad_norm": 2.0531005859375, "learning_rate": 9.751456567796611e-06, "loss": 1.1793, "mean_token_accuracy": 0.7409718036651611, "num_tokens": 1518643.0, "step": 1878 }, { "epoch": 0.4978813559322034, "grad_norm": 1.50923752784729, "learning_rate": 9.751191737288136e-06, "loss": 1.6712, "mean_token_accuracy": 0.6614570878446102, "num_tokens": 1520476.0, "step": 1880 }, { "epoch": 0.4984110169491525, "grad_norm": 1.3566958904266357, "learning_rate": 9.750926906779663e-06, "loss": 0.8787, "mean_token_accuracy": 0.7929534763097763, "num_tokens": 1522210.0, "step": 1882 }, { "epoch": 0.4989406779661017, "grad_norm": 1.5555394887924194, "learning_rate": 9.750662076271188e-06, "loss": 1.2324, "mean_token_accuracy": 0.7136520892381668, "num_tokens": 1523619.0, "step": 1884 }, { "epoch": 0.4994703389830508, "grad_norm": 1.3185510635375977, "learning_rate": 9.750397245762712e-06, "loss": 1.4095, "mean_token_accuracy": 0.7007369883358479, "num_tokens": 1525197.0, "step": 1886 }, { "epoch": 0.5, "grad_norm": 1.7815288305282593, "learning_rate": 9.750132415254237e-06, "loss": 1.0274, "mean_token_accuracy": 0.7597938030958176, "num_tokens": 1526779.0, "step": 1888 }, { "epoch": 0.5005296610169492, "grad_norm": 1.9160972833633423, "learning_rate": 9.749867584745764e-06, "loss": 1.6354, "mean_token_accuracy": 0.651470772922039, "num_tokens": 1528471.0, "step": 1890 }, { "epoch": 0.5010593220338984, "grad_norm": 1.9172940254211426, "learning_rate": 9.749602754237289e-06, "loss": 1.618, "mean_token_accuracy": 0.6643747575581074, "num_tokens": 1530041.0, "step": 1892 }, { "epoch": 0.5015889830508474, "grad_norm": 2.0239369869232178, "learning_rate": 9.749337923728814e-06, "loss": 1.1992, "mean_token_accuracy": 0.7233951389789581, "num_tokens": 1531269.0, "step": 1894 }, { "epoch": 0.5021186440677966, "grad_norm": 1.2435338497161865, "learning_rate": 9.749073093220339e-06, "loss": 1.0472, "mean_token_accuracy": 0.7631554640829563, "num_tokens": 1532761.0, "step": 1896 }, { "epoch": 0.5026483050847458, "grad_norm": 1.6171574592590332, "learning_rate": 9.748808262711865e-06, "loss": 1.466, "mean_token_accuracy": 0.6771634221076965, "num_tokens": 1534209.0, "step": 1898 }, { "epoch": 0.503177966101695, "grad_norm": 1.5132508277893066, "learning_rate": 9.74854343220339e-06, "loss": 1.1379, "mean_token_accuracy": 0.7177082188427448, "num_tokens": 1536110.0, "step": 1900 }, { "epoch": 0.503707627118644, "grad_norm": 1.8165708780288696, "learning_rate": 9.748278601694915e-06, "loss": 1.2898, "mean_token_accuracy": 0.7096015065908432, "num_tokens": 1537543.0, "step": 1902 }, { "epoch": 0.5042372881355932, "grad_norm": 1.8222633600234985, "learning_rate": 9.748013771186442e-06, "loss": 1.4224, "mean_token_accuracy": 0.7031560614705086, "num_tokens": 1539282.0, "step": 1904 }, { "epoch": 0.5047669491525424, "grad_norm": 1.659479022026062, "learning_rate": 9.747748940677967e-06, "loss": 1.2636, "mean_token_accuracy": 0.6924474015831947, "num_tokens": 1540874.0, "step": 1906 }, { "epoch": 0.5052966101694916, "grad_norm": 1.8782382011413574, "learning_rate": 9.747484110169493e-06, "loss": 1.1129, "mean_token_accuracy": 0.7313853204250336, "num_tokens": 1542457.0, "step": 1908 }, { "epoch": 0.5058262711864406, "grad_norm": 1.6415693759918213, "learning_rate": 9.747219279661018e-06, "loss": 1.3167, "mean_token_accuracy": 0.7085385322570801, "num_tokens": 1543876.0, "step": 1910 }, { "epoch": 0.5063559322033898, "grad_norm": 1.446003794670105, "learning_rate": 9.746954449152543e-06, "loss": 1.0383, "mean_token_accuracy": 0.7695765867829323, "num_tokens": 1545687.0, "step": 1912 }, { "epoch": 0.506885593220339, "grad_norm": 1.6328458786010742, "learning_rate": 9.746689618644068e-06, "loss": 1.4707, "mean_token_accuracy": 0.6800980418920517, "num_tokens": 1547367.0, "step": 1914 }, { "epoch": 0.5074152542372882, "grad_norm": 1.8975054025650024, "learning_rate": 9.746424788135595e-06, "loss": 1.797, "mean_token_accuracy": 0.6037796214222908, "num_tokens": 1548979.0, "step": 1916 }, { "epoch": 0.5079449152542372, "grad_norm": 1.8772802352905273, "learning_rate": 9.74615995762712e-06, "loss": 1.5107, "mean_token_accuracy": 0.6605697274208069, "num_tokens": 1550472.0, "step": 1918 }, { "epoch": 0.5084745762711864, "grad_norm": 1.7299609184265137, "learning_rate": 9.745895127118645e-06, "loss": 1.2372, "mean_token_accuracy": 0.7082501575350761, "num_tokens": 1552262.0, "step": 1920 }, { "epoch": 0.5090042372881356, "grad_norm": 1.9918360710144043, "learning_rate": 9.74563029661017e-06, "loss": 1.5133, "mean_token_accuracy": 0.6745664775371552, "num_tokens": 1553795.0, "step": 1922 }, { "epoch": 0.5095338983050848, "grad_norm": 1.4541964530944824, "learning_rate": 9.745365466101696e-06, "loss": 1.4141, "mean_token_accuracy": 0.6951767727732658, "num_tokens": 1555470.0, "step": 1924 }, { "epoch": 0.5100635593220338, "grad_norm": 1.540474534034729, "learning_rate": 9.745100635593221e-06, "loss": 1.3642, "mean_token_accuracy": 0.6937609761953354, "num_tokens": 1557006.0, "step": 1926 }, { "epoch": 0.510593220338983, "grad_norm": 1.7772833108901978, "learning_rate": 9.744835805084746e-06, "loss": 1.169, "mean_token_accuracy": 0.7367835491895676, "num_tokens": 1558646.0, "step": 1928 }, { "epoch": 0.5111228813559322, "grad_norm": 1.4934529066085815, "learning_rate": 9.744570974576271e-06, "loss": 1.1043, "mean_token_accuracy": 0.7210254445672035, "num_tokens": 1560388.0, "step": 1930 }, { "epoch": 0.5116525423728814, "grad_norm": 1.7634834051132202, "learning_rate": 9.744306144067798e-06, "loss": 1.1053, "mean_token_accuracy": 0.7388005033135414, "num_tokens": 1561986.0, "step": 1932 }, { "epoch": 0.5121822033898306, "grad_norm": 1.2827516794204712, "learning_rate": 9.744041313559323e-06, "loss": 1.1853, "mean_token_accuracy": 0.7195433005690575, "num_tokens": 1563851.0, "step": 1934 }, { "epoch": 0.5127118644067796, "grad_norm": 1.357645869255066, "learning_rate": 9.74377648305085e-06, "loss": 1.4107, "mean_token_accuracy": 0.6835659667849541, "num_tokens": 1565507.0, "step": 1936 }, { "epoch": 0.5132415254237288, "grad_norm": 2.160515069961548, "learning_rate": 9.743511652542374e-06, "loss": 1.7003, "mean_token_accuracy": 0.6403629332780838, "num_tokens": 1567033.0, "step": 1938 }, { "epoch": 0.513771186440678, "grad_norm": 1.677495002746582, "learning_rate": 9.743246822033899e-06, "loss": 1.5934, "mean_token_accuracy": 0.6707498952746391, "num_tokens": 1568693.0, "step": 1940 }, { "epoch": 0.5143008474576272, "grad_norm": 1.4068154096603394, "learning_rate": 9.742981991525424e-06, "loss": 1.2392, "mean_token_accuracy": 0.7173566743731499, "num_tokens": 1570461.0, "step": 1942 }, { "epoch": 0.5148305084745762, "grad_norm": 1.5492899417877197, "learning_rate": 9.74271716101695e-06, "loss": 1.0464, "mean_token_accuracy": 0.7317660227417946, "num_tokens": 1572155.0, "step": 1944 }, { "epoch": 0.5153601694915254, "grad_norm": 1.6303863525390625, "learning_rate": 9.742452330508476e-06, "loss": 1.2882, "mean_token_accuracy": 0.7249630764126778, "num_tokens": 1573669.0, "step": 1946 }, { "epoch": 0.5158898305084746, "grad_norm": 1.5822417736053467, "learning_rate": 9.7421875e-06, "loss": 1.403, "mean_token_accuracy": 0.6995394825935364, "num_tokens": 1575232.0, "step": 1948 }, { "epoch": 0.5164194915254238, "grad_norm": 1.323549747467041, "learning_rate": 9.741922669491525e-06, "loss": 0.9126, "mean_token_accuracy": 0.7785408794879913, "num_tokens": 1576602.0, "step": 1950 }, { "epoch": 0.5169491525423728, "grad_norm": 2.0568292140960693, "learning_rate": 9.741657838983052e-06, "loss": 1.4354, "mean_token_accuracy": 0.6892575994133949, "num_tokens": 1578164.0, "step": 1952 }, { "epoch": 0.517478813559322, "grad_norm": 1.5494962930679321, "learning_rate": 9.741393008474577e-06, "loss": 1.3727, "mean_token_accuracy": 0.6935119330883026, "num_tokens": 1579933.0, "step": 1954 }, { "epoch": 0.5180084745762712, "grad_norm": 1.6052989959716797, "learning_rate": 9.741128177966102e-06, "loss": 1.4763, "mean_token_accuracy": 0.7014876529574394, "num_tokens": 1581340.0, "step": 1956 }, { "epoch": 0.5185381355932204, "grad_norm": 2.1648061275482178, "learning_rate": 9.740863347457627e-06, "loss": 1.6965, "mean_token_accuracy": 0.6179681867361069, "num_tokens": 1582785.0, "step": 1958 }, { "epoch": 0.5190677966101694, "grad_norm": 1.4109996557235718, "learning_rate": 9.740598516949153e-06, "loss": 1.0352, "mean_token_accuracy": 0.7664082944393158, "num_tokens": 1584428.0, "step": 1960 }, { "epoch": 0.5195974576271186, "grad_norm": 1.3088241815567017, "learning_rate": 9.740333686440678e-06, "loss": 1.0138, "mean_token_accuracy": 0.747891902923584, "num_tokens": 1586105.0, "step": 1962 }, { "epoch": 0.5201271186440678, "grad_norm": 1.4956676959991455, "learning_rate": 9.740068855932205e-06, "loss": 0.9413, "mean_token_accuracy": 0.7550933584570885, "num_tokens": 1587582.0, "step": 1964 }, { "epoch": 0.520656779661017, "grad_norm": 1.3371870517730713, "learning_rate": 9.73980402542373e-06, "loss": 1.2558, "mean_token_accuracy": 0.7119809985160828, "num_tokens": 1589209.0, "step": 1966 }, { "epoch": 0.5211864406779662, "grad_norm": 1.9008086919784546, "learning_rate": 9.739539194915255e-06, "loss": 1.3947, "mean_token_accuracy": 0.6892865151166916, "num_tokens": 1590689.0, "step": 1968 }, { "epoch": 0.5217161016949152, "grad_norm": 1.6091198921203613, "learning_rate": 9.73927436440678e-06, "loss": 1.2509, "mean_token_accuracy": 0.7482990697026253, "num_tokens": 1592230.0, "step": 1970 }, { "epoch": 0.5222457627118644, "grad_norm": 1.508324146270752, "learning_rate": 9.739009533898306e-06, "loss": 1.5469, "mean_token_accuracy": 0.641762413084507, "num_tokens": 1593858.0, "step": 1972 }, { "epoch": 0.5227754237288136, "grad_norm": 1.9664379358291626, "learning_rate": 9.738744703389831e-06, "loss": 0.86, "mean_token_accuracy": 0.78777876496315, "num_tokens": 1595123.0, "step": 1974 }, { "epoch": 0.5233050847457628, "grad_norm": 2.3702807426452637, "learning_rate": 9.738479872881356e-06, "loss": 1.5964, "mean_token_accuracy": 0.6839668899774551, "num_tokens": 1596571.0, "step": 1976 }, { "epoch": 0.5238347457627118, "grad_norm": 1.8521283864974976, "learning_rate": 9.738215042372881e-06, "loss": 1.562, "mean_token_accuracy": 0.6133120879530907, "num_tokens": 1598738.0, "step": 1978 }, { "epoch": 0.524364406779661, "grad_norm": 1.5341017246246338, "learning_rate": 9.737950211864408e-06, "loss": 1.2049, "mean_token_accuracy": 0.7184553667902946, "num_tokens": 1600183.0, "step": 1980 }, { "epoch": 0.5248940677966102, "grad_norm": 1.600361704826355, "learning_rate": 9.737685381355933e-06, "loss": 0.941, "mean_token_accuracy": 0.7714420035481453, "num_tokens": 1601683.0, "step": 1982 }, { "epoch": 0.5254237288135594, "grad_norm": 1.6894716024398804, "learning_rate": 9.737420550847458e-06, "loss": 1.1461, "mean_token_accuracy": 0.7312502339482307, "num_tokens": 1603150.0, "step": 1984 }, { "epoch": 0.5259533898305084, "grad_norm": 1.730326771736145, "learning_rate": 9.737155720338984e-06, "loss": 1.2448, "mean_token_accuracy": 0.7222347855567932, "num_tokens": 1604720.0, "step": 1986 }, { "epoch": 0.5264830508474576, "grad_norm": 1.776120901107788, "learning_rate": 9.73689088983051e-06, "loss": 1.0972, "mean_token_accuracy": 0.7431923896074295, "num_tokens": 1605948.0, "step": 1988 }, { "epoch": 0.5270127118644068, "grad_norm": 1.8741374015808105, "learning_rate": 9.736626059322036e-06, "loss": 1.5192, "mean_token_accuracy": 0.6777163743972778, "num_tokens": 1607342.0, "step": 1990 }, { "epoch": 0.527542372881356, "grad_norm": 1.6422855854034424, "learning_rate": 9.73636122881356e-06, "loss": 1.1881, "mean_token_accuracy": 0.7412798032164574, "num_tokens": 1609159.0, "step": 1992 }, { "epoch": 0.528072033898305, "grad_norm": 1.690943717956543, "learning_rate": 9.736096398305086e-06, "loss": 0.9756, "mean_token_accuracy": 0.7341416701674461, "num_tokens": 1610679.0, "step": 1994 }, { "epoch": 0.5286016949152542, "grad_norm": 1.549353837966919, "learning_rate": 9.73583156779661e-06, "loss": 1.0184, "mean_token_accuracy": 0.7482821643352509, "num_tokens": 1612458.0, "step": 1996 }, { "epoch": 0.5291313559322034, "grad_norm": 2.0687856674194336, "learning_rate": 9.735566737288137e-06, "loss": 1.6056, "mean_token_accuracy": 0.6501092724502087, "num_tokens": 1613975.0, "step": 1998 }, { "epoch": 0.5296610169491526, "grad_norm": 1.6786201000213623, "learning_rate": 9.735301906779662e-06, "loss": 1.4563, "step": 2000 }, { "epoch": 0.5296610169491526, "eval_loss": 1.3380028009414673, "eval_mean_token_accuracy": 0.6950056846072148, "eval_num_tokens": 1615755.0, "eval_runtime": 48.2871, "eval_samples_per_second": 6.379, "eval_steps_per_second": 6.379, "step": 2000 }, { "epoch": 0.5301906779661016, "grad_norm": 1.803446888923645, "learning_rate": 9.735037076271187e-06, "loss": 1.5328, "mean_token_accuracy": 0.6483834944665432, "num_tokens": 1617213.0, "step": 2002 }, { "epoch": 0.5307203389830508, "grad_norm": 1.3630051612854004, "learning_rate": 9.734772245762712e-06, "loss": 0.9654, "mean_token_accuracy": 0.7565262615680695, "num_tokens": 1618728.0, "step": 2004 }, { "epoch": 0.53125, "grad_norm": 1.4006280899047852, "learning_rate": 9.734507415254239e-06, "loss": 1.0474, "mean_token_accuracy": 0.7523226365447044, "num_tokens": 1620410.0, "step": 2006 }, { "epoch": 0.5317796610169492, "grad_norm": 1.8534959554672241, "learning_rate": 9.734242584745764e-06, "loss": 1.4087, "mean_token_accuracy": 0.6989208161830902, "num_tokens": 1622000.0, "step": 2008 }, { "epoch": 0.5323093220338984, "grad_norm": 1.7150321006774902, "learning_rate": 9.733977754237289e-06, "loss": 1.3515, "mean_token_accuracy": 0.6764537468552589, "num_tokens": 1623645.0, "step": 2010 }, { "epoch": 0.5328389830508474, "grad_norm": 1.3910512924194336, "learning_rate": 9.733712923728813e-06, "loss": 1.0918, "mean_token_accuracy": 0.7338629588484764, "num_tokens": 1625199.0, "step": 2012 }, { "epoch": 0.5333686440677966, "grad_norm": 1.9316178560256958, "learning_rate": 9.73344809322034e-06, "loss": 1.7814, "mean_token_accuracy": 0.6426534578204155, "num_tokens": 1626694.0, "step": 2014 }, { "epoch": 0.5338983050847458, "grad_norm": 2.0669538974761963, "learning_rate": 9.733183262711865e-06, "loss": 1.8433, "mean_token_accuracy": 0.6038735695183277, "num_tokens": 1628347.0, "step": 2016 }, { "epoch": 0.534427966101695, "grad_norm": 1.469369649887085, "learning_rate": 9.732918432203392e-06, "loss": 0.9715, "mean_token_accuracy": 0.7746047973632812, "num_tokens": 1629830.0, "step": 2018 }, { "epoch": 0.534957627118644, "grad_norm": 2.0266542434692383, "learning_rate": 9.732653601694917e-06, "loss": 1.4658, "mean_token_accuracy": 0.6687359400093555, "num_tokens": 1631393.0, "step": 2020 }, { "epoch": 0.5354872881355932, "grad_norm": 1.6418269872665405, "learning_rate": 9.732388771186441e-06, "loss": 1.6491, "mean_token_accuracy": 0.6583049446344376, "num_tokens": 1632958.0, "step": 2022 }, { "epoch": 0.5360169491525424, "grad_norm": 1.826748013496399, "learning_rate": 9.732123940677966e-06, "loss": 1.152, "mean_token_accuracy": 0.7241136878728867, "num_tokens": 1634800.0, "step": 2024 }, { "epoch": 0.5365466101694916, "grad_norm": 1.4656051397323608, "learning_rate": 9.731859110169493e-06, "loss": 1.2575, "mean_token_accuracy": 0.6980898752808571, "num_tokens": 1636393.0, "step": 2026 }, { "epoch": 0.5370762711864406, "grad_norm": 1.5499402284622192, "learning_rate": 9.731594279661018e-06, "loss": 1.5058, "mean_token_accuracy": 0.6674896702170372, "num_tokens": 1638150.0, "step": 2028 }, { "epoch": 0.5376059322033898, "grad_norm": 1.3952147960662842, "learning_rate": 9.731329449152543e-06, "loss": 1.1875, "mean_token_accuracy": 0.7044788897037506, "num_tokens": 1639964.0, "step": 2030 }, { "epoch": 0.538135593220339, "grad_norm": 1.9511994123458862, "learning_rate": 9.731064618644068e-06, "loss": 1.4726, "mean_token_accuracy": 0.6505935974419117, "num_tokens": 1641348.0, "step": 2032 }, { "epoch": 0.5386652542372882, "grad_norm": 1.7998766899108887, "learning_rate": 9.730799788135594e-06, "loss": 1.2521, "mean_token_accuracy": 0.7140576243400574, "num_tokens": 1642762.0, "step": 2034 }, { "epoch": 0.5391949152542372, "grad_norm": 1.4910862445831299, "learning_rate": 9.73053495762712e-06, "loss": 1.6187, "mean_token_accuracy": 0.6478224247694016, "num_tokens": 1644445.0, "step": 2036 }, { "epoch": 0.5397245762711864, "grad_norm": 1.6894186735153198, "learning_rate": 9.730270127118644e-06, "loss": 1.7627, "mean_token_accuracy": 0.6336371153593063, "num_tokens": 1646008.0, "step": 2038 }, { "epoch": 0.5402542372881356, "grad_norm": 1.847400426864624, "learning_rate": 9.73000529661017e-06, "loss": 1.4564, "mean_token_accuracy": 0.6799597963690758, "num_tokens": 1647562.0, "step": 2040 }, { "epoch": 0.5407838983050848, "grad_norm": 1.727852702140808, "learning_rate": 9.729740466101696e-06, "loss": 1.2051, "mean_token_accuracy": 0.7272579595446587, "num_tokens": 1648995.0, "step": 2042 }, { "epoch": 0.5413135593220338, "grad_norm": 1.4905411005020142, "learning_rate": 9.72947563559322e-06, "loss": 1.4344, "mean_token_accuracy": 0.6742021217942238, "num_tokens": 1650817.0, "step": 2044 }, { "epoch": 0.541843220338983, "grad_norm": 1.6928743124008179, "learning_rate": 9.729210805084747e-06, "loss": 1.3224, "mean_token_accuracy": 0.7196889109909534, "num_tokens": 1652442.0, "step": 2046 }, { "epoch": 0.5423728813559322, "grad_norm": 1.9153369665145874, "learning_rate": 9.728945974576272e-06, "loss": 1.6933, "mean_token_accuracy": 0.6329126320779324, "num_tokens": 1653978.0, "step": 2048 }, { "epoch": 0.5429025423728814, "grad_norm": 1.7648147344589233, "learning_rate": 9.728681144067797e-06, "loss": 1.4393, "mean_token_accuracy": 0.6737483888864517, "num_tokens": 1655424.0, "step": 2050 }, { "epoch": 0.5434322033898306, "grad_norm": 1.6894630193710327, "learning_rate": 9.728416313559322e-06, "loss": 1.579, "mean_token_accuracy": 0.6617598608136177, "num_tokens": 1657069.0, "step": 2052 }, { "epoch": 0.5439618644067796, "grad_norm": 1.685349702835083, "learning_rate": 9.728151483050849e-06, "loss": 1.2662, "mean_token_accuracy": 0.7162421271204948, "num_tokens": 1658600.0, "step": 2054 }, { "epoch": 0.5444915254237288, "grad_norm": 1.4804563522338867, "learning_rate": 9.727886652542374e-06, "loss": 1.3937, "mean_token_accuracy": 0.6596196070313454, "num_tokens": 1660276.0, "step": 2056 }, { "epoch": 0.545021186440678, "grad_norm": 2.355865955352783, "learning_rate": 9.727621822033899e-06, "loss": 1.7678, "mean_token_accuracy": 0.642203327268362, "num_tokens": 1661695.0, "step": 2058 }, { "epoch": 0.5455508474576272, "grad_norm": 1.581179141998291, "learning_rate": 9.727356991525424e-06, "loss": 1.3958, "mean_token_accuracy": 0.6732088774442673, "num_tokens": 1663400.0, "step": 2060 }, { "epoch": 0.5460805084745762, "grad_norm": 1.8189448118209839, "learning_rate": 9.72709216101695e-06, "loss": 1.4439, "mean_token_accuracy": 0.667088720947504, "num_tokens": 1664823.0, "step": 2062 }, { "epoch": 0.5466101694915254, "grad_norm": 1.7414922714233398, "learning_rate": 9.726827330508475e-06, "loss": 1.319, "mean_token_accuracy": 0.7112227454781532, "num_tokens": 1666380.0, "step": 2064 }, { "epoch": 0.5471398305084746, "grad_norm": 1.4803756475448608, "learning_rate": 9.7265625e-06, "loss": 1.3211, "mean_token_accuracy": 0.7046902701258659, "num_tokens": 1667949.0, "step": 2066 }, { "epoch": 0.5476694915254238, "grad_norm": 1.3671575784683228, "learning_rate": 9.726297669491527e-06, "loss": 1.0852, "mean_token_accuracy": 0.7228281050920486, "num_tokens": 1669542.0, "step": 2068 }, { "epoch": 0.5481991525423728, "grad_norm": 1.862953543663025, "learning_rate": 9.726032838983052e-06, "loss": 1.0718, "mean_token_accuracy": 0.7467397749423981, "num_tokens": 1671212.0, "step": 2070 }, { "epoch": 0.548728813559322, "grad_norm": 1.574859380722046, "learning_rate": 9.725768008474578e-06, "loss": 1.752, "mean_token_accuracy": 0.6012471467256546, "num_tokens": 1672869.0, "step": 2072 }, { "epoch": 0.5492584745762712, "grad_norm": 1.3835095167160034, "learning_rate": 9.725503177966103e-06, "loss": 1.1504, "mean_token_accuracy": 0.7252127081155777, "num_tokens": 1674688.0, "step": 2074 }, { "epoch": 0.5497881355932204, "grad_norm": 1.813191294670105, "learning_rate": 9.725238347457628e-06, "loss": 1.4342, "mean_token_accuracy": 0.6718426272273064, "num_tokens": 1676371.0, "step": 2076 }, { "epoch": 0.5503177966101694, "grad_norm": 1.5185033082962036, "learning_rate": 9.724973516949153e-06, "loss": 1.9164, "mean_token_accuracy": 0.6056056618690491, "num_tokens": 1678011.0, "step": 2078 }, { "epoch": 0.5508474576271186, "grad_norm": 1.5543395280838013, "learning_rate": 9.72470868644068e-06, "loss": 1.4569, "mean_token_accuracy": 0.6836750581860542, "num_tokens": 1679547.0, "step": 2080 }, { "epoch": 0.5513771186440678, "grad_norm": 1.6090961694717407, "learning_rate": 9.724443855932205e-06, "loss": 1.1361, "mean_token_accuracy": 0.7322148829698563, "num_tokens": 1681051.0, "step": 2082 }, { "epoch": 0.551906779661017, "grad_norm": 1.6773605346679688, "learning_rate": 9.72417902542373e-06, "loss": 1.6492, "mean_token_accuracy": 0.6497936695814133, "num_tokens": 1682539.0, "step": 2084 }, { "epoch": 0.5524364406779662, "grad_norm": 1.5883748531341553, "learning_rate": 9.723914194915254e-06, "loss": 1.4225, "mean_token_accuracy": 0.6690484136343002, "num_tokens": 1684287.0, "step": 2086 }, { "epoch": 0.5529661016949152, "grad_norm": 1.7309212684631348, "learning_rate": 9.723649364406781e-06, "loss": 1.4032, "mean_token_accuracy": 0.7043334990739822, "num_tokens": 1685706.0, "step": 2088 }, { "epoch": 0.5534957627118644, "grad_norm": 1.450629472732544, "learning_rate": 9.723384533898306e-06, "loss": 1.1826, "mean_token_accuracy": 0.7117701210081577, "num_tokens": 1687391.0, "step": 2090 }, { "epoch": 0.5540254237288136, "grad_norm": 2.1954386234283447, "learning_rate": 9.723119703389831e-06, "loss": 1.3068, "mean_token_accuracy": 0.7082920148968697, "num_tokens": 1688664.0, "step": 2092 }, { "epoch": 0.5545550847457628, "grad_norm": 1.8025346994400024, "learning_rate": 9.722854872881356e-06, "loss": 1.4545, "mean_token_accuracy": 0.6770236566662788, "num_tokens": 1690081.0, "step": 2094 }, { "epoch": 0.5550847457627118, "grad_norm": 2.321769952774048, "learning_rate": 9.722590042372883e-06, "loss": 1.2434, "mean_token_accuracy": 0.7362562343478203, "num_tokens": 1691546.0, "step": 2096 }, { "epoch": 0.555614406779661, "grad_norm": 2.034105062484741, "learning_rate": 9.722325211864407e-06, "loss": 1.5529, "mean_token_accuracy": 0.6754162311553955, "num_tokens": 1692804.0, "step": 2098 }, { "epoch": 0.5561440677966102, "grad_norm": 2.0022268295288086, "learning_rate": 9.722060381355934e-06, "loss": 1.5701, "mean_token_accuracy": 0.6618568003177643, "num_tokens": 1694226.0, "step": 2100 }, { "epoch": 0.5566737288135594, "grad_norm": 1.8316961526870728, "learning_rate": 9.721795550847459e-06, "loss": 1.4483, "mean_token_accuracy": 0.6858396753668785, "num_tokens": 1695719.0, "step": 2102 }, { "epoch": 0.5572033898305084, "grad_norm": 1.7939246892929077, "learning_rate": 9.721530720338984e-06, "loss": 1.7564, "mean_token_accuracy": 0.6723771505057812, "num_tokens": 1697505.0, "step": 2104 }, { "epoch": 0.5577330508474576, "grad_norm": 1.1868056058883667, "learning_rate": 9.721265889830509e-06, "loss": 1.1829, "mean_token_accuracy": 0.7182565182447433, "num_tokens": 1699249.0, "step": 2106 }, { "epoch": 0.5582627118644068, "grad_norm": 1.6636114120483398, "learning_rate": 9.721001059322035e-06, "loss": 1.4157, "mean_token_accuracy": 0.6953052021563053, "num_tokens": 1700765.0, "step": 2108 }, { "epoch": 0.558792372881356, "grad_norm": 1.812814474105835, "learning_rate": 9.72073622881356e-06, "loss": 1.4662, "mean_token_accuracy": 0.6788622587919235, "num_tokens": 1702441.0, "step": 2110 }, { "epoch": 0.559322033898305, "grad_norm": 1.8155345916748047, "learning_rate": 9.720471398305085e-06, "loss": 1.8377, "mean_token_accuracy": 0.622750036418438, "num_tokens": 1704286.0, "step": 2112 }, { "epoch": 0.5598516949152542, "grad_norm": 1.2771790027618408, "learning_rate": 9.72020656779661e-06, "loss": 1.1893, "mean_token_accuracy": 0.715017706155777, "num_tokens": 1705924.0, "step": 2114 }, { "epoch": 0.5603813559322034, "grad_norm": 1.254517912864685, "learning_rate": 9.719941737288137e-06, "loss": 0.97, "mean_token_accuracy": 0.7688115239143372, "num_tokens": 1707413.0, "step": 2116 }, { "epoch": 0.5609110169491526, "grad_norm": 1.9951578378677368, "learning_rate": 9.719676906779662e-06, "loss": 1.6476, "mean_token_accuracy": 0.6508770920336246, "num_tokens": 1708911.0, "step": 2118 }, { "epoch": 0.5614406779661016, "grad_norm": 1.379452109336853, "learning_rate": 9.719412076271187e-06, "loss": 1.4794, "mean_token_accuracy": 0.6714907437562943, "num_tokens": 1710727.0, "step": 2120 }, { "epoch": 0.5619703389830508, "grad_norm": 1.156790018081665, "learning_rate": 9.719147245762712e-06, "loss": 1.0768, "mean_token_accuracy": 0.7411370426416397, "num_tokens": 1712358.0, "step": 2122 }, { "epoch": 0.5625, "grad_norm": 1.1869542598724365, "learning_rate": 9.718882415254238e-06, "loss": 1.2457, "mean_token_accuracy": 0.71034986525774, "num_tokens": 1713945.0, "step": 2124 }, { "epoch": 0.5630296610169492, "grad_norm": 1.924541711807251, "learning_rate": 9.718617584745763e-06, "loss": 1.612, "mean_token_accuracy": 0.6547224968671799, "num_tokens": 1715323.0, "step": 2126 }, { "epoch": 0.5635593220338984, "grad_norm": 2.034205675125122, "learning_rate": 9.71835275423729e-06, "loss": 1.4467, "mean_token_accuracy": 0.6880942359566689, "num_tokens": 1716755.0, "step": 2128 }, { "epoch": 0.5640889830508474, "grad_norm": 1.3705986738204956, "learning_rate": 9.718087923728815e-06, "loss": 1.213, "mean_token_accuracy": 0.7089291661977768, "num_tokens": 1718446.0, "step": 2130 }, { "epoch": 0.5646186440677966, "grad_norm": 1.405678629875183, "learning_rate": 9.71782309322034e-06, "loss": 1.5238, "mean_token_accuracy": 0.6734248623251915, "num_tokens": 1720110.0, "step": 2132 }, { "epoch": 0.5651483050847458, "grad_norm": 1.4865165948867798, "learning_rate": 9.717558262711865e-06, "loss": 1.7958, "mean_token_accuracy": 0.6208979189395905, "num_tokens": 1722038.0, "step": 2134 }, { "epoch": 0.565677966101695, "grad_norm": 1.8308395147323608, "learning_rate": 9.717293432203391e-06, "loss": 1.4053, "mean_token_accuracy": 0.6849080845713615, "num_tokens": 1723436.0, "step": 2136 }, { "epoch": 0.566207627118644, "grad_norm": 1.695237636566162, "learning_rate": 9.717028601694916e-06, "loss": 1.4008, "mean_token_accuracy": 0.6719419211149216, "num_tokens": 1724998.0, "step": 2138 }, { "epoch": 0.5667372881355932, "grad_norm": 1.659246563911438, "learning_rate": 9.716763771186441e-06, "loss": 1.455, "mean_token_accuracy": 0.6718964800238609, "num_tokens": 1726659.0, "step": 2140 }, { "epoch": 0.5672669491525424, "grad_norm": 1.513053297996521, "learning_rate": 9.716498940677966e-06, "loss": 1.2604, "mean_token_accuracy": 0.7028441429138184, "num_tokens": 1728381.0, "step": 2142 }, { "epoch": 0.5677966101694916, "grad_norm": 1.3405859470367432, "learning_rate": 9.716234110169493e-06, "loss": 1.1279, "mean_token_accuracy": 0.7449511885643005, "num_tokens": 1729851.0, "step": 2144 }, { "epoch": 0.5683262711864406, "grad_norm": 1.4798280000686646, "learning_rate": 9.715969279661018e-06, "loss": 1.3489, "mean_token_accuracy": 0.6636105328798294, "num_tokens": 1731594.0, "step": 2146 }, { "epoch": 0.5688559322033898, "grad_norm": 1.8968931436538696, "learning_rate": 9.715704449152543e-06, "loss": 1.517, "mean_token_accuracy": 0.6544384062290192, "num_tokens": 1733283.0, "step": 2148 }, { "epoch": 0.569385593220339, "grad_norm": 1.4543992280960083, "learning_rate": 9.715439618644067e-06, "loss": 1.2906, "mean_token_accuracy": 0.6991326808929443, "num_tokens": 1734773.0, "step": 2150 }, { "epoch": 0.5699152542372882, "grad_norm": 2.1086714267730713, "learning_rate": 9.715174788135594e-06, "loss": 1.6778, "mean_token_accuracy": 0.6320537552237511, "num_tokens": 1736168.0, "step": 2152 }, { "epoch": 0.5704449152542372, "grad_norm": 1.6461230516433716, "learning_rate": 9.71490995762712e-06, "loss": 1.3013, "mean_token_accuracy": 0.6919222101569176, "num_tokens": 1737722.0, "step": 2154 }, { "epoch": 0.5709745762711864, "grad_norm": 1.407004714012146, "learning_rate": 9.714645127118646e-06, "loss": 1.5354, "mean_token_accuracy": 0.6550887748599052, "num_tokens": 1739372.0, "step": 2156 }, { "epoch": 0.5715042372881356, "grad_norm": 1.215889573097229, "learning_rate": 9.71438029661017e-06, "loss": 0.8184, "mean_token_accuracy": 0.7828953638672829, "num_tokens": 1740708.0, "step": 2158 }, { "epoch": 0.5720338983050848, "grad_norm": 1.6613695621490479, "learning_rate": 9.714115466101695e-06, "loss": 1.1362, "mean_token_accuracy": 0.7371480762958527, "num_tokens": 1742124.0, "step": 2160 }, { "epoch": 0.5725635593220338, "grad_norm": 1.721983790397644, "learning_rate": 9.713850635593222e-06, "loss": 1.356, "mean_token_accuracy": 0.7234439849853516, "num_tokens": 1743596.0, "step": 2162 }, { "epoch": 0.573093220338983, "grad_norm": 1.375092625617981, "learning_rate": 9.713585805084747e-06, "loss": 1.2859, "mean_token_accuracy": 0.707643385976553, "num_tokens": 1745291.0, "step": 2164 }, { "epoch": 0.5736228813559322, "grad_norm": 1.3689022064208984, "learning_rate": 9.713320974576272e-06, "loss": 1.1746, "mean_token_accuracy": 0.7344382107257843, "num_tokens": 1746979.0, "step": 2166 }, { "epoch": 0.5741525423728814, "grad_norm": 2.295504093170166, "learning_rate": 9.713056144067797e-06, "loss": 1.5091, "mean_token_accuracy": 0.6577697768807411, "num_tokens": 1748359.0, "step": 2168 }, { "epoch": 0.5746822033898306, "grad_norm": 1.6053709983825684, "learning_rate": 9.712791313559324e-06, "loss": 1.8379, "mean_token_accuracy": 0.6015245690941811, "num_tokens": 1750150.0, "step": 2170 }, { "epoch": 0.5752118644067796, "grad_norm": 2.2525815963745117, "learning_rate": 9.712526483050848e-06, "loss": 1.592, "mean_token_accuracy": 0.6666229739785194, "num_tokens": 1751576.0, "step": 2172 }, { "epoch": 0.5757415254237288, "grad_norm": 2.1662867069244385, "learning_rate": 9.712261652542373e-06, "loss": 1.7169, "mean_token_accuracy": 0.6381828784942627, "num_tokens": 1753279.0, "step": 2174 }, { "epoch": 0.576271186440678, "grad_norm": 1.6172462701797485, "learning_rate": 9.711996822033898e-06, "loss": 1.4738, "mean_token_accuracy": 0.6740087941288948, "num_tokens": 1754827.0, "step": 2176 }, { "epoch": 0.5768008474576272, "grad_norm": 1.914474606513977, "learning_rate": 9.711731991525425e-06, "loss": 1.3508, "mean_token_accuracy": 0.6947726011276245, "num_tokens": 1756701.0, "step": 2178 }, { "epoch": 0.5773305084745762, "grad_norm": 1.8453608751296997, "learning_rate": 9.71146716101695e-06, "loss": 1.6702, "mean_token_accuracy": 0.6608395650982857, "num_tokens": 1758188.0, "step": 2180 }, { "epoch": 0.5778601694915254, "grad_norm": 1.708807349205017, "learning_rate": 9.711202330508476e-06, "loss": 1.3126, "mean_token_accuracy": 0.6902076415717602, "num_tokens": 1759633.0, "step": 2182 }, { "epoch": 0.5783898305084746, "grad_norm": 1.9014272689819336, "learning_rate": 9.710937500000001e-06, "loss": 1.4466, "mean_token_accuracy": 0.7055534198880196, "num_tokens": 1760972.0, "step": 2184 }, { "epoch": 0.5789194915254238, "grad_norm": 1.5186940431594849, "learning_rate": 9.710672669491526e-06, "loss": 1.2893, "mean_token_accuracy": 0.7061970308423042, "num_tokens": 1762566.0, "step": 2186 }, { "epoch": 0.5794491525423728, "grad_norm": 1.7290945053100586, "learning_rate": 9.710407838983051e-06, "loss": 1.1883, "mean_token_accuracy": 0.7141411602497101, "num_tokens": 1764016.0, "step": 2188 }, { "epoch": 0.579978813559322, "grad_norm": 1.5263969898223877, "learning_rate": 9.710143008474578e-06, "loss": 1.3502, "mean_token_accuracy": 0.6828342154622078, "num_tokens": 1765799.0, "step": 2190 }, { "epoch": 0.5805084745762712, "grad_norm": 1.8269966840744019, "learning_rate": 9.709878177966103e-06, "loss": 1.4312, "mean_token_accuracy": 0.68209533020854, "num_tokens": 1767081.0, "step": 2192 }, { "epoch": 0.5810381355932204, "grad_norm": 1.2866827249526978, "learning_rate": 9.709613347457628e-06, "loss": 1.1577, "mean_token_accuracy": 0.7495226636528969, "num_tokens": 1768888.0, "step": 2194 }, { "epoch": 0.5815677966101694, "grad_norm": 1.9383875131607056, "learning_rate": 9.709348516949153e-06, "loss": 1.7344, "mean_token_accuracy": 0.6441267915070057, "num_tokens": 1770432.0, "step": 2196 }, { "epoch": 0.5820974576271186, "grad_norm": 1.636741042137146, "learning_rate": 9.70908368644068e-06, "loss": 1.4681, "mean_token_accuracy": 0.6742912344634533, "num_tokens": 1772005.0, "step": 2198 }, { "epoch": 0.5826271186440678, "grad_norm": 1.6301501989364624, "learning_rate": 9.708818855932204e-06, "loss": 1.3841, "mean_token_accuracy": 0.6999574266374111, "num_tokens": 1773789.0, "step": 2200 }, { "epoch": 0.583156779661017, "grad_norm": 1.927649974822998, "learning_rate": 9.70855402542373e-06, "loss": 1.3777, "mean_token_accuracy": 0.6808512955904007, "num_tokens": 1775292.0, "step": 2202 }, { "epoch": 0.5836864406779662, "grad_norm": 1.5467357635498047, "learning_rate": 9.708289194915254e-06, "loss": 0.8929, "mean_token_accuracy": 0.7811969220638275, "num_tokens": 1777034.0, "step": 2204 }, { "epoch": 0.5842161016949152, "grad_norm": 1.428007960319519, "learning_rate": 9.70802436440678e-06, "loss": 1.1628, "mean_token_accuracy": 0.716401468962431, "num_tokens": 1778570.0, "step": 2206 }, { "epoch": 0.5847457627118644, "grad_norm": 1.9222997426986694, "learning_rate": 9.707759533898306e-06, "loss": 1.2158, "mean_token_accuracy": 0.7113586962223053, "num_tokens": 1780172.0, "step": 2208 }, { "epoch": 0.5852754237288136, "grad_norm": 2.102008581161499, "learning_rate": 9.707494703389832e-06, "loss": 1.5671, "mean_token_accuracy": 0.6717322021722794, "num_tokens": 1781906.0, "step": 2210 }, { "epoch": 0.5858050847457628, "grad_norm": 1.5500807762145996, "learning_rate": 9.707229872881356e-06, "loss": 1.5574, "mean_token_accuracy": 0.6579554900527, "num_tokens": 1783651.0, "step": 2212 }, { "epoch": 0.5863347457627118, "grad_norm": 1.6783454418182373, "learning_rate": 9.706965042372882e-06, "loss": 1.4, "mean_token_accuracy": 0.6701654642820358, "num_tokens": 1785300.0, "step": 2214 }, { "epoch": 0.586864406779661, "grad_norm": 1.7738155126571655, "learning_rate": 9.706700211864407e-06, "loss": 1.8459, "mean_token_accuracy": 0.599560372531414, "num_tokens": 1786978.0, "step": 2216 }, { "epoch": 0.5873940677966102, "grad_norm": 1.3738319873809814, "learning_rate": 9.706435381355934e-06, "loss": 1.1829, "mean_token_accuracy": 0.708396814763546, "num_tokens": 1788590.0, "step": 2218 }, { "epoch": 0.5879237288135594, "grad_norm": 1.406063199043274, "learning_rate": 9.706170550847459e-06, "loss": 1.2166, "mean_token_accuracy": 0.7110121883451939, "num_tokens": 1790273.0, "step": 2220 }, { "epoch": 0.5884533898305084, "grad_norm": 1.6612316370010376, "learning_rate": 9.705905720338984e-06, "loss": 1.5973, "mean_token_accuracy": 0.6738273352384567, "num_tokens": 1791974.0, "step": 2222 }, { "epoch": 0.5889830508474576, "grad_norm": 1.5717909336090088, "learning_rate": 9.705640889830508e-06, "loss": 1.4003, "mean_token_accuracy": 0.6965897716581821, "num_tokens": 1793372.0, "step": 2224 }, { "epoch": 0.5895127118644068, "grad_norm": 1.8490023612976074, "learning_rate": 9.705376059322035e-06, "loss": 1.6103, "mean_token_accuracy": 0.6374319642782211, "num_tokens": 1795002.0, "step": 2226 }, { "epoch": 0.590042372881356, "grad_norm": 2.2487471103668213, "learning_rate": 9.70511122881356e-06, "loss": 1.3263, "mean_token_accuracy": 0.695892371237278, "num_tokens": 1796489.0, "step": 2228 }, { "epoch": 0.590572033898305, "grad_norm": 1.5511726140975952, "learning_rate": 9.704846398305085e-06, "loss": 1.5126, "mean_token_accuracy": 0.6721299290657043, "num_tokens": 1798254.0, "step": 2230 }, { "epoch": 0.5911016949152542, "grad_norm": 1.6242996454238892, "learning_rate": 9.70458156779661e-06, "loss": 1.2286, "mean_token_accuracy": 0.6904207840561867, "num_tokens": 1799967.0, "step": 2232 }, { "epoch": 0.5916313559322034, "grad_norm": 1.6666009426116943, "learning_rate": 9.704316737288137e-06, "loss": 1.4821, "mean_token_accuracy": 0.662010945379734, "num_tokens": 1801742.0, "step": 2234 }, { "epoch": 0.5921610169491526, "grad_norm": 1.7961345911026, "learning_rate": 9.704051906779663e-06, "loss": 1.5021, "mean_token_accuracy": 0.6639653965830803, "num_tokens": 1803264.0, "step": 2236 }, { "epoch": 0.5926906779661016, "grad_norm": 1.5386420488357544, "learning_rate": 9.703787076271188e-06, "loss": 1.1829, "mean_token_accuracy": 0.7202104926109314, "num_tokens": 1804877.0, "step": 2238 }, { "epoch": 0.5932203389830508, "grad_norm": 2.054304361343384, "learning_rate": 9.703522245762713e-06, "loss": 1.9111, "mean_token_accuracy": 0.5819600000977516, "num_tokens": 1806445.0, "step": 2240 }, { "epoch": 0.59375, "grad_norm": 1.3771647214889526, "learning_rate": 9.703257415254238e-06, "loss": 1.3028, "mean_token_accuracy": 0.7006783932447433, "num_tokens": 1807914.0, "step": 2242 }, { "epoch": 0.5942796610169492, "grad_norm": 1.9869074821472168, "learning_rate": 9.702992584745765e-06, "loss": 1.5105, "mean_token_accuracy": 0.6394255198538303, "num_tokens": 1809665.0, "step": 2244 }, { "epoch": 0.5948093220338984, "grad_norm": 1.4010671377182007, "learning_rate": 9.70272775423729e-06, "loss": 1.3521, "mean_token_accuracy": 0.7102522179484367, "num_tokens": 1811126.0, "step": 2246 }, { "epoch": 0.5953389830508474, "grad_norm": 1.6916388273239136, "learning_rate": 9.702462923728814e-06, "loss": 1.2752, "mean_token_accuracy": 0.709571585059166, "num_tokens": 1812617.0, "step": 2248 }, { "epoch": 0.5958686440677966, "grad_norm": 1.2341632843017578, "learning_rate": 9.70219809322034e-06, "loss": 0.9319, "step": 2250 }, { "epoch": 0.5958686440677966, "eval_loss": 1.3351248502731323, "eval_mean_token_accuracy": 0.6958780127105775, "eval_num_tokens": 1814330.0, "eval_runtime": 48.0262, "eval_samples_per_second": 6.413, "eval_steps_per_second": 6.413, "step": 2250 }, { "epoch": 0.5963983050847458, "grad_norm": 1.6734459400177002, "learning_rate": 9.701933262711866e-06, "loss": 1.5556, "mean_token_accuracy": 0.6982061620801687, "num_tokens": 1816278.0, "step": 2252 }, { "epoch": 0.596927966101695, "grad_norm": 1.8943653106689453, "learning_rate": 9.701668432203391e-06, "loss": 1.2572, "mean_token_accuracy": 0.7182703390717506, "num_tokens": 1817766.0, "step": 2254 }, { "epoch": 0.597457627118644, "grad_norm": 1.9313881397247314, "learning_rate": 9.701403601694916e-06, "loss": 1.6253, "mean_token_accuracy": 0.6598510965704918, "num_tokens": 1819421.0, "step": 2256 }, { "epoch": 0.5979872881355932, "grad_norm": 1.5320346355438232, "learning_rate": 9.70113877118644e-06, "loss": 1.3399, "mean_token_accuracy": 0.7036316692829132, "num_tokens": 1820872.0, "step": 2258 }, { "epoch": 0.5985169491525424, "grad_norm": 1.9343857765197754, "learning_rate": 9.700873940677967e-06, "loss": 1.2863, "mean_token_accuracy": 0.6961497515439987, "num_tokens": 1822374.0, "step": 2260 }, { "epoch": 0.5990466101694916, "grad_norm": 1.4437477588653564, "learning_rate": 9.700609110169492e-06, "loss": 1.1843, "mean_token_accuracy": 0.7418482750654221, "num_tokens": 1824018.0, "step": 2262 }, { "epoch": 0.5995762711864406, "grad_norm": 1.8929193019866943, "learning_rate": 9.700344279661019e-06, "loss": 1.5687, "mean_token_accuracy": 0.6591523215174675, "num_tokens": 1825603.0, "step": 2264 }, { "epoch": 0.6001059322033898, "grad_norm": 1.8140475749969482, "learning_rate": 9.700079449152542e-06, "loss": 1.3211, "mean_token_accuracy": 0.6907073296606541, "num_tokens": 1827142.0, "step": 2266 }, { "epoch": 0.600635593220339, "grad_norm": 1.5080877542495728, "learning_rate": 9.699814618644069e-06, "loss": 1.0869, "mean_token_accuracy": 0.7469917014241219, "num_tokens": 1828667.0, "step": 2268 }, { "epoch": 0.6011652542372882, "grad_norm": 1.4603172540664673, "learning_rate": 9.699549788135594e-06, "loss": 1.3815, "mean_token_accuracy": 0.6789924204349518, "num_tokens": 1830350.0, "step": 2270 }, { "epoch": 0.6016949152542372, "grad_norm": 1.7997899055480957, "learning_rate": 9.69928495762712e-06, "loss": 1.5087, "mean_token_accuracy": 0.6636911779642105, "num_tokens": 1831852.0, "step": 2272 }, { "epoch": 0.6022245762711864, "grad_norm": 1.5298187732696533, "learning_rate": 9.699020127118645e-06, "loss": 1.1774, "mean_token_accuracy": 0.71010472625494, "num_tokens": 1833362.0, "step": 2274 }, { "epoch": 0.6027542372881356, "grad_norm": 2.481649398803711, "learning_rate": 9.69875529661017e-06, "loss": 1.5845, "mean_token_accuracy": 0.6321504004299641, "num_tokens": 1834768.0, "step": 2276 }, { "epoch": 0.6032838983050848, "grad_norm": 1.6584196090698242, "learning_rate": 9.698490466101695e-06, "loss": 1.0939, "mean_token_accuracy": 0.7528806626796722, "num_tokens": 1836254.0, "step": 2278 }, { "epoch": 0.6038135593220338, "grad_norm": 2.1603500843048096, "learning_rate": 9.698225635593222e-06, "loss": 1.455, "mean_token_accuracy": 0.6768086925148964, "num_tokens": 1837499.0, "step": 2280 }, { "epoch": 0.604343220338983, "grad_norm": 1.3886525630950928, "learning_rate": 9.697960805084747e-06, "loss": 1.4037, "mean_token_accuracy": 0.6785335913300514, "num_tokens": 1839072.0, "step": 2282 }, { "epoch": 0.6048728813559322, "grad_norm": 1.7112398147583008, "learning_rate": 9.697695974576272e-06, "loss": 1.416, "mean_token_accuracy": 0.6962729245424271, "num_tokens": 1840690.0, "step": 2284 }, { "epoch": 0.6054025423728814, "grad_norm": 1.667656421661377, "learning_rate": 9.697431144067797e-06, "loss": 1.2426, "mean_token_accuracy": 0.7215655371546745, "num_tokens": 1842472.0, "step": 2286 }, { "epoch": 0.6059322033898306, "grad_norm": 2.087174654006958, "learning_rate": 9.697166313559323e-06, "loss": 1.1924, "mean_token_accuracy": 0.7283900901675224, "num_tokens": 1844444.0, "step": 2288 }, { "epoch": 0.6064618644067796, "grad_norm": 1.377856969833374, "learning_rate": 9.696901483050848e-06, "loss": 1.0974, "mean_token_accuracy": 0.7360668927431107, "num_tokens": 1846094.0, "step": 2290 }, { "epoch": 0.6069915254237288, "grad_norm": 2.000337839126587, "learning_rate": 9.696636652542375e-06, "loss": 1.662, "mean_token_accuracy": 0.6518987119197845, "num_tokens": 1847931.0, "step": 2292 }, { "epoch": 0.607521186440678, "grad_norm": 1.4618178606033325, "learning_rate": 9.696371822033898e-06, "loss": 1.3159, "mean_token_accuracy": 0.6823034361004829, "num_tokens": 1849710.0, "step": 2294 }, { "epoch": 0.6080508474576272, "grad_norm": 1.6765475273132324, "learning_rate": 9.696106991525425e-06, "loss": 1.5159, "mean_token_accuracy": 0.6444305405020714, "num_tokens": 1851465.0, "step": 2296 }, { "epoch": 0.6085805084745762, "grad_norm": 2.0793375968933105, "learning_rate": 9.69584216101695e-06, "loss": 1.1366, "mean_token_accuracy": 0.7264189124107361, "num_tokens": 1853060.0, "step": 2298 }, { "epoch": 0.6091101694915254, "grad_norm": 1.2715520858764648, "learning_rate": 9.695577330508476e-06, "loss": 1.3606, "mean_token_accuracy": 0.6936153694987297, "num_tokens": 1854473.0, "step": 2300 }, { "epoch": 0.6096398305084746, "grad_norm": 1.9665082693099976, "learning_rate": 9.695312500000001e-06, "loss": 1.4937, "mean_token_accuracy": 0.6998507156968117, "num_tokens": 1855930.0, "step": 2302 }, { "epoch": 0.6101694915254238, "grad_norm": 1.6035188436508179, "learning_rate": 9.695047669491526e-06, "loss": 1.3073, "mean_token_accuracy": 0.7064255550503731, "num_tokens": 1857387.0, "step": 2304 }, { "epoch": 0.6106991525423728, "grad_norm": 1.5422568321228027, "learning_rate": 9.694782838983051e-06, "loss": 1.163, "mean_token_accuracy": 0.7223349064588547, "num_tokens": 1858937.0, "step": 2306 }, { "epoch": 0.611228813559322, "grad_norm": 1.7228890657424927, "learning_rate": 9.694518008474578e-06, "loss": 1.3106, "mean_token_accuracy": 0.6996585130691528, "num_tokens": 1861003.0, "step": 2308 }, { "epoch": 0.6117584745762712, "grad_norm": 1.6721683740615845, "learning_rate": 9.694253177966102e-06, "loss": 1.0784, "mean_token_accuracy": 0.749823659658432, "num_tokens": 1862453.0, "step": 2310 }, { "epoch": 0.6122881355932204, "grad_norm": 1.8225154876708984, "learning_rate": 9.693988347457627e-06, "loss": 1.5305, "mean_token_accuracy": 0.6539452411234379, "num_tokens": 1863903.0, "step": 2312 }, { "epoch": 0.6128177966101694, "grad_norm": 1.0977017879486084, "learning_rate": 9.693723516949152e-06, "loss": 1.0432, "mean_token_accuracy": 0.7803827524185181, "num_tokens": 1865721.0, "step": 2314 }, { "epoch": 0.6133474576271186, "grad_norm": 1.8275130987167358, "learning_rate": 9.693458686440679e-06, "loss": 1.2777, "mean_token_accuracy": 0.709928885102272, "num_tokens": 1867165.0, "step": 2316 }, { "epoch": 0.6138771186440678, "grad_norm": 1.5603967905044556, "learning_rate": 9.693193855932204e-06, "loss": 1.1109, "mean_token_accuracy": 0.7469398155808449, "num_tokens": 1868640.0, "step": 2318 }, { "epoch": 0.614406779661017, "grad_norm": 1.4969664812088013, "learning_rate": 9.692929025423729e-06, "loss": 1.2812, "mean_token_accuracy": 0.7102150171995163, "num_tokens": 1870305.0, "step": 2320 }, { "epoch": 0.6149364406779662, "grad_norm": 1.5240553617477417, "learning_rate": 9.692664194915255e-06, "loss": 1.3934, "mean_token_accuracy": 0.6767150685191154, "num_tokens": 1871942.0, "step": 2322 }, { "epoch": 0.6154661016949152, "grad_norm": 1.8668211698532104, "learning_rate": 9.69239936440678e-06, "loss": 1.8045, "mean_token_accuracy": 0.6271221190690994, "num_tokens": 1873464.0, "step": 2324 }, { "epoch": 0.6159957627118644, "grad_norm": 1.793695092201233, "learning_rate": 9.692134533898307e-06, "loss": 1.4406, "mean_token_accuracy": 0.7075807973742485, "num_tokens": 1875037.0, "step": 2326 }, { "epoch": 0.6165254237288136, "grad_norm": 2.817629337310791, "learning_rate": 9.691869703389832e-06, "loss": 1.226, "mean_token_accuracy": 0.7341335341334343, "num_tokens": 1876522.0, "step": 2328 }, { "epoch": 0.6170550847457628, "grad_norm": 1.5935689210891724, "learning_rate": 9.691604872881357e-06, "loss": 1.2273, "mean_token_accuracy": 0.7153059393167496, "num_tokens": 1877986.0, "step": 2330 }, { "epoch": 0.6175847457627118, "grad_norm": 1.5589171648025513, "learning_rate": 9.691340042372882e-06, "loss": 1.3918, "mean_token_accuracy": 0.6985113620758057, "num_tokens": 1879757.0, "step": 2332 }, { "epoch": 0.618114406779661, "grad_norm": 1.6373465061187744, "learning_rate": 9.691075211864408e-06, "loss": 1.3571, "mean_token_accuracy": 0.7055788934230804, "num_tokens": 1881240.0, "step": 2334 }, { "epoch": 0.6186440677966102, "grad_norm": 2.1663777828216553, "learning_rate": 9.690810381355933e-06, "loss": 1.3305, "mean_token_accuracy": 0.6881780922412872, "num_tokens": 1883049.0, "step": 2336 }, { "epoch": 0.6191737288135594, "grad_norm": 1.8778471946716309, "learning_rate": 9.690545550847458e-06, "loss": 1.497, "mean_token_accuracy": 0.6603462919592857, "num_tokens": 1884467.0, "step": 2338 }, { "epoch": 0.6197033898305084, "grad_norm": 1.6699421405792236, "learning_rate": 9.690280720338983e-06, "loss": 1.0352, "mean_token_accuracy": 0.7829964980483055, "num_tokens": 1885901.0, "step": 2340 }, { "epoch": 0.6202330508474576, "grad_norm": 1.8439053297042847, "learning_rate": 9.69001588983051e-06, "loss": 1.6223, "mean_token_accuracy": 0.6427019163966179, "num_tokens": 1887502.0, "step": 2342 }, { "epoch": 0.6207627118644068, "grad_norm": 1.4654145240783691, "learning_rate": 9.689751059322035e-06, "loss": 1.2942, "mean_token_accuracy": 0.6926639005541801, "num_tokens": 1889070.0, "step": 2344 }, { "epoch": 0.621292372881356, "grad_norm": 1.7694525718688965, "learning_rate": 9.689486228813561e-06, "loss": 1.6674, "mean_token_accuracy": 0.6323520466685295, "num_tokens": 1890674.0, "step": 2346 }, { "epoch": 0.621822033898305, "grad_norm": 1.9896084070205688, "learning_rate": 9.689221398305085e-06, "loss": 1.173, "mean_token_accuracy": 0.7119884714484215, "num_tokens": 1892218.0, "step": 2348 }, { "epoch": 0.6223516949152542, "grad_norm": 1.6174070835113525, "learning_rate": 9.688956567796611e-06, "loss": 1.381, "mean_token_accuracy": 0.7011027708649635, "num_tokens": 1893847.0, "step": 2350 }, { "epoch": 0.6228813559322034, "grad_norm": 1.5064202547073364, "learning_rate": 9.688691737288136e-06, "loss": 1.2445, "mean_token_accuracy": 0.7109973952174187, "num_tokens": 1895426.0, "step": 2352 }, { "epoch": 0.6234110169491526, "grad_norm": 2.1290431022644043, "learning_rate": 9.688426906779663e-06, "loss": 1.5284, "mean_token_accuracy": 0.6634340360760689, "num_tokens": 1897286.0, "step": 2354 }, { "epoch": 0.6239406779661016, "grad_norm": 1.495470404624939, "learning_rate": 9.688162076271188e-06, "loss": 1.1458, "mean_token_accuracy": 0.7424758225679398, "num_tokens": 1898740.0, "step": 2356 }, { "epoch": 0.6244703389830508, "grad_norm": 1.728976845741272, "learning_rate": 9.687897245762713e-06, "loss": 1.2304, "mean_token_accuracy": 0.701297651976347, "num_tokens": 1900064.0, "step": 2358 }, { "epoch": 0.625, "grad_norm": 1.448072075843811, "learning_rate": 9.687632415254238e-06, "loss": 1.4181, "mean_token_accuracy": 0.6912899389863014, "num_tokens": 1901579.0, "step": 2360 }, { "epoch": 0.6255296610169492, "grad_norm": 1.6823879480361938, "learning_rate": 9.687367584745764e-06, "loss": 1.0111, "mean_token_accuracy": 0.7285708785057068, "num_tokens": 1903177.0, "step": 2362 }, { "epoch": 0.6260593220338984, "grad_norm": 1.560465931892395, "learning_rate": 9.687102754237289e-06, "loss": 1.2142, "mean_token_accuracy": 0.7145688161253929, "num_tokens": 1904789.0, "step": 2364 }, { "epoch": 0.6265889830508474, "grad_norm": 1.3849644660949707, "learning_rate": 9.686837923728814e-06, "loss": 1.3547, "mean_token_accuracy": 0.6909924671053886, "num_tokens": 1907406.0, "step": 2366 }, { "epoch": 0.6271186440677966, "grad_norm": 1.342113971710205, "learning_rate": 9.686573093220339e-06, "loss": 1.2411, "mean_token_accuracy": 0.7179146632552147, "num_tokens": 1909144.0, "step": 2368 }, { "epoch": 0.6276483050847458, "grad_norm": 1.5523910522460938, "learning_rate": 9.686308262711866e-06, "loss": 1.1637, "mean_token_accuracy": 0.7078383974730968, "num_tokens": 1910670.0, "step": 2370 }, { "epoch": 0.628177966101695, "grad_norm": 1.4602047204971313, "learning_rate": 9.68604343220339e-06, "loss": 1.3994, "mean_token_accuracy": 0.6869008727371693, "num_tokens": 1912406.0, "step": 2372 }, { "epoch": 0.628707627118644, "grad_norm": 1.6091536283493042, "learning_rate": 9.685778601694915e-06, "loss": 1.5793, "mean_token_accuracy": 0.6710401326417923, "num_tokens": 1914030.0, "step": 2374 }, { "epoch": 0.6292372881355932, "grad_norm": 1.9809423685073853, "learning_rate": 9.68551377118644e-06, "loss": 1.2114, "mean_token_accuracy": 0.7268080860376358, "num_tokens": 1915568.0, "step": 2376 }, { "epoch": 0.6297669491525424, "grad_norm": 1.4806934595108032, "learning_rate": 9.685248940677967e-06, "loss": 1.3205, "mean_token_accuracy": 0.7173827067017555, "num_tokens": 1917248.0, "step": 2378 }, { "epoch": 0.6302966101694916, "grad_norm": 1.5381455421447754, "learning_rate": 9.684984110169492e-06, "loss": 1.5702, "mean_token_accuracy": 0.6714353896677494, "num_tokens": 1918837.0, "step": 2380 }, { "epoch": 0.6308262711864406, "grad_norm": 1.6660664081573486, "learning_rate": 9.684719279661019e-06, "loss": 1.3419, "mean_token_accuracy": 0.7037245631217957, "num_tokens": 1920352.0, "step": 2382 }, { "epoch": 0.6313559322033898, "grad_norm": 1.7164207696914673, "learning_rate": 9.684454449152543e-06, "loss": 1.2911, "mean_token_accuracy": 0.7182725295424461, "num_tokens": 1922036.0, "step": 2384 }, { "epoch": 0.631885593220339, "grad_norm": 1.6483821868896484, "learning_rate": 9.684189618644068e-06, "loss": 1.5874, "mean_token_accuracy": 0.6739106252789497, "num_tokens": 1923720.0, "step": 2386 }, { "epoch": 0.6324152542372882, "grad_norm": 1.756303310394287, "learning_rate": 9.683924788135593e-06, "loss": 1.5378, "mean_token_accuracy": 0.6494290456175804, "num_tokens": 1925345.0, "step": 2388 }, { "epoch": 0.6329449152542372, "grad_norm": 1.3491055965423584, "learning_rate": 9.68365995762712e-06, "loss": 1.1638, "mean_token_accuracy": 0.7519441619515419, "num_tokens": 1926910.0, "step": 2390 }, { "epoch": 0.6334745762711864, "grad_norm": 1.546776533126831, "learning_rate": 9.683395127118645e-06, "loss": 1.301, "mean_token_accuracy": 0.7132455855607986, "num_tokens": 1928511.0, "step": 2392 }, { "epoch": 0.6340042372881356, "grad_norm": 2.239060163497925, "learning_rate": 9.68313029661017e-06, "loss": 1.7149, "mean_token_accuracy": 0.6512337625026703, "num_tokens": 1929797.0, "step": 2394 }, { "epoch": 0.6345338983050848, "grad_norm": 1.7564201354980469, "learning_rate": 9.682865466101695e-06, "loss": 1.4955, "mean_token_accuracy": 0.6616012677550316, "num_tokens": 1931526.0, "step": 2396 }, { "epoch": 0.6350635593220338, "grad_norm": 1.5695327520370483, "learning_rate": 9.682600635593221e-06, "loss": 1.4974, "mean_token_accuracy": 0.6755221113562584, "num_tokens": 1933033.0, "step": 2398 }, { "epoch": 0.635593220338983, "grad_norm": 2.101060152053833, "learning_rate": 9.682335805084746e-06, "loss": 1.3397, "mean_token_accuracy": 0.7205767445266247, "num_tokens": 1934486.0, "step": 2400 }, { "epoch": 0.6361228813559322, "grad_norm": 1.8184696435928345, "learning_rate": 9.682070974576271e-06, "loss": 1.7194, "mean_token_accuracy": 0.6365551352500916, "num_tokens": 1936076.0, "step": 2402 }, { "epoch": 0.6366525423728814, "grad_norm": 1.3706187009811401, "learning_rate": 9.681806144067798e-06, "loss": 1.3166, "mean_token_accuracy": 0.6845975667238235, "num_tokens": 1937974.0, "step": 2404 }, { "epoch": 0.6371822033898306, "grad_norm": 1.790781021118164, "learning_rate": 9.681541313559323e-06, "loss": 1.0992, "mean_token_accuracy": 0.7337211892008781, "num_tokens": 1939711.0, "step": 2406 }, { "epoch": 0.6377118644067796, "grad_norm": 1.410043716430664, "learning_rate": 9.68127648305085e-06, "loss": 0.9957, "mean_token_accuracy": 0.758410632610321, "num_tokens": 1941121.0, "step": 2408 }, { "epoch": 0.6382415254237288, "grad_norm": 1.6389435529708862, "learning_rate": 9.681011652542374e-06, "loss": 1.1363, "mean_token_accuracy": 0.7183384448289871, "num_tokens": 1942735.0, "step": 2410 }, { "epoch": 0.638771186440678, "grad_norm": 1.4245797395706177, "learning_rate": 9.6807468220339e-06, "loss": 1.6108, "mean_token_accuracy": 0.6718983687460423, "num_tokens": 1944510.0, "step": 2412 }, { "epoch": 0.6393008474576272, "grad_norm": 1.3144702911376953, "learning_rate": 9.680481991525424e-06, "loss": 1.2494, "mean_token_accuracy": 0.6971545144915581, "num_tokens": 1946235.0, "step": 2414 }, { "epoch": 0.6398305084745762, "grad_norm": 2.489927053451538, "learning_rate": 9.68021716101695e-06, "loss": 1.1472, "mean_token_accuracy": 0.736391544342041, "num_tokens": 1947491.0, "step": 2416 }, { "epoch": 0.6403601694915254, "grad_norm": 1.4721519947052002, "learning_rate": 9.679952330508476e-06, "loss": 1.5712, "mean_token_accuracy": 0.6461226865649223, "num_tokens": 1949165.0, "step": 2418 }, { "epoch": 0.6408898305084746, "grad_norm": 1.6239153146743774, "learning_rate": 9.6796875e-06, "loss": 1.2626, "mean_token_accuracy": 0.7013596370816231, "num_tokens": 1950793.0, "step": 2420 }, { "epoch": 0.6414194915254238, "grad_norm": 1.4396820068359375, "learning_rate": 9.679422669491526e-06, "loss": 1.2081, "mean_token_accuracy": 0.7055396735668182, "num_tokens": 1952272.0, "step": 2422 }, { "epoch": 0.6419491525423728, "grad_norm": 1.9915364980697632, "learning_rate": 9.679157838983052e-06, "loss": 1.7344, "mean_token_accuracy": 0.6220896169543266, "num_tokens": 1953634.0, "step": 2424 }, { "epoch": 0.642478813559322, "grad_norm": 1.639696478843689, "learning_rate": 9.678893008474577e-06, "loss": 1.5738, "mean_token_accuracy": 0.6500451192259789, "num_tokens": 1955363.0, "step": 2426 }, { "epoch": 0.6430084745762712, "grad_norm": 1.9303687810897827, "learning_rate": 9.678628177966102e-06, "loss": 1.8408, "mean_token_accuracy": 0.5813092961907387, "num_tokens": 1956823.0, "step": 2428 }, { "epoch": 0.6435381355932204, "grad_norm": 1.5722771883010864, "learning_rate": 9.678363347457627e-06, "loss": 1.3023, "mean_token_accuracy": 0.7024793401360512, "num_tokens": 1958291.0, "step": 2430 }, { "epoch": 0.6440677966101694, "grad_norm": 1.494184136390686, "learning_rate": 9.678098516949154e-06, "loss": 1.3924, "mean_token_accuracy": 0.6881704404950142, "num_tokens": 1960080.0, "step": 2432 }, { "epoch": 0.6445974576271186, "grad_norm": 1.5443035364151, "learning_rate": 9.677833686440679e-06, "loss": 1.3039, "mean_token_accuracy": 0.7161934599280357, "num_tokens": 1961545.0, "step": 2434 }, { "epoch": 0.6451271186440678, "grad_norm": 2.0824315547943115, "learning_rate": 9.677568855932205e-06, "loss": 1.4941, "mean_token_accuracy": 0.6681026741862297, "num_tokens": 1963054.0, "step": 2436 }, { "epoch": 0.645656779661017, "grad_norm": 1.8507604598999023, "learning_rate": 9.67730402542373e-06, "loss": 1.3991, "mean_token_accuracy": 0.6869510188698769, "num_tokens": 1964781.0, "step": 2438 }, { "epoch": 0.6461864406779662, "grad_norm": 1.6750472784042358, "learning_rate": 9.677039194915255e-06, "loss": 1.6512, "mean_token_accuracy": 0.6393877640366554, "num_tokens": 1966547.0, "step": 2440 }, { "epoch": 0.6467161016949152, "grad_norm": 2.0962984561920166, "learning_rate": 9.67677436440678e-06, "loss": 1.1462, "mean_token_accuracy": 0.7276422902941704, "num_tokens": 1967887.0, "step": 2442 }, { "epoch": 0.6472457627118644, "grad_norm": 1.4699180126190186, "learning_rate": 9.676509533898307e-06, "loss": 1.1376, "mean_token_accuracy": 0.7380979955196381, "num_tokens": 1969306.0, "step": 2444 }, { "epoch": 0.6477754237288136, "grad_norm": 1.7708054780960083, "learning_rate": 9.676244703389832e-06, "loss": 1.6458, "mean_token_accuracy": 0.6533284410834312, "num_tokens": 1970924.0, "step": 2446 }, { "epoch": 0.6483050847457628, "grad_norm": 1.6927727460861206, "learning_rate": 9.675979872881356e-06, "loss": 0.9749, "mean_token_accuracy": 0.7710631862282753, "num_tokens": 1972190.0, "step": 2448 }, { "epoch": 0.6488347457627118, "grad_norm": 1.8763681650161743, "learning_rate": 9.675715042372881e-06, "loss": 1.433, "mean_token_accuracy": 0.6718738526105881, "num_tokens": 1973612.0, "step": 2450 }, { "epoch": 0.649364406779661, "grad_norm": 1.2786797285079956, "learning_rate": 9.675450211864408e-06, "loss": 1.011, "mean_token_accuracy": 0.7640543431043625, "num_tokens": 1975228.0, "step": 2452 }, { "epoch": 0.6498940677966102, "grad_norm": 1.9202979803085327, "learning_rate": 9.675185381355933e-06, "loss": 1.3294, "mean_token_accuracy": 0.7123861014842987, "num_tokens": 1976613.0, "step": 2454 }, { "epoch": 0.6504237288135594, "grad_norm": 1.9714518785476685, "learning_rate": 9.674920550847458e-06, "loss": 1.2477, "mean_token_accuracy": 0.706626333296299, "num_tokens": 1978014.0, "step": 2456 }, { "epoch": 0.6509533898305084, "grad_norm": 2.027533531188965, "learning_rate": 9.674655720338983e-06, "loss": 1.3717, "mean_token_accuracy": 0.681680791079998, "num_tokens": 1979468.0, "step": 2458 }, { "epoch": 0.6514830508474576, "grad_norm": 1.3640114068984985, "learning_rate": 9.67439088983051e-06, "loss": 0.8559, "mean_token_accuracy": 0.8075682148337364, "num_tokens": 1980804.0, "step": 2460 }, { "epoch": 0.6520127118644068, "grad_norm": 1.4911576509475708, "learning_rate": 9.674126059322034e-06, "loss": 1.1987, "mean_token_accuracy": 0.7088237777352333, "num_tokens": 1982624.0, "step": 2462 }, { "epoch": 0.652542372881356, "grad_norm": 1.5671404600143433, "learning_rate": 9.673861228813561e-06, "loss": 1.5145, "mean_token_accuracy": 0.6412906125187874, "num_tokens": 1984164.0, "step": 2464 }, { "epoch": 0.653072033898305, "grad_norm": 1.6106507778167725, "learning_rate": 9.673596398305086e-06, "loss": 1.7277, "mean_token_accuracy": 0.6084743738174438, "num_tokens": 1985989.0, "step": 2466 }, { "epoch": 0.6536016949152542, "grad_norm": 1.7712230682373047, "learning_rate": 9.67333156779661e-06, "loss": 1.3981, "mean_token_accuracy": 0.7050968110561371, "num_tokens": 1987493.0, "step": 2468 }, { "epoch": 0.6541313559322034, "grad_norm": 1.4891653060913086, "learning_rate": 9.673066737288136e-06, "loss": 1.3963, "mean_token_accuracy": 0.6839020922780037, "num_tokens": 1989092.0, "step": 2470 }, { "epoch": 0.6546610169491526, "grad_norm": 1.8139883279800415, "learning_rate": 9.672801906779662e-06, "loss": 1.5514, "mean_token_accuracy": 0.6567831560969353, "num_tokens": 1990706.0, "step": 2472 }, { "epoch": 0.6551906779661016, "grad_norm": 1.5682170391082764, "learning_rate": 9.672537076271187e-06, "loss": 1.4509, "mean_token_accuracy": 0.7025499492883682, "num_tokens": 1992280.0, "step": 2474 }, { "epoch": 0.6557203389830508, "grad_norm": 1.5563991069793701, "learning_rate": 9.672272245762712e-06, "loss": 1.4677, "mean_token_accuracy": 0.6750174276530743, "num_tokens": 1993939.0, "step": 2476 }, { "epoch": 0.65625, "grad_norm": 2.316089391708374, "learning_rate": 9.672007415254237e-06, "loss": 1.6291, "mean_token_accuracy": 0.6525920629501343, "num_tokens": 1995586.0, "step": 2478 }, { "epoch": 0.6567796610169492, "grad_norm": 2.1642062664031982, "learning_rate": 9.671742584745764e-06, "loss": 1.3893, "mean_token_accuracy": 0.6761788725852966, "num_tokens": 1997123.0, "step": 2480 }, { "epoch": 0.6573093220338984, "grad_norm": 1.4830608367919922, "learning_rate": 9.671477754237289e-06, "loss": 1.2522, "mean_token_accuracy": 0.7265413478016853, "num_tokens": 1998844.0, "step": 2482 }, { "epoch": 0.6578389830508474, "grad_norm": 1.716334342956543, "learning_rate": 9.671212923728814e-06, "loss": 1.3377, "mean_token_accuracy": 0.7004310339689255, "num_tokens": 2000273.0, "step": 2484 }, { "epoch": 0.6583686440677966, "grad_norm": 1.5738413333892822, "learning_rate": 9.670948093220339e-06, "loss": 1.2278, "mean_token_accuracy": 0.7257298305630684, "num_tokens": 2001848.0, "step": 2486 }, { "epoch": 0.6588983050847458, "grad_norm": 1.6520535945892334, "learning_rate": 9.670683262711865e-06, "loss": 0.8245, "mean_token_accuracy": 0.7927552089095116, "num_tokens": 2003208.0, "step": 2488 }, { "epoch": 0.659427966101695, "grad_norm": 1.696926236152649, "learning_rate": 9.670418432203392e-06, "loss": 1.2812, "mean_token_accuracy": 0.7331323102116585, "num_tokens": 2004810.0, "step": 2490 }, { "epoch": 0.659957627118644, "grad_norm": 1.661909818649292, "learning_rate": 9.670153601694917e-06, "loss": 1.07, "mean_token_accuracy": 0.754407674074173, "num_tokens": 2006453.0, "step": 2492 }, { "epoch": 0.6604872881355932, "grad_norm": 1.4337689876556396, "learning_rate": 9.669888771186442e-06, "loss": 1.095, "mean_token_accuracy": 0.7530628964304924, "num_tokens": 2008077.0, "step": 2494 }, { "epoch": 0.6610169491525424, "grad_norm": 1.4395750761032104, "learning_rate": 9.669623940677967e-06, "loss": 1.4362, "mean_token_accuracy": 0.6468893140554428, "num_tokens": 2009655.0, "step": 2496 }, { "epoch": 0.6615466101694916, "grad_norm": 1.9778786897659302, "learning_rate": 9.669359110169493e-06, "loss": 1.2937, "mean_token_accuracy": 0.7277633994817734, "num_tokens": 2011093.0, "step": 2498 }, { "epoch": 0.6620762711864406, "grad_norm": 1.5140936374664307, "learning_rate": 9.669094279661018e-06, "loss": 1.3369, "step": 2500 }, { "epoch": 0.6620762711864406, "eval_loss": 1.3340051174163818, "eval_mean_token_accuracy": 0.6968666795012238, "eval_num_tokens": 2012773.0, "eval_runtime": 48.0996, "eval_samples_per_second": 6.403, "eval_steps_per_second": 6.403, "step": 2500 }, { "epoch": 0.6626059322033898, "grad_norm": 1.8812938928604126, "learning_rate": 9.668829449152543e-06, "loss": 1.6239, "mean_token_accuracy": 0.6763209030032158, "num_tokens": 2014400.0, "step": 2502 }, { "epoch": 0.663135593220339, "grad_norm": 2.026102066040039, "learning_rate": 9.668564618644068e-06, "loss": 2.1168, "mean_token_accuracy": 0.5631096735596657, "num_tokens": 2016099.0, "step": 2504 }, { "epoch": 0.6636652542372882, "grad_norm": 1.1786919832229614, "learning_rate": 9.668299788135595e-06, "loss": 1.1052, "mean_token_accuracy": 0.7414227202534676, "num_tokens": 2017981.0, "step": 2506 }, { "epoch": 0.6641949152542372, "grad_norm": 1.9934649467468262, "learning_rate": 9.66803495762712e-06, "loss": 1.8944, "mean_token_accuracy": 0.5863719135522842, "num_tokens": 2019812.0, "step": 2508 }, { "epoch": 0.6647245762711864, "grad_norm": 1.4689525365829468, "learning_rate": 9.667770127118644e-06, "loss": 0.9908, "mean_token_accuracy": 0.7647634372115135, "num_tokens": 2021584.0, "step": 2510 }, { "epoch": 0.6652542372881356, "grad_norm": 1.7455778121948242, "learning_rate": 9.66750529661017e-06, "loss": 1.5195, "mean_token_accuracy": 0.6505101025104523, "num_tokens": 2023392.0, "step": 2512 }, { "epoch": 0.6657838983050848, "grad_norm": 1.4606239795684814, "learning_rate": 9.667240466101696e-06, "loss": 1.4166, "mean_token_accuracy": 0.6875288709998131, "num_tokens": 2025207.0, "step": 2514 }, { "epoch": 0.6663135593220338, "grad_norm": 1.9216033220291138, "learning_rate": 9.666975635593221e-06, "loss": 1.6155, "mean_token_accuracy": 0.6451160833239555, "num_tokens": 2026775.0, "step": 2516 }, { "epoch": 0.666843220338983, "grad_norm": 1.5497770309448242, "learning_rate": 9.666710805084748e-06, "loss": 1.0832, "mean_token_accuracy": 0.7234875336289406, "num_tokens": 2028194.0, "step": 2518 }, { "epoch": 0.6673728813559322, "grad_norm": 1.4300220012664795, "learning_rate": 9.666445974576273e-06, "loss": 0.8759, "mean_token_accuracy": 0.7688228785991669, "num_tokens": 2029633.0, "step": 2520 }, { "epoch": 0.6679025423728814, "grad_norm": 2.182615041732788, "learning_rate": 9.666181144067797e-06, "loss": 1.1382, "mean_token_accuracy": 0.7327791973948479, "num_tokens": 2030961.0, "step": 2522 }, { "epoch": 0.6684322033898306, "grad_norm": 1.3667105436325073, "learning_rate": 9.665916313559322e-06, "loss": 1.0628, "mean_token_accuracy": 0.7339612618088722, "num_tokens": 2032453.0, "step": 2524 }, { "epoch": 0.6689618644067796, "grad_norm": 1.4302690029144287, "learning_rate": 9.665651483050849e-06, "loss": 1.2309, "mean_token_accuracy": 0.7256666198372841, "num_tokens": 2033978.0, "step": 2526 }, { "epoch": 0.6694915254237288, "grad_norm": 1.7355713844299316, "learning_rate": 9.665386652542374e-06, "loss": 1.2545, "mean_token_accuracy": 0.7198014184832573, "num_tokens": 2035692.0, "step": 2528 }, { "epoch": 0.670021186440678, "grad_norm": 2.4305076599121094, "learning_rate": 9.665121822033899e-06, "loss": 1.5715, "mean_token_accuracy": 0.6561804115772247, "num_tokens": 2037169.0, "step": 2530 }, { "epoch": 0.6705508474576272, "grad_norm": 1.629249930381775, "learning_rate": 9.664856991525424e-06, "loss": 1.0299, "mean_token_accuracy": 0.7587056159973145, "num_tokens": 2038503.0, "step": 2532 }, { "epoch": 0.6710805084745762, "grad_norm": 1.6243242025375366, "learning_rate": 9.66459216101695e-06, "loss": 1.1864, "mean_token_accuracy": 0.7295751757919788, "num_tokens": 2040019.0, "step": 2534 }, { "epoch": 0.6716101694915254, "grad_norm": 1.5958876609802246, "learning_rate": 9.664327330508475e-06, "loss": 1.631, "mean_token_accuracy": 0.6424602456390858, "num_tokens": 2041858.0, "step": 2536 }, { "epoch": 0.6721398305084746, "grad_norm": 1.645591378211975, "learning_rate": 9.6640625e-06, "loss": 1.6203, "mean_token_accuracy": 0.6660308241844177, "num_tokens": 2043319.0, "step": 2538 }, { "epoch": 0.6726694915254238, "grad_norm": 1.4386117458343506, "learning_rate": 9.663797669491525e-06, "loss": 1.4761, "mean_token_accuracy": 0.7002231478691101, "num_tokens": 2044864.0, "step": 2540 }, { "epoch": 0.6731991525423728, "grad_norm": 1.7869523763656616, "learning_rate": 9.663532838983052e-06, "loss": 1.6484, "mean_token_accuracy": 0.6362810507416725, "num_tokens": 2046899.0, "step": 2542 }, { "epoch": 0.673728813559322, "grad_norm": 1.3596826791763306, "learning_rate": 9.663268008474577e-06, "loss": 1.4277, "mean_token_accuracy": 0.6861588284373283, "num_tokens": 2048672.0, "step": 2544 }, { "epoch": 0.6742584745762712, "grad_norm": 1.6539207696914673, "learning_rate": 9.663003177966103e-06, "loss": 1.2761, "mean_token_accuracy": 0.7284206449985504, "num_tokens": 2050251.0, "step": 2546 }, { "epoch": 0.6747881355932204, "grad_norm": 1.6655611991882324, "learning_rate": 9.662738347457628e-06, "loss": 1.6397, "mean_token_accuracy": 0.6546197682619095, "num_tokens": 2051735.0, "step": 2548 }, { "epoch": 0.6753177966101694, "grad_norm": 1.8140441179275513, "learning_rate": 9.662473516949153e-06, "loss": 1.0818, "mean_token_accuracy": 0.7532130479812622, "num_tokens": 2052996.0, "step": 2550 }, { "epoch": 0.6758474576271186, "grad_norm": 1.8406426906585693, "learning_rate": 9.662208686440678e-06, "loss": 1.2614, "mean_token_accuracy": 0.700339786708355, "num_tokens": 2054447.0, "step": 2552 }, { "epoch": 0.6763771186440678, "grad_norm": 1.4906238317489624, "learning_rate": 9.661943855932205e-06, "loss": 0.9215, "mean_token_accuracy": 0.7740073949098587, "num_tokens": 2056036.0, "step": 2554 }, { "epoch": 0.676906779661017, "grad_norm": 2.475877046585083, "learning_rate": 9.66167902542373e-06, "loss": 1.8605, "mean_token_accuracy": 0.5900014415383339, "num_tokens": 2057352.0, "step": 2556 }, { "epoch": 0.6774364406779662, "grad_norm": 1.6798993349075317, "learning_rate": 9.661414194915255e-06, "loss": 1.1041, "mean_token_accuracy": 0.7395563274621964, "num_tokens": 2058692.0, "step": 2558 }, { "epoch": 0.6779661016949152, "grad_norm": 2.4775924682617188, "learning_rate": 9.66114936440678e-06, "loss": 1.2884, "mean_token_accuracy": 0.7098970636725426, "num_tokens": 2059982.0, "step": 2560 }, { "epoch": 0.6784957627118644, "grad_norm": 1.702219009399414, "learning_rate": 9.660884533898306e-06, "loss": 1.4826, "mean_token_accuracy": 0.6795610040426254, "num_tokens": 2061493.0, "step": 2562 }, { "epoch": 0.6790254237288136, "grad_norm": 1.2157262563705444, "learning_rate": 9.660619703389831e-06, "loss": 1.0695, "mean_token_accuracy": 0.7404784560203552, "num_tokens": 2063129.0, "step": 2564 }, { "epoch": 0.6795550847457628, "grad_norm": 1.659490704536438, "learning_rate": 9.660354872881356e-06, "loss": 1.3734, "mean_token_accuracy": 0.6732341274619102, "num_tokens": 2064755.0, "step": 2566 }, { "epoch": 0.6800847457627118, "grad_norm": 1.7112016677856445, "learning_rate": 9.660090042372881e-06, "loss": 1.4561, "mean_token_accuracy": 0.6759450361132622, "num_tokens": 2066071.0, "step": 2568 }, { "epoch": 0.680614406779661, "grad_norm": 1.4941717386245728, "learning_rate": 9.659825211864408e-06, "loss": 1.6865, "mean_token_accuracy": 0.6442316845059395, "num_tokens": 2067987.0, "step": 2570 }, { "epoch": 0.6811440677966102, "grad_norm": 1.8552547693252563, "learning_rate": 9.659560381355934e-06, "loss": 1.4428, "mean_token_accuracy": 0.6803976967930794, "num_tokens": 2069649.0, "step": 2572 }, { "epoch": 0.6816737288135594, "grad_norm": 1.7736319303512573, "learning_rate": 9.65929555084746e-06, "loss": 1.5207, "mean_token_accuracy": 0.6391555182635784, "num_tokens": 2071343.0, "step": 2574 }, { "epoch": 0.6822033898305084, "grad_norm": 1.8979564905166626, "learning_rate": 9.659030720338984e-06, "loss": 1.2522, "mean_token_accuracy": 0.7192215099930763, "num_tokens": 2073111.0, "step": 2576 }, { "epoch": 0.6827330508474576, "grad_norm": 1.7095023393630981, "learning_rate": 9.658765889830509e-06, "loss": 1.7084, "mean_token_accuracy": 0.6227592825889587, "num_tokens": 2074558.0, "step": 2578 }, { "epoch": 0.6832627118644068, "grad_norm": 1.5636916160583496, "learning_rate": 9.658501059322036e-06, "loss": 1.3013, "mean_token_accuracy": 0.701073557138443, "num_tokens": 2076212.0, "step": 2580 }, { "epoch": 0.683792372881356, "grad_norm": 1.3990896940231323, "learning_rate": 9.65823622881356e-06, "loss": 1.383, "mean_token_accuracy": 0.7064133808016777, "num_tokens": 2077609.0, "step": 2582 }, { "epoch": 0.684322033898305, "grad_norm": 1.3695024251937866, "learning_rate": 9.657971398305086e-06, "loss": 0.9995, "mean_token_accuracy": 0.7357519045472145, "num_tokens": 2078935.0, "step": 2584 }, { "epoch": 0.6848516949152542, "grad_norm": 1.786357045173645, "learning_rate": 9.65770656779661e-06, "loss": 1.89, "mean_token_accuracy": 0.6176961064338684, "num_tokens": 2080406.0, "step": 2586 }, { "epoch": 0.6853813559322034, "grad_norm": 1.8319904804229736, "learning_rate": 9.657441737288137e-06, "loss": 1.3902, "mean_token_accuracy": 0.6955637186765671, "num_tokens": 2081893.0, "step": 2588 }, { "epoch": 0.6859110169491526, "grad_norm": 2.2387752532958984, "learning_rate": 9.657176906779662e-06, "loss": 1.3665, "mean_token_accuracy": 0.6967783570289612, "num_tokens": 2083246.0, "step": 2590 }, { "epoch": 0.6864406779661016, "grad_norm": 1.671541452407837, "learning_rate": 9.656912076271187e-06, "loss": 1.6772, "mean_token_accuracy": 0.6549180634319782, "num_tokens": 2084975.0, "step": 2592 }, { "epoch": 0.6869703389830508, "grad_norm": 1.6997253894805908, "learning_rate": 9.656647245762712e-06, "loss": 1.819, "mean_token_accuracy": 0.6433225721120834, "num_tokens": 2086514.0, "step": 2594 }, { "epoch": 0.6875, "grad_norm": 1.4105710983276367, "learning_rate": 9.656382415254238e-06, "loss": 0.9984, "mean_token_accuracy": 0.7587636858224869, "num_tokens": 2087929.0, "step": 2596 }, { "epoch": 0.6880296610169492, "grad_norm": 1.4843697547912598, "learning_rate": 9.656117584745763e-06, "loss": 1.3616, "mean_token_accuracy": 0.7064943388104439, "num_tokens": 2089391.0, "step": 2598 }, { "epoch": 0.6885593220338984, "grad_norm": 1.9014841318130493, "learning_rate": 9.65585275423729e-06, "loss": 1.1678, "mean_token_accuracy": 0.7197810113430023, "num_tokens": 2091092.0, "step": 2600 }, { "epoch": 0.6890889830508474, "grad_norm": 1.5153415203094482, "learning_rate": 9.655587923728815e-06, "loss": 1.4496, "mean_token_accuracy": 0.6685321852564812, "num_tokens": 2092830.0, "step": 2602 }, { "epoch": 0.6896186440677966, "grad_norm": 1.5886390209197998, "learning_rate": 9.65532309322034e-06, "loss": 1.5198, "mean_token_accuracy": 0.6669864654541016, "num_tokens": 2094318.0, "step": 2604 }, { "epoch": 0.6901483050847458, "grad_norm": 2.0264663696289062, "learning_rate": 9.655058262711865e-06, "loss": 1.5753, "mean_token_accuracy": 0.6647222712635994, "num_tokens": 2095907.0, "step": 2606 }, { "epoch": 0.690677966101695, "grad_norm": 1.6604775190353394, "learning_rate": 9.654793432203391e-06, "loss": 1.333, "mean_token_accuracy": 0.6958604007959366, "num_tokens": 2097434.0, "step": 2608 }, { "epoch": 0.691207627118644, "grad_norm": 1.7717769145965576, "learning_rate": 9.654528601694916e-06, "loss": 1.3292, "mean_token_accuracy": 0.6856342405080795, "num_tokens": 2099050.0, "step": 2610 }, { "epoch": 0.6917372881355932, "grad_norm": 1.4924534559249878, "learning_rate": 9.654263771186441e-06, "loss": 0.8428, "mean_token_accuracy": 0.7820508703589439, "num_tokens": 2100535.0, "step": 2612 }, { "epoch": 0.6922669491525424, "grad_norm": 1.392536997795105, "learning_rate": 9.653998940677966e-06, "loss": 1.101, "mean_token_accuracy": 0.729944996535778, "num_tokens": 2102149.0, "step": 2614 }, { "epoch": 0.6927966101694916, "grad_norm": 1.3450201749801636, "learning_rate": 9.653734110169493e-06, "loss": 1.5195, "mean_token_accuracy": 0.6583444662392139, "num_tokens": 2103863.0, "step": 2616 }, { "epoch": 0.6933262711864406, "grad_norm": 1.4411829710006714, "learning_rate": 9.653469279661018e-06, "loss": 1.2494, "mean_token_accuracy": 0.7310865744948387, "num_tokens": 2105494.0, "step": 2618 }, { "epoch": 0.6938559322033898, "grad_norm": 1.8953670263290405, "learning_rate": 9.653204449152543e-06, "loss": 1.6882, "mean_token_accuracy": 0.6306904926896095, "num_tokens": 2107007.0, "step": 2620 }, { "epoch": 0.694385593220339, "grad_norm": 1.2818834781646729, "learning_rate": 9.652939618644068e-06, "loss": 1.5329, "mean_token_accuracy": 0.6979207992553711, "num_tokens": 2108791.0, "step": 2622 }, { "epoch": 0.6949152542372882, "grad_norm": 1.4837172031402588, "learning_rate": 9.652674788135594e-06, "loss": 1.2989, "mean_token_accuracy": 0.7091590911149979, "num_tokens": 2110471.0, "step": 2624 }, { "epoch": 0.6954449152542372, "grad_norm": 1.5567735433578491, "learning_rate": 9.65240995762712e-06, "loss": 1.2404, "mean_token_accuracy": 0.7311229407787323, "num_tokens": 2111972.0, "step": 2626 }, { "epoch": 0.6959745762711864, "grad_norm": 1.7283583879470825, "learning_rate": 9.652145127118646e-06, "loss": 1.1519, "mean_token_accuracy": 0.7420974597334862, "num_tokens": 2113610.0, "step": 2628 }, { "epoch": 0.6965042372881356, "grad_norm": 1.657502293586731, "learning_rate": 9.65188029661017e-06, "loss": 1.515, "mean_token_accuracy": 0.6804554760456085, "num_tokens": 2115164.0, "step": 2630 }, { "epoch": 0.6970338983050848, "grad_norm": 1.8910460472106934, "learning_rate": 9.651615466101696e-06, "loss": 1.6665, "mean_token_accuracy": 0.6057373099029064, "num_tokens": 2116809.0, "step": 2632 }, { "epoch": 0.6975635593220338, "grad_norm": 2.547733783721924, "learning_rate": 9.65135063559322e-06, "loss": 1.2928, "mean_token_accuracy": 0.7165030911564827, "num_tokens": 2118187.0, "step": 2634 }, { "epoch": 0.698093220338983, "grad_norm": 1.5207380056381226, "learning_rate": 9.651085805084747e-06, "loss": 1.177, "mean_token_accuracy": 0.703510470688343, "num_tokens": 2119959.0, "step": 2636 }, { "epoch": 0.6986228813559322, "grad_norm": 1.3547985553741455, "learning_rate": 9.650820974576272e-06, "loss": 1.393, "mean_token_accuracy": 0.680476538836956, "num_tokens": 2121565.0, "step": 2638 }, { "epoch": 0.6991525423728814, "grad_norm": 1.5646299123764038, "learning_rate": 9.650556144067797e-06, "loss": 1.203, "mean_token_accuracy": 0.7317710369825363, "num_tokens": 2123063.0, "step": 2640 }, { "epoch": 0.6996822033898306, "grad_norm": 1.6479467153549194, "learning_rate": 9.650291313559322e-06, "loss": 1.4291, "mean_token_accuracy": 0.6658804640173912, "num_tokens": 2124885.0, "step": 2642 }, { "epoch": 0.7002118644067796, "grad_norm": 1.8051400184631348, "learning_rate": 9.650026483050849e-06, "loss": 1.2007, "mean_token_accuracy": 0.727429024875164, "num_tokens": 2126207.0, "step": 2644 }, { "epoch": 0.7007415254237288, "grad_norm": 1.9331979751586914, "learning_rate": 9.649761652542374e-06, "loss": 1.4533, "mean_token_accuracy": 0.6704137325286865, "num_tokens": 2127798.0, "step": 2646 }, { "epoch": 0.701271186440678, "grad_norm": 1.5860614776611328, "learning_rate": 9.649496822033898e-06, "loss": 1.3276, "mean_token_accuracy": 0.6914941594004631, "num_tokens": 2129215.0, "step": 2648 }, { "epoch": 0.7018008474576272, "grad_norm": 1.5632611513137817, "learning_rate": 9.649231991525423e-06, "loss": 1.3684, "mean_token_accuracy": 0.7083518207073212, "num_tokens": 2130774.0, "step": 2650 }, { "epoch": 0.7023305084745762, "grad_norm": 1.2079696655273438, "learning_rate": 9.64896716101695e-06, "loss": 1.1199, "mean_token_accuracy": 0.7350472137331963, "num_tokens": 2132167.0, "step": 2652 }, { "epoch": 0.7028601694915254, "grad_norm": 1.8325191736221313, "learning_rate": 9.648702330508475e-06, "loss": 1.6835, "mean_token_accuracy": 0.6329574584960938, "num_tokens": 2133604.0, "step": 2654 }, { "epoch": 0.7033898305084746, "grad_norm": 1.7805300951004028, "learning_rate": 9.648437500000002e-06, "loss": 1.1415, "mean_token_accuracy": 0.7385456338524818, "num_tokens": 2135136.0, "step": 2656 }, { "epoch": 0.7039194915254238, "grad_norm": 1.3923133611679077, "learning_rate": 9.648172669491527e-06, "loss": 1.4448, "mean_token_accuracy": 0.6780997402966022, "num_tokens": 2136979.0, "step": 2658 }, { "epoch": 0.7044491525423728, "grad_norm": 1.6516410112380981, "learning_rate": 9.647907838983051e-06, "loss": 1.3566, "mean_token_accuracy": 0.6830848008394241, "num_tokens": 2138446.0, "step": 2660 }, { "epoch": 0.704978813559322, "grad_norm": 1.5855085849761963, "learning_rate": 9.647643008474578e-06, "loss": 1.3298, "mean_token_accuracy": 0.7056766822934151, "num_tokens": 2140273.0, "step": 2662 }, { "epoch": 0.7055084745762712, "grad_norm": 2.0398917198181152, "learning_rate": 9.647378177966103e-06, "loss": 1.5005, "mean_token_accuracy": 0.6684864424169064, "num_tokens": 2141741.0, "step": 2664 }, { "epoch": 0.7060381355932204, "grad_norm": 1.6137220859527588, "learning_rate": 9.647113347457628e-06, "loss": 1.3125, "mean_token_accuracy": 0.721223633736372, "num_tokens": 2143397.0, "step": 2666 }, { "epoch": 0.7065677966101694, "grad_norm": 2.156627655029297, "learning_rate": 9.646848516949153e-06, "loss": 1.5464, "mean_token_accuracy": 0.6565579995512962, "num_tokens": 2145056.0, "step": 2668 }, { "epoch": 0.7070974576271186, "grad_norm": 1.3730039596557617, "learning_rate": 9.64658368644068e-06, "loss": 0.9034, "mean_token_accuracy": 0.7839610502123833, "num_tokens": 2146395.0, "step": 2670 }, { "epoch": 0.7076271186440678, "grad_norm": 1.7032735347747803, "learning_rate": 9.646318855932204e-06, "loss": 1.2404, "mean_token_accuracy": 0.7182218953967094, "num_tokens": 2147911.0, "step": 2672 }, { "epoch": 0.708156779661017, "grad_norm": 1.6491787433624268, "learning_rate": 9.64605402542373e-06, "loss": 1.8477, "mean_token_accuracy": 0.6657077595591545, "num_tokens": 2149517.0, "step": 2674 }, { "epoch": 0.7086864406779662, "grad_norm": 2.146695852279663, "learning_rate": 9.645789194915254e-06, "loss": 1.2352, "mean_token_accuracy": 0.7178514152765274, "num_tokens": 2151062.0, "step": 2676 }, { "epoch": 0.7092161016949152, "grad_norm": 1.5463027954101562, "learning_rate": 9.645524364406781e-06, "loss": 1.5926, "mean_token_accuracy": 0.6726401820778847, "num_tokens": 2152759.0, "step": 2678 }, { "epoch": 0.7097457627118644, "grad_norm": 2.0338408946990967, "learning_rate": 9.645259533898306e-06, "loss": 1.1675, "mean_token_accuracy": 0.7304162308573723, "num_tokens": 2154386.0, "step": 2680 }, { "epoch": 0.7102754237288136, "grad_norm": 1.741869330406189, "learning_rate": 9.644994703389832e-06, "loss": 1.3732, "mean_token_accuracy": 0.6789335161447525, "num_tokens": 2155766.0, "step": 2682 }, { "epoch": 0.7108050847457628, "grad_norm": 1.7564997673034668, "learning_rate": 9.644729872881357e-06, "loss": 1.0942, "mean_token_accuracy": 0.7266440242528915, "num_tokens": 2157334.0, "step": 2684 }, { "epoch": 0.7113347457627118, "grad_norm": 1.6154998540878296, "learning_rate": 9.644465042372882e-06, "loss": 1.4221, "mean_token_accuracy": 0.6767430379986763, "num_tokens": 2159228.0, "step": 2686 }, { "epoch": 0.711864406779661, "grad_norm": 1.8080114126205444, "learning_rate": 9.644200211864407e-06, "loss": 1.4542, "mean_token_accuracy": 0.6705066114664078, "num_tokens": 2160697.0, "step": 2688 }, { "epoch": 0.7123940677966102, "grad_norm": 1.6344014406204224, "learning_rate": 9.643935381355934e-06, "loss": 1.3461, "mean_token_accuracy": 0.7047756090760231, "num_tokens": 2162393.0, "step": 2690 }, { "epoch": 0.7129237288135594, "grad_norm": 1.9453299045562744, "learning_rate": 9.643670550847459e-06, "loss": 1.4215, "mean_token_accuracy": 0.6543978154659271, "num_tokens": 2163853.0, "step": 2692 }, { "epoch": 0.7134533898305084, "grad_norm": 1.7728937864303589, "learning_rate": 9.643405720338984e-06, "loss": 1.4358, "mean_token_accuracy": 0.6835781261324883, "num_tokens": 2165295.0, "step": 2694 }, { "epoch": 0.7139830508474576, "grad_norm": 1.8722206354141235, "learning_rate": 9.643140889830509e-06, "loss": 1.3933, "mean_token_accuracy": 0.6858386322855949, "num_tokens": 2166896.0, "step": 2696 }, { "epoch": 0.7145127118644068, "grad_norm": 1.6794649362564087, "learning_rate": 9.642876059322035e-06, "loss": 1.8273, "mean_token_accuracy": 0.6171870529651642, "num_tokens": 2168369.0, "step": 2698 }, { "epoch": 0.715042372881356, "grad_norm": 1.145750880241394, "learning_rate": 9.64261122881356e-06, "loss": 1.1289, "mean_token_accuracy": 0.7305222898721695, "num_tokens": 2170122.0, "step": 2700 }, { "epoch": 0.715572033898305, "grad_norm": 1.532545804977417, "learning_rate": 9.642346398305085e-06, "loss": 1.2884, "mean_token_accuracy": 0.7156480178236961, "num_tokens": 2171817.0, "step": 2702 }, { "epoch": 0.7161016949152542, "grad_norm": 2.000028133392334, "learning_rate": 9.64208156779661e-06, "loss": 1.0765, "mean_token_accuracy": 0.7506330087780952, "num_tokens": 2173225.0, "step": 2704 }, { "epoch": 0.7166313559322034, "grad_norm": 1.5830508470535278, "learning_rate": 9.641816737288137e-06, "loss": 1.5081, "mean_token_accuracy": 0.6505457237362862, "num_tokens": 2175734.0, "step": 2706 }, { "epoch": 0.7171610169491526, "grad_norm": 1.7134852409362793, "learning_rate": 9.641551906779662e-06, "loss": 1.4131, "mean_token_accuracy": 0.664156299084425, "num_tokens": 2177432.0, "step": 2708 }, { "epoch": 0.7176906779661016, "grad_norm": 1.2645453214645386, "learning_rate": 9.641287076271188e-06, "loss": 0.8757, "mean_token_accuracy": 0.7720454633235931, "num_tokens": 2178939.0, "step": 2710 }, { "epoch": 0.7182203389830508, "grad_norm": 1.270507574081421, "learning_rate": 9.641022245762711e-06, "loss": 1.2816, "mean_token_accuracy": 0.6680939570069313, "num_tokens": 2181670.0, "step": 2712 }, { "epoch": 0.71875, "grad_norm": 1.862899661064148, "learning_rate": 9.640757415254238e-06, "loss": 1.2488, "mean_token_accuracy": 0.7155813947319984, "num_tokens": 2183185.0, "step": 2714 }, { "epoch": 0.7192796610169492, "grad_norm": 1.3738220930099487, "learning_rate": 9.640492584745763e-06, "loss": 1.0361, "mean_token_accuracy": 0.7660066559910774, "num_tokens": 2184499.0, "step": 2716 }, { "epoch": 0.7198093220338984, "grad_norm": 1.668320655822754, "learning_rate": 9.64022775423729e-06, "loss": 1.5339, "mean_token_accuracy": 0.6787368729710579, "num_tokens": 2186071.0, "step": 2718 }, { "epoch": 0.7203389830508474, "grad_norm": 1.6950536966323853, "learning_rate": 9.639962923728815e-06, "loss": 1.4399, "mean_token_accuracy": 0.6882024481892586, "num_tokens": 2187517.0, "step": 2720 }, { "epoch": 0.7208686440677966, "grad_norm": 1.6029303073883057, "learning_rate": 9.63969809322034e-06, "loss": 1.2766, "mean_token_accuracy": 0.7092558145523071, "num_tokens": 2189047.0, "step": 2722 }, { "epoch": 0.7213983050847458, "grad_norm": 1.5908948183059692, "learning_rate": 9.639433262711864e-06, "loss": 1.0735, "mean_token_accuracy": 0.7316601276397705, "num_tokens": 2190591.0, "step": 2724 }, { "epoch": 0.721927966101695, "grad_norm": 1.6569455862045288, "learning_rate": 9.639168432203391e-06, "loss": 1.2771, "mean_token_accuracy": 0.7062565311789513, "num_tokens": 2192521.0, "step": 2726 }, { "epoch": 0.722457627118644, "grad_norm": 1.7944854497909546, "learning_rate": 9.638903601694916e-06, "loss": 1.7726, "mean_token_accuracy": 0.6088992729783058, "num_tokens": 2194117.0, "step": 2728 }, { "epoch": 0.7229872881355932, "grad_norm": 2.0074212551116943, "learning_rate": 9.638638771186441e-06, "loss": 1.0489, "mean_token_accuracy": 0.7789015546441078, "num_tokens": 2195281.0, "step": 2730 }, { "epoch": 0.7235169491525424, "grad_norm": 2.561920642852783, "learning_rate": 9.638373940677966e-06, "loss": 1.232, "mean_token_accuracy": 0.7152373865246773, "num_tokens": 2196743.0, "step": 2732 }, { "epoch": 0.7240466101694916, "grad_norm": 1.5623219013214111, "learning_rate": 9.638109110169492e-06, "loss": 1.0345, "mean_token_accuracy": 0.7403800562024117, "num_tokens": 2198193.0, "step": 2734 }, { "epoch": 0.7245762711864406, "grad_norm": 1.8348853588104248, "learning_rate": 9.637844279661017e-06, "loss": 1.3136, "mean_token_accuracy": 0.7017422914505005, "num_tokens": 2199814.0, "step": 2736 }, { "epoch": 0.7251059322033898, "grad_norm": 1.5010111331939697, "learning_rate": 9.637579449152544e-06, "loss": 1.4542, "mean_token_accuracy": 0.6672917008399963, "num_tokens": 2201330.0, "step": 2738 }, { "epoch": 0.725635593220339, "grad_norm": 1.4965620040893555, "learning_rate": 9.637314618644067e-06, "loss": 1.264, "mean_token_accuracy": 0.7067728564143181, "num_tokens": 2202971.0, "step": 2740 }, { "epoch": 0.7261652542372882, "grad_norm": 1.2332062721252441, "learning_rate": 9.637049788135594e-06, "loss": 1.5333, "mean_token_accuracy": 0.684628501534462, "num_tokens": 2204655.0, "step": 2742 }, { "epoch": 0.7266949152542372, "grad_norm": 1.8782347440719604, "learning_rate": 9.63678495762712e-06, "loss": 1.1689, "mean_token_accuracy": 0.7140536606311798, "num_tokens": 2206352.0, "step": 2744 }, { "epoch": 0.7272245762711864, "grad_norm": 1.4159573316574097, "learning_rate": 9.636520127118645e-06, "loss": 1.335, "mean_token_accuracy": 0.6897217929363251, "num_tokens": 2208095.0, "step": 2746 }, { "epoch": 0.7277542372881356, "grad_norm": 1.9225715398788452, "learning_rate": 9.63625529661017e-06, "loss": 1.2858, "mean_token_accuracy": 0.7137264758348465, "num_tokens": 2209577.0, "step": 2748 }, { "epoch": 0.7282838983050848, "grad_norm": 1.645754337310791, "learning_rate": 9.635990466101695e-06, "loss": 1.3875, "step": 2750 }, { "epoch": 0.7282838983050848, "eval_loss": 1.3305306434631348, "eval_mean_token_accuracy": 0.6972409530312984, "eval_num_tokens": 2211377.0, "eval_runtime": 48.2432, "eval_samples_per_second": 6.384, "eval_steps_per_second": 6.384, "step": 2750 }, { "epoch": 0.7288135593220338, "grad_norm": 1.4424374103546143, "learning_rate": 9.635725635593222e-06, "loss": 1.8288, "mean_token_accuracy": 0.6496424209326506, "num_tokens": 2213195.0, "step": 2752 }, { "epoch": 0.729343220338983, "grad_norm": 1.6821802854537964, "learning_rate": 9.635460805084747e-06, "loss": 1.3202, "mean_token_accuracy": 0.6852321699261665, "num_tokens": 2214800.0, "step": 2754 }, { "epoch": 0.7298728813559322, "grad_norm": 1.3180286884307861, "learning_rate": 9.635195974576272e-06, "loss": 1.0607, "mean_token_accuracy": 0.743266761302948, "num_tokens": 2216173.0, "step": 2756 }, { "epoch": 0.7304025423728814, "grad_norm": 1.5427415370941162, "learning_rate": 9.634931144067797e-06, "loss": 1.0145, "mean_token_accuracy": 0.7618267685174942, "num_tokens": 2217596.0, "step": 2758 }, { "epoch": 0.7309322033898306, "grad_norm": 1.8910021781921387, "learning_rate": 9.634666313559323e-06, "loss": 1.47, "mean_token_accuracy": 0.6821527257561684, "num_tokens": 2219154.0, "step": 2760 }, { "epoch": 0.7314618644067796, "grad_norm": 1.666080117225647, "learning_rate": 9.634401483050848e-06, "loss": 1.3958, "mean_token_accuracy": 0.7012612968683243, "num_tokens": 2220647.0, "step": 2762 }, { "epoch": 0.7319915254237288, "grad_norm": 1.6626050472259521, "learning_rate": 9.634136652542375e-06, "loss": 1.2904, "mean_token_accuracy": 0.7097525596618652, "num_tokens": 2222196.0, "step": 2764 }, { "epoch": 0.732521186440678, "grad_norm": 1.5212032794952393, "learning_rate": 9.633871822033898e-06, "loss": 0.9644, "mean_token_accuracy": 0.760301761329174, "num_tokens": 2223690.0, "step": 2766 }, { "epoch": 0.7330508474576272, "grad_norm": 1.1722311973571777, "learning_rate": 9.633606991525425e-06, "loss": 0.9884, "mean_token_accuracy": 0.7584671154618263, "num_tokens": 2226215.0, "step": 2768 }, { "epoch": 0.7335805084745762, "grad_norm": 1.6863337755203247, "learning_rate": 9.63334216101695e-06, "loss": 1.5145, "mean_token_accuracy": 0.6703843474388123, "num_tokens": 2228250.0, "step": 2770 }, { "epoch": 0.7341101694915254, "grad_norm": 2.061173915863037, "learning_rate": 9.633077330508476e-06, "loss": 1.6808, "mean_token_accuracy": 0.6166793890297413, "num_tokens": 2229846.0, "step": 2772 }, { "epoch": 0.7346398305084746, "grad_norm": 1.5401413440704346, "learning_rate": 9.632812500000001e-06, "loss": 1.5833, "mean_token_accuracy": 0.6302652582526207, "num_tokens": 2231559.0, "step": 2774 }, { "epoch": 0.7351694915254238, "grad_norm": 1.8753479719161987, "learning_rate": 9.632547669491526e-06, "loss": 1.0594, "mean_token_accuracy": 0.756597712635994, "num_tokens": 2233023.0, "step": 2776 }, { "epoch": 0.7356991525423728, "grad_norm": 1.6262047290802002, "learning_rate": 9.632282838983051e-06, "loss": 1.0952, "mean_token_accuracy": 0.750958152115345, "num_tokens": 2234524.0, "step": 2778 }, { "epoch": 0.736228813559322, "grad_norm": 1.739205002784729, "learning_rate": 9.632018008474578e-06, "loss": 1.5017, "mean_token_accuracy": 0.6779549941420555, "num_tokens": 2236183.0, "step": 2780 }, { "epoch": 0.7367584745762712, "grad_norm": 1.482099175453186, "learning_rate": 9.631753177966103e-06, "loss": 1.4694, "mean_token_accuracy": 0.6808006390929222, "num_tokens": 2237597.0, "step": 2782 }, { "epoch": 0.7372881355932204, "grad_norm": 1.2941269874572754, "learning_rate": 9.631488347457628e-06, "loss": 1.2132, "mean_token_accuracy": 0.7047975435853004, "num_tokens": 2239401.0, "step": 2784 }, { "epoch": 0.7378177966101694, "grad_norm": 1.714943528175354, "learning_rate": 9.631223516949152e-06, "loss": 1.5558, "mean_token_accuracy": 0.6730372384190559, "num_tokens": 2240892.0, "step": 2786 }, { "epoch": 0.7383474576271186, "grad_norm": 1.6856919527053833, "learning_rate": 9.630958686440679e-06, "loss": 1.1341, "mean_token_accuracy": 0.7248996794223785, "num_tokens": 2242408.0, "step": 2788 }, { "epoch": 0.7388771186440678, "grad_norm": 1.7109317779541016, "learning_rate": 9.630693855932204e-06, "loss": 1.8816, "mean_token_accuracy": 0.6129221394658089, "num_tokens": 2243917.0, "step": 2790 }, { "epoch": 0.739406779661017, "grad_norm": 1.7272230386734009, "learning_rate": 9.63042902542373e-06, "loss": 1.5248, "mean_token_accuracy": 0.6581543534994125, "num_tokens": 2245498.0, "step": 2792 }, { "epoch": 0.7399364406779662, "grad_norm": 1.6011420488357544, "learning_rate": 9.630164194915254e-06, "loss": 1.4518, "mean_token_accuracy": 0.6717067584395409, "num_tokens": 2246982.0, "step": 2794 }, { "epoch": 0.7404661016949152, "grad_norm": 1.3473302125930786, "learning_rate": 9.62989936440678e-06, "loss": 1.4064, "mean_token_accuracy": 0.6828244179487228, "num_tokens": 2248622.0, "step": 2796 }, { "epoch": 0.7409957627118644, "grad_norm": 1.347360372543335, "learning_rate": 9.629634533898305e-06, "loss": 1.0935, "mean_token_accuracy": 0.7306854724884033, "num_tokens": 2250432.0, "step": 2798 }, { "epoch": 0.7415254237288136, "grad_norm": 1.8850752115249634, "learning_rate": 9.629369703389832e-06, "loss": 1.7152, "mean_token_accuracy": 0.6555771604180336, "num_tokens": 2252016.0, "step": 2800 }, { "epoch": 0.7420550847457628, "grad_norm": 1.8471553325653076, "learning_rate": 9.629104872881357e-06, "loss": 1.4549, "mean_token_accuracy": 0.6876996085047722, "num_tokens": 2253538.0, "step": 2802 }, { "epoch": 0.7425847457627118, "grad_norm": 1.5284452438354492, "learning_rate": 9.628840042372882e-06, "loss": 1.1334, "mean_token_accuracy": 0.7306521236896515, "num_tokens": 2255061.0, "step": 2804 }, { "epoch": 0.743114406779661, "grad_norm": 1.9663840532302856, "learning_rate": 9.628575211864407e-06, "loss": 1.4179, "mean_token_accuracy": 0.6802778318524361, "num_tokens": 2256476.0, "step": 2806 }, { "epoch": 0.7436440677966102, "grad_norm": 1.689620852470398, "learning_rate": 9.628310381355933e-06, "loss": 1.2591, "mean_token_accuracy": 0.7038333564996719, "num_tokens": 2258093.0, "step": 2808 }, { "epoch": 0.7441737288135594, "grad_norm": 1.8602590560913086, "learning_rate": 9.628045550847458e-06, "loss": 1.4385, "mean_token_accuracy": 0.6925476565957069, "num_tokens": 2259472.0, "step": 2810 }, { "epoch": 0.7447033898305084, "grad_norm": 1.6417500972747803, "learning_rate": 9.627780720338983e-06, "loss": 1.1888, "mean_token_accuracy": 0.7250412255525589, "num_tokens": 2260957.0, "step": 2812 }, { "epoch": 0.7452330508474576, "grad_norm": 1.5664328336715698, "learning_rate": 9.627515889830508e-06, "loss": 0.9993, "mean_token_accuracy": 0.7843310832977295, "num_tokens": 2262496.0, "step": 2814 }, { "epoch": 0.7457627118644068, "grad_norm": 1.7659692764282227, "learning_rate": 9.627251059322035e-06, "loss": 1.4784, "mean_token_accuracy": 0.660520363599062, "num_tokens": 2264203.0, "step": 2816 }, { "epoch": 0.746292372881356, "grad_norm": 1.507620096206665, "learning_rate": 9.62698622881356e-06, "loss": 1.5103, "mean_token_accuracy": 0.6787868514657021, "num_tokens": 2265956.0, "step": 2818 }, { "epoch": 0.746822033898305, "grad_norm": 1.7065244913101196, "learning_rate": 9.626721398305085e-06, "loss": 1.0915, "mean_token_accuracy": 0.738062895834446, "num_tokens": 2267287.0, "step": 2820 }, { "epoch": 0.7473516949152542, "grad_norm": 1.6172796487808228, "learning_rate": 9.62645656779661e-06, "loss": 1.5559, "mean_token_accuracy": 0.6641648523509502, "num_tokens": 2268929.0, "step": 2822 }, { "epoch": 0.7478813559322034, "grad_norm": 1.442603349685669, "learning_rate": 9.626191737288136e-06, "loss": 1.6255, "mean_token_accuracy": 0.6634219214320183, "num_tokens": 2270537.0, "step": 2824 }, { "epoch": 0.7484110169491526, "grad_norm": 1.4224005937576294, "learning_rate": 9.625926906779663e-06, "loss": 1.3839, "mean_token_accuracy": 0.67787005007267, "num_tokens": 2272024.0, "step": 2826 }, { "epoch": 0.7489406779661016, "grad_norm": 1.627558708190918, "learning_rate": 9.625662076271188e-06, "loss": 1.2524, "mean_token_accuracy": 0.7175000384449959, "num_tokens": 2273601.0, "step": 2828 }, { "epoch": 0.7494703389830508, "grad_norm": 1.2465218305587769, "learning_rate": 9.625397245762713e-06, "loss": 1.0322, "mean_token_accuracy": 0.7556178569793701, "num_tokens": 2275358.0, "step": 2830 }, { "epoch": 0.75, "grad_norm": 1.6475467681884766, "learning_rate": 9.625132415254238e-06, "loss": 1.5462, "mean_token_accuracy": 0.663266085088253, "num_tokens": 2276699.0, "step": 2832 }, { "epoch": 0.7505296610169492, "grad_norm": 1.4573241472244263, "learning_rate": 9.624867584745764e-06, "loss": 1.2914, "mean_token_accuracy": 0.7131555937230587, "num_tokens": 2278516.0, "step": 2834 }, { "epoch": 0.7510593220338984, "grad_norm": 1.6765793561935425, "learning_rate": 9.62460275423729e-06, "loss": 1.3605, "mean_token_accuracy": 0.7022193670272827, "num_tokens": 2279950.0, "step": 2836 }, { "epoch": 0.7515889830508474, "grad_norm": 1.739195466041565, "learning_rate": 9.624337923728814e-06, "loss": 1.2824, "mean_token_accuracy": 0.7020125612616539, "num_tokens": 2281321.0, "step": 2838 }, { "epoch": 0.7521186440677966, "grad_norm": 1.5071691274642944, "learning_rate": 9.624073093220339e-06, "loss": 0.8957, "mean_token_accuracy": 0.754631794989109, "num_tokens": 2283124.0, "step": 2840 }, { "epoch": 0.7526483050847458, "grad_norm": 1.4203208684921265, "learning_rate": 9.623808262711866e-06, "loss": 1.3781, "mean_token_accuracy": 0.6896863132715225, "num_tokens": 2284856.0, "step": 2842 }, { "epoch": 0.753177966101695, "grad_norm": 1.7264633178710938, "learning_rate": 9.62354343220339e-06, "loss": 1.2048, "mean_token_accuracy": 0.7116772755980492, "num_tokens": 2286304.0, "step": 2844 }, { "epoch": 0.753707627118644, "grad_norm": 1.6461049318313599, "learning_rate": 9.623278601694917e-06, "loss": 1.4705, "mean_token_accuracy": 0.6576054841279984, "num_tokens": 2288096.0, "step": 2846 }, { "epoch": 0.7542372881355932, "grad_norm": 1.561180830001831, "learning_rate": 9.62301377118644e-06, "loss": 1.4069, "mean_token_accuracy": 0.6758722364902496, "num_tokens": 2289593.0, "step": 2848 }, { "epoch": 0.7547669491525424, "grad_norm": 1.5976595878601074, "learning_rate": 9.622748940677967e-06, "loss": 1.2295, "mean_token_accuracy": 0.7206134125590324, "num_tokens": 2291207.0, "step": 2850 }, { "epoch": 0.7552966101694916, "grad_norm": 1.7588415145874023, "learning_rate": 9.622484110169492e-06, "loss": 1.3868, "mean_token_accuracy": 0.7000627443194389, "num_tokens": 2292744.0, "step": 2852 }, { "epoch": 0.7558262711864406, "grad_norm": 1.5565381050109863, "learning_rate": 9.622219279661019e-06, "loss": 1.5453, "mean_token_accuracy": 0.6637611910700798, "num_tokens": 2294359.0, "step": 2854 }, { "epoch": 0.7563559322033898, "grad_norm": 1.6683601140975952, "learning_rate": 9.621954449152544e-06, "loss": 1.5326, "mean_token_accuracy": 0.7059554234147072, "num_tokens": 2295673.0, "step": 2856 }, { "epoch": 0.756885593220339, "grad_norm": 1.8723161220550537, "learning_rate": 9.621689618644069e-06, "loss": 1.4911, "mean_token_accuracy": 0.6630816459655762, "num_tokens": 2297317.0, "step": 2858 }, { "epoch": 0.7574152542372882, "grad_norm": 1.6038388013839722, "learning_rate": 9.621424788135593e-06, "loss": 1.1282, "mean_token_accuracy": 0.7219580337405205, "num_tokens": 2298903.0, "step": 2860 }, { "epoch": 0.7579449152542372, "grad_norm": 2.181410312652588, "learning_rate": 9.62115995762712e-06, "loss": 1.664, "mean_token_accuracy": 0.6182642951607704, "num_tokens": 2300326.0, "step": 2862 }, { "epoch": 0.7584745762711864, "grad_norm": 1.5305410623550415, "learning_rate": 9.620895127118645e-06, "loss": 1.5373, "mean_token_accuracy": 0.6553952395915985, "num_tokens": 2301959.0, "step": 2864 }, { "epoch": 0.7590042372881356, "grad_norm": 1.4494653940200806, "learning_rate": 9.62063029661017e-06, "loss": 1.4035, "mean_token_accuracy": 0.6916749402880669, "num_tokens": 2303682.0, "step": 2866 }, { "epoch": 0.7595338983050848, "grad_norm": 2.1261138916015625, "learning_rate": 9.620365466101695e-06, "loss": 1.0394, "mean_token_accuracy": 0.7352311164140701, "num_tokens": 2305148.0, "step": 2868 }, { "epoch": 0.7600635593220338, "grad_norm": 1.6345703601837158, "learning_rate": 9.620100635593222e-06, "loss": 1.2081, "mean_token_accuracy": 0.7025217562913895, "num_tokens": 2306658.0, "step": 2870 }, { "epoch": 0.760593220338983, "grad_norm": 1.7088980674743652, "learning_rate": 9.619835805084746e-06, "loss": 1.0865, "mean_token_accuracy": 0.730363704264164, "num_tokens": 2308375.0, "step": 2872 }, { "epoch": 0.7611228813559322, "grad_norm": 1.8765829801559448, "learning_rate": 9.619570974576271e-06, "loss": 1.8059, "mean_token_accuracy": 0.6123056598007679, "num_tokens": 2310038.0, "step": 2874 }, { "epoch": 0.7616525423728814, "grad_norm": 1.6945061683654785, "learning_rate": 9.619306144067796e-06, "loss": 1.2339, "mean_token_accuracy": 0.6797119826078415, "num_tokens": 2311697.0, "step": 2876 }, { "epoch": 0.7621822033898306, "grad_norm": 1.6316003799438477, "learning_rate": 9.619041313559323e-06, "loss": 1.2234, "mean_token_accuracy": 0.7103658467531204, "num_tokens": 2313075.0, "step": 2878 }, { "epoch": 0.7627118644067796, "grad_norm": 1.6423603296279907, "learning_rate": 9.618776483050848e-06, "loss": 1.1042, "mean_token_accuracy": 0.7431692555546761, "num_tokens": 2314653.0, "step": 2880 }, { "epoch": 0.7632415254237288, "grad_norm": 1.9506070613861084, "learning_rate": 9.618511652542374e-06, "loss": 1.5633, "mean_token_accuracy": 0.6663630865514278, "num_tokens": 2316132.0, "step": 2882 }, { "epoch": 0.763771186440678, "grad_norm": 1.4473520517349243, "learning_rate": 9.6182468220339e-06, "loss": 1.7687, "mean_token_accuracy": 0.608544185757637, "num_tokens": 2317902.0, "step": 2884 }, { "epoch": 0.7643008474576272, "grad_norm": 1.539085865020752, "learning_rate": 9.617981991525424e-06, "loss": 1.5663, "mean_token_accuracy": 0.675036296248436, "num_tokens": 2319436.0, "step": 2886 }, { "epoch": 0.7648305084745762, "grad_norm": 1.5347509384155273, "learning_rate": 9.61771716101695e-06, "loss": 1.4231, "mean_token_accuracy": 0.6862152367830276, "num_tokens": 2320973.0, "step": 2888 }, { "epoch": 0.7653601694915254, "grad_norm": 1.692524790763855, "learning_rate": 9.617452330508476e-06, "loss": 1.4935, "mean_token_accuracy": 0.6746900528669357, "num_tokens": 2322438.0, "step": 2890 }, { "epoch": 0.7658898305084746, "grad_norm": 1.4801422357559204, "learning_rate": 9.6171875e-06, "loss": 1.1216, "mean_token_accuracy": 0.7115805000066757, "num_tokens": 2324215.0, "step": 2892 }, { "epoch": 0.7664194915254238, "grad_norm": 1.6027146577835083, "learning_rate": 9.616922669491526e-06, "loss": 1.37, "mean_token_accuracy": 0.6736144945025444, "num_tokens": 2325797.0, "step": 2894 }, { "epoch": 0.7669491525423728, "grad_norm": 1.4635567665100098, "learning_rate": 9.61665783898305e-06, "loss": 0.835, "mean_token_accuracy": 0.789809986948967, "num_tokens": 2327460.0, "step": 2896 }, { "epoch": 0.767478813559322, "grad_norm": 1.8449289798736572, "learning_rate": 9.616393008474577e-06, "loss": 1.5884, "mean_token_accuracy": 0.6545255333185196, "num_tokens": 2329070.0, "step": 2898 }, { "epoch": 0.7680084745762712, "grad_norm": 1.7521207332611084, "learning_rate": 9.616128177966102e-06, "loss": 1.2607, "mean_token_accuracy": 0.7291818857192993, "num_tokens": 2330567.0, "step": 2900 }, { "epoch": 0.7685381355932204, "grad_norm": 1.6752533912658691, "learning_rate": 9.615863347457627e-06, "loss": 1.5451, "mean_token_accuracy": 0.6538046263158321, "num_tokens": 2332224.0, "step": 2902 }, { "epoch": 0.7690677966101694, "grad_norm": 1.6872564554214478, "learning_rate": 9.615598516949152e-06, "loss": 1.4756, "mean_token_accuracy": 0.6734939888119698, "num_tokens": 2333705.0, "step": 2904 }, { "epoch": 0.7695974576271186, "grad_norm": 1.5083262920379639, "learning_rate": 9.615333686440679e-06, "loss": 1.2155, "mean_token_accuracy": 0.7130254209041595, "num_tokens": 2335118.0, "step": 2906 }, { "epoch": 0.7701271186440678, "grad_norm": 1.2819240093231201, "learning_rate": 9.615068855932204e-06, "loss": 0.9331, "mean_token_accuracy": 0.784653827548027, "num_tokens": 2336547.0, "step": 2908 }, { "epoch": 0.770656779661017, "grad_norm": 1.5515236854553223, "learning_rate": 9.61480402542373e-06, "loss": 0.9998, "mean_token_accuracy": 0.7521337196230888, "num_tokens": 2338461.0, "step": 2910 }, { "epoch": 0.7711864406779662, "grad_norm": 1.5058645009994507, "learning_rate": 9.614539194915255e-06, "loss": 1.7636, "mean_token_accuracy": 0.6450755409896374, "num_tokens": 2340370.0, "step": 2912 }, { "epoch": 0.7717161016949152, "grad_norm": 1.5670005083084106, "learning_rate": 9.61427436440678e-06, "loss": 1.2677, "mean_token_accuracy": 0.7123004272580147, "num_tokens": 2341968.0, "step": 2914 }, { "epoch": 0.7722457627118644, "grad_norm": 1.514159083366394, "learning_rate": 9.614009533898307e-06, "loss": 1.1589, "mean_token_accuracy": 0.7343071103096008, "num_tokens": 2343520.0, "step": 2916 }, { "epoch": 0.7727754237288136, "grad_norm": 1.292349934577942, "learning_rate": 9.613744703389832e-06, "loss": 1.2397, "mean_token_accuracy": 0.7334643490612507, "num_tokens": 2345355.0, "step": 2918 }, { "epoch": 0.7733050847457628, "grad_norm": 1.5886714458465576, "learning_rate": 9.613479872881357e-06, "loss": 1.4447, "mean_token_accuracy": 0.6602598056197166, "num_tokens": 2347174.0, "step": 2920 }, { "epoch": 0.7738347457627118, "grad_norm": 1.5357393026351929, "learning_rate": 9.613215042372882e-06, "loss": 1.0545, "mean_token_accuracy": 0.7433006539940834, "num_tokens": 2348808.0, "step": 2922 }, { "epoch": 0.774364406779661, "grad_norm": 1.5712766647338867, "learning_rate": 9.612950211864408e-06, "loss": 1.4403, "mean_token_accuracy": 0.6641444750130177, "num_tokens": 2350547.0, "step": 2924 }, { "epoch": 0.7748940677966102, "grad_norm": 1.5764282941818237, "learning_rate": 9.612685381355933e-06, "loss": 1.3411, "mean_token_accuracy": 0.6819232106208801, "num_tokens": 2352207.0, "step": 2926 }, { "epoch": 0.7754237288135594, "grad_norm": 1.623403787612915, "learning_rate": 9.612420550847458e-06, "loss": 1.5042, "mean_token_accuracy": 0.6799005940556526, "num_tokens": 2353821.0, "step": 2928 }, { "epoch": 0.7759533898305084, "grad_norm": 1.543782353401184, "learning_rate": 9.612155720338983e-06, "loss": 1.4814, "mean_token_accuracy": 0.6702755466103554, "num_tokens": 2355471.0, "step": 2930 }, { "epoch": 0.7764830508474576, "grad_norm": 1.7434744834899902, "learning_rate": 9.61189088983051e-06, "loss": 1.7201, "mean_token_accuracy": 0.641256719827652, "num_tokens": 2356928.0, "step": 2932 }, { "epoch": 0.7770127118644068, "grad_norm": 1.6261625289916992, "learning_rate": 9.611626059322035e-06, "loss": 1.0291, "mean_token_accuracy": 0.7362386360764503, "num_tokens": 2358261.0, "step": 2934 }, { "epoch": 0.777542372881356, "grad_norm": 2.074873924255371, "learning_rate": 9.611361228813561e-06, "loss": 1.3473, "mean_token_accuracy": 0.6934834718704224, "num_tokens": 2359606.0, "step": 2936 }, { "epoch": 0.778072033898305, "grad_norm": 1.744102120399475, "learning_rate": 9.611096398305086e-06, "loss": 1.3408, "mean_token_accuracy": 0.6973954662680626, "num_tokens": 2361238.0, "step": 2938 }, { "epoch": 0.7786016949152542, "grad_norm": 2.5185136795043945, "learning_rate": 9.610831567796611e-06, "loss": 1.141, "mean_token_accuracy": 0.6994594037532806, "num_tokens": 2362912.0, "step": 2940 }, { "epoch": 0.7791313559322034, "grad_norm": 1.6684916019439697, "learning_rate": 9.610566737288136e-06, "loss": 1.2304, "mean_token_accuracy": 0.7290317490696907, "num_tokens": 2364438.0, "step": 2942 }, { "epoch": 0.7796610169491526, "grad_norm": 1.3521913290023804, "learning_rate": 9.610301906779663e-06, "loss": 1.2505, "mean_token_accuracy": 0.6990716382861137, "num_tokens": 2365927.0, "step": 2944 }, { "epoch": 0.7801906779661016, "grad_norm": 1.447617769241333, "learning_rate": 9.610037076271187e-06, "loss": 1.3241, "mean_token_accuracy": 0.6966891437768936, "num_tokens": 2367481.0, "step": 2946 }, { "epoch": 0.7807203389830508, "grad_norm": 1.6800650358200073, "learning_rate": 9.609772245762712e-06, "loss": 1.7155, "mean_token_accuracy": 0.6180528029799461, "num_tokens": 2369516.0, "step": 2948 }, { "epoch": 0.78125, "grad_norm": 1.4783802032470703, "learning_rate": 9.609507415254237e-06, "loss": 0.9461, "mean_token_accuracy": 0.7639515697956085, "num_tokens": 2371074.0, "step": 2950 }, { "epoch": 0.7817796610169492, "grad_norm": 1.9952894449234009, "learning_rate": 9.609242584745764e-06, "loss": 1.9783, "mean_token_accuracy": 0.5788172408938408, "num_tokens": 2372664.0, "step": 2952 }, { "epoch": 0.7823093220338984, "grad_norm": 1.5108788013458252, "learning_rate": 9.608977754237289e-06, "loss": 1.3429, "mean_token_accuracy": 0.6716051697731018, "num_tokens": 2375115.0, "step": 2954 }, { "epoch": 0.7828389830508474, "grad_norm": 1.4677541255950928, "learning_rate": 9.608712923728814e-06, "loss": 1.6399, "mean_token_accuracy": 0.6541184410452843, "num_tokens": 2376950.0, "step": 2956 }, { "epoch": 0.7833686440677966, "grad_norm": 1.81862211227417, "learning_rate": 9.608448093220339e-06, "loss": 1.2866, "mean_token_accuracy": 0.7020881399512291, "num_tokens": 2378848.0, "step": 2958 }, { "epoch": 0.7838983050847458, "grad_norm": 1.7926270961761475, "learning_rate": 9.608183262711865e-06, "loss": 1.366, "mean_token_accuracy": 0.6989993825554848, "num_tokens": 2380473.0, "step": 2960 }, { "epoch": 0.784427966101695, "grad_norm": 1.326154112815857, "learning_rate": 9.60791843220339e-06, "loss": 1.251, "mean_token_accuracy": 0.7205435633659363, "num_tokens": 2382012.0, "step": 2962 }, { "epoch": 0.784957627118644, "grad_norm": 1.4186195135116577, "learning_rate": 9.607653601694917e-06, "loss": 1.1612, "mean_token_accuracy": 0.7209725379943848, "num_tokens": 2383490.0, "step": 2964 }, { "epoch": 0.7854872881355932, "grad_norm": 1.8243741989135742, "learning_rate": 9.607388771186442e-06, "loss": 1.8653, "mean_token_accuracy": 0.6326042450964451, "num_tokens": 2384981.0, "step": 2966 }, { "epoch": 0.7860169491525424, "grad_norm": 1.8560895919799805, "learning_rate": 9.607123940677967e-06, "loss": 1.335, "mean_token_accuracy": 0.7116682603955269, "num_tokens": 2386786.0, "step": 2968 }, { "epoch": 0.7865466101694916, "grad_norm": 1.653696894645691, "learning_rate": 9.606859110169492e-06, "loss": 1.1806, "mean_token_accuracy": 0.7322837188839912, "num_tokens": 2388175.0, "step": 2970 }, { "epoch": 0.7870762711864406, "grad_norm": 1.427079677581787, "learning_rate": 9.606594279661018e-06, "loss": 0.9002, "mean_token_accuracy": 0.7661270499229431, "num_tokens": 2389464.0, "step": 2972 }, { "epoch": 0.7876059322033898, "grad_norm": 1.5083290338516235, "learning_rate": 9.606329449152543e-06, "loss": 1.6242, "mean_token_accuracy": 0.6397632881999016, "num_tokens": 2391168.0, "step": 2974 }, { "epoch": 0.788135593220339, "grad_norm": 1.6539819240570068, "learning_rate": 9.606064618644068e-06, "loss": 1.4292, "mean_token_accuracy": 0.694688655436039, "num_tokens": 2392900.0, "step": 2976 }, { "epoch": 0.7886652542372882, "grad_norm": 1.6402136087417603, "learning_rate": 9.605799788135593e-06, "loss": 1.4572, "mean_token_accuracy": 0.6690758913755417, "num_tokens": 2394471.0, "step": 2978 }, { "epoch": 0.7891949152542372, "grad_norm": 1.9773995876312256, "learning_rate": 9.60553495762712e-06, "loss": 1.5773, "mean_token_accuracy": 0.6588483974337578, "num_tokens": 2395990.0, "step": 2980 }, { "epoch": 0.7897245762711864, "grad_norm": 1.5998717546463013, "learning_rate": 9.605270127118645e-06, "loss": 1.3258, "mean_token_accuracy": 0.6971612647175789, "num_tokens": 2397769.0, "step": 2982 }, { "epoch": 0.7902542372881356, "grad_norm": 1.60707688331604, "learning_rate": 9.60500529661017e-06, "loss": 1.5895, "mean_token_accuracy": 0.6514024585485458, "num_tokens": 2399442.0, "step": 2984 }, { "epoch": 0.7907838983050848, "grad_norm": 2.0295751094818115, "learning_rate": 9.604740466101695e-06, "loss": 1.4137, "mean_token_accuracy": 0.6760813742876053, "num_tokens": 2400774.0, "step": 2986 }, { "epoch": 0.7913135593220338, "grad_norm": 1.9450911283493042, "learning_rate": 9.604475635593221e-06, "loss": 1.1512, "mean_token_accuracy": 0.7009934782981873, "num_tokens": 2402974.0, "step": 2988 }, { "epoch": 0.791843220338983, "grad_norm": 1.5878732204437256, "learning_rate": 9.604210805084746e-06, "loss": 1.0819, "mean_token_accuracy": 0.7348921447992325, "num_tokens": 2404741.0, "step": 2990 }, { "epoch": 0.7923728813559322, "grad_norm": 1.5826390981674194, "learning_rate": 9.603945974576273e-06, "loss": 1.4235, "mean_token_accuracy": 0.6849502809345722, "num_tokens": 2406220.0, "step": 2992 }, { "epoch": 0.7929025423728814, "grad_norm": 2.1744399070739746, "learning_rate": 9.603681144067798e-06, "loss": 1.6974, "mean_token_accuracy": 0.6330899447202682, "num_tokens": 2407569.0, "step": 2994 }, { "epoch": 0.7934322033898306, "grad_norm": 1.5042365789413452, "learning_rate": 9.603416313559323e-06, "loss": 1.5484, "mean_token_accuracy": 0.6596753969788551, "num_tokens": 2409569.0, "step": 2996 }, { "epoch": 0.7939618644067796, "grad_norm": 1.2636537551879883, "learning_rate": 9.60315148305085e-06, "loss": 0.7344, "mean_token_accuracy": 0.8100912421941757, "num_tokens": 2411249.0, "step": 2998 }, { "epoch": 0.7944915254237288, "grad_norm": 1.242209792137146, "learning_rate": 9.602886652542374e-06, "loss": 1.0881, "step": 3000 }, { "epoch": 0.7944915254237288, "eval_loss": 1.3289058208465576, "eval_mean_token_accuracy": 0.6974571878453354, "eval_num_tokens": 2413059.0, "eval_runtime": 48.1105, "eval_samples_per_second": 6.402, "eval_steps_per_second": 6.402, "step": 3000 }, { "epoch": 0.795021186440678, "grad_norm": 2.115696907043457, "learning_rate": 9.602621822033899e-06, "loss": 2.0076, "mean_token_accuracy": 0.6671444475650787, "num_tokens": 2414905.0, "step": 3002 }, { "epoch": 0.7955508474576272, "grad_norm": 1.5977823734283447, "learning_rate": 9.602356991525424e-06, "loss": 1.3901, "mean_token_accuracy": 0.6841437071561813, "num_tokens": 2416541.0, "step": 3004 }, { "epoch": 0.7960805084745762, "grad_norm": 1.788252830505371, "learning_rate": 9.60209216101695e-06, "loss": 1.3957, "mean_token_accuracy": 0.6951913312077522, "num_tokens": 2418123.0, "step": 3006 }, { "epoch": 0.7966101694915254, "grad_norm": 1.881927490234375, "learning_rate": 9.601827330508476e-06, "loss": 1.0834, "mean_token_accuracy": 0.7405016273260117, "num_tokens": 2419729.0, "step": 3008 }, { "epoch": 0.7971398305084746, "grad_norm": 1.5061407089233398, "learning_rate": 9.6015625e-06, "loss": 1.3359, "mean_token_accuracy": 0.7049429304897785, "num_tokens": 2421317.0, "step": 3010 }, { "epoch": 0.7976694915254238, "grad_norm": 1.4903825521469116, "learning_rate": 9.601297669491525e-06, "loss": 1.4029, "mean_token_accuracy": 0.6907465755939484, "num_tokens": 2422840.0, "step": 3012 }, { "epoch": 0.7981991525423728, "grad_norm": 2.100757122039795, "learning_rate": 9.601032838983052e-06, "loss": 1.1139, "mean_token_accuracy": 0.7295105829834938, "num_tokens": 2424486.0, "step": 3014 }, { "epoch": 0.798728813559322, "grad_norm": 1.1659289598464966, "learning_rate": 9.600768008474577e-06, "loss": 1.2143, "mean_token_accuracy": 0.7303084470331669, "num_tokens": 2426502.0, "step": 3016 }, { "epoch": 0.7992584745762712, "grad_norm": 1.4258371591567993, "learning_rate": 9.600503177966104e-06, "loss": 1.2395, "mean_token_accuracy": 0.727342963218689, "num_tokens": 2428029.0, "step": 3018 }, { "epoch": 0.7997881355932204, "grad_norm": 1.5223448276519775, "learning_rate": 9.600238347457628e-06, "loss": 1.6239, "mean_token_accuracy": 0.641804076731205, "num_tokens": 2429580.0, "step": 3020 }, { "epoch": 0.8003177966101694, "grad_norm": 1.248587965965271, "learning_rate": 9.599973516949153e-06, "loss": 1.4159, "mean_token_accuracy": 0.6884188205003738, "num_tokens": 2431429.0, "step": 3022 }, { "epoch": 0.8008474576271186, "grad_norm": 1.7445571422576904, "learning_rate": 9.599708686440678e-06, "loss": 1.1451, "mean_token_accuracy": 0.7421068102121353, "num_tokens": 2432697.0, "step": 3024 }, { "epoch": 0.8013771186440678, "grad_norm": 1.878798007965088, "learning_rate": 9.599443855932205e-06, "loss": 1.6575, "mean_token_accuracy": 0.6467843428254128, "num_tokens": 2434319.0, "step": 3026 }, { "epoch": 0.801906779661017, "grad_norm": 1.5975204706192017, "learning_rate": 9.59917902542373e-06, "loss": 1.5049, "mean_token_accuracy": 0.6756861619651318, "num_tokens": 2435915.0, "step": 3028 }, { "epoch": 0.8024364406779662, "grad_norm": 1.4792985916137695, "learning_rate": 9.598914194915255e-06, "loss": 1.1307, "mean_token_accuracy": 0.7461883574724197, "num_tokens": 2437378.0, "step": 3030 }, { "epoch": 0.8029661016949152, "grad_norm": 1.6996612548828125, "learning_rate": 9.59864936440678e-06, "loss": 1.1042, "mean_token_accuracy": 0.7274369448423386, "num_tokens": 2439021.0, "step": 3032 }, { "epoch": 0.8034957627118644, "grad_norm": 1.7471297979354858, "learning_rate": 9.598384533898306e-06, "loss": 0.9881, "mean_token_accuracy": 0.7474326938390732, "num_tokens": 2440591.0, "step": 3034 }, { "epoch": 0.8040254237288136, "grad_norm": 1.7404063940048218, "learning_rate": 9.598119703389831e-06, "loss": 1.4425, "mean_token_accuracy": 0.6851020604372025, "num_tokens": 2442309.0, "step": 3036 }, { "epoch": 0.8045550847457628, "grad_norm": 1.8490056991577148, "learning_rate": 9.597854872881356e-06, "loss": 1.2583, "mean_token_accuracy": 0.7042011618614197, "num_tokens": 2443878.0, "step": 3038 }, { "epoch": 0.8050847457627118, "grad_norm": 1.5573145151138306, "learning_rate": 9.597590042372881e-06, "loss": 1.5385, "mean_token_accuracy": 0.6683918759226799, "num_tokens": 2445448.0, "step": 3040 }, { "epoch": 0.805614406779661, "grad_norm": 1.362335443496704, "learning_rate": 9.597325211864408e-06, "loss": 1.1967, "mean_token_accuracy": 0.7153814062476158, "num_tokens": 2447134.0, "step": 3042 }, { "epoch": 0.8061440677966102, "grad_norm": 1.3666601181030273, "learning_rate": 9.597060381355933e-06, "loss": 0.8327, "mean_token_accuracy": 0.7927452474832535, "num_tokens": 2448593.0, "step": 3044 }, { "epoch": 0.8066737288135594, "grad_norm": 1.888364553451538, "learning_rate": 9.59679555084746e-06, "loss": 1.3896, "mean_token_accuracy": 0.677795983850956, "num_tokens": 2450195.0, "step": 3046 }, { "epoch": 0.8072033898305084, "grad_norm": 1.5758211612701416, "learning_rate": 9.596530720338984e-06, "loss": 1.6255, "mean_token_accuracy": 0.6680543497204781, "num_tokens": 2451841.0, "step": 3048 }, { "epoch": 0.8077330508474576, "grad_norm": 1.6150996685028076, "learning_rate": 9.59626588983051e-06, "loss": 0.9852, "mean_token_accuracy": 0.7536737620830536, "num_tokens": 2453472.0, "step": 3050 }, { "epoch": 0.8082627118644068, "grad_norm": 1.6943681240081787, "learning_rate": 9.596001059322034e-06, "loss": 1.5375, "mean_token_accuracy": 0.6656649857759476, "num_tokens": 2455140.0, "step": 3052 }, { "epoch": 0.808792372881356, "grad_norm": 1.7575572729110718, "learning_rate": 9.59573622881356e-06, "loss": 1.1133, "mean_token_accuracy": 0.716696709394455, "num_tokens": 2456595.0, "step": 3054 }, { "epoch": 0.809322033898305, "grad_norm": 1.8626477718353271, "learning_rate": 9.595471398305086e-06, "loss": 1.1478, "mean_token_accuracy": 0.7233337461948395, "num_tokens": 2458263.0, "step": 3056 }, { "epoch": 0.8098516949152542, "grad_norm": 1.567286729812622, "learning_rate": 9.59520656779661e-06, "loss": 1.7535, "mean_token_accuracy": 0.6608448326587677, "num_tokens": 2460009.0, "step": 3058 }, { "epoch": 0.8103813559322034, "grad_norm": 1.8234634399414062, "learning_rate": 9.594941737288136e-06, "loss": 1.9915, "mean_token_accuracy": 0.584881380200386, "num_tokens": 2461615.0, "step": 3060 }, { "epoch": 0.8109110169491526, "grad_norm": 1.3898372650146484, "learning_rate": 9.594676906779662e-06, "loss": 1.321, "mean_token_accuracy": 0.7076054364442825, "num_tokens": 2463555.0, "step": 3062 }, { "epoch": 0.8114406779661016, "grad_norm": 1.7198090553283691, "learning_rate": 9.594412076271187e-06, "loss": 1.1553, "mean_token_accuracy": 0.7573767453432083, "num_tokens": 2465026.0, "step": 3064 }, { "epoch": 0.8119703389830508, "grad_norm": 1.3246885538101196, "learning_rate": 9.594147245762712e-06, "loss": 1.7132, "mean_token_accuracy": 0.6308722384274006, "num_tokens": 2466929.0, "step": 3066 }, { "epoch": 0.8125, "grad_norm": 1.6124740839004517, "learning_rate": 9.593882415254237e-06, "loss": 1.2412, "mean_token_accuracy": 0.7159765921533108, "num_tokens": 2468346.0, "step": 3068 }, { "epoch": 0.8130296610169492, "grad_norm": 1.565346121788025, "learning_rate": 9.593617584745764e-06, "loss": 1.3422, "mean_token_accuracy": 0.6903858482837677, "num_tokens": 2469871.0, "step": 3070 }, { "epoch": 0.8135593220338984, "grad_norm": 1.463068962097168, "learning_rate": 9.593352754237289e-06, "loss": 1.106, "mean_token_accuracy": 0.7530204057693481, "num_tokens": 2471552.0, "step": 3072 }, { "epoch": 0.8140889830508474, "grad_norm": 1.4188357591629028, "learning_rate": 9.593087923728815e-06, "loss": 1.0953, "mean_token_accuracy": 0.7505328133702278, "num_tokens": 2473468.0, "step": 3074 }, { "epoch": 0.8146186440677966, "grad_norm": 1.4557914733886719, "learning_rate": 9.59282309322034e-06, "loss": 1.4532, "mean_token_accuracy": 0.6652631312608719, "num_tokens": 2475360.0, "step": 3076 }, { "epoch": 0.8151483050847458, "grad_norm": 1.4055378437042236, "learning_rate": 9.592558262711865e-06, "loss": 1.255, "mean_token_accuracy": 0.6966661140322685, "num_tokens": 2477153.0, "step": 3078 }, { "epoch": 0.815677966101695, "grad_norm": 1.7262248992919922, "learning_rate": 9.592293432203392e-06, "loss": 1.3131, "mean_token_accuracy": 0.7133179157972336, "num_tokens": 2478869.0, "step": 3080 }, { "epoch": 0.816207627118644, "grad_norm": 1.4727849960327148, "learning_rate": 9.592028601694917e-06, "loss": 0.9922, "mean_token_accuracy": 0.7706418558955193, "num_tokens": 2480256.0, "step": 3082 }, { "epoch": 0.8167372881355932, "grad_norm": 1.690941333770752, "learning_rate": 9.591763771186441e-06, "loss": 1.5621, "mean_token_accuracy": 0.6321833059191704, "num_tokens": 2481878.0, "step": 3084 }, { "epoch": 0.8172669491525424, "grad_norm": 1.8224161863327026, "learning_rate": 9.591498940677966e-06, "loss": 1.6238, "mean_token_accuracy": 0.6477147862315178, "num_tokens": 2483357.0, "step": 3086 }, { "epoch": 0.8177966101694916, "grad_norm": 1.9136857986450195, "learning_rate": 9.591234110169493e-06, "loss": 1.5264, "mean_token_accuracy": 0.6531047895550728, "num_tokens": 2485062.0, "step": 3088 }, { "epoch": 0.8183262711864406, "grad_norm": 1.3537628650665283, "learning_rate": 9.590969279661018e-06, "loss": 1.13, "mean_token_accuracy": 0.7538178935647011, "num_tokens": 2486582.0, "step": 3090 }, { "epoch": 0.8188559322033898, "grad_norm": 1.970312237739563, "learning_rate": 9.590704449152543e-06, "loss": 1.6476, "mean_token_accuracy": 0.6437510140240192, "num_tokens": 2488114.0, "step": 3092 }, { "epoch": 0.819385593220339, "grad_norm": 1.3660240173339844, "learning_rate": 9.590439618644068e-06, "loss": 1.209, "mean_token_accuracy": 0.7325213551521301, "num_tokens": 2489566.0, "step": 3094 }, { "epoch": 0.8199152542372882, "grad_norm": 1.6631264686584473, "learning_rate": 9.590174788135594e-06, "loss": 1.4798, "mean_token_accuracy": 0.6672581657767296, "num_tokens": 2491145.0, "step": 3096 }, { "epoch": 0.8204449152542372, "grad_norm": 1.597815752029419, "learning_rate": 9.58990995762712e-06, "loss": 1.0562, "mean_token_accuracy": 0.7542276158928871, "num_tokens": 2492766.0, "step": 3098 }, { "epoch": 0.8209745762711864, "grad_norm": 1.6362125873565674, "learning_rate": 9.589645127118646e-06, "loss": 1.3433, "mean_token_accuracy": 0.6991322934627533, "num_tokens": 2494316.0, "step": 3100 }, { "epoch": 0.8215042372881356, "grad_norm": 1.5850642919540405, "learning_rate": 9.589380296610171e-06, "loss": 1.4915, "mean_token_accuracy": 0.6884984374046326, "num_tokens": 2495663.0, "step": 3102 }, { "epoch": 0.8220338983050848, "grad_norm": 1.4563305377960205, "learning_rate": 9.589115466101696e-06, "loss": 1.1809, "mean_token_accuracy": 0.7356502935290337, "num_tokens": 2497087.0, "step": 3104 }, { "epoch": 0.8225635593220338, "grad_norm": 1.7484550476074219, "learning_rate": 9.58885063559322e-06, "loss": 1.0304, "mean_token_accuracy": 0.7491883486509323, "num_tokens": 2498310.0, "step": 3106 }, { "epoch": 0.823093220338983, "grad_norm": 1.4284539222717285, "learning_rate": 9.588585805084747e-06, "loss": 1.3506, "mean_token_accuracy": 0.6969245746731758, "num_tokens": 2500148.0, "step": 3108 }, { "epoch": 0.8236228813559322, "grad_norm": 1.9001376628875732, "learning_rate": 9.588320974576272e-06, "loss": 1.4258, "mean_token_accuracy": 0.6913331151008606, "num_tokens": 2501651.0, "step": 3110 }, { "epoch": 0.8241525423728814, "grad_norm": 1.6819791793823242, "learning_rate": 9.588056144067797e-06, "loss": 1.5397, "mean_token_accuracy": 0.6594640389084816, "num_tokens": 2503073.0, "step": 3112 }, { "epoch": 0.8246822033898306, "grad_norm": 1.6677193641662598, "learning_rate": 9.587791313559322e-06, "loss": 1.2952, "mean_token_accuracy": 0.7052221074700356, "num_tokens": 2504776.0, "step": 3114 }, { "epoch": 0.8252118644067796, "grad_norm": 1.7601819038391113, "learning_rate": 9.587526483050849e-06, "loss": 1.4865, "mean_token_accuracy": 0.6763586103916168, "num_tokens": 2506425.0, "step": 3116 }, { "epoch": 0.8257415254237288, "grad_norm": 1.4753879308700562, "learning_rate": 9.587261652542374e-06, "loss": 1.126, "mean_token_accuracy": 0.7199391908943653, "num_tokens": 2507913.0, "step": 3118 }, { "epoch": 0.826271186440678, "grad_norm": 1.2984638214111328, "learning_rate": 9.586996822033899e-06, "loss": 1.2657, "mean_token_accuracy": 0.6997589208185673, "num_tokens": 2509729.0, "step": 3120 }, { "epoch": 0.8268008474576272, "grad_norm": 1.5642119646072388, "learning_rate": 9.586731991525424e-06, "loss": 1.3722, "mean_token_accuracy": 0.6926563009619713, "num_tokens": 2511077.0, "step": 3122 }, { "epoch": 0.8273305084745762, "grad_norm": 1.988486647605896, "learning_rate": 9.58646716101695e-06, "loss": 1.1025, "mean_token_accuracy": 0.7135385498404503, "num_tokens": 2512495.0, "step": 3124 }, { "epoch": 0.8278601694915254, "grad_norm": 1.4482513666152954, "learning_rate": 9.586202330508475e-06, "loss": 1.3569, "mean_token_accuracy": 0.6883535757660866, "num_tokens": 2514243.0, "step": 3126 }, { "epoch": 0.8283898305084746, "grad_norm": 1.4381201267242432, "learning_rate": 9.585937500000002e-06, "loss": 0.9878, "mean_token_accuracy": 0.7485922873020172, "num_tokens": 2515732.0, "step": 3128 }, { "epoch": 0.8289194915254238, "grad_norm": 1.4828078746795654, "learning_rate": 9.585672669491527e-06, "loss": 1.2597, "mean_token_accuracy": 0.700862467288971, "num_tokens": 2517222.0, "step": 3130 }, { "epoch": 0.8294491525423728, "grad_norm": 2.224590301513672, "learning_rate": 9.585407838983052e-06, "loss": 1.3629, "mean_token_accuracy": 0.6848488673567772, "num_tokens": 2518706.0, "step": 3132 }, { "epoch": 0.829978813559322, "grad_norm": 1.671259880065918, "learning_rate": 9.585143008474577e-06, "loss": 1.4114, "mean_token_accuracy": 0.6660236045718193, "num_tokens": 2520381.0, "step": 3134 }, { "epoch": 0.8305084745762712, "grad_norm": 1.5347793102264404, "learning_rate": 9.584878177966103e-06, "loss": 1.3365, "mean_token_accuracy": 0.7300995364785194, "num_tokens": 2522195.0, "step": 3136 }, { "epoch": 0.8310381355932204, "grad_norm": 1.4765655994415283, "learning_rate": 9.584613347457628e-06, "loss": 0.8907, "mean_token_accuracy": 0.7812126725912094, "num_tokens": 2523924.0, "step": 3138 }, { "epoch": 0.8315677966101694, "grad_norm": 1.668418288230896, "learning_rate": 9.584348516949153e-06, "loss": 1.5292, "mean_token_accuracy": 0.6683048233389854, "num_tokens": 2525606.0, "step": 3140 }, { "epoch": 0.8320974576271186, "grad_norm": 1.529348611831665, "learning_rate": 9.584083686440678e-06, "loss": 1.2979, "mean_token_accuracy": 0.694231703877449, "num_tokens": 2527225.0, "step": 3142 }, { "epoch": 0.8326271186440678, "grad_norm": 1.2555561065673828, "learning_rate": 9.583818855932205e-06, "loss": 1.3574, "mean_token_accuracy": 0.6759310364723206, "num_tokens": 2529337.0, "step": 3144 }, { "epoch": 0.833156779661017, "grad_norm": 1.4319432973861694, "learning_rate": 9.58355402542373e-06, "loss": 0.9784, "mean_token_accuracy": 0.7554926872253418, "num_tokens": 2530860.0, "step": 3146 }, { "epoch": 0.8336864406779662, "grad_norm": 1.3059427738189697, "learning_rate": 9.583289194915254e-06, "loss": 1.0212, "mean_token_accuracy": 0.7485346570611, "num_tokens": 2532597.0, "step": 3148 }, { "epoch": 0.8342161016949152, "grad_norm": 1.4864614009857178, "learning_rate": 9.58302436440678e-06, "loss": 1.3627, "mean_token_accuracy": 0.694304883480072, "num_tokens": 2534065.0, "step": 3150 }, { "epoch": 0.8347457627118644, "grad_norm": 1.5490533113479614, "learning_rate": 9.582759533898306e-06, "loss": 1.1687, "mean_token_accuracy": 0.7245537862181664, "num_tokens": 2535510.0, "step": 3152 }, { "epoch": 0.8352754237288136, "grad_norm": 1.5746276378631592, "learning_rate": 9.582494703389831e-06, "loss": 0.952, "mean_token_accuracy": 0.773366890847683, "num_tokens": 2536854.0, "step": 3154 }, { "epoch": 0.8358050847457628, "grad_norm": 1.863966941833496, "learning_rate": 9.582229872881358e-06, "loss": 1.0841, "mean_token_accuracy": 0.7301120012998581, "num_tokens": 2538491.0, "step": 3156 }, { "epoch": 0.8363347457627118, "grad_norm": 2.1740458011627197, "learning_rate": 9.58196504237288e-06, "loss": 1.5521, "mean_token_accuracy": 0.6688883230090141, "num_tokens": 2539908.0, "step": 3158 }, { "epoch": 0.836864406779661, "grad_norm": 1.679445505142212, "learning_rate": 9.581700211864407e-06, "loss": 1.2274, "mean_token_accuracy": 0.7183366343379021, "num_tokens": 2541517.0, "step": 3160 }, { "epoch": 0.8373940677966102, "grad_norm": 1.698386788368225, "learning_rate": 9.581435381355934e-06, "loss": 1.7769, "mean_token_accuracy": 0.6175147816538811, "num_tokens": 2543102.0, "step": 3162 }, { "epoch": 0.8379237288135594, "grad_norm": 1.6286768913269043, "learning_rate": 9.581170550847459e-06, "loss": 1.345, "mean_token_accuracy": 0.6574063450098038, "num_tokens": 2545267.0, "step": 3164 }, { "epoch": 0.8384533898305084, "grad_norm": 1.4214085340499878, "learning_rate": 9.580905720338984e-06, "loss": 1.1726, "mean_token_accuracy": 0.7189093604683876, "num_tokens": 2546470.0, "step": 3166 }, { "epoch": 0.8389830508474576, "grad_norm": 1.9481914043426514, "learning_rate": 9.580640889830509e-06, "loss": 1.3727, "mean_token_accuracy": 0.7121414318680763, "num_tokens": 2547862.0, "step": 3168 }, { "epoch": 0.8395127118644068, "grad_norm": 1.766833782196045, "learning_rate": 9.580376059322035e-06, "loss": 1.4765, "mean_token_accuracy": 0.6579109355807304, "num_tokens": 2549403.0, "step": 3170 }, { "epoch": 0.840042372881356, "grad_norm": 1.6323044300079346, "learning_rate": 9.58011122881356e-06, "loss": 1.1428, "mean_token_accuracy": 0.7228894084692001, "num_tokens": 2551021.0, "step": 3172 }, { "epoch": 0.840572033898305, "grad_norm": 1.6036245822906494, "learning_rate": 9.579846398305085e-06, "loss": 1.2535, "mean_token_accuracy": 0.7176508679986, "num_tokens": 2552760.0, "step": 3174 }, { "epoch": 0.8411016949152542, "grad_norm": 1.5471162796020508, "learning_rate": 9.57958156779661e-06, "loss": 1.0427, "mean_token_accuracy": 0.7550771832466125, "num_tokens": 2554165.0, "step": 3176 }, { "epoch": 0.8416313559322034, "grad_norm": 1.2998286485671997, "learning_rate": 9.579316737288137e-06, "loss": 0.8923, "mean_token_accuracy": 0.7931646257638931, "num_tokens": 2555780.0, "step": 3178 }, { "epoch": 0.8421610169491526, "grad_norm": 1.6044684648513794, "learning_rate": 9.579051906779662e-06, "loss": 1.12, "mean_token_accuracy": 0.7556261122226715, "num_tokens": 2557012.0, "step": 3180 }, { "epoch": 0.8426906779661016, "grad_norm": 1.4599008560180664, "learning_rate": 9.578787076271188e-06, "loss": 1.1823, "mean_token_accuracy": 0.7235941663384438, "num_tokens": 2558581.0, "step": 3182 }, { "epoch": 0.8432203389830508, "grad_norm": 1.4082330465316772, "learning_rate": 9.578522245762713e-06, "loss": 1.3678, "mean_token_accuracy": 0.700145810842514, "num_tokens": 2560282.0, "step": 3184 }, { "epoch": 0.84375, "grad_norm": 1.8541096448898315, "learning_rate": 9.578257415254238e-06, "loss": 1.3364, "mean_token_accuracy": 0.6945990398526192, "num_tokens": 2561862.0, "step": 3186 }, { "epoch": 0.8442796610169492, "grad_norm": 1.8183056116104126, "learning_rate": 9.577992584745763e-06, "loss": 1.0233, "mean_token_accuracy": 0.7638964578509331, "num_tokens": 2563381.0, "step": 3188 }, { "epoch": 0.8448093220338984, "grad_norm": 1.6599663496017456, "learning_rate": 9.57772775423729e-06, "loss": 1.4633, "mean_token_accuracy": 0.6888062730431557, "num_tokens": 2564976.0, "step": 3190 }, { "epoch": 0.8453389830508474, "grad_norm": 1.5352877378463745, "learning_rate": 9.577462923728815e-06, "loss": 1.4589, "mean_token_accuracy": 0.6582476869225502, "num_tokens": 2566601.0, "step": 3192 }, { "epoch": 0.8458686440677966, "grad_norm": 1.672249674797058, "learning_rate": 9.57719809322034e-06, "loss": 1.868, "mean_token_accuracy": 0.6115191169083118, "num_tokens": 2568408.0, "step": 3194 }, { "epoch": 0.8463983050847458, "grad_norm": 1.94462251663208, "learning_rate": 9.576933262711865e-06, "loss": 1.3647, "mean_token_accuracy": 0.6636285781860352, "num_tokens": 2570845.0, "step": 3196 }, { "epoch": 0.846927966101695, "grad_norm": 1.5507718324661255, "learning_rate": 9.576668432203391e-06, "loss": 1.4527, "mean_token_accuracy": 0.6578695401549339, "num_tokens": 2572615.0, "step": 3198 }, { "epoch": 0.847457627118644, "grad_norm": 1.42585027217865, "learning_rate": 9.576403601694916e-06, "loss": 1.0152, "mean_token_accuracy": 0.7649744376540184, "num_tokens": 2574244.0, "step": 3200 }, { "epoch": 0.8479872881355932, "grad_norm": 1.5493268966674805, "learning_rate": 9.576138771186441e-06, "loss": 1.1606, "mean_token_accuracy": 0.7168730050325394, "num_tokens": 2575893.0, "step": 3202 }, { "epoch": 0.8485169491525424, "grad_norm": 1.6079336404800415, "learning_rate": 9.575873940677966e-06, "loss": 1.0078, "mean_token_accuracy": 0.7601807788014412, "num_tokens": 2577557.0, "step": 3204 }, { "epoch": 0.8490466101694916, "grad_norm": 1.3953015804290771, "learning_rate": 9.575609110169493e-06, "loss": 1.4202, "mean_token_accuracy": 0.6763884425163269, "num_tokens": 2579337.0, "step": 3206 }, { "epoch": 0.8495762711864406, "grad_norm": 1.6506205797195435, "learning_rate": 9.575344279661018e-06, "loss": 1.1809, "mean_token_accuracy": 0.7511658668518066, "num_tokens": 2580963.0, "step": 3208 }, { "epoch": 0.8501059322033898, "grad_norm": 1.900162696838379, "learning_rate": 9.575079449152544e-06, "loss": 1.2401, "mean_token_accuracy": 0.7579985782504082, "num_tokens": 2582441.0, "step": 3210 }, { "epoch": 0.850635593220339, "grad_norm": 1.8528059720993042, "learning_rate": 9.574814618644067e-06, "loss": 1.6983, "mean_token_accuracy": 0.6338490769267082, "num_tokens": 2583746.0, "step": 3212 }, { "epoch": 0.8511652542372882, "grad_norm": 1.2739440202713013, "learning_rate": 9.574549788135594e-06, "loss": 1.4537, "mean_token_accuracy": 0.6759971901774406, "num_tokens": 2586607.0, "step": 3214 }, { "epoch": 0.8516949152542372, "grad_norm": 1.5556622743606567, "learning_rate": 9.574284957627119e-06, "loss": 1.7212, "mean_token_accuracy": 0.6385447010397911, "num_tokens": 2588271.0, "step": 3216 }, { "epoch": 0.8522245762711864, "grad_norm": 1.3701708316802979, "learning_rate": 9.574020127118646e-06, "loss": 1.3657, "mean_token_accuracy": 0.6777315177023411, "num_tokens": 2589884.0, "step": 3218 }, { "epoch": 0.8527542372881356, "grad_norm": 1.261915683746338, "learning_rate": 9.57375529661017e-06, "loss": 0.7218, "mean_token_accuracy": 0.8008831813931465, "num_tokens": 2591578.0, "step": 3220 }, { "epoch": 0.8532838983050848, "grad_norm": 1.7423044443130493, "learning_rate": 9.573490466101695e-06, "loss": 1.4885, "mean_token_accuracy": 0.6696495115756989, "num_tokens": 2592910.0, "step": 3222 }, { "epoch": 0.8538135593220338, "grad_norm": 2.8263063430786133, "learning_rate": 9.57322563559322e-06, "loss": 1.1702, "mean_token_accuracy": 0.7244082242250443, "num_tokens": 2594480.0, "step": 3224 }, { "epoch": 0.854343220338983, "grad_norm": 1.7858799695968628, "learning_rate": 9.572960805084747e-06, "loss": 1.7855, "mean_token_accuracy": 0.6206600815057755, "num_tokens": 2596275.0, "step": 3226 }, { "epoch": 0.8548728813559322, "grad_norm": 1.3830502033233643, "learning_rate": 9.572695974576272e-06, "loss": 1.0963, "mean_token_accuracy": 0.7313225865364075, "num_tokens": 2597842.0, "step": 3228 }, { "epoch": 0.8554025423728814, "grad_norm": 1.8755868673324585, "learning_rate": 9.572431144067797e-06, "loss": 1.2178, "mean_token_accuracy": 0.7184169068932533, "num_tokens": 2599195.0, "step": 3230 }, { "epoch": 0.8559322033898306, "grad_norm": 1.7957994937896729, "learning_rate": 9.572166313559322e-06, "loss": 1.2562, "mean_token_accuracy": 0.7117991670966148, "num_tokens": 2600412.0, "step": 3232 }, { "epoch": 0.8564618644067796, "grad_norm": 1.3573065996170044, "learning_rate": 9.571901483050848e-06, "loss": 1.0696, "mean_token_accuracy": 0.7255676984786987, "num_tokens": 2601936.0, "step": 3234 }, { "epoch": 0.8569915254237288, "grad_norm": 1.2154194116592407, "learning_rate": 9.571636652542373e-06, "loss": 1.2372, "mean_token_accuracy": 0.7376085966825485, "num_tokens": 2603671.0, "step": 3236 }, { "epoch": 0.857521186440678, "grad_norm": 1.3683981895446777, "learning_rate": 9.5713718220339e-06, "loss": 1.3784, "mean_token_accuracy": 0.6986477337777615, "num_tokens": 2605486.0, "step": 3238 }, { "epoch": 0.8580508474576272, "grad_norm": 1.3968288898468018, "learning_rate": 9.571106991525423e-06, "loss": 1.4046, "mean_token_accuracy": 0.6741332784295082, "num_tokens": 2607371.0, "step": 3240 }, { "epoch": 0.8585805084745762, "grad_norm": 1.8310956954956055, "learning_rate": 9.57084216101695e-06, "loss": 1.8173, "mean_token_accuracy": 0.5994112230837345, "num_tokens": 2609139.0, "step": 3242 }, { "epoch": 0.8591101694915254, "grad_norm": 1.6312624216079712, "learning_rate": 9.570577330508475e-06, "loss": 1.3395, "mean_token_accuracy": 0.6894189864397049, "num_tokens": 2610617.0, "step": 3244 }, { "epoch": 0.8596398305084746, "grad_norm": 1.5482122898101807, "learning_rate": 9.570312500000001e-06, "loss": 1.0778, "mean_token_accuracy": 0.7285861819982529, "num_tokens": 2612078.0, "step": 3246 }, { "epoch": 0.8601694915254238, "grad_norm": 1.469797968864441, "learning_rate": 9.570047669491526e-06, "loss": 1.4774, "mean_token_accuracy": 0.6935736574232578, "num_tokens": 2613625.0, "step": 3248 }, { "epoch": 0.8606991525423728, "grad_norm": 1.6517046689987183, "learning_rate": 9.569782838983051e-06, "loss": 1.4324, "step": 3250 }, { "epoch": 0.8606991525423728, "eval_loss": 1.328011393547058, "eval_mean_token_accuracy": 0.697213012870256, "eval_num_tokens": 2615537.0, "eval_runtime": 48.3037, "eval_samples_per_second": 6.376, "eval_steps_per_second": 6.376, "step": 3250 }, { "epoch": 0.861228813559322, "grad_norm": 1.9502676725387573, "learning_rate": 9.569518008474578e-06, "loss": 1.5483, "mean_token_accuracy": 0.6781592071056366, "num_tokens": 2616927.0, "step": 3252 }, { "epoch": 0.8617584745762712, "grad_norm": 1.769891381263733, "learning_rate": 9.569253177966103e-06, "loss": 1.2579, "mean_token_accuracy": 0.7342043817043304, "num_tokens": 2618495.0, "step": 3254 }, { "epoch": 0.8622881355932204, "grad_norm": 1.4239153861999512, "learning_rate": 9.568988347457628e-06, "loss": 1.4858, "mean_token_accuracy": 0.6829873360693455, "num_tokens": 2619948.0, "step": 3256 }, { "epoch": 0.8628177966101694, "grad_norm": 1.6926257610321045, "learning_rate": 9.568723516949153e-06, "loss": 1.3146, "mean_token_accuracy": 0.6994315683841705, "num_tokens": 2621371.0, "step": 3258 }, { "epoch": 0.8633474576271186, "grad_norm": 1.3792309761047363, "learning_rate": 9.56845868644068e-06, "loss": 1.2962, "mean_token_accuracy": 0.7115645222365856, "num_tokens": 2623230.0, "step": 3260 }, { "epoch": 0.8638771186440678, "grad_norm": 1.8649072647094727, "learning_rate": 9.568193855932204e-06, "loss": 1.5341, "mean_token_accuracy": 0.6440626084804535, "num_tokens": 2624796.0, "step": 3262 }, { "epoch": 0.864406779661017, "grad_norm": 1.6184678077697754, "learning_rate": 9.56792902542373e-06, "loss": 1.0101, "mean_token_accuracy": 0.752899095416069, "num_tokens": 2626209.0, "step": 3264 }, { "epoch": 0.8649364406779662, "grad_norm": 1.9109504222869873, "learning_rate": 9.567664194915254e-06, "loss": 1.2783, "mean_token_accuracy": 0.7037730365991592, "num_tokens": 2627444.0, "step": 3266 }, { "epoch": 0.8654661016949152, "grad_norm": 1.82469642162323, "learning_rate": 9.56739936440678e-06, "loss": 1.451, "mean_token_accuracy": 0.668417178094387, "num_tokens": 2629107.0, "step": 3268 }, { "epoch": 0.8659957627118644, "grad_norm": 1.5692976713180542, "learning_rate": 9.567134533898306e-06, "loss": 1.4959, "mean_token_accuracy": 0.6611413806676865, "num_tokens": 2630696.0, "step": 3270 }, { "epoch": 0.8665254237288136, "grad_norm": 1.5370901823043823, "learning_rate": 9.566869703389832e-06, "loss": 1.4717, "mean_token_accuracy": 0.6949712336063385, "num_tokens": 2632291.0, "step": 3272 }, { "epoch": 0.8670550847457628, "grad_norm": 1.4417169094085693, "learning_rate": 9.566604872881357e-06, "loss": 1.1374, "mean_token_accuracy": 0.7133355960249901, "num_tokens": 2633770.0, "step": 3274 }, { "epoch": 0.8675847457627118, "grad_norm": 2.1669270992279053, "learning_rate": 9.566340042372882e-06, "loss": 1.2705, "mean_token_accuracy": 0.7329373136162758, "num_tokens": 2635136.0, "step": 3276 }, { "epoch": 0.868114406779661, "grad_norm": 1.3597266674041748, "learning_rate": 9.566075211864407e-06, "loss": 1.1053, "mean_token_accuracy": 0.7409014850854874, "num_tokens": 2636627.0, "step": 3278 }, { "epoch": 0.8686440677966102, "grad_norm": 1.5516622066497803, "learning_rate": 9.565810381355934e-06, "loss": 1.2185, "mean_token_accuracy": 0.6924097612500191, "num_tokens": 2638074.0, "step": 3280 }, { "epoch": 0.8691737288135594, "grad_norm": 1.179207682609558, "learning_rate": 9.565545550847459e-06, "loss": 1.2273, "mean_token_accuracy": 0.6959324702620506, "num_tokens": 2640214.0, "step": 3282 }, { "epoch": 0.8697033898305084, "grad_norm": 1.6279813051223755, "learning_rate": 9.565280720338984e-06, "loss": 1.5966, "mean_token_accuracy": 0.6411900743842125, "num_tokens": 2641795.0, "step": 3284 }, { "epoch": 0.8702330508474576, "grad_norm": 1.7119216918945312, "learning_rate": 9.565015889830508e-06, "loss": 1.6345, "mean_token_accuracy": 0.6429794728755951, "num_tokens": 2643564.0, "step": 3286 }, { "epoch": 0.8707627118644068, "grad_norm": 1.2818711996078491, "learning_rate": 9.564751059322035e-06, "loss": 1.2758, "mean_token_accuracy": 0.6891579031944275, "num_tokens": 2645301.0, "step": 3288 }, { "epoch": 0.871292372881356, "grad_norm": 1.8126875162124634, "learning_rate": 9.56448622881356e-06, "loss": 1.2588, "mean_token_accuracy": 0.6976844742894173, "num_tokens": 2646721.0, "step": 3290 }, { "epoch": 0.871822033898305, "grad_norm": 1.5241203308105469, "learning_rate": 9.564221398305087e-06, "loss": 1.5012, "mean_token_accuracy": 0.6639905050396919, "num_tokens": 2648206.0, "step": 3292 }, { "epoch": 0.8723516949152542, "grad_norm": 1.3853946924209595, "learning_rate": 9.56395656779661e-06, "loss": 1.1613, "mean_token_accuracy": 0.7215872406959534, "num_tokens": 2650093.0, "step": 3294 }, { "epoch": 0.8728813559322034, "grad_norm": 1.364391565322876, "learning_rate": 9.563691737288136e-06, "loss": 1.0536, "mean_token_accuracy": 0.7394764572381973, "num_tokens": 2651672.0, "step": 3296 }, { "epoch": 0.8734110169491526, "grad_norm": 1.8405486345291138, "learning_rate": 9.563426906779661e-06, "loss": 1.8774, "mean_token_accuracy": 0.6153829321265221, "num_tokens": 2653605.0, "step": 3298 }, { "epoch": 0.8739406779661016, "grad_norm": 1.585245132446289, "learning_rate": 9.563162076271188e-06, "loss": 1.1027, "mean_token_accuracy": 0.7457442358136177, "num_tokens": 2654814.0, "step": 3300 }, { "epoch": 0.8744703389830508, "grad_norm": 1.3120359182357788, "learning_rate": 9.562897245762713e-06, "loss": 1.3916, "mean_token_accuracy": 0.6878783330321312, "num_tokens": 2656813.0, "step": 3302 }, { "epoch": 0.875, "grad_norm": 1.3384206295013428, "learning_rate": 9.562632415254238e-06, "loss": 1.333, "mean_token_accuracy": 0.7139920368790627, "num_tokens": 2658276.0, "step": 3304 }, { "epoch": 0.8755296610169492, "grad_norm": 1.98054838180542, "learning_rate": 9.562367584745763e-06, "loss": 1.4949, "mean_token_accuracy": 0.6556197926402092, "num_tokens": 2659661.0, "step": 3306 }, { "epoch": 0.8760593220338984, "grad_norm": 2.0461480617523193, "learning_rate": 9.56210275423729e-06, "loss": 2.0496, "mean_token_accuracy": 0.5787624269723892, "num_tokens": 2661322.0, "step": 3308 }, { "epoch": 0.8765889830508474, "grad_norm": 1.2147667407989502, "learning_rate": 9.561837923728814e-06, "loss": 1.3689, "mean_token_accuracy": 0.7098994627594948, "num_tokens": 2663425.0, "step": 3310 }, { "epoch": 0.8771186440677966, "grad_norm": 1.8126635551452637, "learning_rate": 9.56157309322034e-06, "loss": 1.2968, "mean_token_accuracy": 0.7168691456317902, "num_tokens": 2664784.0, "step": 3312 }, { "epoch": 0.8776483050847458, "grad_norm": 1.831099271774292, "learning_rate": 9.561308262711864e-06, "loss": 1.2481, "mean_token_accuracy": 0.7202161848545074, "num_tokens": 2666274.0, "step": 3314 }, { "epoch": 0.878177966101695, "grad_norm": 1.2910308837890625, "learning_rate": 9.56104343220339e-06, "loss": 0.8154, "mean_token_accuracy": 0.7895848304033279, "num_tokens": 2667778.0, "step": 3316 }, { "epoch": 0.878707627118644, "grad_norm": 1.8107024431228638, "learning_rate": 9.560778601694916e-06, "loss": 1.7885, "mean_token_accuracy": 0.6494912542402744, "num_tokens": 2669272.0, "step": 3318 }, { "epoch": 0.8792372881355932, "grad_norm": 1.627544641494751, "learning_rate": 9.56051377118644e-06, "loss": 1.4132, "mean_token_accuracy": 0.6743301562964916, "num_tokens": 2671201.0, "step": 3320 }, { "epoch": 0.8797669491525424, "grad_norm": 1.7586870193481445, "learning_rate": 9.560248940677966e-06, "loss": 1.8419, "mean_token_accuracy": 0.6117073632776737, "num_tokens": 2672835.0, "step": 3322 }, { "epoch": 0.8802966101694916, "grad_norm": 1.5796265602111816, "learning_rate": 9.559984110169492e-06, "loss": 1.0398, "mean_token_accuracy": 0.7608628310263157, "num_tokens": 2674196.0, "step": 3324 }, { "epoch": 0.8808262711864406, "grad_norm": 1.9170600175857544, "learning_rate": 9.559719279661017e-06, "loss": 1.0831, "mean_token_accuracy": 0.7598159089684486, "num_tokens": 2675614.0, "step": 3326 }, { "epoch": 0.8813559322033898, "grad_norm": 1.4346927404403687, "learning_rate": 9.559454449152544e-06, "loss": 1.2714, "mean_token_accuracy": 0.7117888629436493, "num_tokens": 2677049.0, "step": 3328 }, { "epoch": 0.881885593220339, "grad_norm": 1.587651014328003, "learning_rate": 9.559189618644069e-06, "loss": 1.4503, "mean_token_accuracy": 0.6849271059036255, "num_tokens": 2678840.0, "step": 3330 }, { "epoch": 0.8824152542372882, "grad_norm": 1.5421488285064697, "learning_rate": 9.558924788135594e-06, "loss": 1.4003, "mean_token_accuracy": 0.681832417845726, "num_tokens": 2680780.0, "step": 3332 }, { "epoch": 0.8829449152542372, "grad_norm": 1.538063645362854, "learning_rate": 9.55865995762712e-06, "loss": 1.7304, "mean_token_accuracy": 0.6523048803210258, "num_tokens": 2682337.0, "step": 3334 }, { "epoch": 0.8834745762711864, "grad_norm": 1.6558656692504883, "learning_rate": 9.558395127118645e-06, "loss": 1.4536, "mean_token_accuracy": 0.6717663630843163, "num_tokens": 2683939.0, "step": 3336 }, { "epoch": 0.8840042372881356, "grad_norm": 1.5309616327285767, "learning_rate": 9.55813029661017e-06, "loss": 1.5566, "mean_token_accuracy": 0.6465427204966545, "num_tokens": 2685696.0, "step": 3338 }, { "epoch": 0.8845338983050848, "grad_norm": 1.7279539108276367, "learning_rate": 9.557865466101695e-06, "loss": 1.1312, "mean_token_accuracy": 0.7307984456419945, "num_tokens": 2687363.0, "step": 3340 }, { "epoch": 0.8850635593220338, "grad_norm": 1.7535760402679443, "learning_rate": 9.557600635593222e-06, "loss": 0.9331, "mean_token_accuracy": 0.7560064643621445, "num_tokens": 2688841.0, "step": 3342 }, { "epoch": 0.885593220338983, "grad_norm": 1.7015831470489502, "learning_rate": 9.557335805084747e-06, "loss": 1.3413, "mean_token_accuracy": 0.7022869735956192, "num_tokens": 2690394.0, "step": 3344 }, { "epoch": 0.8861228813559322, "grad_norm": 1.6472452878952026, "learning_rate": 9.557070974576273e-06, "loss": 1.27, "mean_token_accuracy": 0.6913130655884743, "num_tokens": 2692303.0, "step": 3346 }, { "epoch": 0.8866525423728814, "grad_norm": 1.706638216972351, "learning_rate": 9.556806144067796e-06, "loss": 1.3384, "mean_token_accuracy": 0.7152146399021149, "num_tokens": 2693806.0, "step": 3348 }, { "epoch": 0.8871822033898306, "grad_norm": 1.244397759437561, "learning_rate": 9.556541313559323e-06, "loss": 1.2483, "mean_token_accuracy": 0.6927303485572338, "num_tokens": 2695607.0, "step": 3350 }, { "epoch": 0.8877118644067796, "grad_norm": 1.5496824979782104, "learning_rate": 9.556276483050848e-06, "loss": 1.2147, "mean_token_accuracy": 0.72090695053339, "num_tokens": 2697164.0, "step": 3352 }, { "epoch": 0.8882415254237288, "grad_norm": 1.50038743019104, "learning_rate": 9.556011652542375e-06, "loss": 1.3938, "mean_token_accuracy": 0.6684185042977333, "num_tokens": 2699398.0, "step": 3354 }, { "epoch": 0.888771186440678, "grad_norm": 1.8330506086349487, "learning_rate": 9.5557468220339e-06, "loss": 1.7236, "mean_token_accuracy": 0.6321107596158981, "num_tokens": 2701077.0, "step": 3356 }, { "epoch": 0.8893008474576272, "grad_norm": 1.69051194190979, "learning_rate": 9.555481991525425e-06, "loss": 1.5481, "mean_token_accuracy": 0.6702527329325676, "num_tokens": 2702620.0, "step": 3358 }, { "epoch": 0.8898305084745762, "grad_norm": 1.4311484098434448, "learning_rate": 9.55521716101695e-06, "loss": 0.9458, "mean_token_accuracy": 0.7649907395243645, "num_tokens": 2704359.0, "step": 3360 }, { "epoch": 0.8903601694915254, "grad_norm": 1.7363201379776, "learning_rate": 9.554952330508476e-06, "loss": 1.5062, "mean_token_accuracy": 0.6731546856462955, "num_tokens": 2705901.0, "step": 3362 }, { "epoch": 0.8908898305084746, "grad_norm": 1.9462976455688477, "learning_rate": 9.554687500000001e-06, "loss": 1.1912, "mean_token_accuracy": 0.7197122499346733, "num_tokens": 2707429.0, "step": 3364 }, { "epoch": 0.8914194915254238, "grad_norm": 1.615322470664978, "learning_rate": 9.554422669491526e-06, "loss": 1.4106, "mean_token_accuracy": 0.6691372357308865, "num_tokens": 2709021.0, "step": 3366 }, { "epoch": 0.8919491525423728, "grad_norm": 1.7801110744476318, "learning_rate": 9.554157838983051e-06, "loss": 1.5862, "mean_token_accuracy": 0.6576265394687653, "num_tokens": 2710668.0, "step": 3368 }, { "epoch": 0.892478813559322, "grad_norm": 1.3273861408233643, "learning_rate": 9.553893008474577e-06, "loss": 1.1861, "mean_token_accuracy": 0.71761304885149, "num_tokens": 2712381.0, "step": 3370 }, { "epoch": 0.8930084745762712, "grad_norm": 1.2858200073242188, "learning_rate": 9.553628177966102e-06, "loss": 1.1066, "mean_token_accuracy": 0.7350544817745686, "num_tokens": 2714295.0, "step": 3372 }, { "epoch": 0.8935381355932204, "grad_norm": 1.4437144994735718, "learning_rate": 9.553363347457627e-06, "loss": 1.4134, "mean_token_accuracy": 0.6686553135514259, "num_tokens": 2715846.0, "step": 3374 }, { "epoch": 0.8940677966101694, "grad_norm": 2.355597734451294, "learning_rate": 9.553098516949152e-06, "loss": 1.8703, "mean_token_accuracy": 0.6387880146503448, "num_tokens": 2717417.0, "step": 3376 }, { "epoch": 0.8945974576271186, "grad_norm": 2.150740385055542, "learning_rate": 9.552833686440679e-06, "loss": 1.6843, "mean_token_accuracy": 0.6642080321907997, "num_tokens": 2718847.0, "step": 3378 }, { "epoch": 0.8951271186440678, "grad_norm": 1.9333081245422363, "learning_rate": 9.552568855932204e-06, "loss": 1.1428, "mean_token_accuracy": 0.7366986274719238, "num_tokens": 2720239.0, "step": 3380 }, { "epoch": 0.895656779661017, "grad_norm": 1.7974990606307983, "learning_rate": 9.55230402542373e-06, "loss": 1.3884, "mean_token_accuracy": 0.6861933991312981, "num_tokens": 2721680.0, "step": 3382 }, { "epoch": 0.8961864406779662, "grad_norm": 1.8328425884246826, "learning_rate": 9.552039194915255e-06, "loss": 1.1059, "mean_token_accuracy": 0.7444720342755318, "num_tokens": 2723366.0, "step": 3384 }, { "epoch": 0.8967161016949152, "grad_norm": 1.2815073728561401, "learning_rate": 9.55177436440678e-06, "loss": 1.0814, "mean_token_accuracy": 0.7355170026421547, "num_tokens": 2725356.0, "step": 3386 }, { "epoch": 0.8972457627118644, "grad_norm": 1.6962774991989136, "learning_rate": 9.551509533898305e-06, "loss": 1.5974, "mean_token_accuracy": 0.652843177318573, "num_tokens": 2727039.0, "step": 3388 }, { "epoch": 0.8977754237288136, "grad_norm": 1.5066652297973633, "learning_rate": 9.551244703389832e-06, "loss": 1.2418, "mean_token_accuracy": 0.69996327906847, "num_tokens": 2728654.0, "step": 3390 }, { "epoch": 0.8983050847457628, "grad_norm": 1.4371848106384277, "learning_rate": 9.550979872881357e-06, "loss": 1.5112, "mean_token_accuracy": 0.6399385370314121, "num_tokens": 2730306.0, "step": 3392 }, { "epoch": 0.8988347457627118, "grad_norm": 1.4931561946868896, "learning_rate": 9.550715042372882e-06, "loss": 1.7304, "mean_token_accuracy": 0.6413604170084, "num_tokens": 2732045.0, "step": 3394 }, { "epoch": 0.899364406779661, "grad_norm": 1.9134941101074219, "learning_rate": 9.550450211864407e-06, "loss": 1.6335, "mean_token_accuracy": 0.6558073908090591, "num_tokens": 2733512.0, "step": 3396 }, { "epoch": 0.8998940677966102, "grad_norm": 1.6374924182891846, "learning_rate": 9.550185381355933e-06, "loss": 1.3278, "mean_token_accuracy": 0.7211490571498871, "num_tokens": 2735027.0, "step": 3398 }, { "epoch": 0.9004237288135594, "grad_norm": 1.7182742357254028, "learning_rate": 9.549920550847458e-06, "loss": 1.4029, "mean_token_accuracy": 0.672432191669941, "num_tokens": 2736644.0, "step": 3400 }, { "epoch": 0.9009533898305084, "grad_norm": 1.5005128383636475, "learning_rate": 9.549655720338983e-06, "loss": 1.2278, "mean_token_accuracy": 0.7048955783247948, "num_tokens": 2738336.0, "step": 3402 }, { "epoch": 0.9014830508474576, "grad_norm": 1.2513401508331299, "learning_rate": 9.549390889830508e-06, "loss": 1.01, "mean_token_accuracy": 0.7515593841671944, "num_tokens": 2739972.0, "step": 3404 }, { "epoch": 0.9020127118644068, "grad_norm": 1.666323184967041, "learning_rate": 9.549126059322035e-06, "loss": 1.3624, "mean_token_accuracy": 0.6831006295979023, "num_tokens": 2741467.0, "step": 3406 }, { "epoch": 0.902542372881356, "grad_norm": 1.5778802633285522, "learning_rate": 9.54886122881356e-06, "loss": 1.35, "mean_token_accuracy": 0.6854526251554489, "num_tokens": 2742956.0, "step": 3408 }, { "epoch": 0.903072033898305, "grad_norm": 1.8498594760894775, "learning_rate": 9.548596398305086e-06, "loss": 1.3985, "mean_token_accuracy": 0.6871817111968994, "num_tokens": 2744390.0, "step": 3410 }, { "epoch": 0.9036016949152542, "grad_norm": 1.548817753791809, "learning_rate": 9.548331567796611e-06, "loss": 1.3687, "mean_token_accuracy": 0.6878243535757065, "num_tokens": 2745839.0, "step": 3412 }, { "epoch": 0.9041313559322034, "grad_norm": 1.584370732307434, "learning_rate": 9.548066737288136e-06, "loss": 1.6253, "mean_token_accuracy": 0.6525730192661285, "num_tokens": 2747489.0, "step": 3414 }, { "epoch": 0.9046610169491526, "grad_norm": 2.037295341491699, "learning_rate": 9.547801906779663e-06, "loss": 1.5783, "mean_token_accuracy": 0.6639766469597816, "num_tokens": 2749056.0, "step": 3416 }, { "epoch": 0.9051906779661016, "grad_norm": 1.742250919342041, "learning_rate": 9.547537076271188e-06, "loss": 1.5818, "mean_token_accuracy": 0.6411214470863342, "num_tokens": 2750786.0, "step": 3418 }, { "epoch": 0.9057203389830508, "grad_norm": 1.5172840356826782, "learning_rate": 9.547272245762713e-06, "loss": 1.2048, "mean_token_accuracy": 0.7008979991078377, "num_tokens": 2752353.0, "step": 3420 }, { "epoch": 0.90625, "grad_norm": 1.4364404678344727, "learning_rate": 9.547007415254238e-06, "loss": 1.2406, "mean_token_accuracy": 0.7158918604254723, "num_tokens": 2754067.0, "step": 3422 }, { "epoch": 0.9067796610169492, "grad_norm": 1.846413016319275, "learning_rate": 9.546742584745764e-06, "loss": 1.4594, "mean_token_accuracy": 0.6648857742547989, "num_tokens": 2755663.0, "step": 3424 }, { "epoch": 0.9073093220338984, "grad_norm": 1.3385158777236938, "learning_rate": 9.546477754237289e-06, "loss": 1.0556, "mean_token_accuracy": 0.7535948902368546, "num_tokens": 2757025.0, "step": 3426 }, { "epoch": 0.9078389830508474, "grad_norm": 1.671907663345337, "learning_rate": 9.546212923728814e-06, "loss": 1.676, "mean_token_accuracy": 0.6545000001788139, "num_tokens": 2758616.0, "step": 3428 }, { "epoch": 0.9083686440677966, "grad_norm": 1.546064853668213, "learning_rate": 9.545948093220339e-06, "loss": 1.3099, "mean_token_accuracy": 0.6959495395421982, "num_tokens": 2759927.0, "step": 3430 }, { "epoch": 0.9088983050847458, "grad_norm": 1.6101605892181396, "learning_rate": 9.545683262711866e-06, "loss": 1.6686, "mean_token_accuracy": 0.6475066766142845, "num_tokens": 2761268.0, "step": 3432 }, { "epoch": 0.909427966101695, "grad_norm": 1.8551125526428223, "learning_rate": 9.54541843220339e-06, "loss": 1.6151, "mean_token_accuracy": 0.6428816914558411, "num_tokens": 2762853.0, "step": 3434 }, { "epoch": 0.909957627118644, "grad_norm": 1.634961724281311, "learning_rate": 9.545153601694917e-06, "loss": 1.0982, "mean_token_accuracy": 0.7486774697899818, "num_tokens": 2764662.0, "step": 3436 }, { "epoch": 0.9104872881355932, "grad_norm": 1.9022672176361084, "learning_rate": 9.544888771186442e-06, "loss": 1.3415, "mean_token_accuracy": 0.6862611994147301, "num_tokens": 2766233.0, "step": 3438 }, { "epoch": 0.9110169491525424, "grad_norm": 1.7207163572311401, "learning_rate": 9.544623940677967e-06, "loss": 1.2722, "mean_token_accuracy": 0.7011071071028709, "num_tokens": 2767970.0, "step": 3440 }, { "epoch": 0.9115466101694916, "grad_norm": 1.6090997457504272, "learning_rate": 9.544359110169492e-06, "loss": 1.5479, "mean_token_accuracy": 0.6578491181135178, "num_tokens": 2769582.0, "step": 3442 }, { "epoch": 0.9120762711864406, "grad_norm": 1.6684272289276123, "learning_rate": 9.544094279661019e-06, "loss": 0.9583, "mean_token_accuracy": 0.7683761715888977, "num_tokens": 2770998.0, "step": 3444 }, { "epoch": 0.9126059322033898, "grad_norm": 2.061568260192871, "learning_rate": 9.543829449152543e-06, "loss": 1.6906, "mean_token_accuracy": 0.6436370760202408, "num_tokens": 2772871.0, "step": 3446 }, { "epoch": 0.913135593220339, "grad_norm": 1.8956542015075684, "learning_rate": 9.543564618644068e-06, "loss": 1.804, "mean_token_accuracy": 0.6237568147480488, "num_tokens": 2774682.0, "step": 3448 }, { "epoch": 0.9136652542372882, "grad_norm": 1.5936200618743896, "learning_rate": 9.543299788135593e-06, "loss": 1.0445, "mean_token_accuracy": 0.7369365692138672, "num_tokens": 2776019.0, "step": 3450 }, { "epoch": 0.9141949152542372, "grad_norm": 1.9098782539367676, "learning_rate": 9.54303495762712e-06, "loss": 1.4456, "mean_token_accuracy": 0.6864410266280174, "num_tokens": 2777366.0, "step": 3452 }, { "epoch": 0.9147245762711864, "grad_norm": 2.0008387565612793, "learning_rate": 9.542770127118645e-06, "loss": 1.1727, "mean_token_accuracy": 0.7344788014888763, "num_tokens": 2779017.0, "step": 3454 }, { "epoch": 0.9152542372881356, "grad_norm": 1.4903030395507812, "learning_rate": 9.54250529661017e-06, "loss": 1.4578, "mean_token_accuracy": 0.6712779439985752, "num_tokens": 2780782.0, "step": 3456 }, { "epoch": 0.9157838983050848, "grad_norm": 1.4017447233200073, "learning_rate": 9.542240466101695e-06, "loss": 1.0935, "mean_token_accuracy": 0.7151613235473633, "num_tokens": 2782615.0, "step": 3458 }, { "epoch": 0.9163135593220338, "grad_norm": 1.6623297929763794, "learning_rate": 9.541975635593221e-06, "loss": 1.2214, "mean_token_accuracy": 0.7107710614800453, "num_tokens": 2784234.0, "step": 3460 }, { "epoch": 0.916843220338983, "grad_norm": 1.7815848588943481, "learning_rate": 9.541710805084746e-06, "loss": 1.4504, "mean_token_accuracy": 0.6521429121494293, "num_tokens": 2785678.0, "step": 3462 }, { "epoch": 0.9173728813559322, "grad_norm": 1.3898735046386719, "learning_rate": 9.541445974576273e-06, "loss": 1.42, "mean_token_accuracy": 0.6830220818519592, "num_tokens": 2787073.0, "step": 3464 }, { "epoch": 0.9179025423728814, "grad_norm": 1.1971851587295532, "learning_rate": 9.541181144067798e-06, "loss": 1.2393, "mean_token_accuracy": 0.7154681794345379, "num_tokens": 2788908.0, "step": 3466 }, { "epoch": 0.9184322033898306, "grad_norm": 1.1953204870224, "learning_rate": 9.540916313559323e-06, "loss": 1.3045, "mean_token_accuracy": 0.7230152562260628, "num_tokens": 2790687.0, "step": 3468 }, { "epoch": 0.9189618644067796, "grad_norm": 1.7296940088272095, "learning_rate": 9.540651483050848e-06, "loss": 1.5126, "mean_token_accuracy": 0.6764400452375412, "num_tokens": 2792230.0, "step": 3470 }, { "epoch": 0.9194915254237288, "grad_norm": 1.5730465650558472, "learning_rate": 9.540386652542374e-06, "loss": 2.0921, "mean_token_accuracy": 0.5507446303963661, "num_tokens": 2794041.0, "step": 3472 }, { "epoch": 0.920021186440678, "grad_norm": 1.5632861852645874, "learning_rate": 9.5401218220339e-06, "loss": 1.7096, "mean_token_accuracy": 0.6341430097818375, "num_tokens": 2795620.0, "step": 3474 }, { "epoch": 0.9205508474576272, "grad_norm": 1.5015010833740234, "learning_rate": 9.539856991525424e-06, "loss": 1.0448, "mean_token_accuracy": 0.7544423192739487, "num_tokens": 2797432.0, "step": 3476 }, { "epoch": 0.9210805084745762, "grad_norm": 1.4251437187194824, "learning_rate": 9.539592161016949e-06, "loss": 1.3297, "mean_token_accuracy": 0.7015016302466393, "num_tokens": 2799296.0, "step": 3478 }, { "epoch": 0.9216101694915254, "grad_norm": 1.6713290214538574, "learning_rate": 9.539327330508476e-06, "loss": 1.9921, "mean_token_accuracy": 0.6082463264465332, "num_tokens": 2800928.0, "step": 3480 }, { "epoch": 0.9221398305084746, "grad_norm": 1.9311776161193848, "learning_rate": 9.5390625e-06, "loss": 1.0438, "mean_token_accuracy": 0.7277176007628441, "num_tokens": 2802575.0, "step": 3482 }, { "epoch": 0.9226694915254238, "grad_norm": 1.3159773349761963, "learning_rate": 9.538797669491526e-06, "loss": 1.0717, "mean_token_accuracy": 0.7468093782663345, "num_tokens": 2803943.0, "step": 3484 }, { "epoch": 0.9231991525423728, "grad_norm": 2.1605610847473145, "learning_rate": 9.53853283898305e-06, "loss": 1.6425, "mean_token_accuracy": 0.6316256672143936, "num_tokens": 2805385.0, "step": 3486 }, { "epoch": 0.923728813559322, "grad_norm": 1.709634780883789, "learning_rate": 9.538268008474577e-06, "loss": 1.7617, "mean_token_accuracy": 0.618764229118824, "num_tokens": 2807269.0, "step": 3488 }, { "epoch": 0.9242584745762712, "grad_norm": 1.5180529356002808, "learning_rate": 9.538003177966102e-06, "loss": 1.2717, "mean_token_accuracy": 0.7010201215744019, "num_tokens": 2808969.0, "step": 3490 }, { "epoch": 0.9247881355932204, "grad_norm": 1.457653522491455, "learning_rate": 9.537738347457629e-06, "loss": 1.3096, "mean_token_accuracy": 0.6825432106852531, "num_tokens": 2810634.0, "step": 3492 }, { "epoch": 0.9253177966101694, "grad_norm": 1.6724576950073242, "learning_rate": 9.537473516949154e-06, "loss": 1.628, "mean_token_accuracy": 0.6442235559225082, "num_tokens": 2812159.0, "step": 3494 }, { "epoch": 0.9258474576271186, "grad_norm": 1.5883703231811523, "learning_rate": 9.537208686440679e-06, "loss": 1.5159, "mean_token_accuracy": 0.6672632470726967, "num_tokens": 2813853.0, "step": 3496 }, { "epoch": 0.9263771186440678, "grad_norm": 2.0057265758514404, "learning_rate": 9.536943855932203e-06, "loss": 1.0445, "mean_token_accuracy": 0.7553988695144653, "num_tokens": 2815425.0, "step": 3498 }, { "epoch": 0.926906779661017, "grad_norm": 1.6037631034851074, "learning_rate": 9.53667902542373e-06, "loss": 1.5253, "step": 3500 }, { "epoch": 0.926906779661017, "eval_loss": 1.3262605667114258, "eval_mean_token_accuracy": 0.698415271550804, "eval_num_tokens": 2817253.0, "eval_runtime": 48.3099, "eval_samples_per_second": 6.376, "eval_steps_per_second": 6.376, "step": 3500 }, { "epoch": 0.9274364406779662, "grad_norm": 1.7915284633636475, "learning_rate": 9.536414194915255e-06, "loss": 1.5389, "mean_token_accuracy": 0.6728759594261646, "num_tokens": 2818549.0, "step": 3502 }, { "epoch": 0.9279661016949152, "grad_norm": 1.5347011089324951, "learning_rate": 9.53614936440678e-06, "loss": 1.0632, "mean_token_accuracy": 0.7454593926668167, "num_tokens": 2820052.0, "step": 3504 }, { "epoch": 0.9284957627118644, "grad_norm": 1.9962307214736938, "learning_rate": 9.535884533898307e-06, "loss": 1.6072, "mean_token_accuracy": 0.6665224134922028, "num_tokens": 2821407.0, "step": 3506 }, { "epoch": 0.9290254237288136, "grad_norm": 1.1197797060012817, "learning_rate": 9.535619703389831e-06, "loss": 1.3303, "mean_token_accuracy": 0.7062279358506203, "num_tokens": 2823068.0, "step": 3508 }, { "epoch": 0.9295550847457628, "grad_norm": 1.8055990934371948, "learning_rate": 9.535354872881356e-06, "loss": 1.3451, "mean_token_accuracy": 0.7129062786698341, "num_tokens": 2824444.0, "step": 3510 }, { "epoch": 0.9300847457627118, "grad_norm": 2.1580238342285156, "learning_rate": 9.535090042372881e-06, "loss": 1.5632, "mean_token_accuracy": 0.6708856970071793, "num_tokens": 2825742.0, "step": 3512 }, { "epoch": 0.930614406779661, "grad_norm": 1.752257227897644, "learning_rate": 9.534825211864408e-06, "loss": 1.5855, "mean_token_accuracy": 0.6452027857303619, "num_tokens": 2827232.0, "step": 3514 }, { "epoch": 0.9311440677966102, "grad_norm": 1.639813780784607, "learning_rate": 9.534560381355933e-06, "loss": 1.5068, "mean_token_accuracy": 0.6731506213545799, "num_tokens": 2828926.0, "step": 3516 }, { "epoch": 0.9316737288135594, "grad_norm": 1.3021318912506104, "learning_rate": 9.53429555084746e-06, "loss": 1.0274, "mean_token_accuracy": 0.726204551756382, "num_tokens": 2830438.0, "step": 3518 }, { "epoch": 0.9322033898305084, "grad_norm": 1.603542447090149, "learning_rate": 9.534030720338984e-06, "loss": 1.1397, "mean_token_accuracy": 0.7217450961470604, "num_tokens": 2832088.0, "step": 3520 }, { "epoch": 0.9327330508474576, "grad_norm": 1.5953199863433838, "learning_rate": 9.53376588983051e-06, "loss": 1.357, "mean_token_accuracy": 0.6688167601823807, "num_tokens": 2833636.0, "step": 3522 }, { "epoch": 0.9332627118644068, "grad_norm": 1.8748615980148315, "learning_rate": 9.533501059322034e-06, "loss": 1.5001, "mean_token_accuracy": 0.6661844998598099, "num_tokens": 2835117.0, "step": 3524 }, { "epoch": 0.933792372881356, "grad_norm": 1.4929112195968628, "learning_rate": 9.533236228813561e-06, "loss": 1.5824, "mean_token_accuracy": 0.6406829953193665, "num_tokens": 2836736.0, "step": 3526 }, { "epoch": 0.934322033898305, "grad_norm": 1.8100056648254395, "learning_rate": 9.532971398305086e-06, "loss": 1.2707, "mean_token_accuracy": 0.7117876634001732, "num_tokens": 2838304.0, "step": 3528 }, { "epoch": 0.9348516949152542, "grad_norm": 1.2476798295974731, "learning_rate": 9.53270656779661e-06, "loss": 1.0057, "mean_token_accuracy": 0.7605492919683456, "num_tokens": 2840080.0, "step": 3530 }, { "epoch": 0.9353813559322034, "grad_norm": 2.11609148979187, "learning_rate": 9.532441737288136e-06, "loss": 1.4719, "mean_token_accuracy": 0.6905376054346561, "num_tokens": 2841781.0, "step": 3532 }, { "epoch": 0.9359110169491526, "grad_norm": 2.0741755962371826, "learning_rate": 9.532176906779662e-06, "loss": 1.339, "mean_token_accuracy": 0.7045475915074348, "num_tokens": 2843359.0, "step": 3534 }, { "epoch": 0.9364406779661016, "grad_norm": 1.3120567798614502, "learning_rate": 9.531912076271187e-06, "loss": 1.2182, "mean_token_accuracy": 0.705904982984066, "num_tokens": 2844852.0, "step": 3536 }, { "epoch": 0.9369703389830508, "grad_norm": 1.682435154914856, "learning_rate": 9.531647245762712e-06, "loss": 1.4547, "mean_token_accuracy": 0.6673030108213425, "num_tokens": 2846362.0, "step": 3538 }, { "epoch": 0.9375, "grad_norm": 1.8528592586517334, "learning_rate": 9.531382415254237e-06, "loss": 1.2428, "mean_token_accuracy": 0.7103468999266624, "num_tokens": 2847956.0, "step": 3540 }, { "epoch": 0.9380296610169492, "grad_norm": 1.3412792682647705, "learning_rate": 9.531117584745764e-06, "loss": 1.522, "mean_token_accuracy": 0.6632540971040726, "num_tokens": 2849760.0, "step": 3542 }, { "epoch": 0.9385593220338984, "grad_norm": 1.2335821390151978, "learning_rate": 9.530852754237289e-06, "loss": 0.9654, "mean_token_accuracy": 0.7528667226433754, "num_tokens": 2851452.0, "step": 3544 }, { "epoch": 0.9390889830508474, "grad_norm": 2.097193479537964, "learning_rate": 9.530587923728815e-06, "loss": 1.4469, "mean_token_accuracy": 0.6858246549963951, "num_tokens": 2853312.0, "step": 3546 }, { "epoch": 0.9396186440677966, "grad_norm": 1.2141621112823486, "learning_rate": 9.53032309322034e-06, "loss": 1.3947, "mean_token_accuracy": 0.6842000037431717, "num_tokens": 2855579.0, "step": 3548 }, { "epoch": 0.9401483050847458, "grad_norm": 1.7836921215057373, "learning_rate": 9.530058262711865e-06, "loss": 1.3968, "mean_token_accuracy": 0.7033167481422424, "num_tokens": 2857262.0, "step": 3550 }, { "epoch": 0.940677966101695, "grad_norm": 1.8831565380096436, "learning_rate": 9.52979343220339e-06, "loss": 1.2357, "mean_token_accuracy": 0.7249858528375626, "num_tokens": 2858687.0, "step": 3552 }, { "epoch": 0.941207627118644, "grad_norm": 1.8042360544204712, "learning_rate": 9.529528601694917e-06, "loss": 1.6925, "mean_token_accuracy": 0.6066626310348511, "num_tokens": 2860407.0, "step": 3554 }, { "epoch": 0.9417372881355932, "grad_norm": 1.4946589469909668, "learning_rate": 9.529263771186442e-06, "loss": 1.7233, "mean_token_accuracy": 0.6195038706064224, "num_tokens": 2862115.0, "step": 3556 }, { "epoch": 0.9422669491525424, "grad_norm": 1.3024392127990723, "learning_rate": 9.528998940677967e-06, "loss": 1.0869, "mean_token_accuracy": 0.7465631514787674, "num_tokens": 2864034.0, "step": 3558 }, { "epoch": 0.9427966101694916, "grad_norm": 1.5997081995010376, "learning_rate": 9.528734110169491e-06, "loss": 1.0909, "mean_token_accuracy": 0.7548641562461853, "num_tokens": 2865392.0, "step": 3560 }, { "epoch": 0.9433262711864406, "grad_norm": 1.3083254098892212, "learning_rate": 9.528469279661018e-06, "loss": 1.2868, "mean_token_accuracy": 0.6927285268902779, "num_tokens": 2867189.0, "step": 3562 }, { "epoch": 0.9438559322033898, "grad_norm": 1.4680843353271484, "learning_rate": 9.528204449152543e-06, "loss": 1.5443, "mean_token_accuracy": 0.6825945675373077, "num_tokens": 2868775.0, "step": 3564 }, { "epoch": 0.944385593220339, "grad_norm": 1.640411376953125, "learning_rate": 9.527939618644068e-06, "loss": 1.6243, "mean_token_accuracy": 0.6632528752088547, "num_tokens": 2870317.0, "step": 3566 }, { "epoch": 0.9449152542372882, "grad_norm": 1.734743595123291, "learning_rate": 9.527674788135593e-06, "loss": 1.4698, "mean_token_accuracy": 0.7053859457373619, "num_tokens": 2871886.0, "step": 3568 }, { "epoch": 0.9454449152542372, "grad_norm": 1.908096432685852, "learning_rate": 9.52740995762712e-06, "loss": 1.6967, "mean_token_accuracy": 0.6375660002231598, "num_tokens": 2873306.0, "step": 3570 }, { "epoch": 0.9459745762711864, "grad_norm": 1.3650540113449097, "learning_rate": 9.527145127118644e-06, "loss": 1.2982, "mean_token_accuracy": 0.7014623433351517, "num_tokens": 2874997.0, "step": 3572 }, { "epoch": 0.9465042372881356, "grad_norm": 1.5400469303131104, "learning_rate": 9.526880296610171e-06, "loss": 1.6234, "mean_token_accuracy": 0.6303467974066734, "num_tokens": 2876797.0, "step": 3574 }, { "epoch": 0.9470338983050848, "grad_norm": 1.5595518350601196, "learning_rate": 9.526615466101696e-06, "loss": 1.4864, "mean_token_accuracy": 0.6827305741608143, "num_tokens": 2878399.0, "step": 3576 }, { "epoch": 0.9475635593220338, "grad_norm": 1.532477855682373, "learning_rate": 9.526350635593221e-06, "loss": 1.0028, "mean_token_accuracy": 0.7478703781962395, "num_tokens": 2879855.0, "step": 3578 }, { "epoch": 0.948093220338983, "grad_norm": 1.382163405418396, "learning_rate": 9.526085805084746e-06, "loss": 1.4654, "mean_token_accuracy": 0.6722274869680405, "num_tokens": 2881338.0, "step": 3580 }, { "epoch": 0.9486228813559322, "grad_norm": 1.7172939777374268, "learning_rate": 9.525820974576272e-06, "loss": 1.0635, "mean_token_accuracy": 0.7399561479687691, "num_tokens": 2882738.0, "step": 3582 }, { "epoch": 0.9491525423728814, "grad_norm": 1.4875330924987793, "learning_rate": 9.525556144067797e-06, "loss": 1.2044, "mean_token_accuracy": 0.7007508352398872, "num_tokens": 2884446.0, "step": 3584 }, { "epoch": 0.9496822033898306, "grad_norm": 1.5642521381378174, "learning_rate": 9.525291313559322e-06, "loss": 1.4379, "mean_token_accuracy": 0.6924026273190975, "num_tokens": 2886178.0, "step": 3586 }, { "epoch": 0.9502118644067796, "grad_norm": 1.6849024295806885, "learning_rate": 9.525026483050849e-06, "loss": 1.4798, "mean_token_accuracy": 0.6658198684453964, "num_tokens": 2888065.0, "step": 3588 }, { "epoch": 0.9507415254237288, "grad_norm": 1.674077033996582, "learning_rate": 9.524761652542374e-06, "loss": 1.5752, "mean_token_accuracy": 0.6478954926133156, "num_tokens": 2889850.0, "step": 3590 }, { "epoch": 0.951271186440678, "grad_norm": 1.5691992044448853, "learning_rate": 9.524496822033899e-06, "loss": 1.5773, "mean_token_accuracy": 0.6668622046709061, "num_tokens": 2891242.0, "step": 3592 }, { "epoch": 0.9518008474576272, "grad_norm": 1.7602267265319824, "learning_rate": 9.524231991525424e-06, "loss": 1.8277, "mean_token_accuracy": 0.6093424446880817, "num_tokens": 2893125.0, "step": 3594 }, { "epoch": 0.9523305084745762, "grad_norm": 1.6333104372024536, "learning_rate": 9.52396716101695e-06, "loss": 1.1158, "mean_token_accuracy": 0.7389829754829407, "num_tokens": 2894878.0, "step": 3596 }, { "epoch": 0.9528601694915254, "grad_norm": 1.9001187086105347, "learning_rate": 9.523702330508475e-06, "loss": 1.3943, "mean_token_accuracy": 0.6832478791475296, "num_tokens": 2896256.0, "step": 3598 }, { "epoch": 0.9533898305084746, "grad_norm": 1.4704967737197876, "learning_rate": 9.523437500000002e-06, "loss": 1.4627, "mean_token_accuracy": 0.6590011119842529, "num_tokens": 2897978.0, "step": 3600 }, { "epoch": 0.9539194915254238, "grad_norm": 1.503211259841919, "learning_rate": 9.523172669491527e-06, "loss": 1.4223, "mean_token_accuracy": 0.7028404399752617, "num_tokens": 2899496.0, "step": 3602 }, { "epoch": 0.9544491525423728, "grad_norm": 1.5647319555282593, "learning_rate": 9.522907838983052e-06, "loss": 1.1734, "mean_token_accuracy": 0.7331338003277779, "num_tokens": 2901202.0, "step": 3604 }, { "epoch": 0.954978813559322, "grad_norm": 1.3859071731567383, "learning_rate": 9.522643008474577e-06, "loss": 0.8441, "mean_token_accuracy": 0.7857811376452446, "num_tokens": 2902671.0, "step": 3606 }, { "epoch": 0.9555084745762712, "grad_norm": 1.8180345296859741, "learning_rate": 9.522378177966103e-06, "loss": 1.722, "mean_token_accuracy": 0.635855607688427, "num_tokens": 2904539.0, "step": 3608 }, { "epoch": 0.9560381355932204, "grad_norm": 1.6281416416168213, "learning_rate": 9.522113347457628e-06, "loss": 1.0142, "mean_token_accuracy": 0.7587625607848167, "num_tokens": 2905924.0, "step": 3610 }, { "epoch": 0.9565677966101694, "grad_norm": 1.959832787513733, "learning_rate": 9.521848516949153e-06, "loss": 1.5161, "mean_token_accuracy": 0.6592437736690044, "num_tokens": 2907435.0, "step": 3612 }, { "epoch": 0.9570974576271186, "grad_norm": 1.6303297281265259, "learning_rate": 9.521583686440678e-06, "loss": 1.5835, "mean_token_accuracy": 0.6577755957841873, "num_tokens": 2908982.0, "step": 3614 }, { "epoch": 0.9576271186440678, "grad_norm": 1.5778131484985352, "learning_rate": 9.521318855932205e-06, "loss": 1.252, "mean_token_accuracy": 0.7240225300192833, "num_tokens": 2910789.0, "step": 3616 }, { "epoch": 0.958156779661017, "grad_norm": 2.2768213748931885, "learning_rate": 9.52105402542373e-06, "loss": 1.6934, "mean_token_accuracy": 0.6546697355806828, "num_tokens": 2912302.0, "step": 3618 }, { "epoch": 0.9586864406779662, "grad_norm": 1.4389981031417847, "learning_rate": 9.520789194915255e-06, "loss": 1.0947, "mean_token_accuracy": 0.7437828704714775, "num_tokens": 2913928.0, "step": 3620 }, { "epoch": 0.9592161016949152, "grad_norm": 1.190439224243164, "learning_rate": 9.52052436440678e-06, "loss": 0.8628, "mean_token_accuracy": 0.7889687195420265, "num_tokens": 2915687.0, "step": 3622 }, { "epoch": 0.9597457627118644, "grad_norm": 1.813537836074829, "learning_rate": 9.520259533898306e-06, "loss": 1.3167, "mean_token_accuracy": 0.6965615190565586, "num_tokens": 2917567.0, "step": 3624 }, { "epoch": 0.9602754237288136, "grad_norm": 1.906558632850647, "learning_rate": 9.519994703389831e-06, "loss": 1.4812, "mean_token_accuracy": 0.6746947541832924, "num_tokens": 2919030.0, "step": 3626 }, { "epoch": 0.9608050847457628, "grad_norm": 1.8161319494247437, "learning_rate": 9.519729872881358e-06, "loss": 1.7905, "mean_token_accuracy": 0.6172988414764404, "num_tokens": 2920532.0, "step": 3628 }, { "epoch": 0.9613347457627118, "grad_norm": 1.6557483673095703, "learning_rate": 9.519465042372883e-06, "loss": 1.4219, "mean_token_accuracy": 0.690507311373949, "num_tokens": 2922302.0, "step": 3630 }, { "epoch": 0.961864406779661, "grad_norm": 1.8459930419921875, "learning_rate": 9.519200211864408e-06, "loss": 1.5552, "mean_token_accuracy": 0.6704058572649956, "num_tokens": 2923634.0, "step": 3632 }, { "epoch": 0.9623940677966102, "grad_norm": 1.4942585229873657, "learning_rate": 9.518935381355933e-06, "loss": 1.6471, "mean_token_accuracy": 0.6485504023730755, "num_tokens": 2925342.0, "step": 3634 }, { "epoch": 0.9629237288135594, "grad_norm": 1.5729625225067139, "learning_rate": 9.518670550847459e-06, "loss": 1.3756, "mean_token_accuracy": 0.6884674429893494, "num_tokens": 2927053.0, "step": 3636 }, { "epoch": 0.9634533898305084, "grad_norm": 1.2795155048370361, "learning_rate": 9.518405720338984e-06, "loss": 1.4896, "mean_token_accuracy": 0.6580408737063408, "num_tokens": 2928996.0, "step": 3638 }, { "epoch": 0.9639830508474576, "grad_norm": 1.6599496603012085, "learning_rate": 9.518140889830509e-06, "loss": 1.7409, "mean_token_accuracy": 0.6043561734259129, "num_tokens": 2930687.0, "step": 3640 }, { "epoch": 0.9645127118644068, "grad_norm": 1.5459825992584229, "learning_rate": 9.517876059322034e-06, "loss": 1.0068, "mean_token_accuracy": 0.7531255707144737, "num_tokens": 2931945.0, "step": 3642 }, { "epoch": 0.965042372881356, "grad_norm": 1.5028390884399414, "learning_rate": 9.51761122881356e-06, "loss": 1.3141, "mean_token_accuracy": 0.7099672257900238, "num_tokens": 2933592.0, "step": 3644 }, { "epoch": 0.965572033898305, "grad_norm": 1.5895966291427612, "learning_rate": 9.517346398305085e-06, "loss": 1.3346, "mean_token_accuracy": 0.702014297246933, "num_tokens": 2935188.0, "step": 3646 }, { "epoch": 0.9661016949152542, "grad_norm": 1.492346167564392, "learning_rate": 9.51708156779661e-06, "loss": 1.4426, "mean_token_accuracy": 0.6837318986654282, "num_tokens": 2936805.0, "step": 3648 }, { "epoch": 0.9666313559322034, "grad_norm": 1.5650500059127808, "learning_rate": 9.516816737288135e-06, "loss": 1.3091, "mean_token_accuracy": 0.7153496891260147, "num_tokens": 2938322.0, "step": 3650 }, { "epoch": 0.9671610169491526, "grad_norm": 1.8406895399093628, "learning_rate": 9.516551906779662e-06, "loss": 1.4005, "mean_token_accuracy": 0.6890442185103893, "num_tokens": 2939707.0, "step": 3652 }, { "epoch": 0.9676906779661016, "grad_norm": 1.532045602798462, "learning_rate": 9.516287076271187e-06, "loss": 1.2443, "mean_token_accuracy": 0.6988905519247055, "num_tokens": 2941312.0, "step": 3654 }, { "epoch": 0.9682203389830508, "grad_norm": 1.2850844860076904, "learning_rate": 9.516022245762714e-06, "loss": 1.1385, "mean_token_accuracy": 0.7134354636073112, "num_tokens": 2943256.0, "step": 3656 }, { "epoch": 0.96875, "grad_norm": 2.000476121902466, "learning_rate": 9.515757415254237e-06, "loss": 1.4989, "mean_token_accuracy": 0.6714408025145531, "num_tokens": 2944694.0, "step": 3658 }, { "epoch": 0.9692796610169492, "grad_norm": 1.498946189880371, "learning_rate": 9.515492584745763e-06, "loss": 1.0049, "mean_token_accuracy": 0.7532336637377739, "num_tokens": 2946191.0, "step": 3660 }, { "epoch": 0.9698093220338984, "grad_norm": 1.9477578401565552, "learning_rate": 9.515227754237288e-06, "loss": 1.4147, "mean_token_accuracy": 0.6892632618546486, "num_tokens": 2947695.0, "step": 3662 }, { "epoch": 0.9703389830508474, "grad_norm": 1.7177143096923828, "learning_rate": 9.514962923728815e-06, "loss": 1.2516, "mean_token_accuracy": 0.7142509371042252, "num_tokens": 2949386.0, "step": 3664 }, { "epoch": 0.9708686440677966, "grad_norm": 1.2644398212432861, "learning_rate": 9.51469809322034e-06, "loss": 1.4674, "mean_token_accuracy": 0.6681729443371296, "num_tokens": 2950953.0, "step": 3666 }, { "epoch": 0.9713983050847458, "grad_norm": 1.6372190713882446, "learning_rate": 9.514433262711865e-06, "loss": 1.1665, "mean_token_accuracy": 0.7451206520199776, "num_tokens": 2952354.0, "step": 3668 }, { "epoch": 0.971927966101695, "grad_norm": 1.3489996194839478, "learning_rate": 9.514168432203391e-06, "loss": 0.9249, "mean_token_accuracy": 0.7723268046975136, "num_tokens": 2953887.0, "step": 3670 }, { "epoch": 0.972457627118644, "grad_norm": 1.782902717590332, "learning_rate": 9.513903601694916e-06, "loss": 1.4121, "mean_token_accuracy": 0.6840807721018791, "num_tokens": 2955390.0, "step": 3672 }, { "epoch": 0.9729872881355932, "grad_norm": 1.5811532735824585, "learning_rate": 9.513638771186441e-06, "loss": 1.3373, "mean_token_accuracy": 0.7072909846901894, "num_tokens": 2956966.0, "step": 3674 }, { "epoch": 0.9735169491525424, "grad_norm": 1.6461187601089478, "learning_rate": 9.513373940677966e-06, "loss": 1.7621, "mean_token_accuracy": 0.6329689025878906, "num_tokens": 2958625.0, "step": 3676 }, { "epoch": 0.9740466101694916, "grad_norm": 1.8013406991958618, "learning_rate": 9.513109110169493e-06, "loss": 1.339, "mean_token_accuracy": 0.7165720835328102, "num_tokens": 2960087.0, "step": 3678 }, { "epoch": 0.9745762711864406, "grad_norm": 1.69857656955719, "learning_rate": 9.512844279661018e-06, "loss": 1.2186, "mean_token_accuracy": 0.709051102399826, "num_tokens": 2961570.0, "step": 3680 }, { "epoch": 0.9751059322033898, "grad_norm": 1.8114519119262695, "learning_rate": 9.512579449152544e-06, "loss": 1.6656, "mean_token_accuracy": 0.6565104238688946, "num_tokens": 2963130.0, "step": 3682 }, { "epoch": 0.975635593220339, "grad_norm": 1.2429953813552856, "learning_rate": 9.51231461864407e-06, "loss": 1.0639, "mean_token_accuracy": 0.7411546632647514, "num_tokens": 2964771.0, "step": 3684 }, { "epoch": 0.9761652542372882, "grad_norm": 1.2083678245544434, "learning_rate": 9.512049788135594e-06, "loss": 0.8428, "mean_token_accuracy": 0.7913098037242889, "num_tokens": 2966127.0, "step": 3686 }, { "epoch": 0.9766949152542372, "grad_norm": 1.7384189367294312, "learning_rate": 9.511784957627119e-06, "loss": 1.5653, "mean_token_accuracy": 0.646018847823143, "num_tokens": 2967685.0, "step": 3688 }, { "epoch": 0.9772245762711864, "grad_norm": 2.1023240089416504, "learning_rate": 9.511520127118646e-06, "loss": 1.3887, "mean_token_accuracy": 0.6740885972976685, "num_tokens": 2969457.0, "step": 3690 }, { "epoch": 0.9777542372881356, "grad_norm": 1.6207866668701172, "learning_rate": 9.51125529661017e-06, "loss": 1.2521, "mean_token_accuracy": 0.7339169904589653, "num_tokens": 2970951.0, "step": 3692 }, { "epoch": 0.9782838983050848, "grad_norm": 1.7274166345596313, "learning_rate": 9.510990466101696e-06, "loss": 1.3219, "mean_token_accuracy": 0.6944757401943207, "num_tokens": 2972756.0, "step": 3694 }, { "epoch": 0.9788135593220338, "grad_norm": 1.2499287128448486, "learning_rate": 9.51072563559322e-06, "loss": 0.6999, "mean_token_accuracy": 0.8013168275356293, "num_tokens": 2974379.0, "step": 3696 }, { "epoch": 0.979343220338983, "grad_norm": 1.4317625761032104, "learning_rate": 9.510460805084747e-06, "loss": 1.2421, "mean_token_accuracy": 0.6923380941152573, "num_tokens": 2975815.0, "step": 3698 }, { "epoch": 0.9798728813559322, "grad_norm": 1.4348373413085938, "learning_rate": 9.510195974576272e-06, "loss": 1.3593, "mean_token_accuracy": 0.7131725922226906, "num_tokens": 2977561.0, "step": 3700 }, { "epoch": 0.9804025423728814, "grad_norm": 2.014824390411377, "learning_rate": 9.509931144067797e-06, "loss": 1.6836, "mean_token_accuracy": 0.6249290779232979, "num_tokens": 2979208.0, "step": 3702 }, { "epoch": 0.9809322033898306, "grad_norm": 1.3830666542053223, "learning_rate": 9.509666313559322e-06, "loss": 1.0223, "mean_token_accuracy": 0.749916099011898, "num_tokens": 2980772.0, "step": 3704 }, { "epoch": 0.9814618644067796, "grad_norm": 1.8148622512817383, "learning_rate": 9.509401483050849e-06, "loss": 1.2533, "mean_token_accuracy": 0.707101084291935, "num_tokens": 2982128.0, "step": 3706 }, { "epoch": 0.9819915254237288, "grad_norm": 2.234994649887085, "learning_rate": 9.509136652542374e-06, "loss": 1.2981, "mean_token_accuracy": 0.6981774531304836, "num_tokens": 2983403.0, "step": 3708 }, { "epoch": 0.982521186440678, "grad_norm": 1.5731046199798584, "learning_rate": 9.5088718220339e-06, "loss": 1.0368, "mean_token_accuracy": 0.7461694628000259, "num_tokens": 2985039.0, "step": 3710 }, { "epoch": 0.9830508474576272, "grad_norm": 1.7524369955062866, "learning_rate": 9.508606991525423e-06, "loss": 1.5755, "mean_token_accuracy": 0.66095931828022, "num_tokens": 2986686.0, "step": 3712 }, { "epoch": 0.9835805084745762, "grad_norm": 1.6902642250061035, "learning_rate": 9.50834216101695e-06, "loss": 0.9431, "mean_token_accuracy": 0.764924943447113, "num_tokens": 2988219.0, "step": 3714 }, { "epoch": 0.9841101694915254, "grad_norm": 1.9474045038223267, "learning_rate": 9.508077330508475e-06, "loss": 1.5438, "mean_token_accuracy": 0.6901370026171207, "num_tokens": 2989597.0, "step": 3716 }, { "epoch": 0.9846398305084746, "grad_norm": 1.5230298042297363, "learning_rate": 9.507812500000002e-06, "loss": 1.2524, "mean_token_accuracy": 0.716034322977066, "num_tokens": 2991321.0, "step": 3718 }, { "epoch": 0.9851694915254238, "grad_norm": 1.5702641010284424, "learning_rate": 9.507547669491526e-06, "loss": 1.4025, "mean_token_accuracy": 0.6935591399669647, "num_tokens": 2993174.0, "step": 3720 }, { "epoch": 0.9856991525423728, "grad_norm": 1.3530933856964111, "learning_rate": 9.507282838983051e-06, "loss": 1.0888, "mean_token_accuracy": 0.7383381798863411, "num_tokens": 2994562.0, "step": 3722 }, { "epoch": 0.986228813559322, "grad_norm": 1.8895716667175293, "learning_rate": 9.507018008474576e-06, "loss": 1.1685, "mean_token_accuracy": 0.7128634229302406, "num_tokens": 2995990.0, "step": 3724 }, { "epoch": 0.9867584745762712, "grad_norm": 1.3112977743148804, "learning_rate": 9.506753177966103e-06, "loss": 1.0914, "mean_token_accuracy": 0.7212493345141411, "num_tokens": 2997383.0, "step": 3726 }, { "epoch": 0.9872881355932204, "grad_norm": 1.4287209510803223, "learning_rate": 9.506488347457628e-06, "loss": 1.1444, "mean_token_accuracy": 0.7577127143740654, "num_tokens": 2998705.0, "step": 3728 }, { "epoch": 0.9878177966101694, "grad_norm": 1.3132381439208984, "learning_rate": 9.506223516949153e-06, "loss": 1.3715, "mean_token_accuracy": 0.6986205577850342, "num_tokens": 3001113.0, "step": 3730 }, { "epoch": 0.9883474576271186, "grad_norm": 1.9566789865493774, "learning_rate": 9.505958686440678e-06, "loss": 1.5878, "mean_token_accuracy": 0.6410089172422886, "num_tokens": 3002365.0, "step": 3732 }, { "epoch": 0.9888771186440678, "grad_norm": 1.100281000137329, "learning_rate": 9.505693855932204e-06, "loss": 1.0844, "mean_token_accuracy": 0.7173058837652206, "num_tokens": 3004219.0, "step": 3734 }, { "epoch": 0.989406779661017, "grad_norm": 1.8038578033447266, "learning_rate": 9.50542902542373e-06, "loss": 1.5861, "mean_token_accuracy": 0.6531511321663857, "num_tokens": 3005772.0, "step": 3736 }, { "epoch": 0.9899364406779662, "grad_norm": 1.6451815366744995, "learning_rate": 9.505164194915256e-06, "loss": 1.2984, "mean_token_accuracy": 0.7059384733438492, "num_tokens": 3007222.0, "step": 3738 }, { "epoch": 0.9904661016949152, "grad_norm": 1.7028710842132568, "learning_rate": 9.50489936440678e-06, "loss": 1.5813, "mean_token_accuracy": 0.6595760397613049, "num_tokens": 3008812.0, "step": 3740 }, { "epoch": 0.9909957627118644, "grad_norm": 1.2276759147644043, "learning_rate": 9.504634533898306e-06, "loss": 0.8468, "mean_token_accuracy": 0.8007526397705078, "num_tokens": 3010317.0, "step": 3742 }, { "epoch": 0.9915254237288136, "grad_norm": 1.6192158460617065, "learning_rate": 9.50436970338983e-06, "loss": 1.1514, "mean_token_accuracy": 0.7206679061055183, "num_tokens": 3012046.0, "step": 3744 }, { "epoch": 0.9920550847457628, "grad_norm": 1.6139551401138306, "learning_rate": 9.504104872881357e-06, "loss": 1.5735, "mean_token_accuracy": 0.6660054698586464, "num_tokens": 3013845.0, "step": 3746 }, { "epoch": 0.9925847457627118, "grad_norm": 1.4567586183547974, "learning_rate": 9.503840042372882e-06, "loss": 1.3465, "mean_token_accuracy": 0.7049882337450981, "num_tokens": 3015725.0, "step": 3748 }, { "epoch": 0.993114406779661, "grad_norm": 1.7321308851242065, "learning_rate": 9.503575211864407e-06, "loss": 1.31, "step": 3750 }, { "epoch": 0.993114406779661, "eval_loss": 1.3248564004898071, "eval_mean_token_accuracy": 0.6980878209525888, "eval_num_tokens": 3017246.0, "eval_runtime": 48.0726, "eval_samples_per_second": 6.407, "eval_steps_per_second": 6.407, "step": 3750 }, { "epoch": 0.9936440677966102, "grad_norm": 1.5497561693191528, "learning_rate": 9.503310381355934e-06, "loss": 1.1166, "mean_token_accuracy": 0.718545027077198, "num_tokens": 3019177.0, "step": 3752 }, { "epoch": 0.9941737288135594, "grad_norm": 1.5596483945846558, "learning_rate": 9.503045550847459e-06, "loss": 1.4143, "mean_token_accuracy": 0.6757076233625412, "num_tokens": 3020856.0, "step": 3754 }, { "epoch": 0.9947033898305084, "grad_norm": 1.635516881942749, "learning_rate": 9.502780720338984e-06, "loss": 1.13, "mean_token_accuracy": 0.7270998656749725, "num_tokens": 3022229.0, "step": 3756 }, { "epoch": 0.9952330508474576, "grad_norm": 1.4596710205078125, "learning_rate": 9.502515889830509e-06, "loss": 1.344, "mean_token_accuracy": 0.6861610822379589, "num_tokens": 3023795.0, "step": 3758 }, { "epoch": 0.9957627118644068, "grad_norm": 1.4447991847991943, "learning_rate": 9.502251059322035e-06, "loss": 0.9975, "mean_token_accuracy": 0.7393147125840187, "num_tokens": 3025557.0, "step": 3760 }, { "epoch": 0.996292372881356, "grad_norm": 1.665193796157837, "learning_rate": 9.50198622881356e-06, "loss": 1.5462, "mean_token_accuracy": 0.6557179316878319, "num_tokens": 3027003.0, "step": 3762 }, { "epoch": 0.996822033898305, "grad_norm": 1.4386941194534302, "learning_rate": 9.501721398305087e-06, "loss": 1.5382, "mean_token_accuracy": 0.678303636610508, "num_tokens": 3028543.0, "step": 3764 }, { "epoch": 0.9973516949152542, "grad_norm": 1.7746102809906006, "learning_rate": 9.50145656779661e-06, "loss": 1.621, "mean_token_accuracy": 0.6493589356541634, "num_tokens": 3030232.0, "step": 3766 }, { "epoch": 0.9978813559322034, "grad_norm": 1.4380916357040405, "learning_rate": 9.501191737288137e-06, "loss": 1.5628, "mean_token_accuracy": 0.6748564392328262, "num_tokens": 3031630.0, "step": 3768 }, { "epoch": 0.9984110169491526, "grad_norm": 1.4367468357086182, "learning_rate": 9.500926906779662e-06, "loss": 1.1321, "mean_token_accuracy": 0.7367370873689651, "num_tokens": 3033337.0, "step": 3770 }, { "epoch": 0.9989406779661016, "grad_norm": 1.5807374715805054, "learning_rate": 9.500662076271188e-06, "loss": 1.6243, "mean_token_accuracy": 0.6529665738344193, "num_tokens": 3034870.0, "step": 3772 }, { "epoch": 0.9994703389830508, "grad_norm": 1.4224226474761963, "learning_rate": 9.500397245762713e-06, "loss": 1.2351, "mean_token_accuracy": 0.7132977358996868, "num_tokens": 3036481.0, "step": 3774 }, { "epoch": 1.0, "grad_norm": 1.760259985923767, "learning_rate": 9.500132415254238e-06, "loss": 1.5329, "mean_token_accuracy": 0.6656326651573181, "num_tokens": 3038268.0, "step": 3776 }, { "epoch": 1.0005296610169492, "grad_norm": 1.8313130140304565, "learning_rate": 9.499867584745763e-06, "loss": 1.4145, "mean_token_accuracy": 0.7032082453370094, "num_tokens": 3039859.0, "step": 3778 }, { "epoch": 1.0010593220338984, "grad_norm": 1.5212448835372925, "learning_rate": 9.49960275423729e-06, "loss": 1.3361, "mean_token_accuracy": 0.7355735525488853, "num_tokens": 3041479.0, "step": 3780 }, { "epoch": 1.0015889830508475, "grad_norm": 1.5437829494476318, "learning_rate": 9.499337923728815e-06, "loss": 1.3451, "mean_token_accuracy": 0.6793091297149658, "num_tokens": 3043245.0, "step": 3782 }, { "epoch": 1.0021186440677967, "grad_norm": 1.4973931312561035, "learning_rate": 9.49907309322034e-06, "loss": 1.4413, "mean_token_accuracy": 0.6733281537890434, "num_tokens": 3045064.0, "step": 3784 }, { "epoch": 1.0026483050847457, "grad_norm": 1.897584319114685, "learning_rate": 9.498808262711864e-06, "loss": 1.1165, "mean_token_accuracy": 0.7541037201881409, "num_tokens": 3046529.0, "step": 3786 }, { "epoch": 1.0031779661016949, "grad_norm": 1.4419913291931152, "learning_rate": 9.498543432203391e-06, "loss": 1.3996, "mean_token_accuracy": 0.7018345929682255, "num_tokens": 3048090.0, "step": 3788 }, { "epoch": 1.003707627118644, "grad_norm": 1.8622119426727295, "learning_rate": 9.498278601694916e-06, "loss": 1.5181, "mean_token_accuracy": 0.676289476454258, "num_tokens": 3049494.0, "step": 3790 }, { "epoch": 1.0042372881355932, "grad_norm": 1.5447108745574951, "learning_rate": 9.498013771186443e-06, "loss": 1.533, "mean_token_accuracy": 0.6583634689450264, "num_tokens": 3051115.0, "step": 3792 }, { "epoch": 1.0047669491525424, "grad_norm": 1.8489335775375366, "learning_rate": 9.497748940677966e-06, "loss": 1.4136, "mean_token_accuracy": 0.6924081221222878, "num_tokens": 3052466.0, "step": 3794 }, { "epoch": 1.0052966101694916, "grad_norm": 1.9992815256118774, "learning_rate": 9.497484110169492e-06, "loss": 1.285, "mean_token_accuracy": 0.712056003510952, "num_tokens": 3054129.0, "step": 3796 }, { "epoch": 1.0058262711864407, "grad_norm": 1.4336333274841309, "learning_rate": 9.497219279661017e-06, "loss": 1.2099, "mean_token_accuracy": 0.7092271372675896, "num_tokens": 3055656.0, "step": 3798 }, { "epoch": 1.00635593220339, "grad_norm": 1.7360819578170776, "learning_rate": 9.496954449152544e-06, "loss": 1.4301, "mean_token_accuracy": 0.6854534335434437, "num_tokens": 3057414.0, "step": 3800 }, { "epoch": 1.0068855932203389, "grad_norm": 1.6663728952407837, "learning_rate": 9.496689618644069e-06, "loss": 1.2252, "mean_token_accuracy": 0.7168242633342743, "num_tokens": 3058950.0, "step": 3802 }, { "epoch": 1.007415254237288, "grad_norm": 1.3218437433242798, "learning_rate": 9.496424788135594e-06, "loss": 0.7546, "mean_token_accuracy": 0.8018913194537163, "num_tokens": 3060516.0, "step": 3804 }, { "epoch": 1.0079449152542372, "grad_norm": 1.6019712686538696, "learning_rate": 9.496159957627119e-06, "loss": 0.9832, "mean_token_accuracy": 0.7669478058815002, "num_tokens": 3062168.0, "step": 3806 }, { "epoch": 1.0084745762711864, "grad_norm": 2.0126848220825195, "learning_rate": 9.495895127118645e-06, "loss": 1.4728, "mean_token_accuracy": 0.6653342247009277, "num_tokens": 3063596.0, "step": 3808 }, { "epoch": 1.0090042372881356, "grad_norm": 1.752208948135376, "learning_rate": 9.49563029661017e-06, "loss": 1.6187, "mean_token_accuracy": 0.6337307989597321, "num_tokens": 3065001.0, "step": 3810 }, { "epoch": 1.0095338983050848, "grad_norm": 1.3711472749710083, "learning_rate": 9.495365466101695e-06, "loss": 1.217, "mean_token_accuracy": 0.70815858989954, "num_tokens": 3066526.0, "step": 3812 }, { "epoch": 1.010063559322034, "grad_norm": 1.6252857446670532, "learning_rate": 9.49510063559322e-06, "loss": 1.4159, "mean_token_accuracy": 0.6750357709825039, "num_tokens": 3068133.0, "step": 3814 }, { "epoch": 1.0105932203389831, "grad_norm": 1.7089083194732666, "learning_rate": 9.494835805084747e-06, "loss": 1.3482, "mean_token_accuracy": 0.7041758745908737, "num_tokens": 3069755.0, "step": 3816 }, { "epoch": 1.0111228813559323, "grad_norm": 1.8619943857192993, "learning_rate": 9.494570974576272e-06, "loss": 1.2593, "mean_token_accuracy": 0.7135278955101967, "num_tokens": 3071132.0, "step": 3818 }, { "epoch": 1.0116525423728813, "grad_norm": 1.272659420967102, "learning_rate": 9.494306144067797e-06, "loss": 1.1501, "mean_token_accuracy": 0.7411899715662003, "num_tokens": 3072630.0, "step": 3820 }, { "epoch": 1.0121822033898304, "grad_norm": 1.372766137123108, "learning_rate": 9.494041313559322e-06, "loss": 1.6345, "mean_token_accuracy": 0.6383141428232193, "num_tokens": 3074286.0, "step": 3822 }, { "epoch": 1.0127118644067796, "grad_norm": 1.9113290309906006, "learning_rate": 9.493776483050848e-06, "loss": 1.7313, "mean_token_accuracy": 0.6134686842560768, "num_tokens": 3075935.0, "step": 3824 }, { "epoch": 1.0132415254237288, "grad_norm": 1.191456913948059, "learning_rate": 9.493511652542373e-06, "loss": 1.3488, "mean_token_accuracy": 0.6971732303500175, "num_tokens": 3077748.0, "step": 3826 }, { "epoch": 1.013771186440678, "grad_norm": 1.4083068370819092, "learning_rate": 9.4932468220339e-06, "loss": 0.9335, "mean_token_accuracy": 0.7707453444600105, "num_tokens": 3079060.0, "step": 3828 }, { "epoch": 1.0143008474576272, "grad_norm": 1.3150707483291626, "learning_rate": 9.492981991525425e-06, "loss": 1.2561, "mean_token_accuracy": 0.7088834494352341, "num_tokens": 3080869.0, "step": 3830 }, { "epoch": 1.0148305084745763, "grad_norm": 1.5377522706985474, "learning_rate": 9.49271716101695e-06, "loss": 0.6986, "mean_token_accuracy": 0.8219146728515625, "num_tokens": 3082325.0, "step": 3832 }, { "epoch": 1.0153601694915255, "grad_norm": 1.313145637512207, "learning_rate": 9.492452330508475e-06, "loss": 0.948, "mean_token_accuracy": 0.7601459249854088, "num_tokens": 3083993.0, "step": 3834 }, { "epoch": 1.0158898305084745, "grad_norm": 1.4987128973007202, "learning_rate": 9.492187500000001e-06, "loss": 1.3582, "mean_token_accuracy": 0.6972881928086281, "num_tokens": 3085483.0, "step": 3836 }, { "epoch": 1.0164194915254237, "grad_norm": 1.7573097944259644, "learning_rate": 9.491922669491526e-06, "loss": 1.3136, "mean_token_accuracy": 0.7185279317200184, "num_tokens": 3087007.0, "step": 3838 }, { "epoch": 1.0169491525423728, "grad_norm": 1.5840421915054321, "learning_rate": 9.491657838983051e-06, "loss": 1.3472, "mean_token_accuracy": 0.6967124119400978, "num_tokens": 3088589.0, "step": 3840 }, { "epoch": 1.017478813559322, "grad_norm": 1.220716953277588, "learning_rate": 9.491393008474578e-06, "loss": 1.048, "mean_token_accuracy": 0.7494026273488998, "num_tokens": 3090168.0, "step": 3842 }, { "epoch": 1.0180084745762712, "grad_norm": 1.169020175933838, "learning_rate": 9.491128177966103e-06, "loss": 1.3235, "mean_token_accuracy": 0.6994547620415688, "num_tokens": 3091709.0, "step": 3844 }, { "epoch": 1.0185381355932204, "grad_norm": 1.2098641395568848, "learning_rate": 9.49086334745763e-06, "loss": 1.1585, "mean_token_accuracy": 0.742064006626606, "num_tokens": 3093248.0, "step": 3846 }, { "epoch": 1.0190677966101696, "grad_norm": 1.8147634267807007, "learning_rate": 9.490598516949152e-06, "loss": 1.0195, "mean_token_accuracy": 0.7705966308712959, "num_tokens": 3094870.0, "step": 3848 }, { "epoch": 1.0195974576271187, "grad_norm": 1.4781203269958496, "learning_rate": 9.490333686440679e-06, "loss": 0.961, "mean_token_accuracy": 0.7754111289978027, "num_tokens": 3096585.0, "step": 3850 }, { "epoch": 1.0201271186440677, "grad_norm": 1.620134711265564, "learning_rate": 9.490068855932204e-06, "loss": 1.0453, "mean_token_accuracy": 0.7581941708922386, "num_tokens": 3097995.0, "step": 3852 }, { "epoch": 1.0206567796610169, "grad_norm": 1.7703897953033447, "learning_rate": 9.48980402542373e-06, "loss": 1.545, "mean_token_accuracy": 0.6871042363345623, "num_tokens": 3099593.0, "step": 3854 }, { "epoch": 1.021186440677966, "grad_norm": 1.289100170135498, "learning_rate": 9.489539194915256e-06, "loss": 1.1377, "mean_token_accuracy": 0.7148842439055443, "num_tokens": 3101700.0, "step": 3856 }, { "epoch": 1.0217161016949152, "grad_norm": 1.5143505334854126, "learning_rate": 9.48927436440678e-06, "loss": 1.0515, "mean_token_accuracy": 0.7236050888895988, "num_tokens": 3103208.0, "step": 3858 }, { "epoch": 1.0222457627118644, "grad_norm": 1.832888126373291, "learning_rate": 9.489009533898305e-06, "loss": 1.5461, "mean_token_accuracy": 0.6615285724401474, "num_tokens": 3104757.0, "step": 3860 }, { "epoch": 1.0227754237288136, "grad_norm": 1.709850788116455, "learning_rate": 9.488744703389832e-06, "loss": 1.3605, "mean_token_accuracy": 0.7025755569338799, "num_tokens": 3106399.0, "step": 3862 }, { "epoch": 1.0233050847457628, "grad_norm": 1.7602105140686035, "learning_rate": 9.488479872881357e-06, "loss": 1.338, "mean_token_accuracy": 0.7098105773329735, "num_tokens": 3107948.0, "step": 3864 }, { "epoch": 1.023834745762712, "grad_norm": 1.9495197534561157, "learning_rate": 9.488215042372882e-06, "loss": 1.6479, "mean_token_accuracy": 0.6288354992866516, "num_tokens": 3109579.0, "step": 3866 }, { "epoch": 1.0243644067796611, "grad_norm": 1.778562068939209, "learning_rate": 9.487950211864407e-06, "loss": 1.6047, "mean_token_accuracy": 0.6516287177801132, "num_tokens": 3111147.0, "step": 3868 }, { "epoch": 1.02489406779661, "grad_norm": 1.6978648900985718, "learning_rate": 9.487685381355933e-06, "loss": 1.2971, "mean_token_accuracy": 0.6905332282185555, "num_tokens": 3112511.0, "step": 3870 }, { "epoch": 1.0254237288135593, "grad_norm": 1.4875901937484741, "learning_rate": 9.487420550847458e-06, "loss": 0.9364, "mean_token_accuracy": 0.7672369703650475, "num_tokens": 3113980.0, "step": 3872 }, { "epoch": 1.0259533898305084, "grad_norm": 1.849692702293396, "learning_rate": 9.487155720338983e-06, "loss": 1.564, "mean_token_accuracy": 0.6771068200469017, "num_tokens": 3115376.0, "step": 3874 }, { "epoch": 1.0264830508474576, "grad_norm": 1.7818135023117065, "learning_rate": 9.486890889830508e-06, "loss": 1.4451, "mean_token_accuracy": 0.6906607076525688, "num_tokens": 3116825.0, "step": 3876 }, { "epoch": 1.0270127118644068, "grad_norm": 1.54276442527771, "learning_rate": 9.486626059322035e-06, "loss": 1.3725, "mean_token_accuracy": 0.672671727836132, "num_tokens": 3118351.0, "step": 3878 }, { "epoch": 1.027542372881356, "grad_norm": 1.748225450515747, "learning_rate": 9.48636122881356e-06, "loss": 1.431, "mean_token_accuracy": 0.667789414525032, "num_tokens": 3119898.0, "step": 3880 }, { "epoch": 1.0280720338983051, "grad_norm": 1.773168921470642, "learning_rate": 9.486096398305086e-06, "loss": 1.4943, "mean_token_accuracy": 0.6886547431349754, "num_tokens": 3121445.0, "step": 3882 }, { "epoch": 1.0286016949152543, "grad_norm": 1.3216512203216553, "learning_rate": 9.485831567796611e-06, "loss": 1.1314, "mean_token_accuracy": 0.7626180797815323, "num_tokens": 3123047.0, "step": 3884 }, { "epoch": 1.0291313559322033, "grad_norm": 0.9008073210716248, "learning_rate": 9.485566737288136e-06, "loss": 1.2185, "mean_token_accuracy": 0.6825415343046188, "num_tokens": 3125543.0, "step": 3886 }, { "epoch": 1.0296610169491525, "grad_norm": 1.552333116531372, "learning_rate": 9.485301906779661e-06, "loss": 1.1546, "mean_token_accuracy": 0.7454095557332039, "num_tokens": 3127092.0, "step": 3888 }, { "epoch": 1.0301906779661016, "grad_norm": 1.2805906534194946, "learning_rate": 9.485037076271188e-06, "loss": 1.2038, "mean_token_accuracy": 0.7052684500813484, "num_tokens": 3129083.0, "step": 3890 }, { "epoch": 1.0307203389830508, "grad_norm": 1.8620647192001343, "learning_rate": 9.484772245762713e-06, "loss": 1.5759, "mean_token_accuracy": 0.6650434657931328, "num_tokens": 3130565.0, "step": 3892 }, { "epoch": 1.03125, "grad_norm": 1.509053349494934, "learning_rate": 9.484507415254238e-06, "loss": 1.25, "mean_token_accuracy": 0.7142548561096191, "num_tokens": 3132217.0, "step": 3894 }, { "epoch": 1.0317796610169492, "grad_norm": 1.452092170715332, "learning_rate": 9.484242584745763e-06, "loss": 0.9535, "mean_token_accuracy": 0.7667273506522179, "num_tokens": 3133683.0, "step": 3896 }, { "epoch": 1.0323093220338984, "grad_norm": 1.7935537099838257, "learning_rate": 9.48397775423729e-06, "loss": 1.7285, "mean_token_accuracy": 0.6292761042714119, "num_tokens": 3135362.0, "step": 3898 }, { "epoch": 1.0328389830508475, "grad_norm": 1.658869981765747, "learning_rate": 9.483712923728814e-06, "loss": 1.0998, "mean_token_accuracy": 0.7323822677135468, "num_tokens": 3136757.0, "step": 3900 }, { "epoch": 1.0333686440677967, "grad_norm": 1.5775535106658936, "learning_rate": 9.483448093220339e-06, "loss": 0.9608, "mean_token_accuracy": 0.7734763920307159, "num_tokens": 3138316.0, "step": 3902 }, { "epoch": 1.0338983050847457, "grad_norm": 1.697096824645996, "learning_rate": 9.483183262711864e-06, "loss": 1.8178, "mean_token_accuracy": 0.6326793134212494, "num_tokens": 3139961.0, "step": 3904 }, { "epoch": 1.0344279661016949, "grad_norm": 1.5153474807739258, "learning_rate": 9.48291843220339e-06, "loss": 1.2834, "mean_token_accuracy": 0.7042824365198612, "num_tokens": 3141631.0, "step": 3906 }, { "epoch": 1.034957627118644, "grad_norm": 1.5279102325439453, "learning_rate": 9.482653601694916e-06, "loss": 1.4901, "mean_token_accuracy": 0.6583013236522675, "num_tokens": 3143540.0, "step": 3908 }, { "epoch": 1.0354872881355932, "grad_norm": 1.7168548107147217, "learning_rate": 9.482388771186442e-06, "loss": 1.5609, "mean_token_accuracy": 0.6560562700033188, "num_tokens": 3145168.0, "step": 3910 }, { "epoch": 1.0360169491525424, "grad_norm": 1.461302399635315, "learning_rate": 9.482123940677967e-06, "loss": 1.0753, "mean_token_accuracy": 0.7620869129896164, "num_tokens": 3146464.0, "step": 3912 }, { "epoch": 1.0365466101694916, "grad_norm": 1.9824862480163574, "learning_rate": 9.481859110169492e-06, "loss": 0.8707, "mean_token_accuracy": 0.7818902060389519, "num_tokens": 3147899.0, "step": 3914 }, { "epoch": 1.0370762711864407, "grad_norm": 1.6479988098144531, "learning_rate": 9.481594279661017e-06, "loss": 1.3106, "mean_token_accuracy": 0.7429661676287651, "num_tokens": 3149526.0, "step": 3916 }, { "epoch": 1.03760593220339, "grad_norm": 1.3284671306610107, "learning_rate": 9.481329449152544e-06, "loss": 0.8559, "mean_token_accuracy": 0.7749153673648834, "num_tokens": 3151058.0, "step": 3918 }, { "epoch": 1.0381355932203389, "grad_norm": 1.8635011911392212, "learning_rate": 9.481064618644069e-06, "loss": 1.4538, "mean_token_accuracy": 0.674446314573288, "num_tokens": 3152423.0, "step": 3920 }, { "epoch": 1.038665254237288, "grad_norm": 1.7579820156097412, "learning_rate": 9.480799788135593e-06, "loss": 1.8614, "mean_token_accuracy": 0.6070816144347191, "num_tokens": 3154084.0, "step": 3922 }, { "epoch": 1.0391949152542372, "grad_norm": 1.62900972366333, "learning_rate": 9.48053495762712e-06, "loss": 1.6011, "mean_token_accuracy": 0.6575276926159859, "num_tokens": 3155735.0, "step": 3924 }, { "epoch": 1.0397245762711864, "grad_norm": 1.833269715309143, "learning_rate": 9.480270127118645e-06, "loss": 1.1363, "mean_token_accuracy": 0.7350514195859432, "num_tokens": 3157317.0, "step": 3926 }, { "epoch": 1.0402542372881356, "grad_norm": 1.7618213891983032, "learning_rate": 9.48000529661017e-06, "loss": 1.4689, "mean_token_accuracy": 0.6544367745518684, "num_tokens": 3158737.0, "step": 3928 }, { "epoch": 1.0407838983050848, "grad_norm": 1.4662202596664429, "learning_rate": 9.479740466101695e-06, "loss": 1.0558, "mean_token_accuracy": 0.7713286951184273, "num_tokens": 3160234.0, "step": 3930 }, { "epoch": 1.041313559322034, "grad_norm": 1.2974028587341309, "learning_rate": 9.479475635593221e-06, "loss": 1.4247, "mean_token_accuracy": 0.6942079216241837, "num_tokens": 3161774.0, "step": 3932 }, { "epoch": 1.0418432203389831, "grad_norm": 1.5018564462661743, "learning_rate": 9.479210805084746e-06, "loss": 0.8907, "mean_token_accuracy": 0.7909849435091019, "num_tokens": 3163102.0, "step": 3934 }, { "epoch": 1.042372881355932, "grad_norm": 1.314963698387146, "learning_rate": 9.478945974576273e-06, "loss": 1.4981, "mean_token_accuracy": 0.6285936161875725, "num_tokens": 3165543.0, "step": 3936 }, { "epoch": 1.0429025423728813, "grad_norm": 1.501285195350647, "learning_rate": 9.478681144067798e-06, "loss": 1.2089, "mean_token_accuracy": 0.7269926816225052, "num_tokens": 3167034.0, "step": 3938 }, { "epoch": 1.0434322033898304, "grad_norm": 1.5483733415603638, "learning_rate": 9.478416313559323e-06, "loss": 1.0159, "mean_token_accuracy": 0.754554457962513, "num_tokens": 3168708.0, "step": 3940 }, { "epoch": 1.0439618644067796, "grad_norm": 1.539280652999878, "learning_rate": 9.478151483050848e-06, "loss": 1.0926, "mean_token_accuracy": 0.7511792480945587, "num_tokens": 3170075.0, "step": 3942 }, { "epoch": 1.0444915254237288, "grad_norm": 1.6652880907058716, "learning_rate": 9.477886652542374e-06, "loss": 1.5697, "mean_token_accuracy": 0.6652492135763168, "num_tokens": 3171739.0, "step": 3944 }, { "epoch": 1.045021186440678, "grad_norm": 1.7315768003463745, "learning_rate": 9.4776218220339e-06, "loss": 1.4397, "mean_token_accuracy": 0.6703441068530083, "num_tokens": 3173218.0, "step": 3946 }, { "epoch": 1.0455508474576272, "grad_norm": 1.5146340131759644, "learning_rate": 9.477356991525424e-06, "loss": 0.7572, "mean_token_accuracy": 0.8033567890524864, "num_tokens": 3174679.0, "step": 3948 }, { "epoch": 1.0460805084745763, "grad_norm": 1.6623655557632446, "learning_rate": 9.47709216101695e-06, "loss": 0.9828, "mean_token_accuracy": 0.7560366988182068, "num_tokens": 3175842.0, "step": 3950 }, { "epoch": 1.0466101694915255, "grad_norm": 1.9327970743179321, "learning_rate": 9.476827330508476e-06, "loss": 1.3754, "mean_token_accuracy": 0.6855651810765266, "num_tokens": 3177224.0, "step": 3952 }, { "epoch": 1.0471398305084745, "grad_norm": 1.8774231672286987, "learning_rate": 9.4765625e-06, "loss": 1.024, "mean_token_accuracy": 0.7669935971498489, "num_tokens": 3178773.0, "step": 3954 }, { "epoch": 1.0476694915254237, "grad_norm": 1.4780828952789307, "learning_rate": 9.476297669491526e-06, "loss": 1.0518, "mean_token_accuracy": 0.7447496950626373, "num_tokens": 3180431.0, "step": 3956 }, { "epoch": 1.0481991525423728, "grad_norm": 1.8116099834442139, "learning_rate": 9.47603283898305e-06, "loss": 1.0913, "mean_token_accuracy": 0.7226938009262085, "num_tokens": 3181958.0, "step": 3958 }, { "epoch": 1.048728813559322, "grad_norm": 1.702584147453308, "learning_rate": 9.475768008474577e-06, "loss": 1.3842, "mean_token_accuracy": 0.6939767152070999, "num_tokens": 3183734.0, "step": 3960 }, { "epoch": 1.0492584745762712, "grad_norm": 1.4972819089889526, "learning_rate": 9.475503177966102e-06, "loss": 1.6349, "mean_token_accuracy": 0.636763259768486, "num_tokens": 3185508.0, "step": 3962 }, { "epoch": 1.0497881355932204, "grad_norm": 1.7102375030517578, "learning_rate": 9.475238347457629e-06, "loss": 1.1895, "mean_token_accuracy": 0.7265922427177429, "num_tokens": 3186882.0, "step": 3964 }, { "epoch": 1.0503177966101696, "grad_norm": 1.675208568572998, "learning_rate": 9.474973516949154e-06, "loss": 1.151, "mean_token_accuracy": 0.7096633538603783, "num_tokens": 3188385.0, "step": 3966 }, { "epoch": 1.0508474576271187, "grad_norm": 1.6674253940582275, "learning_rate": 9.474708686440679e-06, "loss": 1.6475, "mean_token_accuracy": 0.6550891101360321, "num_tokens": 3189822.0, "step": 3968 }, { "epoch": 1.051377118644068, "grad_norm": 1.8539615869522095, "learning_rate": 9.474443855932204e-06, "loss": 1.6019, "mean_token_accuracy": 0.643678218126297, "num_tokens": 3191250.0, "step": 3970 }, { "epoch": 1.0519067796610169, "grad_norm": 1.812337040901184, "learning_rate": 9.47417902542373e-06, "loss": 1.5325, "mean_token_accuracy": 0.6451326906681061, "num_tokens": 3192787.0, "step": 3972 }, { "epoch": 1.052436440677966, "grad_norm": 1.2884248495101929, "learning_rate": 9.473914194915255e-06, "loss": 1.5149, "mean_token_accuracy": 0.6726487316191196, "num_tokens": 3194494.0, "step": 3974 }, { "epoch": 1.0529661016949152, "grad_norm": 1.7380744218826294, "learning_rate": 9.47364936440678e-06, "loss": 1.2417, "mean_token_accuracy": 0.7026951834559441, "num_tokens": 3196106.0, "step": 3976 }, { "epoch": 1.0534957627118644, "grad_norm": 1.1970281600952148, "learning_rate": 9.473384533898305e-06, "loss": 1.4324, "mean_token_accuracy": 0.6665805354714394, "num_tokens": 3198218.0, "step": 3978 }, { "epoch": 1.0540254237288136, "grad_norm": 1.2183188199996948, "learning_rate": 9.473119703389832e-06, "loss": 1.3309, "mean_token_accuracy": 0.7115498036146164, "num_tokens": 3199832.0, "step": 3980 }, { "epoch": 1.0545550847457628, "grad_norm": 1.4511255025863647, "learning_rate": 9.472854872881357e-06, "loss": 1.0575, "mean_token_accuracy": 0.7474708706140518, "num_tokens": 3201391.0, "step": 3982 }, { "epoch": 1.055084745762712, "grad_norm": 1.6854256391525269, "learning_rate": 9.472590042372882e-06, "loss": 1.22, "mean_token_accuracy": 0.729277141392231, "num_tokens": 3202974.0, "step": 3984 }, { "epoch": 1.0556144067796611, "grad_norm": 1.2887992858886719, "learning_rate": 9.472325211864406e-06, "loss": 1.2768, "mean_token_accuracy": 0.6904527172446251, "num_tokens": 3204758.0, "step": 3986 }, { "epoch": 1.05614406779661, "grad_norm": 1.2447428703308105, "learning_rate": 9.472060381355933e-06, "loss": 1.0688, "mean_token_accuracy": 0.7338692247867584, "num_tokens": 3206520.0, "step": 3988 }, { "epoch": 1.0566737288135593, "grad_norm": 1.885736346244812, "learning_rate": 9.471795550847458e-06, "loss": 1.1051, "mean_token_accuracy": 0.7523752748966217, "num_tokens": 3208180.0, "step": 3990 }, { "epoch": 1.0572033898305084, "grad_norm": 1.6788476705551147, "learning_rate": 9.471530720338985e-06, "loss": 0.8375, "mean_token_accuracy": 0.801537312567234, "num_tokens": 3209403.0, "step": 3992 }, { "epoch": 1.0577330508474576, "grad_norm": 1.835791826248169, "learning_rate": 9.47126588983051e-06, "loss": 1.7735, "mean_token_accuracy": 0.5952439680695534, "num_tokens": 3210795.0, "step": 3994 }, { "epoch": 1.0582627118644068, "grad_norm": 1.7461769580841064, "learning_rate": 9.471001059322034e-06, "loss": 1.5403, "mean_token_accuracy": 0.6655706875026226, "num_tokens": 3212319.0, "step": 3996 }, { "epoch": 1.058792372881356, "grad_norm": 1.8971648216247559, "learning_rate": 9.47073622881356e-06, "loss": 1.3487, "mean_token_accuracy": 0.7061612121760845, "num_tokens": 3214159.0, "step": 3998 }, { "epoch": 1.0593220338983051, "grad_norm": 1.7546019554138184, "learning_rate": 9.470471398305086e-06, "loss": 1.3482, "step": 4000 }, { "epoch": 1.0593220338983051, "eval_loss": 1.3240890502929688, "eval_mean_token_accuracy": 0.6986562195536378, "eval_num_tokens": 3215896.0, "eval_runtime": 48.1303, "eval_samples_per_second": 6.399, "eval_steps_per_second": 6.399, "step": 4000 }, { "epoch": 1.0598516949152543, "grad_norm": 1.7748736143112183, "learning_rate": 9.470206567796611e-06, "loss": 1.3745, "mean_token_accuracy": 0.7035325765609741, "num_tokens": 3217646.0, "step": 4002 }, { "epoch": 1.0603813559322033, "grad_norm": 1.504487156867981, "learning_rate": 9.469941737288136e-06, "loss": 1.4756, "mean_token_accuracy": 0.6863018721342087, "num_tokens": 3219353.0, "step": 4004 }, { "epoch": 1.0609110169491525, "grad_norm": 2.1407270431518555, "learning_rate": 9.469676906779663e-06, "loss": 1.5315, "mean_token_accuracy": 0.6506499126553535, "num_tokens": 3221001.0, "step": 4006 }, { "epoch": 1.0614406779661016, "grad_norm": 2.8499064445495605, "learning_rate": 9.469412076271187e-06, "loss": 1.4573, "mean_token_accuracy": 0.667199157178402, "num_tokens": 3222627.0, "step": 4008 }, { "epoch": 1.0619703389830508, "grad_norm": 1.7750129699707031, "learning_rate": 9.469147245762712e-06, "loss": 1.8429, "mean_token_accuracy": 0.6124612540006638, "num_tokens": 3224111.0, "step": 4010 }, { "epoch": 1.0625, "grad_norm": 1.3781139850616455, "learning_rate": 9.468882415254237e-06, "loss": 1.3537, "mean_token_accuracy": 0.6950515173375607, "num_tokens": 3225715.0, "step": 4012 }, { "epoch": 1.0630296610169492, "grad_norm": 1.5257920026779175, "learning_rate": 9.468617584745764e-06, "loss": 1.4403, "mean_token_accuracy": 0.6624070405960083, "num_tokens": 3227547.0, "step": 4014 }, { "epoch": 1.0635593220338984, "grad_norm": 1.42392897605896, "learning_rate": 9.468352754237289e-06, "loss": 0.9887, "mean_token_accuracy": 0.7745935097336769, "num_tokens": 3228879.0, "step": 4016 }, { "epoch": 1.0640889830508475, "grad_norm": 1.321719765663147, "learning_rate": 9.468087923728815e-06, "loss": 1.1046, "mean_token_accuracy": 0.7510957792401314, "num_tokens": 3230681.0, "step": 4018 }, { "epoch": 1.0646186440677967, "grad_norm": 1.4396464824676514, "learning_rate": 9.46782309322034e-06, "loss": 1.2847, "mean_token_accuracy": 0.6850610300898552, "num_tokens": 3232262.0, "step": 4020 }, { "epoch": 1.0651483050847457, "grad_norm": 1.309072732925415, "learning_rate": 9.467558262711865e-06, "loss": 1.181, "mean_token_accuracy": 0.720014113932848, "num_tokens": 3233871.0, "step": 4022 }, { "epoch": 1.0656779661016949, "grad_norm": 2.032788038253784, "learning_rate": 9.46729343220339e-06, "loss": 1.292, "mean_token_accuracy": 0.7140540108084679, "num_tokens": 3235292.0, "step": 4024 }, { "epoch": 1.066207627118644, "grad_norm": 1.665446400642395, "learning_rate": 9.467028601694917e-06, "loss": 1.0416, "mean_token_accuracy": 0.7516925111413002, "num_tokens": 3236511.0, "step": 4026 }, { "epoch": 1.0667372881355932, "grad_norm": 2.155701160430908, "learning_rate": 9.466763771186442e-06, "loss": 1.8432, "mean_token_accuracy": 0.6410410553216934, "num_tokens": 3237997.0, "step": 4028 }, { "epoch": 1.0672669491525424, "grad_norm": 1.439494252204895, "learning_rate": 9.466498940677967e-06, "loss": 1.6132, "mean_token_accuracy": 0.652996763586998, "num_tokens": 3239999.0, "step": 4030 }, { "epoch": 1.0677966101694916, "grad_norm": 1.423277735710144, "learning_rate": 9.466234110169492e-06, "loss": 0.9167, "mean_token_accuracy": 0.7966446280479431, "num_tokens": 3241525.0, "step": 4032 }, { "epoch": 1.0683262711864407, "grad_norm": 1.5604941844940186, "learning_rate": 9.465969279661018e-06, "loss": 1.5732, "mean_token_accuracy": 0.6527433544397354, "num_tokens": 3243180.0, "step": 4034 }, { "epoch": 1.06885593220339, "grad_norm": 1.8365967273712158, "learning_rate": 9.465704449152543e-06, "loss": 1.5633, "mean_token_accuracy": 0.6533653140068054, "num_tokens": 3244866.0, "step": 4036 }, { "epoch": 1.0693855932203389, "grad_norm": 1.9144924879074097, "learning_rate": 9.465439618644068e-06, "loss": 1.3696, "mean_token_accuracy": 0.6840886697173119, "num_tokens": 3246248.0, "step": 4038 }, { "epoch": 1.069915254237288, "grad_norm": 2.0161638259887695, "learning_rate": 9.465174788135593e-06, "loss": 1.4421, "mean_token_accuracy": 0.6722387671470642, "num_tokens": 3247690.0, "step": 4040 }, { "epoch": 1.0704449152542372, "grad_norm": 1.7423620223999023, "learning_rate": 9.46490995762712e-06, "loss": 1.6532, "mean_token_accuracy": 0.6259895339608192, "num_tokens": 3249264.0, "step": 4042 }, { "epoch": 1.0709745762711864, "grad_norm": 1.8224899768829346, "learning_rate": 9.464645127118645e-06, "loss": 1.6393, "mean_token_accuracy": 0.6366088092327118, "num_tokens": 3251039.0, "step": 4044 }, { "epoch": 1.0715042372881356, "grad_norm": 1.7485878467559814, "learning_rate": 9.464380296610171e-06, "loss": 1.7294, "mean_token_accuracy": 0.6320789232850075, "num_tokens": 3252703.0, "step": 4046 }, { "epoch": 1.0720338983050848, "grad_norm": 1.6253207921981812, "learning_rate": 9.464115466101696e-06, "loss": 1.5337, "mean_token_accuracy": 0.6686110645532608, "num_tokens": 3254561.0, "step": 4048 }, { "epoch": 1.072563559322034, "grad_norm": 1.499680519104004, "learning_rate": 9.463850635593221e-06, "loss": 1.467, "mean_token_accuracy": 0.6806646659970284, "num_tokens": 3256261.0, "step": 4050 }, { "epoch": 1.0730932203389831, "grad_norm": 1.8259985446929932, "learning_rate": 9.463585805084746e-06, "loss": 1.2367, "mean_token_accuracy": 0.7132184132933617, "num_tokens": 3258001.0, "step": 4052 }, { "epoch": 1.073622881355932, "grad_norm": 1.7987319231033325, "learning_rate": 9.463320974576273e-06, "loss": 1.5461, "mean_token_accuracy": 0.6783919483423233, "num_tokens": 3259487.0, "step": 4054 }, { "epoch": 1.0741525423728813, "grad_norm": 1.3009865283966064, "learning_rate": 9.463056144067798e-06, "loss": 0.9277, "mean_token_accuracy": 0.7710608765482903, "num_tokens": 3260912.0, "step": 4056 }, { "epoch": 1.0746822033898304, "grad_norm": 2.022042989730835, "learning_rate": 9.462791313559323e-06, "loss": 1.6838, "mean_token_accuracy": 0.6344348564743996, "num_tokens": 3262499.0, "step": 4058 }, { "epoch": 1.0752118644067796, "grad_norm": 1.5179418325424194, "learning_rate": 9.462526483050847e-06, "loss": 0.844, "mean_token_accuracy": 0.7894523814320564, "num_tokens": 3264158.0, "step": 4060 }, { "epoch": 1.0757415254237288, "grad_norm": 1.4726959466934204, "learning_rate": 9.462261652542374e-06, "loss": 1.0175, "mean_token_accuracy": 0.7641738206148148, "num_tokens": 3265606.0, "step": 4062 }, { "epoch": 1.076271186440678, "grad_norm": 1.823460578918457, "learning_rate": 9.461996822033899e-06, "loss": 0.8625, "mean_token_accuracy": 0.7864466160535812, "num_tokens": 3266861.0, "step": 4064 }, { "epoch": 1.0768008474576272, "grad_norm": 1.0616379976272583, "learning_rate": 9.461731991525424e-06, "loss": 1.2717, "mean_token_accuracy": 0.7175575345754623, "num_tokens": 3268533.0, "step": 4066 }, { "epoch": 1.0773305084745763, "grad_norm": 1.165020227432251, "learning_rate": 9.461467161016949e-06, "loss": 0.7625, "mean_token_accuracy": 0.7882163003087044, "num_tokens": 3270237.0, "step": 4068 }, { "epoch": 1.0778601694915255, "grad_norm": 1.4966719150543213, "learning_rate": 9.461202330508475e-06, "loss": 1.2987, "mean_token_accuracy": 0.6805596724152565, "num_tokens": 3271673.0, "step": 4070 }, { "epoch": 1.0783898305084745, "grad_norm": 1.4745349884033203, "learning_rate": 9.4609375e-06, "loss": 1.3104, "mean_token_accuracy": 0.7007766366004944, "num_tokens": 3273154.0, "step": 4072 }, { "epoch": 1.0789194915254237, "grad_norm": 1.89617121219635, "learning_rate": 9.460672669491527e-06, "loss": 1.1475, "mean_token_accuracy": 0.7375425398349762, "num_tokens": 3274667.0, "step": 4074 }, { "epoch": 1.0794491525423728, "grad_norm": 1.4712574481964111, "learning_rate": 9.460407838983052e-06, "loss": 1.3555, "mean_token_accuracy": 0.6947829984128475, "num_tokens": 3276221.0, "step": 4076 }, { "epoch": 1.079978813559322, "grad_norm": 1.6038291454315186, "learning_rate": 9.460143008474577e-06, "loss": 1.2403, "mean_token_accuracy": 0.7025328353047371, "num_tokens": 3277964.0, "step": 4078 }, { "epoch": 1.0805084745762712, "grad_norm": 1.9607387781143188, "learning_rate": 9.459878177966102e-06, "loss": 1.0075, "mean_token_accuracy": 0.7459206096827984, "num_tokens": 3279525.0, "step": 4080 }, { "epoch": 1.0810381355932204, "grad_norm": 2.1020660400390625, "learning_rate": 9.459613347457628e-06, "loss": 1.6999, "mean_token_accuracy": 0.6180169396102428, "num_tokens": 3281049.0, "step": 4082 }, { "epoch": 1.0815677966101696, "grad_norm": 1.424389362335205, "learning_rate": 9.459348516949153e-06, "loss": 1.1166, "mean_token_accuracy": 0.7330451086163521, "num_tokens": 3282669.0, "step": 4084 }, { "epoch": 1.0820974576271187, "grad_norm": 1.70907461643219, "learning_rate": 9.459083686440678e-06, "loss": 1.2301, "mean_token_accuracy": 0.7011746242642403, "num_tokens": 3284198.0, "step": 4086 }, { "epoch": 1.082627118644068, "grad_norm": 1.2837095260620117, "learning_rate": 9.458818855932203e-06, "loss": 1.2727, "mean_token_accuracy": 0.7066819071769714, "num_tokens": 3285866.0, "step": 4088 }, { "epoch": 1.0831567796610169, "grad_norm": 1.6241883039474487, "learning_rate": 9.45855402542373e-06, "loss": 1.525, "mean_token_accuracy": 0.6665560826659203, "num_tokens": 3287448.0, "step": 4090 }, { "epoch": 1.083686440677966, "grad_norm": 1.8973885774612427, "learning_rate": 9.458289194915255e-06, "loss": 1.4228, "mean_token_accuracy": 0.6937892884016037, "num_tokens": 3288955.0, "step": 4092 }, { "epoch": 1.0842161016949152, "grad_norm": 1.905378818511963, "learning_rate": 9.45802436440678e-06, "loss": 1.3894, "mean_token_accuracy": 0.7067089304327965, "num_tokens": 3290375.0, "step": 4094 }, { "epoch": 1.0847457627118644, "grad_norm": 1.645282506942749, "learning_rate": 9.457759533898306e-06, "loss": 0.8464, "mean_token_accuracy": 0.7872451692819595, "num_tokens": 3292048.0, "step": 4096 }, { "epoch": 1.0852754237288136, "grad_norm": 2.417029619216919, "learning_rate": 9.457494703389831e-06, "loss": 1.3237, "mean_token_accuracy": 0.715477779507637, "num_tokens": 3293727.0, "step": 4098 }, { "epoch": 1.0858050847457628, "grad_norm": 1.973750114440918, "learning_rate": 9.457229872881358e-06, "loss": 1.47, "mean_token_accuracy": 0.6863479688763618, "num_tokens": 3295226.0, "step": 4100 }, { "epoch": 1.086334745762712, "grad_norm": 1.5356248617172241, "learning_rate": 9.456965042372883e-06, "loss": 1.3538, "mean_token_accuracy": 0.6744297966361046, "num_tokens": 3297154.0, "step": 4102 }, { "epoch": 1.0868644067796611, "grad_norm": 1.420547366142273, "learning_rate": 9.456700211864408e-06, "loss": 1.4272, "mean_token_accuracy": 0.6734664291143417, "num_tokens": 3298825.0, "step": 4104 }, { "epoch": 1.08739406779661, "grad_norm": 1.4590282440185547, "learning_rate": 9.456435381355933e-06, "loss": 0.9098, "mean_token_accuracy": 0.7655415385961533, "num_tokens": 3300059.0, "step": 4106 }, { "epoch": 1.0879237288135593, "grad_norm": 1.75541090965271, "learning_rate": 9.45617055084746e-06, "loss": 1.4084, "mean_token_accuracy": 0.6838154159486294, "num_tokens": 3301637.0, "step": 4108 }, { "epoch": 1.0884533898305084, "grad_norm": 2.0046660900115967, "learning_rate": 9.455905720338984e-06, "loss": 1.1245, "mean_token_accuracy": 0.7334098368883133, "num_tokens": 3303056.0, "step": 4110 }, { "epoch": 1.0889830508474576, "grad_norm": 1.5959972143173218, "learning_rate": 9.45564088983051e-06, "loss": 1.239, "mean_token_accuracy": 0.7161201313138008, "num_tokens": 3304565.0, "step": 4112 }, { "epoch": 1.0895127118644068, "grad_norm": 1.4907130002975464, "learning_rate": 9.455376059322034e-06, "loss": 1.46, "mean_token_accuracy": 0.6805664002895355, "num_tokens": 3306387.0, "step": 4114 }, { "epoch": 1.090042372881356, "grad_norm": 1.6332749128341675, "learning_rate": 9.45511122881356e-06, "loss": 1.4055, "mean_token_accuracy": 0.6553420275449753, "num_tokens": 3308007.0, "step": 4116 }, { "epoch": 1.0905720338983051, "grad_norm": 1.799530267715454, "learning_rate": 9.454846398305086e-06, "loss": 1.4628, "mean_token_accuracy": 0.6762544959783554, "num_tokens": 3309268.0, "step": 4118 }, { "epoch": 1.0911016949152543, "grad_norm": 1.7979782819747925, "learning_rate": 9.45458156779661e-06, "loss": 1.6794, "mean_token_accuracy": 0.675032302737236, "num_tokens": 3310764.0, "step": 4120 }, { "epoch": 1.0916313559322033, "grad_norm": 1.6247868537902832, "learning_rate": 9.454316737288136e-06, "loss": 1.8195, "mean_token_accuracy": 0.6192290149629116, "num_tokens": 3312657.0, "step": 4122 }, { "epoch": 1.0921610169491525, "grad_norm": 1.3905324935913086, "learning_rate": 9.454051906779662e-06, "loss": 1.25, "mean_token_accuracy": 0.704513244330883, "num_tokens": 3314533.0, "step": 4124 }, { "epoch": 1.0926906779661016, "grad_norm": 1.1933772563934326, "learning_rate": 9.453787076271187e-06, "loss": 1.3894, "mean_token_accuracy": 0.6746693402528763, "num_tokens": 3316302.0, "step": 4126 }, { "epoch": 1.0932203389830508, "grad_norm": 1.5342519283294678, "learning_rate": 9.453522245762714e-06, "loss": 1.4184, "mean_token_accuracy": 0.6724093034863472, "num_tokens": 3317756.0, "step": 4128 }, { "epoch": 1.09375, "grad_norm": 1.9658912420272827, "learning_rate": 9.453257415254239e-06, "loss": 1.4538, "mean_token_accuracy": 0.6661797389388084, "num_tokens": 3319312.0, "step": 4130 }, { "epoch": 1.0942796610169492, "grad_norm": 1.5972130298614502, "learning_rate": 9.452992584745764e-06, "loss": 2.0069, "mean_token_accuracy": 0.5763574205338955, "num_tokens": 3321034.0, "step": 4132 }, { "epoch": 1.0948093220338984, "grad_norm": 1.676784873008728, "learning_rate": 9.452727754237288e-06, "loss": 1.1407, "mean_token_accuracy": 0.7209281623363495, "num_tokens": 3322380.0, "step": 4134 }, { "epoch": 1.0953389830508475, "grad_norm": 1.4500932693481445, "learning_rate": 9.452462923728815e-06, "loss": 1.2192, "mean_token_accuracy": 0.7509864047169685, "num_tokens": 3324705.0, "step": 4136 }, { "epoch": 1.0958686440677967, "grad_norm": 1.4973108768463135, "learning_rate": 9.45219809322034e-06, "loss": 1.1444, "mean_token_accuracy": 0.7328818514943123, "num_tokens": 3326474.0, "step": 4138 }, { "epoch": 1.0963983050847457, "grad_norm": 1.483632206916809, "learning_rate": 9.451933262711865e-06, "loss": 1.1505, "mean_token_accuracy": 0.710758350789547, "num_tokens": 3328412.0, "step": 4140 }, { "epoch": 1.0969279661016949, "grad_norm": 1.7290173768997192, "learning_rate": 9.45166843220339e-06, "loss": 1.519, "mean_token_accuracy": 0.6721399277448654, "num_tokens": 3329946.0, "step": 4142 }, { "epoch": 1.097457627118644, "grad_norm": 1.5914556980133057, "learning_rate": 9.451403601694917e-06, "loss": 1.2321, "mean_token_accuracy": 0.7243836522102356, "num_tokens": 3331390.0, "step": 4144 }, { "epoch": 1.0979872881355932, "grad_norm": 1.1906887292861938, "learning_rate": 9.451138771186441e-06, "loss": 1.1815, "mean_token_accuracy": 0.7484050095081329, "num_tokens": 3333083.0, "step": 4146 }, { "epoch": 1.0985169491525424, "grad_norm": 1.7248531579971313, "learning_rate": 9.450873940677966e-06, "loss": 1.2077, "mean_token_accuracy": 0.7200669199228287, "num_tokens": 3334538.0, "step": 4148 }, { "epoch": 1.0990466101694916, "grad_norm": 1.9945900440216064, "learning_rate": 9.450609110169491e-06, "loss": 1.5605, "mean_token_accuracy": 0.6195255815982819, "num_tokens": 3336283.0, "step": 4150 }, { "epoch": 1.0995762711864407, "grad_norm": 1.6605963706970215, "learning_rate": 9.450344279661018e-06, "loss": 1.0731, "mean_token_accuracy": 0.7373258322477341, "num_tokens": 3337877.0, "step": 4152 }, { "epoch": 1.10010593220339, "grad_norm": 1.3857301473617554, "learning_rate": 9.450079449152543e-06, "loss": 1.2081, "mean_token_accuracy": 0.715424083173275, "num_tokens": 3339709.0, "step": 4154 }, { "epoch": 1.1006355932203389, "grad_norm": 2.1457011699676514, "learning_rate": 9.44981461864407e-06, "loss": 1.4142, "mean_token_accuracy": 0.6761514320969582, "num_tokens": 3341289.0, "step": 4156 }, { "epoch": 1.101165254237288, "grad_norm": 1.6271635293960571, "learning_rate": 9.449549788135593e-06, "loss": 1.5886, "mean_token_accuracy": 0.6436975970864296, "num_tokens": 3343041.0, "step": 4158 }, { "epoch": 1.1016949152542372, "grad_norm": 1.9753304719924927, "learning_rate": 9.44928495762712e-06, "loss": 1.4841, "mean_token_accuracy": 0.6912944614887238, "num_tokens": 3344325.0, "step": 4160 }, { "epoch": 1.1022245762711864, "grad_norm": 1.464106559753418, "learning_rate": 9.449020127118644e-06, "loss": 1.3201, "mean_token_accuracy": 0.6642579063773155, "num_tokens": 3346036.0, "step": 4162 }, { "epoch": 1.1027542372881356, "grad_norm": 1.5078754425048828, "learning_rate": 9.448755296610171e-06, "loss": 0.9671, "mean_token_accuracy": 0.7382035553455353, "num_tokens": 3348515.0, "step": 4164 }, { "epoch": 1.1032838983050848, "grad_norm": 1.606825590133667, "learning_rate": 9.448490466101696e-06, "loss": 1.2229, "mean_token_accuracy": 0.7521993294358253, "num_tokens": 3349971.0, "step": 4166 }, { "epoch": 1.103813559322034, "grad_norm": 1.4128245115280151, "learning_rate": 9.44822563559322e-06, "loss": 1.3164, "mean_token_accuracy": 0.7130234688520432, "num_tokens": 3351430.0, "step": 4168 }, { "epoch": 1.1043432203389831, "grad_norm": 1.3492717742919922, "learning_rate": 9.447960805084746e-06, "loss": 1.2212, "mean_token_accuracy": 0.717784970998764, "num_tokens": 3352934.0, "step": 4170 }, { "epoch": 1.104872881355932, "grad_norm": 1.9183027744293213, "learning_rate": 9.447695974576272e-06, "loss": 1.1199, "mean_token_accuracy": 0.7560828179121017, "num_tokens": 3354482.0, "step": 4172 }, { "epoch": 1.1054025423728813, "grad_norm": 1.6212480068206787, "learning_rate": 9.447431144067797e-06, "loss": 1.096, "mean_token_accuracy": 0.7384021766483784, "num_tokens": 3356014.0, "step": 4174 }, { "epoch": 1.1059322033898304, "grad_norm": 1.6053966283798218, "learning_rate": 9.447166313559322e-06, "loss": 1.1878, "mean_token_accuracy": 0.7378502190113068, "num_tokens": 3357441.0, "step": 4176 }, { "epoch": 1.1064618644067796, "grad_norm": 2.125783681869507, "learning_rate": 9.446901483050849e-06, "loss": 1.6817, "mean_token_accuracy": 0.665664941072464, "num_tokens": 3358946.0, "step": 4178 }, { "epoch": 1.1069915254237288, "grad_norm": 1.6372365951538086, "learning_rate": 9.446636652542374e-06, "loss": 1.3969, "mean_token_accuracy": 0.6747739166021347, "num_tokens": 3360842.0, "step": 4180 }, { "epoch": 1.107521186440678, "grad_norm": 2.2342655658721924, "learning_rate": 9.4463718220339e-06, "loss": 1.3568, "mean_token_accuracy": 0.6987822726368904, "num_tokens": 3362090.0, "step": 4182 }, { "epoch": 1.1080508474576272, "grad_norm": 1.7552800178527832, "learning_rate": 9.446106991525425e-06, "loss": 1.3607, "mean_token_accuracy": 0.6931492015719414, "num_tokens": 3363657.0, "step": 4184 }, { "epoch": 1.1085805084745763, "grad_norm": 1.2609920501708984, "learning_rate": 9.44584216101695e-06, "loss": 1.2119, "mean_token_accuracy": 0.7300547137856483, "num_tokens": 3365334.0, "step": 4186 }, { "epoch": 1.1091101694915255, "grad_norm": 1.9335914850234985, "learning_rate": 9.445577330508475e-06, "loss": 1.219, "mean_token_accuracy": 0.7061306983232498, "num_tokens": 3366663.0, "step": 4188 }, { "epoch": 1.1096398305084745, "grad_norm": 1.5806523561477661, "learning_rate": 9.445312500000002e-06, "loss": 1.1484, "mean_token_accuracy": 0.737418957054615, "num_tokens": 3368229.0, "step": 4190 }, { "epoch": 1.1101694915254237, "grad_norm": 1.4976145029067993, "learning_rate": 9.445047669491527e-06, "loss": 1.2956, "mean_token_accuracy": 0.6822700798511505, "num_tokens": 3370173.0, "step": 4192 }, { "epoch": 1.1106991525423728, "grad_norm": 1.385416030883789, "learning_rate": 9.444782838983052e-06, "loss": 1.3144, "mean_token_accuracy": 0.6817285045981407, "num_tokens": 3371679.0, "step": 4194 }, { "epoch": 1.111228813559322, "grad_norm": 1.4864423274993896, "learning_rate": 9.444518008474577e-06, "loss": 1.7164, "mean_token_accuracy": 0.6274966448545456, "num_tokens": 3373354.0, "step": 4196 }, { "epoch": 1.1117584745762712, "grad_norm": 1.7320834398269653, "learning_rate": 9.444253177966103e-06, "loss": 1.4832, "mean_token_accuracy": 0.6699970439076424, "num_tokens": 3375004.0, "step": 4198 }, { "epoch": 1.1122881355932204, "grad_norm": 1.5152860879898071, "learning_rate": 9.443988347457628e-06, "loss": 1.3046, "mean_token_accuracy": 0.7060450911521912, "num_tokens": 3376727.0, "step": 4200 }, { "epoch": 1.1128177966101696, "grad_norm": 1.684181571006775, "learning_rate": 9.443723516949153e-06, "loss": 1.5813, "mean_token_accuracy": 0.6714176796376705, "num_tokens": 3378438.0, "step": 4202 }, { "epoch": 1.1133474576271187, "grad_norm": 1.634619951248169, "learning_rate": 9.443458686440678e-06, "loss": 1.0158, "mean_token_accuracy": 0.7411894649267197, "num_tokens": 3379796.0, "step": 4204 }, { "epoch": 1.113877118644068, "grad_norm": 1.6435502767562866, "learning_rate": 9.443193855932205e-06, "loss": 1.5678, "mean_token_accuracy": 0.654714547097683, "num_tokens": 3381571.0, "step": 4206 }, { "epoch": 1.1144067796610169, "grad_norm": 1.9769530296325684, "learning_rate": 9.44292902542373e-06, "loss": 1.6764, "mean_token_accuracy": 0.6362843886017799, "num_tokens": 3383244.0, "step": 4208 }, { "epoch": 1.114936440677966, "grad_norm": 1.445186972618103, "learning_rate": 9.442664194915256e-06, "loss": 1.3612, "mean_token_accuracy": 0.7119382619857788, "num_tokens": 3385204.0, "step": 4210 }, { "epoch": 1.1154661016949152, "grad_norm": 1.2776402235031128, "learning_rate": 9.44239936440678e-06, "loss": 1.1222, "mean_token_accuracy": 0.7357438951730728, "num_tokens": 3387171.0, "step": 4212 }, { "epoch": 1.1159957627118644, "grad_norm": 1.6540573835372925, "learning_rate": 9.442134533898306e-06, "loss": 1.4857, "mean_token_accuracy": 0.682125523686409, "num_tokens": 3388751.0, "step": 4214 }, { "epoch": 1.1165254237288136, "grad_norm": 1.690750002861023, "learning_rate": 9.441869703389831e-06, "loss": 1.4534, "mean_token_accuracy": 0.6875359639525414, "num_tokens": 3390218.0, "step": 4216 }, { "epoch": 1.1170550847457628, "grad_norm": 1.652869462966919, "learning_rate": 9.441604872881358e-06, "loss": 1.4411, "mean_token_accuracy": 0.6718604564666748, "num_tokens": 3391789.0, "step": 4218 }, { "epoch": 1.117584745762712, "grad_norm": 1.3527816534042358, "learning_rate": 9.441340042372882e-06, "loss": 1.5317, "mean_token_accuracy": 0.6447129249572754, "num_tokens": 3393479.0, "step": 4220 }, { "epoch": 1.1181144067796611, "grad_norm": 1.4403921365737915, "learning_rate": 9.441075211864407e-06, "loss": 1.316, "mean_token_accuracy": 0.6820070594549179, "num_tokens": 3395022.0, "step": 4222 }, { "epoch": 1.11864406779661, "grad_norm": 1.5026636123657227, "learning_rate": 9.440810381355932e-06, "loss": 1.4788, "mean_token_accuracy": 0.6693605482578278, "num_tokens": 3396474.0, "step": 4224 }, { "epoch": 1.1191737288135593, "grad_norm": 1.5722554922103882, "learning_rate": 9.440545550847459e-06, "loss": 1.187, "mean_token_accuracy": 0.7299395427107811, "num_tokens": 3397999.0, "step": 4226 }, { "epoch": 1.1197033898305084, "grad_norm": 1.2538467645645142, "learning_rate": 9.440280720338984e-06, "loss": 0.8956, "mean_token_accuracy": 0.7703671306371689, "num_tokens": 3399840.0, "step": 4228 }, { "epoch": 1.1202330508474576, "grad_norm": 1.6917004585266113, "learning_rate": 9.440015889830509e-06, "loss": 1.7167, "mean_token_accuracy": 0.6581258773803711, "num_tokens": 3401371.0, "step": 4230 }, { "epoch": 1.1207627118644068, "grad_norm": 1.612478256225586, "learning_rate": 9.439751059322034e-06, "loss": 1.1775, "mean_token_accuracy": 0.7314676269888878, "num_tokens": 3403003.0, "step": 4232 }, { "epoch": 1.121292372881356, "grad_norm": 1.5137722492218018, "learning_rate": 9.43948622881356e-06, "loss": 1.1127, "mean_token_accuracy": 0.7353985533118248, "num_tokens": 3404552.0, "step": 4234 }, { "epoch": 1.1218220338983051, "grad_norm": 1.551132082939148, "learning_rate": 9.439221398305085e-06, "loss": 1.1678, "mean_token_accuracy": 0.7196350991725922, "num_tokens": 3406083.0, "step": 4236 }, { "epoch": 1.1223516949152543, "grad_norm": 1.7667003870010376, "learning_rate": 9.438956567796612e-06, "loss": 1.3994, "mean_token_accuracy": 0.7058667205274105, "num_tokens": 3407673.0, "step": 4238 }, { "epoch": 1.1228813559322033, "grad_norm": 1.7187180519104004, "learning_rate": 9.438691737288135e-06, "loss": 1.4114, "mean_token_accuracy": 0.6510593891143799, "num_tokens": 3409401.0, "step": 4240 }, { "epoch": 1.1234110169491525, "grad_norm": 1.8994927406311035, "learning_rate": 9.438426906779662e-06, "loss": 1.4954, "mean_token_accuracy": 0.6881710514426231, "num_tokens": 3411066.0, "step": 4242 }, { "epoch": 1.1239406779661016, "grad_norm": 1.7344411611557007, "learning_rate": 9.438162076271187e-06, "loss": 1.3217, "mean_token_accuracy": 0.6961254440248013, "num_tokens": 3412685.0, "step": 4244 }, { "epoch": 1.1244703389830508, "grad_norm": 1.8819355964660645, "learning_rate": 9.437897245762713e-06, "loss": 1.6282, "mean_token_accuracy": 0.6718517988920212, "num_tokens": 3414314.0, "step": 4246 }, { "epoch": 1.125, "grad_norm": 1.4946626424789429, "learning_rate": 9.437632415254238e-06, "loss": 1.4062, "mean_token_accuracy": 0.6786701083183289, "num_tokens": 3416147.0, "step": 4248 }, { "epoch": 1.1255296610169492, "grad_norm": 1.6362916231155396, "learning_rate": 9.437367584745763e-06, "loss": 0.9947, "step": 4250 }, { "epoch": 1.1255296610169492, "eval_loss": 1.3234992027282715, "eval_mean_token_accuracy": 0.6980748744560527, "eval_num_tokens": 3417762.0, "eval_runtime": 48.3051, "eval_samples_per_second": 6.376, "eval_steps_per_second": 6.376, "step": 4250 }, { "epoch": 1.1260593220338984, "grad_norm": 1.4893168210983276, "learning_rate": 9.437102754237288e-06, "loss": 1.2664, "mean_token_accuracy": 0.7387927509844303, "num_tokens": 3419342.0, "step": 4252 }, { "epoch": 1.1265889830508475, "grad_norm": 1.3560192584991455, "learning_rate": 9.436837923728815e-06, "loss": 1.0407, "mean_token_accuracy": 0.7246357873082161, "num_tokens": 3421072.0, "step": 4254 }, { "epoch": 1.1271186440677967, "grad_norm": 1.9143378734588623, "learning_rate": 9.43657309322034e-06, "loss": 1.3484, "mean_token_accuracy": 0.6939830854535103, "num_tokens": 3422697.0, "step": 4256 }, { "epoch": 1.1276483050847457, "grad_norm": 1.7251214981079102, "learning_rate": 9.436308262711865e-06, "loss": 1.2913, "mean_token_accuracy": 0.7124537080526352, "num_tokens": 3424438.0, "step": 4258 }, { "epoch": 1.1281779661016949, "grad_norm": 1.8357045650482178, "learning_rate": 9.436043432203391e-06, "loss": 1.3996, "mean_token_accuracy": 0.6693154014647007, "num_tokens": 3425928.0, "step": 4260 }, { "epoch": 1.128707627118644, "grad_norm": 2.136573553085327, "learning_rate": 9.435778601694916e-06, "loss": 1.2954, "mean_token_accuracy": 0.7145699486136436, "num_tokens": 3427395.0, "step": 4262 }, { "epoch": 1.1292372881355932, "grad_norm": 1.5667444467544556, "learning_rate": 9.435513771186443e-06, "loss": 1.9005, "mean_token_accuracy": 0.5985250100493431, "num_tokens": 3429169.0, "step": 4264 }, { "epoch": 1.1297669491525424, "grad_norm": 1.6512469053268433, "learning_rate": 9.435248940677966e-06, "loss": 1.7558, "mean_token_accuracy": 0.6284651756286621, "num_tokens": 3430988.0, "step": 4266 }, { "epoch": 1.1302966101694916, "grad_norm": 1.500463843345642, "learning_rate": 9.434984110169493e-06, "loss": 1.2006, "mean_token_accuracy": 0.7114390283823013, "num_tokens": 3432765.0, "step": 4268 }, { "epoch": 1.1308262711864407, "grad_norm": 1.8324878215789795, "learning_rate": 9.434719279661018e-06, "loss": 1.7214, "mean_token_accuracy": 0.628391794860363, "num_tokens": 3434463.0, "step": 4270 }, { "epoch": 1.13135593220339, "grad_norm": 1.512302279472351, "learning_rate": 9.434454449152544e-06, "loss": 1.2046, "mean_token_accuracy": 0.7146783396601677, "num_tokens": 3436199.0, "step": 4272 }, { "epoch": 1.131885593220339, "grad_norm": 1.6481760740280151, "learning_rate": 9.434189618644069e-06, "loss": 1.3996, "mean_token_accuracy": 0.681435152888298, "num_tokens": 3437626.0, "step": 4274 }, { "epoch": 1.132415254237288, "grad_norm": 1.390669584274292, "learning_rate": 9.433924788135594e-06, "loss": 1.3958, "mean_token_accuracy": 0.7099952325224876, "num_tokens": 3439354.0, "step": 4276 }, { "epoch": 1.1329449152542372, "grad_norm": 1.221801996231079, "learning_rate": 9.433659957627119e-06, "loss": 0.8398, "mean_token_accuracy": 0.7854704782366753, "num_tokens": 3440875.0, "step": 4278 }, { "epoch": 1.1334745762711864, "grad_norm": 1.2065147161483765, "learning_rate": 9.433395127118646e-06, "loss": 1.1528, "mean_token_accuracy": 0.7078730762004852, "num_tokens": 3443245.0, "step": 4280 }, { "epoch": 1.1340042372881356, "grad_norm": 1.5071603059768677, "learning_rate": 9.43313029661017e-06, "loss": 1.1706, "mean_token_accuracy": 0.7187891378998756, "num_tokens": 3444630.0, "step": 4282 }, { "epoch": 1.1345338983050848, "grad_norm": 1.2189897298812866, "learning_rate": 9.432865466101695e-06, "loss": 0.8367, "mean_token_accuracy": 0.7963553443551064, "num_tokens": 3445922.0, "step": 4284 }, { "epoch": 1.135063559322034, "grad_norm": 1.78838050365448, "learning_rate": 9.43260063559322e-06, "loss": 1.5114, "mean_token_accuracy": 0.6641345098614693, "num_tokens": 3447438.0, "step": 4286 }, { "epoch": 1.1355932203389831, "grad_norm": 1.8066200017929077, "learning_rate": 9.432335805084747e-06, "loss": 1.4663, "mean_token_accuracy": 0.6731084436178207, "num_tokens": 3449001.0, "step": 4288 }, { "epoch": 1.136122881355932, "grad_norm": 1.5732935667037964, "learning_rate": 9.432070974576272e-06, "loss": 1.4996, "mean_token_accuracy": 0.6749891564249992, "num_tokens": 3450565.0, "step": 4290 }, { "epoch": 1.1366525423728813, "grad_norm": 1.2815485000610352, "learning_rate": 9.431806144067799e-06, "loss": 0.9457, "mean_token_accuracy": 0.7444457933306694, "num_tokens": 3452213.0, "step": 4292 }, { "epoch": 1.1371822033898304, "grad_norm": 1.7463219165802002, "learning_rate": 9.431541313559322e-06, "loss": 1.317, "mean_token_accuracy": 0.6965935006737709, "num_tokens": 3453794.0, "step": 4294 }, { "epoch": 1.1377118644067796, "grad_norm": 1.4955989122390747, "learning_rate": 9.431276483050848e-06, "loss": 1.1496, "mean_token_accuracy": 0.7111883834004402, "num_tokens": 3455194.0, "step": 4296 }, { "epoch": 1.1382415254237288, "grad_norm": 1.5804837942123413, "learning_rate": 9.431011652542373e-06, "loss": 1.2459, "mean_token_accuracy": 0.7120458260178566, "num_tokens": 3456891.0, "step": 4298 }, { "epoch": 1.138771186440678, "grad_norm": 1.507806658744812, "learning_rate": 9.4307468220339e-06, "loss": 1.1439, "mean_token_accuracy": 0.7295509278774261, "num_tokens": 3458338.0, "step": 4300 }, { "epoch": 1.1393008474576272, "grad_norm": 1.8586176633834839, "learning_rate": 9.430481991525425e-06, "loss": 1.4223, "mean_token_accuracy": 0.6733830273151398, "num_tokens": 3459866.0, "step": 4302 }, { "epoch": 1.1398305084745763, "grad_norm": 1.6553397178649902, "learning_rate": 9.43021716101695e-06, "loss": 1.5059, "mean_token_accuracy": 0.6754660904407501, "num_tokens": 3461663.0, "step": 4304 }, { "epoch": 1.1403601694915255, "grad_norm": 1.7707886695861816, "learning_rate": 9.429952330508475e-06, "loss": 1.0991, "mean_token_accuracy": 0.7145441137254238, "num_tokens": 3463244.0, "step": 4306 }, { "epoch": 1.1408898305084745, "grad_norm": 1.5074212551116943, "learning_rate": 9.429687500000001e-06, "loss": 1.4112, "mean_token_accuracy": 0.6629199460148811, "num_tokens": 3464830.0, "step": 4308 }, { "epoch": 1.1414194915254237, "grad_norm": 1.5684056282043457, "learning_rate": 9.429422669491526e-06, "loss": 1.4365, "mean_token_accuracy": 0.6948616057634354, "num_tokens": 3466387.0, "step": 4310 }, { "epoch": 1.1419491525423728, "grad_norm": 1.3194831609725952, "learning_rate": 9.429157838983051e-06, "loss": 1.4748, "mean_token_accuracy": 0.6606419272720814, "num_tokens": 3468451.0, "step": 4312 }, { "epoch": 1.142478813559322, "grad_norm": 2.0216872692108154, "learning_rate": 9.428893008474576e-06, "loss": 1.3895, "mean_token_accuracy": 0.6905333921313286, "num_tokens": 3469952.0, "step": 4314 }, { "epoch": 1.1430084745762712, "grad_norm": 2.331300973892212, "learning_rate": 9.428628177966103e-06, "loss": 1.3992, "mean_token_accuracy": 0.6693582497537136, "num_tokens": 3471527.0, "step": 4316 }, { "epoch": 1.1435381355932204, "grad_norm": 1.4687267541885376, "learning_rate": 9.428363347457628e-06, "loss": 1.2483, "mean_token_accuracy": 0.7334382012486458, "num_tokens": 3473046.0, "step": 4318 }, { "epoch": 1.1440677966101696, "grad_norm": 1.6340640783309937, "learning_rate": 9.428098516949153e-06, "loss": 1.4262, "mean_token_accuracy": 0.6774670407176018, "num_tokens": 3474742.0, "step": 4320 }, { "epoch": 1.1445974576271187, "grad_norm": 1.436049222946167, "learning_rate": 9.427833686440678e-06, "loss": 1.4635, "mean_token_accuracy": 0.6424362808465958, "num_tokens": 3476908.0, "step": 4322 }, { "epoch": 1.145127118644068, "grad_norm": 1.7247298955917358, "learning_rate": 9.427568855932204e-06, "loss": 1.3301, "mean_token_accuracy": 0.6762997210025787, "num_tokens": 3478537.0, "step": 4324 }, { "epoch": 1.1456567796610169, "grad_norm": 1.9538817405700684, "learning_rate": 9.427304025423729e-06, "loss": 1.2945, "mean_token_accuracy": 0.7089975923299789, "num_tokens": 3480032.0, "step": 4326 }, { "epoch": 1.146186440677966, "grad_norm": 1.3831042051315308, "learning_rate": 9.427039194915256e-06, "loss": 0.8652, "mean_token_accuracy": 0.7804298475384712, "num_tokens": 3481561.0, "step": 4328 }, { "epoch": 1.1467161016949152, "grad_norm": 1.5138301849365234, "learning_rate": 9.42677436440678e-06, "loss": 1.0667, "mean_token_accuracy": 0.7581221610307693, "num_tokens": 3483213.0, "step": 4330 }, { "epoch": 1.1472457627118644, "grad_norm": 2.2087550163269043, "learning_rate": 9.426509533898306e-06, "loss": 1.2203, "mean_token_accuracy": 0.7112221270799637, "num_tokens": 3484592.0, "step": 4332 }, { "epoch": 1.1477754237288136, "grad_norm": 2.0349197387695312, "learning_rate": 9.42624470338983e-06, "loss": 1.7343, "mean_token_accuracy": 0.6080141663551331, "num_tokens": 3486105.0, "step": 4334 }, { "epoch": 1.1483050847457628, "grad_norm": 1.6806331872940063, "learning_rate": 9.425979872881357e-06, "loss": 1.3625, "mean_token_accuracy": 0.6874671131372452, "num_tokens": 3487431.0, "step": 4336 }, { "epoch": 1.148834745762712, "grad_norm": 1.9504711627960205, "learning_rate": 9.425715042372882e-06, "loss": 1.6014, "mean_token_accuracy": 0.6466010212898254, "num_tokens": 3489120.0, "step": 4338 }, { "epoch": 1.149364406779661, "grad_norm": 1.5918633937835693, "learning_rate": 9.425450211864407e-06, "loss": 1.1474, "mean_token_accuracy": 0.7265563607215881, "num_tokens": 3490585.0, "step": 4340 }, { "epoch": 1.14989406779661, "grad_norm": 1.6183987855911255, "learning_rate": 9.425185381355934e-06, "loss": 1.6451, "mean_token_accuracy": 0.6372014284133911, "num_tokens": 3492248.0, "step": 4342 }, { "epoch": 1.1504237288135593, "grad_norm": 1.923251748085022, "learning_rate": 9.424920550847459e-06, "loss": 1.8292, "mean_token_accuracy": 0.6093785800039768, "num_tokens": 3493792.0, "step": 4344 }, { "epoch": 1.1509533898305084, "grad_norm": 1.5416107177734375, "learning_rate": 9.424655720338985e-06, "loss": 1.3686, "mean_token_accuracy": 0.6853300929069519, "num_tokens": 3495623.0, "step": 4346 }, { "epoch": 1.1514830508474576, "grad_norm": 2.24245548248291, "learning_rate": 9.424390889830508e-06, "loss": 1.3074, "mean_token_accuracy": 0.6893531084060669, "num_tokens": 3497067.0, "step": 4348 }, { "epoch": 1.1520127118644068, "grad_norm": 1.5484883785247803, "learning_rate": 9.424126059322035e-06, "loss": 1.3603, "mean_token_accuracy": 0.6775679513812065, "num_tokens": 3498806.0, "step": 4350 }, { "epoch": 1.152542372881356, "grad_norm": 2.5929114818573, "learning_rate": 9.42386122881356e-06, "loss": 1.5463, "mean_token_accuracy": 0.6684442907571793, "num_tokens": 3500405.0, "step": 4352 }, { "epoch": 1.1530720338983051, "grad_norm": 1.4588154554367065, "learning_rate": 9.423596398305087e-06, "loss": 1.129, "mean_token_accuracy": 0.740037627518177, "num_tokens": 3501896.0, "step": 4354 }, { "epoch": 1.1536016949152543, "grad_norm": 1.371208906173706, "learning_rate": 9.423331567796612e-06, "loss": 1.0359, "mean_token_accuracy": 0.7506191581487656, "num_tokens": 3503719.0, "step": 4356 }, { "epoch": 1.1541313559322033, "grad_norm": 1.6796295642852783, "learning_rate": 9.423066737288136e-06, "loss": 1.3557, "mean_token_accuracy": 0.7121307104825974, "num_tokens": 3505491.0, "step": 4358 }, { "epoch": 1.1546610169491525, "grad_norm": 1.7672762870788574, "learning_rate": 9.422801906779661e-06, "loss": 1.2671, "mean_token_accuracy": 0.7017433568835258, "num_tokens": 3507210.0, "step": 4360 }, { "epoch": 1.1551906779661016, "grad_norm": 2.0713417530059814, "learning_rate": 9.422537076271188e-06, "loss": 1.5089, "mean_token_accuracy": 0.6644826680421829, "num_tokens": 3508731.0, "step": 4362 }, { "epoch": 1.1557203389830508, "grad_norm": 2.058028221130371, "learning_rate": 9.422272245762713e-06, "loss": 1.4218, "mean_token_accuracy": 0.6566231846809387, "num_tokens": 3510222.0, "step": 4364 }, { "epoch": 1.15625, "grad_norm": 1.4605640172958374, "learning_rate": 9.422007415254238e-06, "loss": 1.5336, "mean_token_accuracy": 0.6557793840765953, "num_tokens": 3512084.0, "step": 4366 }, { "epoch": 1.1567796610169492, "grad_norm": 1.6950874328613281, "learning_rate": 9.421742584745763e-06, "loss": 1.3865, "mean_token_accuracy": 0.6702639237046242, "num_tokens": 3513602.0, "step": 4368 }, { "epoch": 1.1573093220338984, "grad_norm": 1.2288074493408203, "learning_rate": 9.42147775423729e-06, "loss": 1.2781, "mean_token_accuracy": 0.709101565182209, "num_tokens": 3515538.0, "step": 4370 }, { "epoch": 1.1578389830508475, "grad_norm": 1.7566194534301758, "learning_rate": 9.421212923728814e-06, "loss": 1.1141, "mean_token_accuracy": 0.7407788708806038, "num_tokens": 3516907.0, "step": 4372 }, { "epoch": 1.1583686440677967, "grad_norm": 1.3651974201202393, "learning_rate": 9.42094809322034e-06, "loss": 1.192, "mean_token_accuracy": 0.7174343541264534, "num_tokens": 3518754.0, "step": 4374 }, { "epoch": 1.1588983050847457, "grad_norm": 1.5943793058395386, "learning_rate": 9.420683262711864e-06, "loss": 1.2725, "mean_token_accuracy": 0.6945805177092552, "num_tokens": 3520297.0, "step": 4376 }, { "epoch": 1.1594279661016949, "grad_norm": 1.5669392347335815, "learning_rate": 9.42041843220339e-06, "loss": 1.257, "mean_token_accuracy": 0.7196692004799843, "num_tokens": 3521877.0, "step": 4378 }, { "epoch": 1.159957627118644, "grad_norm": 1.6777794361114502, "learning_rate": 9.420153601694916e-06, "loss": 1.239, "mean_token_accuracy": 0.714773952960968, "num_tokens": 3523408.0, "step": 4380 }, { "epoch": 1.1604872881355932, "grad_norm": 1.401494026184082, "learning_rate": 9.419888771186442e-06, "loss": 1.3255, "mean_token_accuracy": 0.7054887637495995, "num_tokens": 3525011.0, "step": 4382 }, { "epoch": 1.1610169491525424, "grad_norm": 1.8293449878692627, "learning_rate": 9.419623940677967e-06, "loss": 1.1961, "mean_token_accuracy": 0.738480843603611, "num_tokens": 3526389.0, "step": 4384 }, { "epoch": 1.1615466101694916, "grad_norm": 1.7162039279937744, "learning_rate": 9.419359110169492e-06, "loss": 1.5062, "mean_token_accuracy": 0.6752182170748711, "num_tokens": 3528022.0, "step": 4386 }, { "epoch": 1.1620762711864407, "grad_norm": 1.7228713035583496, "learning_rate": 9.419094279661017e-06, "loss": 1.6916, "mean_token_accuracy": 0.6423430144786835, "num_tokens": 3529418.0, "step": 4388 }, { "epoch": 1.16260593220339, "grad_norm": 1.3638322353363037, "learning_rate": 9.418829449152544e-06, "loss": 1.2296, "mean_token_accuracy": 0.7050367668271065, "num_tokens": 3531025.0, "step": 4390 }, { "epoch": 1.163135593220339, "grad_norm": 1.7002993822097778, "learning_rate": 9.418564618644069e-06, "loss": 1.0571, "mean_token_accuracy": 0.7470248192548752, "num_tokens": 3532550.0, "step": 4392 }, { "epoch": 1.163665254237288, "grad_norm": 1.481374979019165, "learning_rate": 9.418299788135594e-06, "loss": 1.3486, "mean_token_accuracy": 0.6841355860233307, "num_tokens": 3534950.0, "step": 4394 }, { "epoch": 1.1641949152542372, "grad_norm": 1.7862327098846436, "learning_rate": 9.418034957627119e-06, "loss": 1.5407, "mean_token_accuracy": 0.6939868666231632, "num_tokens": 3536499.0, "step": 4396 }, { "epoch": 1.1647245762711864, "grad_norm": 1.9071406126022339, "learning_rate": 9.417770127118645e-06, "loss": 1.2077, "mean_token_accuracy": 0.723575048148632, "num_tokens": 3537925.0, "step": 4398 }, { "epoch": 1.1652542372881356, "grad_norm": 2.076524496078491, "learning_rate": 9.41750529661017e-06, "loss": 1.4633, "mean_token_accuracy": 0.6653244271874428, "num_tokens": 3539316.0, "step": 4400 }, { "epoch": 1.1657838983050848, "grad_norm": 1.4475661516189575, "learning_rate": 9.417240466101695e-06, "loss": 0.9748, "mean_token_accuracy": 0.7549154236912727, "num_tokens": 3541106.0, "step": 4402 }, { "epoch": 1.166313559322034, "grad_norm": 1.6471065282821655, "learning_rate": 9.41697563559322e-06, "loss": 1.304, "mean_token_accuracy": 0.686003290116787, "num_tokens": 3542880.0, "step": 4404 }, { "epoch": 1.1668432203389831, "grad_norm": 1.9251751899719238, "learning_rate": 9.416710805084747e-06, "loss": 1.6449, "mean_token_accuracy": 0.6414305418729782, "num_tokens": 3544188.0, "step": 4406 }, { "epoch": 1.167372881355932, "grad_norm": 1.5125733613967896, "learning_rate": 9.416445974576272e-06, "loss": 1.0588, "mean_token_accuracy": 0.7383086085319519, "num_tokens": 3545732.0, "step": 4408 }, { "epoch": 1.1679025423728813, "grad_norm": 1.689921259880066, "learning_rate": 9.416181144067798e-06, "loss": 1.3384, "mean_token_accuracy": 0.6981384977698326, "num_tokens": 3547512.0, "step": 4410 }, { "epoch": 1.1684322033898304, "grad_norm": 1.3924652338027954, "learning_rate": 9.415916313559323e-06, "loss": 1.2587, "mean_token_accuracy": 0.7125213444232941, "num_tokens": 3549106.0, "step": 4412 }, { "epoch": 1.1689618644067796, "grad_norm": 1.682410478591919, "learning_rate": 9.415651483050848e-06, "loss": 1.2396, "mean_token_accuracy": 0.6964191421866417, "num_tokens": 3550578.0, "step": 4414 }, { "epoch": 1.1694915254237288, "grad_norm": 1.9062812328338623, "learning_rate": 9.415386652542373e-06, "loss": 1.3804, "mean_token_accuracy": 0.6928097754716873, "num_tokens": 3552029.0, "step": 4416 }, { "epoch": 1.170021186440678, "grad_norm": 1.9967868328094482, "learning_rate": 9.4151218220339e-06, "loss": 1.4122, "mean_token_accuracy": 0.688290186226368, "num_tokens": 3553937.0, "step": 4418 }, { "epoch": 1.1705508474576272, "grad_norm": 1.791914939880371, "learning_rate": 9.414856991525424e-06, "loss": 1.2772, "mean_token_accuracy": 0.7119023725390434, "num_tokens": 3555317.0, "step": 4420 }, { "epoch": 1.1710805084745763, "grad_norm": 1.4933584928512573, "learning_rate": 9.41459216101695e-06, "loss": 1.189, "mean_token_accuracy": 0.7182196900248528, "num_tokens": 3556880.0, "step": 4422 }, { "epoch": 1.1716101694915255, "grad_norm": 1.7118744850158691, "learning_rate": 9.414327330508474e-06, "loss": 1.3491, "mean_token_accuracy": 0.6921676769852638, "num_tokens": 3558560.0, "step": 4424 }, { "epoch": 1.1721398305084745, "grad_norm": 1.8351566791534424, "learning_rate": 9.414062500000001e-06, "loss": 1.2539, "mean_token_accuracy": 0.709717683494091, "num_tokens": 3560221.0, "step": 4426 }, { "epoch": 1.1726694915254237, "grad_norm": 1.2469650506973267, "learning_rate": 9.413797669491526e-06, "loss": 1.1789, "mean_token_accuracy": 0.7364673763513565, "num_tokens": 3561701.0, "step": 4428 }, { "epoch": 1.1731991525423728, "grad_norm": 1.6537299156188965, "learning_rate": 9.41353283898305e-06, "loss": 1.5883, "mean_token_accuracy": 0.679169949144125, "num_tokens": 3563528.0, "step": 4430 }, { "epoch": 1.173728813559322, "grad_norm": 1.7423031330108643, "learning_rate": 9.413268008474577e-06, "loss": 1.6516, "mean_token_accuracy": 0.6475965678691864, "num_tokens": 3565197.0, "step": 4432 }, { "epoch": 1.1742584745762712, "grad_norm": 1.3691834211349487, "learning_rate": 9.413003177966102e-06, "loss": 1.2396, "mean_token_accuracy": 0.7187769450247288, "num_tokens": 3566830.0, "step": 4434 }, { "epoch": 1.1747881355932204, "grad_norm": 1.5150158405303955, "learning_rate": 9.412738347457629e-06, "loss": 0.9549, "mean_token_accuracy": 0.755423329770565, "num_tokens": 3568389.0, "step": 4436 }, { "epoch": 1.1753177966101696, "grad_norm": 1.6759257316589355, "learning_rate": 9.412473516949154e-06, "loss": 1.1818, "mean_token_accuracy": 0.7115293070673943, "num_tokens": 3569879.0, "step": 4438 }, { "epoch": 1.1758474576271187, "grad_norm": 1.380631446838379, "learning_rate": 9.412208686440679e-06, "loss": 1.1669, "mean_token_accuracy": 0.715406134724617, "num_tokens": 3571429.0, "step": 4440 }, { "epoch": 1.176377118644068, "grad_norm": 2.0201244354248047, "learning_rate": 9.411943855932204e-06, "loss": 1.1422, "mean_token_accuracy": 0.7386274561285973, "num_tokens": 3572687.0, "step": 4442 }, { "epoch": 1.1769067796610169, "grad_norm": 1.80829918384552, "learning_rate": 9.41167902542373e-06, "loss": 1.3219, "mean_token_accuracy": 0.6877870038151741, "num_tokens": 3574361.0, "step": 4444 }, { "epoch": 1.177436440677966, "grad_norm": 1.9276100397109985, "learning_rate": 9.411414194915255e-06, "loss": 1.2234, "mean_token_accuracy": 0.7490042299032211, "num_tokens": 3575989.0, "step": 4446 }, { "epoch": 1.1779661016949152, "grad_norm": 1.684887409210205, "learning_rate": 9.41114936440678e-06, "loss": 1.0385, "mean_token_accuracy": 0.7372517436742783, "num_tokens": 3577330.0, "step": 4448 }, { "epoch": 1.1784957627118644, "grad_norm": 1.8063478469848633, "learning_rate": 9.410884533898305e-06, "loss": 1.6187, "mean_token_accuracy": 0.6447745561599731, "num_tokens": 3578868.0, "step": 4450 }, { "epoch": 1.1790254237288136, "grad_norm": 1.7467923164367676, "learning_rate": 9.410619703389832e-06, "loss": 1.6108, "mean_token_accuracy": 0.640262246131897, "num_tokens": 3580582.0, "step": 4452 }, { "epoch": 1.1795550847457628, "grad_norm": 1.3807157278060913, "learning_rate": 9.410354872881357e-06, "loss": 1.2623, "mean_token_accuracy": 0.7034001871943474, "num_tokens": 3582258.0, "step": 4454 }, { "epoch": 1.180084745762712, "grad_norm": 1.1489726305007935, "learning_rate": 9.410090042372882e-06, "loss": 1.2687, "mean_token_accuracy": 0.7078306898474693, "num_tokens": 3584118.0, "step": 4456 }, { "epoch": 1.180614406779661, "grad_norm": 1.5381468534469604, "learning_rate": 9.409825211864407e-06, "loss": 1.4192, "mean_token_accuracy": 0.6814148239791393, "num_tokens": 3585781.0, "step": 4458 }, { "epoch": 1.18114406779661, "grad_norm": 1.306014060974121, "learning_rate": 9.409560381355933e-06, "loss": 1.1966, "mean_token_accuracy": 0.7047486007213593, "num_tokens": 3587516.0, "step": 4460 }, { "epoch": 1.1816737288135593, "grad_norm": 1.4655777215957642, "learning_rate": 9.409295550847458e-06, "loss": 1.149, "mean_token_accuracy": 0.7185935601592064, "num_tokens": 3589355.0, "step": 4462 }, { "epoch": 1.1822033898305084, "grad_norm": 1.463458776473999, "learning_rate": 9.409030720338985e-06, "loss": 1.0883, "mean_token_accuracy": 0.7471800819039345, "num_tokens": 3591139.0, "step": 4464 }, { "epoch": 1.1827330508474576, "grad_norm": 2.006808042526245, "learning_rate": 9.40876588983051e-06, "loss": 1.5133, "mean_token_accuracy": 0.6545592844486237, "num_tokens": 3592625.0, "step": 4466 }, { "epoch": 1.1832627118644068, "grad_norm": 2.1131043434143066, "learning_rate": 9.408501059322035e-06, "loss": 1.6766, "mean_token_accuracy": 0.6295658573508263, "num_tokens": 3594030.0, "step": 4468 }, { "epoch": 1.183792372881356, "grad_norm": 1.5307955741882324, "learning_rate": 9.40823622881356e-06, "loss": 1.4413, "mean_token_accuracy": 0.6685726307332516, "num_tokens": 3595473.0, "step": 4470 }, { "epoch": 1.1843220338983051, "grad_norm": 2.0728111267089844, "learning_rate": 9.407971398305086e-06, "loss": 1.759, "mean_token_accuracy": 0.620398111641407, "num_tokens": 3597025.0, "step": 4472 }, { "epoch": 1.1848516949152543, "grad_norm": 2.064042329788208, "learning_rate": 9.407706567796611e-06, "loss": 1.3924, "mean_token_accuracy": 0.693778682500124, "num_tokens": 3598522.0, "step": 4474 }, { "epoch": 1.1853813559322033, "grad_norm": 1.5677485466003418, "learning_rate": 9.407441737288136e-06, "loss": 1.1847, "mean_token_accuracy": 0.7168743535876274, "num_tokens": 3600376.0, "step": 4476 }, { "epoch": 1.1859110169491525, "grad_norm": 1.7884845733642578, "learning_rate": 9.407176906779661e-06, "loss": 1.2494, "mean_token_accuracy": 0.7211213335394859, "num_tokens": 3601717.0, "step": 4478 }, { "epoch": 1.1864406779661016, "grad_norm": 1.5976157188415527, "learning_rate": 9.406912076271188e-06, "loss": 1.1384, "mean_token_accuracy": 0.7333137169480324, "num_tokens": 3603410.0, "step": 4480 }, { "epoch": 1.1869703389830508, "grad_norm": 1.5906341075897217, "learning_rate": 9.406647245762713e-06, "loss": 0.9276, "mean_token_accuracy": 0.7709785476326942, "num_tokens": 3604954.0, "step": 4482 }, { "epoch": 1.1875, "grad_norm": 1.3901805877685547, "learning_rate": 9.406382415254237e-06, "loss": 1.2494, "mean_token_accuracy": 0.6997330337762833, "num_tokens": 3606588.0, "step": 4484 }, { "epoch": 1.1880296610169492, "grad_norm": 1.6446770429611206, "learning_rate": 9.406117584745762e-06, "loss": 1.3552, "mean_token_accuracy": 0.6858402490615845, "num_tokens": 3608267.0, "step": 4486 }, { "epoch": 1.1885593220338984, "grad_norm": 2.2111058235168457, "learning_rate": 9.405852754237289e-06, "loss": 1.4554, "mean_token_accuracy": 0.6785369329154491, "num_tokens": 3609859.0, "step": 4488 }, { "epoch": 1.1890889830508475, "grad_norm": 1.803645372390747, "learning_rate": 9.405587923728814e-06, "loss": 1.4474, "mean_token_accuracy": 0.721658281981945, "num_tokens": 3611491.0, "step": 4490 }, { "epoch": 1.1896186440677967, "grad_norm": 1.8578492403030396, "learning_rate": 9.40532309322034e-06, "loss": 1.246, "mean_token_accuracy": 0.7116959095001221, "num_tokens": 3613022.0, "step": 4492 }, { "epoch": 1.1901483050847457, "grad_norm": 2.022718906402588, "learning_rate": 9.405058262711866e-06, "loss": 1.7851, "mean_token_accuracy": 0.614819910377264, "num_tokens": 3614637.0, "step": 4494 }, { "epoch": 1.1906779661016949, "grad_norm": 2.062028169631958, "learning_rate": 9.40479343220339e-06, "loss": 1.3917, "mean_token_accuracy": 0.7012578472495079, "num_tokens": 3616000.0, "step": 4496 }, { "epoch": 1.191207627118644, "grad_norm": 1.681252360343933, "learning_rate": 9.404528601694915e-06, "loss": 1.2027, "mean_token_accuracy": 0.7201771922409534, "num_tokens": 3617664.0, "step": 4498 }, { "epoch": 1.1917372881355932, "grad_norm": 1.0869721174240112, "learning_rate": 9.404263771186442e-06, "loss": 1.1785, "step": 4500 }, { "epoch": 1.1917372881355932, "eval_loss": 1.3221783638000488, "eval_mean_token_accuracy": 0.6991710070665781, "eval_num_tokens": 3619218.0, "eval_runtime": 48.2978, "eval_samples_per_second": 6.377, "eval_steps_per_second": 6.377, "step": 4500 }, { "epoch": 1.1922669491525424, "grad_norm": 1.4048902988433838, "learning_rate": 9.403998940677967e-06, "loss": 1.394, "mean_token_accuracy": 0.7079466171562672, "num_tokens": 3621239.0, "step": 4502 }, { "epoch": 1.1927966101694916, "grad_norm": 1.872826337814331, "learning_rate": 9.403734110169492e-06, "loss": 1.4021, "mean_token_accuracy": 0.6785512566566467, "num_tokens": 3622754.0, "step": 4504 }, { "epoch": 1.1933262711864407, "grad_norm": 1.7091675996780396, "learning_rate": 9.403469279661017e-06, "loss": 1.0926, "mean_token_accuracy": 0.7441489174962044, "num_tokens": 3624338.0, "step": 4506 }, { "epoch": 1.19385593220339, "grad_norm": 1.8151358366012573, "learning_rate": 9.403204449152543e-06, "loss": 1.3177, "mean_token_accuracy": 0.7028627917170525, "num_tokens": 3625943.0, "step": 4508 }, { "epoch": 1.194385593220339, "grad_norm": 1.8153772354125977, "learning_rate": 9.402939618644068e-06, "loss": 1.5912, "mean_token_accuracy": 0.6666401289403439, "num_tokens": 3627550.0, "step": 4510 }, { "epoch": 1.194915254237288, "grad_norm": 1.757859706878662, "learning_rate": 9.402674788135593e-06, "loss": 1.2917, "mean_token_accuracy": 0.7056719958782196, "num_tokens": 3628926.0, "step": 4512 }, { "epoch": 1.1954449152542372, "grad_norm": 1.6716604232788086, "learning_rate": 9.40240995762712e-06, "loss": 1.1493, "mean_token_accuracy": 0.7152896896004677, "num_tokens": 3630702.0, "step": 4514 }, { "epoch": 1.1959745762711864, "grad_norm": 1.9566929340362549, "learning_rate": 9.402145127118645e-06, "loss": 1.3515, "mean_token_accuracy": 0.6992719545960426, "num_tokens": 3632246.0, "step": 4516 }, { "epoch": 1.1965042372881356, "grad_norm": 1.506223201751709, "learning_rate": 9.401880296610171e-06, "loss": 1.103, "mean_token_accuracy": 0.753349956125021, "num_tokens": 3633964.0, "step": 4518 }, { "epoch": 1.1970338983050848, "grad_norm": 1.6277458667755127, "learning_rate": 9.401615466101696e-06, "loss": 1.2848, "mean_token_accuracy": 0.7265530675649643, "num_tokens": 3635809.0, "step": 4520 }, { "epoch": 1.197563559322034, "grad_norm": 1.7670010328292847, "learning_rate": 9.401350635593221e-06, "loss": 1.3784, "mean_token_accuracy": 0.6889050491154194, "num_tokens": 3637204.0, "step": 4522 }, { "epoch": 1.1980932203389831, "grad_norm": 1.7453501224517822, "learning_rate": 9.401085805084746e-06, "loss": 1.6142, "mean_token_accuracy": 0.6668515652418137, "num_tokens": 3639045.0, "step": 4524 }, { "epoch": 1.198622881355932, "grad_norm": 1.4922480583190918, "learning_rate": 9.400820974576273e-06, "loss": 0.851, "mean_token_accuracy": 0.7915762886404991, "num_tokens": 3640679.0, "step": 4526 }, { "epoch": 1.1991525423728813, "grad_norm": 1.6788710355758667, "learning_rate": 9.400556144067798e-06, "loss": 1.2621, "mean_token_accuracy": 0.6964337825775146, "num_tokens": 3642195.0, "step": 4528 }, { "epoch": 1.1996822033898304, "grad_norm": 1.8747456073760986, "learning_rate": 9.400291313559323e-06, "loss": 1.449, "mean_token_accuracy": 0.6606403365731239, "num_tokens": 3643929.0, "step": 4530 }, { "epoch": 1.2002118644067796, "grad_norm": 1.955997109413147, "learning_rate": 9.400026483050848e-06, "loss": 1.144, "mean_token_accuracy": 0.7465526536107063, "num_tokens": 3645525.0, "step": 4532 }, { "epoch": 1.2007415254237288, "grad_norm": 1.7342827320098877, "learning_rate": 9.399761652542374e-06, "loss": 1.1536, "mean_token_accuracy": 0.7356675267219543, "num_tokens": 3647118.0, "step": 4534 }, { "epoch": 1.201271186440678, "grad_norm": 1.4553449153900146, "learning_rate": 9.3994968220339e-06, "loss": 1.3714, "mean_token_accuracy": 0.6938425153493881, "num_tokens": 3648923.0, "step": 4536 }, { "epoch": 1.2018008474576272, "grad_norm": 1.662235140800476, "learning_rate": 9.399231991525424e-06, "loss": 1.0806, "mean_token_accuracy": 0.7439693473279476, "num_tokens": 3650263.0, "step": 4538 }, { "epoch": 1.2023305084745763, "grad_norm": 1.6650927066802979, "learning_rate": 9.398967161016949e-06, "loss": 1.5508, "mean_token_accuracy": 0.6170993819832802, "num_tokens": 3652162.0, "step": 4540 }, { "epoch": 1.2028601694915255, "grad_norm": 1.9002721309661865, "learning_rate": 9.398702330508476e-06, "loss": 1.4057, "mean_token_accuracy": 0.6881931386888027, "num_tokens": 3653807.0, "step": 4542 }, { "epoch": 1.2033898305084745, "grad_norm": 1.4578807353973389, "learning_rate": 9.3984375e-06, "loss": 0.9677, "mean_token_accuracy": 0.7500269040465355, "num_tokens": 3655687.0, "step": 4544 }, { "epoch": 1.2039194915254237, "grad_norm": 1.785996437072754, "learning_rate": 9.398172669491527e-06, "loss": 1.1541, "mean_token_accuracy": 0.7351472228765488, "num_tokens": 3657216.0, "step": 4546 }, { "epoch": 1.2044491525423728, "grad_norm": 1.3344988822937012, "learning_rate": 9.397907838983052e-06, "loss": 1.1271, "mean_token_accuracy": 0.7222693562507629, "num_tokens": 3659018.0, "step": 4548 }, { "epoch": 1.204978813559322, "grad_norm": 1.9524190425872803, "learning_rate": 9.397643008474577e-06, "loss": 1.7606, "mean_token_accuracy": 0.6279517114162445, "num_tokens": 3660987.0, "step": 4550 }, { "epoch": 1.2055084745762712, "grad_norm": 1.5512887239456177, "learning_rate": 9.397378177966102e-06, "loss": 1.3009, "mean_token_accuracy": 0.681937899440527, "num_tokens": 3662677.0, "step": 4552 }, { "epoch": 1.2060381355932204, "grad_norm": 1.439541220664978, "learning_rate": 9.397113347457629e-06, "loss": 1.0521, "mean_token_accuracy": 0.7314633727073669, "num_tokens": 3664472.0, "step": 4554 }, { "epoch": 1.2065677966101696, "grad_norm": 2.1592955589294434, "learning_rate": 9.396848516949154e-06, "loss": 1.5472, "mean_token_accuracy": 0.6500355191528797, "num_tokens": 3666189.0, "step": 4556 }, { "epoch": 1.2070974576271187, "grad_norm": 1.5616687536239624, "learning_rate": 9.396583686440678e-06, "loss": 1.2744, "mean_token_accuracy": 0.6997528113424778, "num_tokens": 3668048.0, "step": 4558 }, { "epoch": 1.207627118644068, "grad_norm": 1.7323014736175537, "learning_rate": 9.396318855932203e-06, "loss": 1.3044, "mean_token_accuracy": 0.6970017477869987, "num_tokens": 3669620.0, "step": 4560 }, { "epoch": 1.2081567796610169, "grad_norm": 2.0276620388031006, "learning_rate": 9.39605402542373e-06, "loss": 1.6085, "mean_token_accuracy": 0.6434165462851524, "num_tokens": 3671165.0, "step": 4562 }, { "epoch": 1.208686440677966, "grad_norm": 1.7745448350906372, "learning_rate": 9.395789194915255e-06, "loss": 1.3931, "mean_token_accuracy": 0.7047421485185623, "num_tokens": 3672745.0, "step": 4564 }, { "epoch": 1.2092161016949152, "grad_norm": 1.585818886756897, "learning_rate": 9.39552436440678e-06, "loss": 1.2613, "mean_token_accuracy": 0.6969896256923676, "num_tokens": 3674205.0, "step": 4566 }, { "epoch": 1.2097457627118644, "grad_norm": 1.2867481708526611, "learning_rate": 9.395259533898305e-06, "loss": 1.3835, "mean_token_accuracy": 0.6757083237171173, "num_tokens": 3675680.0, "step": 4568 }, { "epoch": 1.2102754237288136, "grad_norm": 1.4804071187973022, "learning_rate": 9.394994703389831e-06, "loss": 1.1699, "mean_token_accuracy": 0.7195888608694077, "num_tokens": 3677275.0, "step": 4570 }, { "epoch": 1.2108050847457628, "grad_norm": 1.144824743270874, "learning_rate": 9.394729872881356e-06, "loss": 1.2507, "mean_token_accuracy": 0.7223557308316231, "num_tokens": 3678908.0, "step": 4572 }, { "epoch": 1.211334745762712, "grad_norm": 2.0029380321502686, "learning_rate": 9.394465042372883e-06, "loss": 1.4349, "mean_token_accuracy": 0.6732048839330673, "num_tokens": 3680741.0, "step": 4574 }, { "epoch": 1.211864406779661, "grad_norm": 1.5397140979766846, "learning_rate": 9.394200211864408e-06, "loss": 1.1395, "mean_token_accuracy": 0.7253315672278404, "num_tokens": 3682068.0, "step": 4576 }, { "epoch": 1.21239406779661, "grad_norm": 1.7821968793869019, "learning_rate": 9.393935381355933e-06, "loss": 1.1595, "mean_token_accuracy": 0.7259391993284225, "num_tokens": 3683405.0, "step": 4578 }, { "epoch": 1.2129237288135593, "grad_norm": 1.7858561277389526, "learning_rate": 9.393670550847458e-06, "loss": 1.289, "mean_token_accuracy": 0.7349854558706284, "num_tokens": 3685018.0, "step": 4580 }, { "epoch": 1.2134533898305084, "grad_norm": 1.8516303300857544, "learning_rate": 9.393405720338984e-06, "loss": 1.7546, "mean_token_accuracy": 0.6228065118193626, "num_tokens": 3686686.0, "step": 4582 }, { "epoch": 1.2139830508474576, "grad_norm": 1.496623158454895, "learning_rate": 9.39314088983051e-06, "loss": 1.0596, "mean_token_accuracy": 0.7594255581498146, "num_tokens": 3688396.0, "step": 4584 }, { "epoch": 1.2145127118644068, "grad_norm": 2.2693891525268555, "learning_rate": 9.392876059322034e-06, "loss": 1.4456, "mean_token_accuracy": 0.6767318807542324, "num_tokens": 3689760.0, "step": 4586 }, { "epoch": 1.215042372881356, "grad_norm": 1.5563071966171265, "learning_rate": 9.39261122881356e-06, "loss": 1.155, "mean_token_accuracy": 0.72643181681633, "num_tokens": 3691513.0, "step": 4588 }, { "epoch": 1.2155720338983051, "grad_norm": 1.2425684928894043, "learning_rate": 9.392346398305086e-06, "loss": 1.0311, "mean_token_accuracy": 0.7621593028306961, "num_tokens": 3693359.0, "step": 4590 }, { "epoch": 1.2161016949152543, "grad_norm": 1.4867194890975952, "learning_rate": 9.39208156779661e-06, "loss": 1.085, "mean_token_accuracy": 0.7628286182880402, "num_tokens": 3694877.0, "step": 4592 }, { "epoch": 1.2166313559322033, "grad_norm": 1.6669416427612305, "learning_rate": 9.391816737288136e-06, "loss": 1.1463, "mean_token_accuracy": 0.7319652214646339, "num_tokens": 3696533.0, "step": 4594 }, { "epoch": 1.2171610169491525, "grad_norm": 1.8434195518493652, "learning_rate": 9.391551906779662e-06, "loss": 1.2704, "mean_token_accuracy": 0.7018898651003838, "num_tokens": 3698190.0, "step": 4596 }, { "epoch": 1.2176906779661016, "grad_norm": 1.773190975189209, "learning_rate": 9.391287076271187e-06, "loss": 1.6974, "mean_token_accuracy": 0.6230596005916595, "num_tokens": 3699680.0, "step": 4598 }, { "epoch": 1.2182203389830508, "grad_norm": 1.593363881111145, "learning_rate": 9.391022245762714e-06, "loss": 1.5984, "mean_token_accuracy": 0.6621859669685364, "num_tokens": 3701271.0, "step": 4600 }, { "epoch": 1.21875, "grad_norm": 1.7520513534545898, "learning_rate": 9.390757415254239e-06, "loss": 1.492, "mean_token_accuracy": 0.6555139049887657, "num_tokens": 3702882.0, "step": 4602 }, { "epoch": 1.2192796610169492, "grad_norm": 1.778881549835205, "learning_rate": 9.390492584745764e-06, "loss": 1.2471, "mean_token_accuracy": 0.7324162200093269, "num_tokens": 3704269.0, "step": 4604 }, { "epoch": 1.2198093220338984, "grad_norm": 1.3522652387619019, "learning_rate": 9.390227754237289e-06, "loss": 0.877, "mean_token_accuracy": 0.7767473608255386, "num_tokens": 3705915.0, "step": 4606 }, { "epoch": 1.2203389830508475, "grad_norm": 1.6320126056671143, "learning_rate": 9.389962923728815e-06, "loss": 1.3034, "mean_token_accuracy": 0.6956126913428307, "num_tokens": 3707444.0, "step": 4608 }, { "epoch": 1.2208686440677967, "grad_norm": 1.730761170387268, "learning_rate": 9.38969809322034e-06, "loss": 1.4167, "mean_token_accuracy": 0.6629917770624161, "num_tokens": 3709013.0, "step": 4610 }, { "epoch": 1.2213983050847457, "grad_norm": 1.1295584440231323, "learning_rate": 9.389433262711865e-06, "loss": 1.1468, "mean_token_accuracy": 0.6638765931129456, "num_tokens": 3711361.0, "step": 4612 }, { "epoch": 1.2219279661016949, "grad_norm": 1.3422596454620361, "learning_rate": 9.38916843220339e-06, "loss": 1.2424, "mean_token_accuracy": 0.7273561097681522, "num_tokens": 3713215.0, "step": 4614 }, { "epoch": 1.222457627118644, "grad_norm": 1.7081602811813354, "learning_rate": 9.388903601694917e-06, "loss": 1.4467, "mean_token_accuracy": 0.6865152642130852, "num_tokens": 3714533.0, "step": 4616 }, { "epoch": 1.2229872881355932, "grad_norm": 1.6249052286148071, "learning_rate": 9.388638771186442e-06, "loss": 1.8594, "mean_token_accuracy": 0.614223588258028, "num_tokens": 3716327.0, "step": 4618 }, { "epoch": 1.2235169491525424, "grad_norm": 1.3956938982009888, "learning_rate": 9.388373940677967e-06, "loss": 1.2838, "mean_token_accuracy": 0.7145144790410995, "num_tokens": 3717827.0, "step": 4620 }, { "epoch": 1.2240466101694916, "grad_norm": 1.833509087562561, "learning_rate": 9.388109110169491e-06, "loss": 1.7537, "mean_token_accuracy": 0.6285811513662338, "num_tokens": 3719331.0, "step": 4622 }, { "epoch": 1.2245762711864407, "grad_norm": 1.442124605178833, "learning_rate": 9.387844279661018e-06, "loss": 1.1765, "mean_token_accuracy": 0.713114358484745, "num_tokens": 3721007.0, "step": 4624 }, { "epoch": 1.22510593220339, "grad_norm": 1.9040448665618896, "learning_rate": 9.387579449152543e-06, "loss": 1.2752, "mean_token_accuracy": 0.7097102925181389, "num_tokens": 3722490.0, "step": 4626 }, { "epoch": 1.225635593220339, "grad_norm": 1.687970519065857, "learning_rate": 9.38731461864407e-06, "loss": 1.2114, "mean_token_accuracy": 0.7141473665833473, "num_tokens": 3723838.0, "step": 4628 }, { "epoch": 1.226165254237288, "grad_norm": 1.4153292179107666, "learning_rate": 9.387049788135595e-06, "loss": 1.3417, "mean_token_accuracy": 0.6939060837030411, "num_tokens": 3725319.0, "step": 4630 }, { "epoch": 1.2266949152542372, "grad_norm": 1.4774161577224731, "learning_rate": 9.38678495762712e-06, "loss": 0.8882, "mean_token_accuracy": 0.7753674313426018, "num_tokens": 3726895.0, "step": 4632 }, { "epoch": 1.2272245762711864, "grad_norm": 1.355717658996582, "learning_rate": 9.386520127118644e-06, "loss": 1.2611, "mean_token_accuracy": 0.6983434855937958, "num_tokens": 3728803.0, "step": 4634 }, { "epoch": 1.2277542372881356, "grad_norm": 1.9537370204925537, "learning_rate": 9.386255296610171e-06, "loss": 1.1385, "mean_token_accuracy": 0.7635442614555359, "num_tokens": 3730418.0, "step": 4636 }, { "epoch": 1.2282838983050848, "grad_norm": 1.577960729598999, "learning_rate": 9.385990466101696e-06, "loss": 1.2378, "mean_token_accuracy": 0.7278143614530563, "num_tokens": 3731975.0, "step": 4638 }, { "epoch": 1.228813559322034, "grad_norm": 1.7192679643630981, "learning_rate": 9.385725635593221e-06, "loss": 1.1987, "mean_token_accuracy": 0.6986896991729736, "num_tokens": 3733625.0, "step": 4640 }, { "epoch": 1.2293432203389831, "grad_norm": 1.5445590019226074, "learning_rate": 9.385460805084746e-06, "loss": 1.1832, "mean_token_accuracy": 0.7238012626767159, "num_tokens": 3735233.0, "step": 4642 }, { "epoch": 1.229872881355932, "grad_norm": 1.541122317314148, "learning_rate": 9.385195974576272e-06, "loss": 1.3603, "mean_token_accuracy": 0.6751417219638824, "num_tokens": 3736990.0, "step": 4644 }, { "epoch": 1.2304025423728813, "grad_norm": 1.6468151807785034, "learning_rate": 9.384931144067797e-06, "loss": 1.2108, "mean_token_accuracy": 0.7294973582029343, "num_tokens": 3738447.0, "step": 4646 }, { "epoch": 1.2309322033898304, "grad_norm": 1.704986572265625, "learning_rate": 9.384666313559322e-06, "loss": 0.9336, "mean_token_accuracy": 0.7436250299215317, "num_tokens": 3740154.0, "step": 4648 }, { "epoch": 1.2314618644067796, "grad_norm": 1.5331096649169922, "learning_rate": 9.384401483050847e-06, "loss": 1.0479, "mean_token_accuracy": 0.7364800646901131, "num_tokens": 3741666.0, "step": 4650 }, { "epoch": 1.2319915254237288, "grad_norm": 1.8910503387451172, "learning_rate": 9.384136652542374e-06, "loss": 1.6506, "mean_token_accuracy": 0.65172129124403, "num_tokens": 3743298.0, "step": 4652 }, { "epoch": 1.232521186440678, "grad_norm": 2.1423556804656982, "learning_rate": 9.383871822033899e-06, "loss": 1.6363, "mean_token_accuracy": 0.6408377476036549, "num_tokens": 3744878.0, "step": 4654 }, { "epoch": 1.2330508474576272, "grad_norm": 1.6581653356552124, "learning_rate": 9.383606991525425e-06, "loss": 0.9564, "mean_token_accuracy": 0.7791768088936806, "num_tokens": 3746177.0, "step": 4656 }, { "epoch": 1.2335805084745763, "grad_norm": 1.873978614807129, "learning_rate": 9.383342161016949e-06, "loss": 1.0143, "mean_token_accuracy": 0.7558230832219124, "num_tokens": 3747598.0, "step": 4658 }, { "epoch": 1.2341101694915255, "grad_norm": 1.832001805305481, "learning_rate": 9.383077330508475e-06, "loss": 1.1, "mean_token_accuracy": 0.7537098228931427, "num_tokens": 3749093.0, "step": 4660 }, { "epoch": 1.2346398305084745, "grad_norm": 1.719555377960205, "learning_rate": 9.3828125e-06, "loss": 1.2312, "mean_token_accuracy": 0.7230864018201828, "num_tokens": 3750618.0, "step": 4662 }, { "epoch": 1.2351694915254237, "grad_norm": 1.2972139120101929, "learning_rate": 9.382547669491527e-06, "loss": 0.5974, "mean_token_accuracy": 0.8417573124170303, "num_tokens": 3752149.0, "step": 4664 }, { "epoch": 1.2356991525423728, "grad_norm": 1.4765398502349854, "learning_rate": 9.382282838983052e-06, "loss": 1.0747, "mean_token_accuracy": 0.7597423866391182, "num_tokens": 3754013.0, "step": 4666 }, { "epoch": 1.236228813559322, "grad_norm": 1.5933706760406494, "learning_rate": 9.382018008474577e-06, "loss": 1.3935, "mean_token_accuracy": 0.6763465031981468, "num_tokens": 3755584.0, "step": 4668 }, { "epoch": 1.2367584745762712, "grad_norm": 1.2719639539718628, "learning_rate": 9.381753177966102e-06, "loss": 1.0285, "mean_token_accuracy": 0.7721392884850502, "num_tokens": 3757345.0, "step": 4670 }, { "epoch": 1.2372881355932204, "grad_norm": 1.8928542137145996, "learning_rate": 9.381488347457628e-06, "loss": 1.5237, "mean_token_accuracy": 0.6685124933719635, "num_tokens": 3759343.0, "step": 4672 }, { "epoch": 1.2378177966101696, "grad_norm": 1.634047508239746, "learning_rate": 9.381223516949153e-06, "loss": 1.4273, "mean_token_accuracy": 0.668974444270134, "num_tokens": 3761331.0, "step": 4674 }, { "epoch": 1.2383474576271187, "grad_norm": 1.5540926456451416, "learning_rate": 9.380958686440678e-06, "loss": 1.498, "mean_token_accuracy": 0.6828362569212914, "num_tokens": 3762889.0, "step": 4676 }, { "epoch": 1.238877118644068, "grad_norm": 1.822209119796753, "learning_rate": 9.380693855932203e-06, "loss": 1.6162, "mean_token_accuracy": 0.6549526378512383, "num_tokens": 3764391.0, "step": 4678 }, { "epoch": 1.2394067796610169, "grad_norm": 1.697823166847229, "learning_rate": 9.38042902542373e-06, "loss": 1.5198, "mean_token_accuracy": 0.6630834005773067, "num_tokens": 3766063.0, "step": 4680 }, { "epoch": 1.239936440677966, "grad_norm": 1.5106450319290161, "learning_rate": 9.380164194915256e-06, "loss": 1.0628, "mean_token_accuracy": 0.7394632697105408, "num_tokens": 3767806.0, "step": 4682 }, { "epoch": 1.2404661016949152, "grad_norm": 1.7428756952285767, "learning_rate": 9.379899364406781e-06, "loss": 1.3826, "mean_token_accuracy": 0.6749868765473366, "num_tokens": 3769425.0, "step": 4684 }, { "epoch": 1.2409957627118644, "grad_norm": 1.5787421464920044, "learning_rate": 9.379634533898306e-06, "loss": 1.487, "mean_token_accuracy": 0.6846462935209274, "num_tokens": 3770899.0, "step": 4686 }, { "epoch": 1.2415254237288136, "grad_norm": 1.6526256799697876, "learning_rate": 9.379369703389831e-06, "loss": 1.4429, "mean_token_accuracy": 0.6617487594485283, "num_tokens": 3772302.0, "step": 4688 }, { "epoch": 1.2420550847457628, "grad_norm": 1.9687880277633667, "learning_rate": 9.379104872881358e-06, "loss": 1.1687, "mean_token_accuracy": 0.7200028300285339, "num_tokens": 3773485.0, "step": 4690 }, { "epoch": 1.242584745762712, "grad_norm": 1.5247135162353516, "learning_rate": 9.378840042372883e-06, "loss": 1.605, "mean_token_accuracy": 0.6240322664380074, "num_tokens": 3775232.0, "step": 4692 }, { "epoch": 1.243114406779661, "grad_norm": 1.6192810535430908, "learning_rate": 9.378575211864408e-06, "loss": 1.4377, "mean_token_accuracy": 0.6866476684808731, "num_tokens": 3776901.0, "step": 4694 }, { "epoch": 1.24364406779661, "grad_norm": 1.767253041267395, "learning_rate": 9.378310381355932e-06, "loss": 1.5737, "mean_token_accuracy": 0.659190371632576, "num_tokens": 3778632.0, "step": 4696 }, { "epoch": 1.2441737288135593, "grad_norm": 1.845502495765686, "learning_rate": 9.378045550847459e-06, "loss": 1.4573, "mean_token_accuracy": 0.6738687679171562, "num_tokens": 3780321.0, "step": 4698 }, { "epoch": 1.2447033898305084, "grad_norm": 1.926538109779358, "learning_rate": 9.377780720338984e-06, "loss": 1.3634, "mean_token_accuracy": 0.6937436535954475, "num_tokens": 3782018.0, "step": 4700 }, { "epoch": 1.2452330508474576, "grad_norm": 1.4478424787521362, "learning_rate": 9.377515889830509e-06, "loss": 1.0884, "mean_token_accuracy": 0.725159265100956, "num_tokens": 3783537.0, "step": 4702 }, { "epoch": 1.2457627118644068, "grad_norm": 1.59574294090271, "learning_rate": 9.377251059322034e-06, "loss": 1.198, "mean_token_accuracy": 0.7128728553652763, "num_tokens": 3785215.0, "step": 4704 }, { "epoch": 1.246292372881356, "grad_norm": 1.8303018808364868, "learning_rate": 9.37698622881356e-06, "loss": 1.5236, "mean_token_accuracy": 0.6635080501437187, "num_tokens": 3786833.0, "step": 4706 }, { "epoch": 1.2468220338983051, "grad_norm": 1.608482837677002, "learning_rate": 9.376721398305085e-06, "loss": 1.2868, "mean_token_accuracy": 0.710689090192318, "num_tokens": 3788521.0, "step": 4708 }, { "epoch": 1.2473516949152543, "grad_norm": 1.667968988418579, "learning_rate": 9.376456567796612e-06, "loss": 1.5279, "mean_token_accuracy": 0.687105767428875, "num_tokens": 3790251.0, "step": 4710 }, { "epoch": 1.2478813559322033, "grad_norm": 1.6980772018432617, "learning_rate": 9.376191737288135e-06, "loss": 1.408, "mean_token_accuracy": 0.6725558415055275, "num_tokens": 3791820.0, "step": 4712 }, { "epoch": 1.2484110169491525, "grad_norm": 1.6315395832061768, "learning_rate": 9.375926906779662e-06, "loss": 1.6684, "mean_token_accuracy": 0.661523126065731, "num_tokens": 3793593.0, "step": 4714 }, { "epoch": 1.2489406779661016, "grad_norm": 1.0877320766448975, "learning_rate": 9.375662076271187e-06, "loss": 0.795, "mean_token_accuracy": 0.7891872227191925, "num_tokens": 3795581.0, "step": 4716 }, { "epoch": 1.2494703389830508, "grad_norm": 2.021986722946167, "learning_rate": 9.375397245762713e-06, "loss": 1.5919, "mean_token_accuracy": 0.6408044993877411, "num_tokens": 3797114.0, "step": 4718 }, { "epoch": 1.25, "grad_norm": 1.5176076889038086, "learning_rate": 9.375132415254238e-06, "loss": 1.422, "mean_token_accuracy": 0.6856395937502384, "num_tokens": 3798936.0, "step": 4720 }, { "epoch": 1.2505296610169492, "grad_norm": 1.643115520477295, "learning_rate": 9.374867584745763e-06, "loss": 1.5366, "mean_token_accuracy": 0.6375995129346848, "num_tokens": 3800462.0, "step": 4722 }, { "epoch": 1.2510593220338984, "grad_norm": 1.7902356386184692, "learning_rate": 9.374602754237288e-06, "loss": 1.4634, "mean_token_accuracy": 0.6666616052389145, "num_tokens": 3802027.0, "step": 4724 }, { "epoch": 1.2515889830508475, "grad_norm": 1.1552149057388306, "learning_rate": 9.374337923728815e-06, "loss": 1.3013, "mean_token_accuracy": 0.6999457404017448, "num_tokens": 3803734.0, "step": 4726 }, { "epoch": 1.2521186440677967, "grad_norm": 1.49691903591156, "learning_rate": 9.37407309322034e-06, "loss": 1.2298, "mean_token_accuracy": 0.7110451012849808, "num_tokens": 3805260.0, "step": 4728 }, { "epoch": 1.2526483050847457, "grad_norm": 1.3857563734054565, "learning_rate": 9.373808262711865e-06, "loss": 0.8956, "mean_token_accuracy": 0.7805106118321419, "num_tokens": 3807232.0, "step": 4730 }, { "epoch": 1.2531779661016949, "grad_norm": 1.6513266563415527, "learning_rate": 9.37354343220339e-06, "loss": 1.1924, "mean_token_accuracy": 0.6897857636213303, "num_tokens": 3809406.0, "step": 4732 }, { "epoch": 1.253707627118644, "grad_norm": 1.3591341972351074, "learning_rate": 9.373278601694916e-06, "loss": 0.8615, "mean_token_accuracy": 0.7825687080621719, "num_tokens": 3811061.0, "step": 4734 }, { "epoch": 1.2542372881355932, "grad_norm": 1.58150315284729, "learning_rate": 9.373013771186441e-06, "loss": 1.0211, "mean_token_accuracy": 0.7309782728552818, "num_tokens": 3812670.0, "step": 4736 }, { "epoch": 1.2547669491525424, "grad_norm": 2.126899480819702, "learning_rate": 9.372748940677968e-06, "loss": 1.6979, "mean_token_accuracy": 0.6224904507398605, "num_tokens": 3814509.0, "step": 4738 }, { "epoch": 1.2552966101694916, "grad_norm": 1.8160672187805176, "learning_rate": 9.372484110169491e-06, "loss": 1.3708, "mean_token_accuracy": 0.6930701658129692, "num_tokens": 3815989.0, "step": 4740 }, { "epoch": 1.2558262711864407, "grad_norm": 1.2955256700515747, "learning_rate": 9.372219279661018e-06, "loss": 1.034, "mean_token_accuracy": 0.7602613940834999, "num_tokens": 3817546.0, "step": 4742 }, { "epoch": 1.2563559322033897, "grad_norm": 1.8663686513900757, "learning_rate": 9.371954449152543e-06, "loss": 1.642, "mean_token_accuracy": 0.6715313494205475, "num_tokens": 3819120.0, "step": 4744 }, { "epoch": 1.256885593220339, "grad_norm": 1.4611754417419434, "learning_rate": 9.37168961864407e-06, "loss": 1.1061, "mean_token_accuracy": 0.7497235015034676, "num_tokens": 3820617.0, "step": 4746 }, { "epoch": 1.257415254237288, "grad_norm": 2.4132814407348633, "learning_rate": 9.371424788135594e-06, "loss": 1.7868, "mean_token_accuracy": 0.6184623800218105, "num_tokens": 3822030.0, "step": 4748 }, { "epoch": 1.2579449152542372, "grad_norm": 1.9135475158691406, "learning_rate": 9.371159957627119e-06, "loss": 2.0231, "step": 4750 }, { "epoch": 1.2579449152542372, "eval_loss": 1.3221665620803833, "eval_mean_token_accuracy": 0.6992848267802945, "eval_num_tokens": 3823709.0, "eval_runtime": 48.2792, "eval_samples_per_second": 6.38, "eval_steps_per_second": 6.38, "step": 4750 }, { "epoch": 1.2584745762711864, "grad_norm": 1.5446497201919556, "learning_rate": 9.370895127118644e-06, "loss": 1.4505, "mean_token_accuracy": 0.6382901817560196, "num_tokens": 3825642.0, "step": 4752 }, { "epoch": 1.2590042372881356, "grad_norm": 1.7969162464141846, "learning_rate": 9.37063029661017e-06, "loss": 1.3471, "mean_token_accuracy": 0.6853828132152557, "num_tokens": 3827399.0, "step": 4754 }, { "epoch": 1.2595338983050848, "grad_norm": 1.7470711469650269, "learning_rate": 9.370365466101696e-06, "loss": 1.1613, "mean_token_accuracy": 0.7512533850967884, "num_tokens": 3828926.0, "step": 4756 }, { "epoch": 1.260063559322034, "grad_norm": 1.2984848022460938, "learning_rate": 9.37010063559322e-06, "loss": 1.232, "mean_token_accuracy": 0.7174400761723518, "num_tokens": 3830818.0, "step": 4758 }, { "epoch": 1.2605932203389831, "grad_norm": 1.6729748249053955, "learning_rate": 9.369835805084745e-06, "loss": 1.1269, "mean_token_accuracy": 0.7315908595919609, "num_tokens": 3832241.0, "step": 4760 }, { "epoch": 1.261122881355932, "grad_norm": 1.6387699842453003, "learning_rate": 9.369570974576272e-06, "loss": 1.2653, "mean_token_accuracy": 0.7052296958863735, "num_tokens": 3833903.0, "step": 4762 }, { "epoch": 1.2616525423728815, "grad_norm": 1.796339750289917, "learning_rate": 9.369306144067799e-06, "loss": 1.1473, "mean_token_accuracy": 0.7348689660429955, "num_tokens": 3835475.0, "step": 4764 }, { "epoch": 1.2621822033898304, "grad_norm": 1.772773027420044, "learning_rate": 9.369041313559322e-06, "loss": 1.5851, "mean_token_accuracy": 0.632440622895956, "num_tokens": 3837425.0, "step": 4766 }, { "epoch": 1.2627118644067796, "grad_norm": 1.8960976600646973, "learning_rate": 9.368776483050849e-06, "loss": 1.0314, "mean_token_accuracy": 0.7553114257752895, "num_tokens": 3838735.0, "step": 4768 }, { "epoch": 1.2632415254237288, "grad_norm": 1.6641011238098145, "learning_rate": 9.368511652542373e-06, "loss": 1.702, "mean_token_accuracy": 0.6248972453176975, "num_tokens": 3840556.0, "step": 4770 }, { "epoch": 1.263771186440678, "grad_norm": 1.69638192653656, "learning_rate": 9.3682468220339e-06, "loss": 1.4312, "mean_token_accuracy": 0.6698348447680473, "num_tokens": 3842296.0, "step": 4772 }, { "epoch": 1.2643008474576272, "grad_norm": 1.4952703714370728, "learning_rate": 9.367981991525425e-06, "loss": 1.3036, "mean_token_accuracy": 0.6861668080091476, "num_tokens": 3844061.0, "step": 4774 }, { "epoch": 1.2648305084745763, "grad_norm": 1.7732396125793457, "learning_rate": 9.36771716101695e-06, "loss": 1.0088, "mean_token_accuracy": 0.7676705569028854, "num_tokens": 3845267.0, "step": 4776 }, { "epoch": 1.2653601694915255, "grad_norm": 1.9399819374084473, "learning_rate": 9.367452330508475e-06, "loss": 1.385, "mean_token_accuracy": 0.6725183799862862, "num_tokens": 3846887.0, "step": 4778 }, { "epoch": 1.2658898305084745, "grad_norm": 1.7710437774658203, "learning_rate": 9.367187500000002e-06, "loss": 1.7358, "mean_token_accuracy": 0.6186798363924026, "num_tokens": 3848394.0, "step": 4780 }, { "epoch": 1.2664194915254237, "grad_norm": 1.6217061281204224, "learning_rate": 9.366922669491526e-06, "loss": 1.7847, "mean_token_accuracy": 0.6291686818003654, "num_tokens": 3850033.0, "step": 4782 }, { "epoch": 1.2669491525423728, "grad_norm": 2.019251823425293, "learning_rate": 9.366657838983051e-06, "loss": 1.727, "mean_token_accuracy": 0.6015540063381195, "num_tokens": 3851563.0, "step": 4784 }, { "epoch": 1.267478813559322, "grad_norm": 1.2753640413284302, "learning_rate": 9.366393008474576e-06, "loss": 1.3886, "mean_token_accuracy": 0.684392374008894, "num_tokens": 3853136.0, "step": 4786 }, { "epoch": 1.2680084745762712, "grad_norm": 1.658557415008545, "learning_rate": 9.366128177966103e-06, "loss": 1.257, "mean_token_accuracy": 0.7033767327666283, "num_tokens": 3854873.0, "step": 4788 }, { "epoch": 1.2685381355932204, "grad_norm": 1.6014435291290283, "learning_rate": 9.365863347457628e-06, "loss": 1.4407, "mean_token_accuracy": 0.6968496851623058, "num_tokens": 3856441.0, "step": 4790 }, { "epoch": 1.2690677966101696, "grad_norm": 2.0125749111175537, "learning_rate": 9.365598516949154e-06, "loss": 1.4271, "mean_token_accuracy": 0.6825606673955917, "num_tokens": 3858024.0, "step": 4792 }, { "epoch": 1.2695974576271185, "grad_norm": 1.7273366451263428, "learning_rate": 9.365333686440678e-06, "loss": 1.7048, "mean_token_accuracy": 0.6406371518969536, "num_tokens": 3859794.0, "step": 4794 }, { "epoch": 1.270127118644068, "grad_norm": 1.5231419801712036, "learning_rate": 9.365068855932204e-06, "loss": 1.2057, "mean_token_accuracy": 0.6965092197060585, "num_tokens": 3861625.0, "step": 4796 }, { "epoch": 1.2706567796610169, "grad_norm": 1.7673622369766235, "learning_rate": 9.36480402542373e-06, "loss": 1.7317, "mean_token_accuracy": 0.6173234209418297, "num_tokens": 3863217.0, "step": 4798 }, { "epoch": 1.271186440677966, "grad_norm": 1.3997734785079956, "learning_rate": 9.364539194915256e-06, "loss": 0.9756, "mean_token_accuracy": 0.7852410823106766, "num_tokens": 3864780.0, "step": 4800 }, { "epoch": 1.2717161016949152, "grad_norm": 1.4519118070602417, "learning_rate": 9.36427436440678e-06, "loss": 0.9115, "mean_token_accuracy": 0.7716889604926109, "num_tokens": 3866446.0, "step": 4802 }, { "epoch": 1.2722457627118644, "grad_norm": 1.605819821357727, "learning_rate": 9.364009533898306e-06, "loss": 1.2897, "mean_token_accuracy": 0.7174529731273651, "num_tokens": 3868223.0, "step": 4804 }, { "epoch": 1.2727754237288136, "grad_norm": 2.007050037384033, "learning_rate": 9.36374470338983e-06, "loss": 1.2964, "mean_token_accuracy": 0.7095108926296234, "num_tokens": 3869538.0, "step": 4806 }, { "epoch": 1.2733050847457628, "grad_norm": 1.8490291833877563, "learning_rate": 9.363479872881357e-06, "loss": 1.1444, "mean_token_accuracy": 0.7212975099682808, "num_tokens": 3871004.0, "step": 4808 }, { "epoch": 1.273834745762712, "grad_norm": 1.7717138528823853, "learning_rate": 9.363215042372882e-06, "loss": 1.1396, "mean_token_accuracy": 0.7422746866941452, "num_tokens": 3872363.0, "step": 4810 }, { "epoch": 1.274364406779661, "grad_norm": 3.2044098377227783, "learning_rate": 9.362950211864407e-06, "loss": 1.8907, "mean_token_accuracy": 0.6465893611311913, "num_tokens": 3874538.0, "step": 4812 }, { "epoch": 1.2748940677966103, "grad_norm": 1.7487157583236694, "learning_rate": 9.362685381355932e-06, "loss": 1.294, "mean_token_accuracy": 0.7039253078401089, "num_tokens": 3876072.0, "step": 4814 }, { "epoch": 1.2754237288135593, "grad_norm": 1.7233328819274902, "learning_rate": 9.362420550847459e-06, "loss": 1.3059, "mean_token_accuracy": 0.7455853596329689, "num_tokens": 3877506.0, "step": 4816 }, { "epoch": 1.2759533898305084, "grad_norm": 1.5435417890548706, "learning_rate": 9.362155720338984e-06, "loss": 1.1757, "mean_token_accuracy": 0.7013080045580864, "num_tokens": 3879171.0, "step": 4818 }, { "epoch": 1.2764830508474576, "grad_norm": 1.618689775466919, "learning_rate": 9.361890889830509e-06, "loss": 1.2596, "mean_token_accuracy": 0.7399357929825783, "num_tokens": 3880928.0, "step": 4820 }, { "epoch": 1.2770127118644068, "grad_norm": 1.431624412536621, "learning_rate": 9.361626059322034e-06, "loss": 1.0841, "mean_token_accuracy": 0.7598767802119255, "num_tokens": 3882647.0, "step": 4822 }, { "epoch": 1.277542372881356, "grad_norm": 1.606050968170166, "learning_rate": 9.36136122881356e-06, "loss": 1.4825, "mean_token_accuracy": 0.67313052713871, "num_tokens": 3884397.0, "step": 4824 }, { "epoch": 1.2780720338983051, "grad_norm": 1.519618034362793, "learning_rate": 9.361096398305085e-06, "loss": 1.2569, "mean_token_accuracy": 0.7441317141056061, "num_tokens": 3886024.0, "step": 4826 }, { "epoch": 1.2786016949152543, "grad_norm": 1.6680994033813477, "learning_rate": 9.360831567796612e-06, "loss": 1.5616, "mean_token_accuracy": 0.6657325997948647, "num_tokens": 3888017.0, "step": 4828 }, { "epoch": 1.2791313559322033, "grad_norm": 1.764125108718872, "learning_rate": 9.360566737288137e-06, "loss": 1.6374, "mean_token_accuracy": 0.657021626830101, "num_tokens": 3889761.0, "step": 4830 }, { "epoch": 1.2796610169491525, "grad_norm": 1.4989091157913208, "learning_rate": 9.360301906779662e-06, "loss": 1.1556, "mean_token_accuracy": 0.7260351553559303, "num_tokens": 3891307.0, "step": 4832 }, { "epoch": 1.2801906779661016, "grad_norm": 1.5439530611038208, "learning_rate": 9.360037076271186e-06, "loss": 1.6527, "mean_token_accuracy": 0.6294016465544701, "num_tokens": 3893259.0, "step": 4834 }, { "epoch": 1.2807203389830508, "grad_norm": 2.312656879425049, "learning_rate": 9.359772245762713e-06, "loss": 1.6342, "mean_token_accuracy": 0.6339141875505447, "num_tokens": 3894718.0, "step": 4836 }, { "epoch": 1.28125, "grad_norm": 1.9438505172729492, "learning_rate": 9.359507415254238e-06, "loss": 1.8154, "mean_token_accuracy": 0.6232443675398827, "num_tokens": 3896155.0, "step": 4838 }, { "epoch": 1.2817796610169492, "grad_norm": 1.5922435522079468, "learning_rate": 9.359242584745763e-06, "loss": 1.1862, "mean_token_accuracy": 0.7386905252933502, "num_tokens": 3897773.0, "step": 4840 }, { "epoch": 1.2823093220338984, "grad_norm": 1.439885139465332, "learning_rate": 9.358977754237288e-06, "loss": 0.9886, "mean_token_accuracy": 0.764647364616394, "num_tokens": 3899394.0, "step": 4842 }, { "epoch": 1.2828389830508475, "grad_norm": 1.7176724672317505, "learning_rate": 9.358712923728815e-06, "loss": 1.6016, "mean_token_accuracy": 0.6730304434895515, "num_tokens": 3900970.0, "step": 4844 }, { "epoch": 1.2833686440677967, "grad_norm": 1.4071460962295532, "learning_rate": 9.35844809322034e-06, "loss": 1.4615, "mean_token_accuracy": 0.7096792608499527, "num_tokens": 3902633.0, "step": 4846 }, { "epoch": 1.2838983050847457, "grad_norm": 1.5159372091293335, "learning_rate": 9.358183262711864e-06, "loss": 1.372, "mean_token_accuracy": 0.6895506680011749, "num_tokens": 3904244.0, "step": 4848 }, { "epoch": 1.2844279661016949, "grad_norm": 1.4079616069793701, "learning_rate": 9.357918432203391e-06, "loss": 1.3696, "mean_token_accuracy": 0.6867541819810867, "num_tokens": 3905984.0, "step": 4850 }, { "epoch": 1.284957627118644, "grad_norm": 2.042739152908325, "learning_rate": 9.357653601694916e-06, "loss": 1.427, "mean_token_accuracy": 0.6920848339796066, "num_tokens": 3907392.0, "step": 4852 }, { "epoch": 1.2854872881355932, "grad_norm": 1.1829946041107178, "learning_rate": 9.357388771186443e-06, "loss": 1.0962, "mean_token_accuracy": 0.7529573142528534, "num_tokens": 3909188.0, "step": 4854 }, { "epoch": 1.2860169491525424, "grad_norm": 1.6848219633102417, "learning_rate": 9.357123940677967e-06, "loss": 1.5524, "mean_token_accuracy": 0.6623825505375862, "num_tokens": 3910868.0, "step": 4856 }, { "epoch": 1.2865466101694916, "grad_norm": 1.5972673892974854, "learning_rate": 9.356859110169492e-06, "loss": 1.0481, "mean_token_accuracy": 0.7422085627913475, "num_tokens": 3912394.0, "step": 4858 }, { "epoch": 1.2870762711864407, "grad_norm": 1.554885745048523, "learning_rate": 9.356594279661017e-06, "loss": 0.9965, "mean_token_accuracy": 0.7541199550032616, "num_tokens": 3913884.0, "step": 4860 }, { "epoch": 1.2876059322033897, "grad_norm": 1.4794011116027832, "learning_rate": 9.356329449152544e-06, "loss": 1.2621, "mean_token_accuracy": 0.7186218798160553, "num_tokens": 3915536.0, "step": 4862 }, { "epoch": 1.288135593220339, "grad_norm": 1.508600115776062, "learning_rate": 9.356064618644069e-06, "loss": 1.6516, "mean_token_accuracy": 0.6483799740672112, "num_tokens": 3917287.0, "step": 4864 }, { "epoch": 1.288665254237288, "grad_norm": 1.5386908054351807, "learning_rate": 9.355799788135594e-06, "loss": 1.0586, "mean_token_accuracy": 0.7418884187936783, "num_tokens": 3918762.0, "step": 4866 }, { "epoch": 1.2891949152542372, "grad_norm": 1.4103201627731323, "learning_rate": 9.355534957627119e-06, "loss": 1.312, "mean_token_accuracy": 0.7351659089326859, "num_tokens": 3920540.0, "step": 4868 }, { "epoch": 1.2897245762711864, "grad_norm": 1.5212489366531372, "learning_rate": 9.355270127118645e-06, "loss": 1.4345, "mean_token_accuracy": 0.692725908011198, "num_tokens": 3922046.0, "step": 4870 }, { "epoch": 1.2902542372881356, "grad_norm": 1.8945215940475464, "learning_rate": 9.35500529661017e-06, "loss": 1.4847, "mean_token_accuracy": 0.6859583333134651, "num_tokens": 3923731.0, "step": 4872 }, { "epoch": 1.2907838983050848, "grad_norm": 1.7565388679504395, "learning_rate": 9.354740466101695e-06, "loss": 1.4753, "mean_token_accuracy": 0.6815657429397106, "num_tokens": 3925360.0, "step": 4874 }, { "epoch": 1.291313559322034, "grad_norm": 1.7267353534698486, "learning_rate": 9.35447563559322e-06, "loss": 1.326, "mean_token_accuracy": 0.6828025206923485, "num_tokens": 3927085.0, "step": 4876 }, { "epoch": 1.2918432203389831, "grad_norm": 1.9377881288528442, "learning_rate": 9.354210805084747e-06, "loss": 1.7236, "mean_token_accuracy": 0.6422564908862114, "num_tokens": 3928800.0, "step": 4878 }, { "epoch": 1.292372881355932, "grad_norm": 1.6733050346374512, "learning_rate": 9.353945974576272e-06, "loss": 1.1382, "mean_token_accuracy": 0.7277101576328278, "num_tokens": 3930522.0, "step": 4880 }, { "epoch": 1.2929025423728815, "grad_norm": 1.530449390411377, "learning_rate": 9.353681144067798e-06, "loss": 1.3223, "mean_token_accuracy": 0.6949919834733009, "num_tokens": 3932043.0, "step": 4882 }, { "epoch": 1.2934322033898304, "grad_norm": 1.505834698677063, "learning_rate": 9.353416313559323e-06, "loss": 1.4861, "mean_token_accuracy": 0.6667457409203053, "num_tokens": 3933623.0, "step": 4884 }, { "epoch": 1.2939618644067796, "grad_norm": 1.6275978088378906, "learning_rate": 9.353151483050848e-06, "loss": 1.4515, "mean_token_accuracy": 0.6886992231011391, "num_tokens": 3935339.0, "step": 4886 }, { "epoch": 1.2944915254237288, "grad_norm": 1.484210729598999, "learning_rate": 9.352886652542373e-06, "loss": 1.3054, "mean_token_accuracy": 0.705205075442791, "num_tokens": 3937522.0, "step": 4888 }, { "epoch": 1.295021186440678, "grad_norm": 1.6479226350784302, "learning_rate": 9.3526218220339e-06, "loss": 1.1294, "mean_token_accuracy": 0.7260158360004425, "num_tokens": 3938870.0, "step": 4890 }, { "epoch": 1.2955508474576272, "grad_norm": 1.6053969860076904, "learning_rate": 9.352356991525425e-06, "loss": 1.2869, "mean_token_accuracy": 0.6946425437927246, "num_tokens": 3940554.0, "step": 4892 }, { "epoch": 1.2960805084745763, "grad_norm": 1.491369605064392, "learning_rate": 9.35209216101695e-06, "loss": 0.9711, "mean_token_accuracy": 0.7686251178383827, "num_tokens": 3942398.0, "step": 4894 }, { "epoch": 1.2966101694915255, "grad_norm": 1.8916358947753906, "learning_rate": 9.351827330508475e-06, "loss": 1.9729, "mean_token_accuracy": 0.5714621320366859, "num_tokens": 3944164.0, "step": 4896 }, { "epoch": 1.2971398305084745, "grad_norm": 1.398798942565918, "learning_rate": 9.351562500000001e-06, "loss": 1.3927, "mean_token_accuracy": 0.6701049841940403, "num_tokens": 3945740.0, "step": 4898 }, { "epoch": 1.2976694915254237, "grad_norm": 1.713436245918274, "learning_rate": 9.351297669491526e-06, "loss": 1.0955, "mean_token_accuracy": 0.7359983995556831, "num_tokens": 3947123.0, "step": 4900 }, { "epoch": 1.2981991525423728, "grad_norm": 1.9640990495681763, "learning_rate": 9.351032838983051e-06, "loss": 1.9898, "mean_token_accuracy": 0.5819899551570415, "num_tokens": 3948811.0, "step": 4902 }, { "epoch": 1.298728813559322, "grad_norm": 1.4447678327560425, "learning_rate": 9.350768008474576e-06, "loss": 1.4236, "mean_token_accuracy": 0.6879237666726112, "num_tokens": 3950489.0, "step": 4904 }, { "epoch": 1.2992584745762712, "grad_norm": 1.466123104095459, "learning_rate": 9.350503177966103e-06, "loss": 1.3489, "mean_token_accuracy": 0.6866961792111397, "num_tokens": 3952185.0, "step": 4906 }, { "epoch": 1.2997881355932204, "grad_norm": 1.4483399391174316, "learning_rate": 9.350238347457627e-06, "loss": 1.204, "mean_token_accuracy": 0.7195623740553856, "num_tokens": 3953789.0, "step": 4908 }, { "epoch": 1.3003177966101696, "grad_norm": 1.8234295845031738, "learning_rate": 9.349973516949154e-06, "loss": 1.7347, "mean_token_accuracy": 0.6481363736093044, "num_tokens": 3955310.0, "step": 4910 }, { "epoch": 1.3008474576271185, "grad_norm": 1.3217450380325317, "learning_rate": 9.349708686440679e-06, "loss": 1.4054, "mean_token_accuracy": 0.6825996860861778, "num_tokens": 3957288.0, "step": 4912 }, { "epoch": 1.301377118644068, "grad_norm": 1.3477354049682617, "learning_rate": 9.349443855932204e-06, "loss": 1.0443, "mean_token_accuracy": 0.7468729317188263, "num_tokens": 3958755.0, "step": 4914 }, { "epoch": 1.3019067796610169, "grad_norm": 2.071528673171997, "learning_rate": 9.349179025423729e-06, "loss": 1.7465, "mean_token_accuracy": 0.6571647003293037, "num_tokens": 3960343.0, "step": 4916 }, { "epoch": 1.302436440677966, "grad_norm": 1.508898138999939, "learning_rate": 9.348914194915256e-06, "loss": 0.7224, "mean_token_accuracy": 0.8058724701404572, "num_tokens": 3961937.0, "step": 4918 }, { "epoch": 1.3029661016949152, "grad_norm": 1.5286808013916016, "learning_rate": 9.34864936440678e-06, "loss": 1.1781, "mean_token_accuracy": 0.7494615241885185, "num_tokens": 3963426.0, "step": 4920 }, { "epoch": 1.3034957627118644, "grad_norm": 1.8450310230255127, "learning_rate": 9.348384533898305e-06, "loss": 1.3549, "mean_token_accuracy": 0.6840187460184097, "num_tokens": 3964941.0, "step": 4922 }, { "epoch": 1.3040254237288136, "grad_norm": 1.682989239692688, "learning_rate": 9.34811970338983e-06, "loss": 1.4017, "mean_token_accuracy": 0.6907692849636078, "num_tokens": 3966292.0, "step": 4924 }, { "epoch": 1.3045550847457628, "grad_norm": 1.2737653255462646, "learning_rate": 9.347854872881357e-06, "loss": 1.3514, "mean_token_accuracy": 0.6924819238483906, "num_tokens": 3968177.0, "step": 4926 }, { "epoch": 1.305084745762712, "grad_norm": 1.7923774719238281, "learning_rate": 9.347590042372882e-06, "loss": 1.3791, "mean_token_accuracy": 0.6870225630700588, "num_tokens": 3969765.0, "step": 4928 }, { "epoch": 1.305614406779661, "grad_norm": 2.031449317932129, "learning_rate": 9.347325211864407e-06, "loss": 1.8106, "mean_token_accuracy": 0.6063663586974144, "num_tokens": 3971453.0, "step": 4930 }, { "epoch": 1.3061440677966103, "grad_norm": 1.3703850507736206, "learning_rate": 9.347060381355933e-06, "loss": 1.2193, "mean_token_accuracy": 0.705463208258152, "num_tokens": 3973105.0, "step": 4932 }, { "epoch": 1.3066737288135593, "grad_norm": 1.797231912612915, "learning_rate": 9.346795550847458e-06, "loss": 1.6542, "mean_token_accuracy": 0.6302389353513718, "num_tokens": 3974667.0, "step": 4934 }, { "epoch": 1.3072033898305084, "grad_norm": 1.871262788772583, "learning_rate": 9.346530720338985e-06, "loss": 1.2688, "mean_token_accuracy": 0.6891497671604156, "num_tokens": 3976069.0, "step": 4936 }, { "epoch": 1.3077330508474576, "grad_norm": 1.5517178773880005, "learning_rate": 9.34626588983051e-06, "loss": 1.4125, "mean_token_accuracy": 0.7024737074971199, "num_tokens": 3977553.0, "step": 4938 }, { "epoch": 1.3082627118644068, "grad_norm": 2.112215995788574, "learning_rate": 9.346001059322035e-06, "loss": 1.6232, "mean_token_accuracy": 0.6592291072010994, "num_tokens": 3979403.0, "step": 4940 }, { "epoch": 1.308792372881356, "grad_norm": 1.5113863945007324, "learning_rate": 9.34573622881356e-06, "loss": 1.4667, "mean_token_accuracy": 0.6761179566383362, "num_tokens": 3981059.0, "step": 4942 }, { "epoch": 1.3093220338983051, "grad_norm": 1.6518152952194214, "learning_rate": 9.345471398305086e-06, "loss": 0.9231, "mean_token_accuracy": 0.7975156381726265, "num_tokens": 3982380.0, "step": 4944 }, { "epoch": 1.3098516949152543, "grad_norm": 1.3887830972671509, "learning_rate": 9.345206567796611e-06, "loss": 0.9758, "mean_token_accuracy": 0.7569864243268967, "num_tokens": 3984122.0, "step": 4946 }, { "epoch": 1.3103813559322033, "grad_norm": 1.6227554082870483, "learning_rate": 9.344941737288136e-06, "loss": 1.4397, "mean_token_accuracy": 0.7015805169939995, "num_tokens": 3985803.0, "step": 4948 }, { "epoch": 1.3109110169491525, "grad_norm": 1.7043148279190063, "learning_rate": 9.344676906779661e-06, "loss": 1.2905, "mean_token_accuracy": 0.6935770958662033, "num_tokens": 3987266.0, "step": 4950 }, { "epoch": 1.3114406779661016, "grad_norm": 1.6847599744796753, "learning_rate": 9.344412076271188e-06, "loss": 1.3867, "mean_token_accuracy": 0.6856401413679123, "num_tokens": 3989027.0, "step": 4952 }, { "epoch": 1.3119703389830508, "grad_norm": 1.381581425666809, "learning_rate": 9.344147245762713e-06, "loss": 1.7422, "mean_token_accuracy": 0.6465650498867035, "num_tokens": 3991288.0, "step": 4954 }, { "epoch": 1.3125, "grad_norm": 1.5617473125457764, "learning_rate": 9.343882415254238e-06, "loss": 1.3311, "mean_token_accuracy": 0.7107653021812439, "num_tokens": 3992747.0, "step": 4956 }, { "epoch": 1.3130296610169492, "grad_norm": 1.5544424057006836, "learning_rate": 9.343617584745763e-06, "loss": 1.1442, "mean_token_accuracy": 0.7393099069595337, "num_tokens": 3994383.0, "step": 4958 }, { "epoch": 1.3135593220338984, "grad_norm": 1.8322371244430542, "learning_rate": 9.34335275423729e-06, "loss": 1.3321, "mean_token_accuracy": 0.7014202624559402, "num_tokens": 3995968.0, "step": 4960 }, { "epoch": 1.3140889830508475, "grad_norm": 1.716225266456604, "learning_rate": 9.343087923728814e-06, "loss": 1.4426, "mean_token_accuracy": 0.6569504737854004, "num_tokens": 3997502.0, "step": 4962 }, { "epoch": 1.3146186440677967, "grad_norm": 1.750715970993042, "learning_rate": 9.34282309322034e-06, "loss": 1.7308, "mean_token_accuracy": 0.6201894246041775, "num_tokens": 3999129.0, "step": 4964 }, { "epoch": 1.3151483050847457, "grad_norm": 1.5703297853469849, "learning_rate": 9.342558262711866e-06, "loss": 1.2155, "mean_token_accuracy": 0.6948683187365532, "num_tokens": 4000840.0, "step": 4966 }, { "epoch": 1.3156779661016949, "grad_norm": 2.046308755874634, "learning_rate": 9.34229343220339e-06, "loss": 1.4383, "mean_token_accuracy": 0.6913011744618416, "num_tokens": 4002065.0, "step": 4968 }, { "epoch": 1.316207627118644, "grad_norm": 1.840055227279663, "learning_rate": 9.342028601694916e-06, "loss": 1.3828, "mean_token_accuracy": 0.67891725897789, "num_tokens": 4003468.0, "step": 4970 }, { "epoch": 1.3167372881355932, "grad_norm": 1.6308645009994507, "learning_rate": 9.341763771186442e-06, "loss": 1.4375, "mean_token_accuracy": 0.7029293701052666, "num_tokens": 4005071.0, "step": 4972 }, { "epoch": 1.3172669491525424, "grad_norm": 1.695264458656311, "learning_rate": 9.341498940677967e-06, "loss": 1.6697, "mean_token_accuracy": 0.6439632996916771, "num_tokens": 4006757.0, "step": 4974 }, { "epoch": 1.3177966101694916, "grad_norm": 1.4210035800933838, "learning_rate": 9.341234110169492e-06, "loss": 0.7449, "mean_token_accuracy": 0.7991424351930618, "num_tokens": 4008435.0, "step": 4976 }, { "epoch": 1.3183262711864407, "grad_norm": 1.4254034757614136, "learning_rate": 9.340969279661017e-06, "loss": 1.4471, "mean_token_accuracy": 0.6756269261240959, "num_tokens": 4010270.0, "step": 4978 }, { "epoch": 1.3188559322033897, "grad_norm": 1.6147925853729248, "learning_rate": 9.340704449152544e-06, "loss": 1.5419, "mean_token_accuracy": 0.6601153835654259, "num_tokens": 4011772.0, "step": 4980 }, { "epoch": 1.319385593220339, "grad_norm": 1.884141445159912, "learning_rate": 9.340439618644069e-06, "loss": 1.4384, "mean_token_accuracy": 0.6836013421416283, "num_tokens": 4013418.0, "step": 4982 }, { "epoch": 1.319915254237288, "grad_norm": 1.6768686771392822, "learning_rate": 9.340174788135593e-06, "loss": 1.3447, "mean_token_accuracy": 0.7012072578072548, "num_tokens": 4015001.0, "step": 4984 }, { "epoch": 1.3204449152542372, "grad_norm": 1.5169280767440796, "learning_rate": 9.339909957627118e-06, "loss": 1.1425, "mean_token_accuracy": 0.7148029208183289, "num_tokens": 4016637.0, "step": 4986 }, { "epoch": 1.3209745762711864, "grad_norm": 1.2680742740631104, "learning_rate": 9.339645127118645e-06, "loss": 0.8559, "mean_token_accuracy": 0.7712697759270668, "num_tokens": 4018338.0, "step": 4988 }, { "epoch": 1.3215042372881356, "grad_norm": 1.1127952337265015, "learning_rate": 9.33938029661017e-06, "loss": 0.9587, "mean_token_accuracy": 0.7701340094208717, "num_tokens": 4020136.0, "step": 4990 }, { "epoch": 1.3220338983050848, "grad_norm": 1.6924490928649902, "learning_rate": 9.339115466101697e-06, "loss": 1.3764, "mean_token_accuracy": 0.6773640289902687, "num_tokens": 4021912.0, "step": 4992 }, { "epoch": 1.322563559322034, "grad_norm": 1.2199630737304688, "learning_rate": 9.338850635593221e-06, "loss": 1.0075, "mean_token_accuracy": 0.745903842151165, "num_tokens": 4023658.0, "step": 4994 }, { "epoch": 1.3230932203389831, "grad_norm": 1.7075883150100708, "learning_rate": 9.338585805084746e-06, "loss": 1.5512, "mean_token_accuracy": 0.6956811808049679, "num_tokens": 4025228.0, "step": 4996 }, { "epoch": 1.323622881355932, "grad_norm": 1.631020188331604, "learning_rate": 9.338320974576271e-06, "loss": 1.0051, "mean_token_accuracy": 0.7593778148293495, "num_tokens": 4026747.0, "step": 4998 }, { "epoch": 1.3241525423728815, "grad_norm": 1.6474124193191528, "learning_rate": 9.338056144067798e-06, "loss": 1.5931, "step": 5000 }, { "epoch": 1.3241525423728815, "eval_loss": 1.3208796977996826, "eval_mean_token_accuracy": 0.6992755838028797, "eval_num_tokens": 4028276.0, "eval_runtime": 48.2454, "eval_samples_per_second": 6.384, "eval_steps_per_second": 6.384, "step": 5000 }, { "epoch": 1.3246822033898304, "grad_norm": 1.5945308208465576, "learning_rate": 9.337791313559323e-06, "loss": 1.33, "mean_token_accuracy": 0.6841986998915672, "num_tokens": 4030037.0, "step": 5002 }, { "epoch": 1.3252118644067796, "grad_norm": 1.9732948541641235, "learning_rate": 9.337526483050848e-06, "loss": 1.4179, "mean_token_accuracy": 0.6911134161055088, "num_tokens": 4031271.0, "step": 5004 }, { "epoch": 1.3257415254237288, "grad_norm": 2.174666404724121, "learning_rate": 9.337261652542373e-06, "loss": 1.4633, "mean_token_accuracy": 0.6660975776612759, "num_tokens": 4032831.0, "step": 5006 }, { "epoch": 1.326271186440678, "grad_norm": 1.526078224182129, "learning_rate": 9.3369968220339e-06, "loss": 0.8587, "mean_token_accuracy": 0.7888995409011841, "num_tokens": 4034439.0, "step": 5008 }, { "epoch": 1.3268008474576272, "grad_norm": 1.9266464710235596, "learning_rate": 9.336731991525424e-06, "loss": 1.4903, "mean_token_accuracy": 0.6785429790616035, "num_tokens": 4035855.0, "step": 5010 }, { "epoch": 1.3273305084745763, "grad_norm": 1.9503417015075684, "learning_rate": 9.33646716101695e-06, "loss": 1.2864, "mean_token_accuracy": 0.7229606471955776, "num_tokens": 4037443.0, "step": 5012 }, { "epoch": 1.3278601694915255, "grad_norm": 1.4257668256759644, "learning_rate": 9.336202330508474e-06, "loss": 1.2245, "mean_token_accuracy": 0.7288625687360764, "num_tokens": 4039276.0, "step": 5014 }, { "epoch": 1.3283898305084745, "grad_norm": 2.2425589561462402, "learning_rate": 9.3359375e-06, "loss": 1.9015, "mean_token_accuracy": 0.6076480597257614, "num_tokens": 4040569.0, "step": 5016 }, { "epoch": 1.3289194915254237, "grad_norm": 1.5814098119735718, "learning_rate": 9.335672669491527e-06, "loss": 1.1292, "mean_token_accuracy": 0.7191860154271126, "num_tokens": 4042325.0, "step": 5018 }, { "epoch": 1.3294491525423728, "grad_norm": 1.5701225996017456, "learning_rate": 9.335407838983052e-06, "loss": 1.2419, "mean_token_accuracy": 0.7277038395404816, "num_tokens": 4043704.0, "step": 5020 }, { "epoch": 1.329978813559322, "grad_norm": 1.2860888242721558, "learning_rate": 9.335143008474577e-06, "loss": 1.2048, "mean_token_accuracy": 0.7364957332611084, "num_tokens": 4045376.0, "step": 5022 }, { "epoch": 1.3305084745762712, "grad_norm": 1.9103761911392212, "learning_rate": 9.334878177966102e-06, "loss": 1.817, "mean_token_accuracy": 0.6170253157615662, "num_tokens": 4046887.0, "step": 5024 }, { "epoch": 1.3310381355932204, "grad_norm": 1.7395719289779663, "learning_rate": 9.334613347457629e-06, "loss": 1.0075, "mean_token_accuracy": 0.7325162217020988, "num_tokens": 4048555.0, "step": 5026 }, { "epoch": 1.3315677966101696, "grad_norm": 1.1686776876449585, "learning_rate": 9.334348516949154e-06, "loss": 1.207, "mean_token_accuracy": 0.7348577305674553, "num_tokens": 4050351.0, "step": 5028 }, { "epoch": 1.3320974576271185, "grad_norm": 1.8663302659988403, "learning_rate": 9.334083686440679e-06, "loss": 1.3594, "mean_token_accuracy": 0.7051669806241989, "num_tokens": 4051876.0, "step": 5030 }, { "epoch": 1.332627118644068, "grad_norm": 1.3909316062927246, "learning_rate": 9.333818855932204e-06, "loss": 1.3976, "mean_token_accuracy": 0.6679456755518913, "num_tokens": 4053573.0, "step": 5032 }, { "epoch": 1.3331567796610169, "grad_norm": 1.4195207357406616, "learning_rate": 9.33355402542373e-06, "loss": 0.8708, "mean_token_accuracy": 0.7997475489974022, "num_tokens": 4055220.0, "step": 5034 }, { "epoch": 1.333686440677966, "grad_norm": 1.541420578956604, "learning_rate": 9.333289194915255e-06, "loss": 0.9927, "mean_token_accuracy": 0.7622040435671806, "num_tokens": 4056813.0, "step": 5036 }, { "epoch": 1.3342161016949152, "grad_norm": 1.9365445375442505, "learning_rate": 9.33302436440678e-06, "loss": 1.5061, "mean_token_accuracy": 0.6587690711021423, "num_tokens": 4058329.0, "step": 5038 }, { "epoch": 1.3347457627118644, "grad_norm": 1.393784523010254, "learning_rate": 9.332759533898305e-06, "loss": 0.9873, "mean_token_accuracy": 0.7622727751731873, "num_tokens": 4060329.0, "step": 5040 }, { "epoch": 1.3352754237288136, "grad_norm": 1.5948861837387085, "learning_rate": 9.332494703389832e-06, "loss": 1.0497, "mean_token_accuracy": 0.7335785254836082, "num_tokens": 4061948.0, "step": 5042 }, { "epoch": 1.3358050847457628, "grad_norm": 1.6441019773483276, "learning_rate": 9.332229872881357e-06, "loss": 1.6904, "mean_token_accuracy": 0.6185935363173485, "num_tokens": 4063630.0, "step": 5044 }, { "epoch": 1.336334745762712, "grad_norm": 1.8510189056396484, "learning_rate": 9.331965042372883e-06, "loss": 1.33, "mean_token_accuracy": 0.7115969136357307, "num_tokens": 4065138.0, "step": 5046 }, { "epoch": 1.336864406779661, "grad_norm": 1.9421793222427368, "learning_rate": 9.331700211864408e-06, "loss": 1.6709, "mean_token_accuracy": 0.6424738354980946, "num_tokens": 4066611.0, "step": 5048 }, { "epoch": 1.3373940677966103, "grad_norm": 2.1299214363098145, "learning_rate": 9.331435381355933e-06, "loss": 1.58, "mean_token_accuracy": 0.6437061056494713, "num_tokens": 4068001.0, "step": 5050 }, { "epoch": 1.3379237288135593, "grad_norm": 1.7254749536514282, "learning_rate": 9.331170550847458e-06, "loss": 1.3365, "mean_token_accuracy": 0.7150088176131248, "num_tokens": 4069617.0, "step": 5052 }, { "epoch": 1.3384533898305084, "grad_norm": 2.0087857246398926, "learning_rate": 9.330905720338985e-06, "loss": 1.9162, "mean_token_accuracy": 0.6347843259572983, "num_tokens": 4071886.0, "step": 5054 }, { "epoch": 1.3389830508474576, "grad_norm": 1.574633240699768, "learning_rate": 9.33064088983051e-06, "loss": 1.0344, "mean_token_accuracy": 0.7506872974336147, "num_tokens": 4073956.0, "step": 5056 }, { "epoch": 1.3395127118644068, "grad_norm": 1.534739375114441, "learning_rate": 9.330376059322034e-06, "loss": 1.095, "mean_token_accuracy": 0.7553992941975594, "num_tokens": 4075559.0, "step": 5058 }, { "epoch": 1.340042372881356, "grad_norm": 1.4645191431045532, "learning_rate": 9.33011122881356e-06, "loss": 1.2479, "mean_token_accuracy": 0.7324124276638031, "num_tokens": 4077196.0, "step": 5060 }, { "epoch": 1.3405720338983051, "grad_norm": 1.5411553382873535, "learning_rate": 9.329846398305086e-06, "loss": 1.0407, "mean_token_accuracy": 0.7432991787791252, "num_tokens": 4078728.0, "step": 5062 }, { "epoch": 1.3411016949152543, "grad_norm": 1.9000808000564575, "learning_rate": 9.329581567796611e-06, "loss": 1.056, "mean_token_accuracy": 0.7493613138794899, "num_tokens": 4080121.0, "step": 5064 }, { "epoch": 1.3416313559322033, "grad_norm": 1.723731279373169, "learning_rate": 9.329316737288136e-06, "loss": 1.5887, "mean_token_accuracy": 0.6618179231882095, "num_tokens": 4081625.0, "step": 5066 }, { "epoch": 1.3421610169491525, "grad_norm": 1.426563024520874, "learning_rate": 9.32905190677966e-06, "loss": 1.2137, "mean_token_accuracy": 0.7198987603187561, "num_tokens": 4083408.0, "step": 5068 }, { "epoch": 1.3426906779661016, "grad_norm": 1.5449721813201904, "learning_rate": 9.328787076271187e-06, "loss": 1.2451, "mean_token_accuracy": 0.7165502458810806, "num_tokens": 4084774.0, "step": 5070 }, { "epoch": 1.3432203389830508, "grad_norm": 1.605840802192688, "learning_rate": 9.328522245762712e-06, "loss": 1.5562, "mean_token_accuracy": 0.6669687107205391, "num_tokens": 4086374.0, "step": 5072 }, { "epoch": 1.34375, "grad_norm": 1.4008636474609375, "learning_rate": 9.328257415254239e-06, "loss": 1.5178, "mean_token_accuracy": 0.6643288396298885, "num_tokens": 4088048.0, "step": 5074 }, { "epoch": 1.3442796610169492, "grad_norm": 2.051865339279175, "learning_rate": 9.327992584745764e-06, "loss": 1.0432, "mean_token_accuracy": 0.7544870674610138, "num_tokens": 4089578.0, "step": 5076 }, { "epoch": 1.3448093220338984, "grad_norm": 2.337256669998169, "learning_rate": 9.327727754237289e-06, "loss": 1.6249, "mean_token_accuracy": 0.6464992240071297, "num_tokens": 4091055.0, "step": 5078 }, { "epoch": 1.3453389830508475, "grad_norm": 1.2516093254089355, "learning_rate": 9.327462923728814e-06, "loss": 1.0962, "mean_token_accuracy": 0.7362595275044441, "num_tokens": 4092532.0, "step": 5080 }, { "epoch": 1.3458686440677967, "grad_norm": 2.0436389446258545, "learning_rate": 9.32719809322034e-06, "loss": 1.8307, "mean_token_accuracy": 0.6145432665944099, "num_tokens": 4094259.0, "step": 5082 }, { "epoch": 1.3463983050847457, "grad_norm": 1.6608860492706299, "learning_rate": 9.326933262711865e-06, "loss": 1.6096, "mean_token_accuracy": 0.6474047154188156, "num_tokens": 4096024.0, "step": 5084 }, { "epoch": 1.3469279661016949, "grad_norm": 1.6825348138809204, "learning_rate": 9.32666843220339e-06, "loss": 0.9798, "mean_token_accuracy": 0.7591797262430191, "num_tokens": 4097322.0, "step": 5086 }, { "epoch": 1.347457627118644, "grad_norm": 1.6257176399230957, "learning_rate": 9.326403601694915e-06, "loss": 1.1391, "mean_token_accuracy": 0.7444974929094315, "num_tokens": 4098964.0, "step": 5088 }, { "epoch": 1.3479872881355932, "grad_norm": 1.7136995792388916, "learning_rate": 9.326138771186442e-06, "loss": 1.4955, "mean_token_accuracy": 0.6689086481928825, "num_tokens": 4100704.0, "step": 5090 }, { "epoch": 1.3485169491525424, "grad_norm": 1.6519187688827515, "learning_rate": 9.325873940677967e-06, "loss": 1.3212, "mean_token_accuracy": 0.6803758293390274, "num_tokens": 4102469.0, "step": 5092 }, { "epoch": 1.3490466101694916, "grad_norm": 1.9817429780960083, "learning_rate": 9.325609110169492e-06, "loss": 1.5589, "mean_token_accuracy": 0.664626482874155, "num_tokens": 4104225.0, "step": 5094 }, { "epoch": 1.3495762711864407, "grad_norm": 1.909621238708496, "learning_rate": 9.325344279661017e-06, "loss": 1.3699, "mean_token_accuracy": 0.684629499912262, "num_tokens": 4105593.0, "step": 5096 }, { "epoch": 1.3501059322033897, "grad_norm": 1.6670615673065186, "learning_rate": 9.325079449152543e-06, "loss": 1.555, "mean_token_accuracy": 0.6660620346665382, "num_tokens": 4107090.0, "step": 5098 }, { "epoch": 1.350635593220339, "grad_norm": 1.9227067232131958, "learning_rate": 9.324814618644068e-06, "loss": 1.4316, "mean_token_accuracy": 0.6966699734330177, "num_tokens": 4108681.0, "step": 5100 }, { "epoch": 1.351165254237288, "grad_norm": 1.6292037963867188, "learning_rate": 9.324549788135595e-06, "loss": 1.3591, "mean_token_accuracy": 0.711630504578352, "num_tokens": 4110525.0, "step": 5102 }, { "epoch": 1.3516949152542372, "grad_norm": 1.6572003364562988, "learning_rate": 9.32428495762712e-06, "loss": 0.9232, "mean_token_accuracy": 0.7732728123664856, "num_tokens": 4111956.0, "step": 5104 }, { "epoch": 1.3522245762711864, "grad_norm": 1.4951434135437012, "learning_rate": 9.324020127118645e-06, "loss": 1.057, "mean_token_accuracy": 0.7601146474480629, "num_tokens": 4113739.0, "step": 5106 }, { "epoch": 1.3527542372881356, "grad_norm": 1.5610220432281494, "learning_rate": 9.323755296610171e-06, "loss": 1.1141, "mean_token_accuracy": 0.714750275015831, "num_tokens": 4115271.0, "step": 5108 }, { "epoch": 1.3532838983050848, "grad_norm": 1.5119644403457642, "learning_rate": 9.323490466101696e-06, "loss": 1.3956, "mean_token_accuracy": 0.6706979610025883, "num_tokens": 4117183.0, "step": 5110 }, { "epoch": 1.353813559322034, "grad_norm": 1.821108102798462, "learning_rate": 9.323225635593221e-06, "loss": 1.7559, "mean_token_accuracy": 0.646077711135149, "num_tokens": 4118632.0, "step": 5112 }, { "epoch": 1.3543432203389831, "grad_norm": 1.4000999927520752, "learning_rate": 9.322960805084746e-06, "loss": 1.247, "mean_token_accuracy": 0.7155102267861366, "num_tokens": 4120397.0, "step": 5114 }, { "epoch": 1.354872881355932, "grad_norm": 2.0031230449676514, "learning_rate": 9.322695974576273e-06, "loss": 1.2055, "mean_token_accuracy": 0.7273086383938789, "num_tokens": 4121797.0, "step": 5116 }, { "epoch": 1.3554025423728815, "grad_norm": 1.4766021966934204, "learning_rate": 9.322431144067798e-06, "loss": 1.7291, "mean_token_accuracy": 0.6576252691447735, "num_tokens": 4124293.0, "step": 5118 }, { "epoch": 1.3559322033898304, "grad_norm": 1.5637949705123901, "learning_rate": 9.322166313559322e-06, "loss": 1.3288, "mean_token_accuracy": 0.6660083457827568, "num_tokens": 4125868.0, "step": 5120 }, { "epoch": 1.3564618644067796, "grad_norm": 1.9107213020324707, "learning_rate": 9.321901483050847e-06, "loss": 1.5682, "mean_token_accuracy": 0.6609045304358006, "num_tokens": 4127577.0, "step": 5122 }, { "epoch": 1.3569915254237288, "grad_norm": 1.6078447103500366, "learning_rate": 9.321636652542374e-06, "loss": 1.6163, "mean_token_accuracy": 0.6268018931150436, "num_tokens": 4129148.0, "step": 5124 }, { "epoch": 1.357521186440678, "grad_norm": 1.5678495168685913, "learning_rate": 9.321371822033899e-06, "loss": 1.3972, "mean_token_accuracy": 0.6923469975590706, "num_tokens": 4130572.0, "step": 5126 }, { "epoch": 1.3580508474576272, "grad_norm": 1.700820803642273, "learning_rate": 9.321106991525426e-06, "loss": 1.1248, "mean_token_accuracy": 0.742616705596447, "num_tokens": 4131929.0, "step": 5128 }, { "epoch": 1.3585805084745763, "grad_norm": 1.5726653337478638, "learning_rate": 9.32084216101695e-06, "loss": 1.3425, "mean_token_accuracy": 0.707522414624691, "num_tokens": 4133229.0, "step": 5130 }, { "epoch": 1.3591101694915255, "grad_norm": 1.5653812885284424, "learning_rate": 9.320577330508475e-06, "loss": 1.285, "mean_token_accuracy": 0.7005962207913399, "num_tokens": 4134910.0, "step": 5132 }, { "epoch": 1.3596398305084745, "grad_norm": 2.170319080352783, "learning_rate": 9.3203125e-06, "loss": 1.6119, "mean_token_accuracy": 0.6502548158168793, "num_tokens": 4136668.0, "step": 5134 }, { "epoch": 1.3601694915254237, "grad_norm": 1.578755497932434, "learning_rate": 9.320047669491527e-06, "loss": 1.3981, "mean_token_accuracy": 0.6677500680088997, "num_tokens": 4138375.0, "step": 5136 }, { "epoch": 1.3606991525423728, "grad_norm": 1.1434446573257446, "learning_rate": 9.319782838983052e-06, "loss": 1.1498, "mean_token_accuracy": 0.706561490893364, "num_tokens": 4140463.0, "step": 5138 }, { "epoch": 1.361228813559322, "grad_norm": 1.744947075843811, "learning_rate": 9.319518008474577e-06, "loss": 1.1683, "mean_token_accuracy": 0.7339330911636353, "num_tokens": 4142112.0, "step": 5140 }, { "epoch": 1.3617584745762712, "grad_norm": 1.503583312034607, "learning_rate": 9.319253177966102e-06, "loss": 1.3628, "mean_token_accuracy": 0.7107353545725346, "num_tokens": 4143881.0, "step": 5142 }, { "epoch": 1.3622881355932204, "grad_norm": 1.782946228981018, "learning_rate": 9.318988347457628e-06, "loss": 1.7518, "mean_token_accuracy": 0.6168220564723015, "num_tokens": 4145459.0, "step": 5144 }, { "epoch": 1.3628177966101696, "grad_norm": 1.7037485837936401, "learning_rate": 9.318723516949153e-06, "loss": 1.631, "mean_token_accuracy": 0.627388022840023, "num_tokens": 4147477.0, "step": 5146 }, { "epoch": 1.3633474576271185, "grad_norm": 1.1257033348083496, "learning_rate": 9.318458686440678e-06, "loss": 1.1598, "mean_token_accuracy": 0.749479129910469, "num_tokens": 4149200.0, "step": 5148 }, { "epoch": 1.363877118644068, "grad_norm": 1.7330312728881836, "learning_rate": 9.318193855932203e-06, "loss": 1.3891, "mean_token_accuracy": 0.6718858182430267, "num_tokens": 4151025.0, "step": 5150 }, { "epoch": 1.3644067796610169, "grad_norm": 1.6307542324066162, "learning_rate": 9.31792902542373e-06, "loss": 1.4565, "mean_token_accuracy": 0.6791298165917397, "num_tokens": 4152611.0, "step": 5152 }, { "epoch": 1.364936440677966, "grad_norm": 1.4107880592346191, "learning_rate": 9.317664194915255e-06, "loss": 1.1723, "mean_token_accuracy": 0.7017907276749611, "num_tokens": 4154244.0, "step": 5154 }, { "epoch": 1.3654661016949152, "grad_norm": 1.8164794445037842, "learning_rate": 9.317399364406781e-06, "loss": 1.2018, "mean_token_accuracy": 0.7348227277398109, "num_tokens": 4155826.0, "step": 5156 }, { "epoch": 1.3659957627118644, "grad_norm": 1.6878386735916138, "learning_rate": 9.317134533898305e-06, "loss": 1.3548, "mean_token_accuracy": 0.6943356767296791, "num_tokens": 4157386.0, "step": 5158 }, { "epoch": 1.3665254237288136, "grad_norm": 1.6297006607055664, "learning_rate": 9.316869703389831e-06, "loss": 1.532, "mean_token_accuracy": 0.6732223257422447, "num_tokens": 4158946.0, "step": 5160 }, { "epoch": 1.3670550847457628, "grad_norm": 1.5717021226882935, "learning_rate": 9.316604872881356e-06, "loss": 1.5645, "mean_token_accuracy": 0.6463343054056168, "num_tokens": 4160611.0, "step": 5162 }, { "epoch": 1.367584745762712, "grad_norm": 1.4985870122909546, "learning_rate": 9.316340042372883e-06, "loss": 1.3504, "mean_token_accuracy": 0.6962566673755646, "num_tokens": 4162245.0, "step": 5164 }, { "epoch": 1.368114406779661, "grad_norm": 1.6495611667633057, "learning_rate": 9.316075211864408e-06, "loss": 1.2544, "mean_token_accuracy": 0.6916000247001648, "num_tokens": 4164006.0, "step": 5166 }, { "epoch": 1.3686440677966103, "grad_norm": 1.2308406829833984, "learning_rate": 9.315810381355933e-06, "loss": 0.9204, "mean_token_accuracy": 0.7709597051143646, "num_tokens": 4165851.0, "step": 5168 }, { "epoch": 1.3691737288135593, "grad_norm": 1.6594570875167847, "learning_rate": 9.315545550847458e-06, "loss": 1.709, "mean_token_accuracy": 0.6241515465080738, "num_tokens": 4167612.0, "step": 5170 }, { "epoch": 1.3697033898305084, "grad_norm": 1.8502541780471802, "learning_rate": 9.315280720338984e-06, "loss": 1.3052, "mean_token_accuracy": 0.6928523257374763, "num_tokens": 4168897.0, "step": 5172 }, { "epoch": 1.3702330508474576, "grad_norm": 1.7966408729553223, "learning_rate": 9.315015889830509e-06, "loss": 1.449, "mean_token_accuracy": 0.6570396944880486, "num_tokens": 4171081.0, "step": 5174 }, { "epoch": 1.3707627118644068, "grad_norm": 1.8687655925750732, "learning_rate": 9.314751059322034e-06, "loss": 1.7999, "mean_token_accuracy": 0.6250966116786003, "num_tokens": 4172867.0, "step": 5176 }, { "epoch": 1.371292372881356, "grad_norm": 1.439591884613037, "learning_rate": 9.314486228813559e-06, "loss": 1.3276, "mean_token_accuracy": 0.7297719940543175, "num_tokens": 4174350.0, "step": 5178 }, { "epoch": 1.3718220338983051, "grad_norm": 1.899170994758606, "learning_rate": 9.314221398305086e-06, "loss": 1.5582, "mean_token_accuracy": 0.6529218107461929, "num_tokens": 4175925.0, "step": 5180 }, { "epoch": 1.3723516949152543, "grad_norm": 1.796347975730896, "learning_rate": 9.31395656779661e-06, "loss": 1.1097, "mean_token_accuracy": 0.7345411479473114, "num_tokens": 4177191.0, "step": 5182 }, { "epoch": 1.3728813559322033, "grad_norm": 1.844468355178833, "learning_rate": 9.313691737288137e-06, "loss": 1.5349, "mean_token_accuracy": 0.6693746596574783, "num_tokens": 4178971.0, "step": 5184 }, { "epoch": 1.3734110169491525, "grad_norm": 1.8912568092346191, "learning_rate": 9.313426906779662e-06, "loss": 1.4358, "mean_token_accuracy": 0.6847611740231514, "num_tokens": 4180483.0, "step": 5186 }, { "epoch": 1.3739406779661016, "grad_norm": 1.697404146194458, "learning_rate": 9.313162076271187e-06, "loss": 1.3716, "mean_token_accuracy": 0.7123295962810516, "num_tokens": 4181928.0, "step": 5188 }, { "epoch": 1.3744703389830508, "grad_norm": 1.8451056480407715, "learning_rate": 9.312897245762714e-06, "loss": 1.078, "mean_token_accuracy": 0.7204287275671959, "num_tokens": 4183400.0, "step": 5190 }, { "epoch": 1.375, "grad_norm": 1.9055856466293335, "learning_rate": 9.312632415254239e-06, "loss": 1.3181, "mean_token_accuracy": 0.7046956419944763, "num_tokens": 4185089.0, "step": 5192 }, { "epoch": 1.3755296610169492, "grad_norm": 1.6555750370025635, "learning_rate": 9.312367584745764e-06, "loss": 1.1912, "mean_token_accuracy": 0.7185462191700935, "num_tokens": 4186553.0, "step": 5194 }, { "epoch": 1.3760593220338984, "grad_norm": 1.7991571426391602, "learning_rate": 9.312102754237288e-06, "loss": 1.3291, "mean_token_accuracy": 0.6892564371228218, "num_tokens": 4187867.0, "step": 5196 }, { "epoch": 1.3765889830508475, "grad_norm": 1.6574410200119019, "learning_rate": 9.311837923728815e-06, "loss": 1.0495, "mean_token_accuracy": 0.7384256571531296, "num_tokens": 4189414.0, "step": 5198 }, { "epoch": 1.3771186440677967, "grad_norm": 2.0771756172180176, "learning_rate": 9.31157309322034e-06, "loss": 1.7658, "mean_token_accuracy": 0.6486720889806747, "num_tokens": 4190884.0, "step": 5200 }, { "epoch": 1.3776483050847457, "grad_norm": 1.0560200214385986, "learning_rate": 9.311308262711865e-06, "loss": 1.442, "mean_token_accuracy": 0.655690111219883, "num_tokens": 4193326.0, "step": 5202 }, { "epoch": 1.3781779661016949, "grad_norm": 1.6636513471603394, "learning_rate": 9.31104343220339e-06, "loss": 1.5434, "mean_token_accuracy": 0.6948498785495758, "num_tokens": 4194744.0, "step": 5204 }, { "epoch": 1.378707627118644, "grad_norm": 1.5045472383499146, "learning_rate": 9.310778601694916e-06, "loss": 0.8998, "mean_token_accuracy": 0.7819545269012451, "num_tokens": 4196404.0, "step": 5206 }, { "epoch": 1.3792372881355932, "grad_norm": 1.4648953676223755, "learning_rate": 9.310513771186441e-06, "loss": 1.2065, "mean_token_accuracy": 0.7020022794604301, "num_tokens": 4198064.0, "step": 5208 }, { "epoch": 1.3797669491525424, "grad_norm": 1.6098542213439941, "learning_rate": 9.310248940677968e-06, "loss": 1.2718, "mean_token_accuracy": 0.6939839720726013, "num_tokens": 4199839.0, "step": 5210 }, { "epoch": 1.3802966101694916, "grad_norm": 1.6927330493927002, "learning_rate": 9.309984110169491e-06, "loss": 1.5609, "mean_token_accuracy": 0.677786435931921, "num_tokens": 4201328.0, "step": 5212 }, { "epoch": 1.3808262711864407, "grad_norm": 1.3497024774551392, "learning_rate": 9.309719279661018e-06, "loss": 1.0619, "mean_token_accuracy": 0.750623069703579, "num_tokens": 4202816.0, "step": 5214 }, { "epoch": 1.3813559322033897, "grad_norm": 1.6768603324890137, "learning_rate": 9.309454449152543e-06, "loss": 1.0684, "mean_token_accuracy": 0.7421434000134468, "num_tokens": 4204261.0, "step": 5216 }, { "epoch": 1.381885593220339, "grad_norm": 1.9281079769134521, "learning_rate": 9.30918961864407e-06, "loss": 1.7046, "mean_token_accuracy": 0.6435220316052437, "num_tokens": 4205659.0, "step": 5218 }, { "epoch": 1.382415254237288, "grad_norm": 1.462355136871338, "learning_rate": 9.308924788135594e-06, "loss": 1.3114, "mean_token_accuracy": 0.6774957701563835, "num_tokens": 4207509.0, "step": 5220 }, { "epoch": 1.3829449152542372, "grad_norm": 1.260493516921997, "learning_rate": 9.30865995762712e-06, "loss": 1.1524, "mean_token_accuracy": 0.7304089814424515, "num_tokens": 4209027.0, "step": 5222 }, { "epoch": 1.3834745762711864, "grad_norm": 1.3446400165557861, "learning_rate": 9.308395127118644e-06, "loss": 1.1503, "mean_token_accuracy": 0.7340380176901817, "num_tokens": 4210502.0, "step": 5224 }, { "epoch": 1.3840042372881356, "grad_norm": 1.6366745233535767, "learning_rate": 9.308130296610171e-06, "loss": 1.432, "mean_token_accuracy": 0.6933032870292664, "num_tokens": 4211993.0, "step": 5226 }, { "epoch": 1.3845338983050848, "grad_norm": 1.5662683248519897, "learning_rate": 9.307865466101696e-06, "loss": 1.1988, "mean_token_accuracy": 0.7238853424787521, "num_tokens": 4213454.0, "step": 5228 }, { "epoch": 1.385063559322034, "grad_norm": 1.847344994544983, "learning_rate": 9.30760063559322e-06, "loss": 1.1738, "mean_token_accuracy": 0.7234449610114098, "num_tokens": 4214596.0, "step": 5230 }, { "epoch": 1.3855932203389831, "grad_norm": 1.7913070917129517, "learning_rate": 9.307335805084746e-06, "loss": 1.9732, "mean_token_accuracy": 0.574568185955286, "num_tokens": 4216449.0, "step": 5232 }, { "epoch": 1.386122881355932, "grad_norm": 1.7346841096878052, "learning_rate": 9.307070974576272e-06, "loss": 1.2324, "mean_token_accuracy": 0.7024487555027008, "num_tokens": 4218288.0, "step": 5234 }, { "epoch": 1.3866525423728815, "grad_norm": 1.9904590845108032, "learning_rate": 9.306806144067797e-06, "loss": 1.4813, "mean_token_accuracy": 0.6828065812587738, "num_tokens": 4219742.0, "step": 5236 }, { "epoch": 1.3871822033898304, "grad_norm": 1.3516066074371338, "learning_rate": 9.306541313559324e-06, "loss": 1.2337, "mean_token_accuracy": 0.7190958186984062, "num_tokens": 4221193.0, "step": 5238 }, { "epoch": 1.3877118644067796, "grad_norm": 1.6397740840911865, "learning_rate": 9.306276483050847e-06, "loss": 1.4161, "mean_token_accuracy": 0.6751108206808567, "num_tokens": 4222761.0, "step": 5240 }, { "epoch": 1.3882415254237288, "grad_norm": 1.2423231601715088, "learning_rate": 9.306011652542374e-06, "loss": 1.1489, "mean_token_accuracy": 0.7663580104708672, "num_tokens": 4225018.0, "step": 5242 }, { "epoch": 1.388771186440678, "grad_norm": 1.5579432249069214, "learning_rate": 9.305746822033899e-06, "loss": 1.1037, "mean_token_accuracy": 0.7319532781839371, "num_tokens": 4226517.0, "step": 5244 }, { "epoch": 1.3893008474576272, "grad_norm": 1.7265757322311401, "learning_rate": 9.305481991525425e-06, "loss": 1.0184, "mean_token_accuracy": 0.7438325807452202, "num_tokens": 4228087.0, "step": 5246 }, { "epoch": 1.3898305084745763, "grad_norm": 1.640321135520935, "learning_rate": 9.30521716101695e-06, "loss": 1.05, "mean_token_accuracy": 0.7527054771780968, "num_tokens": 4229475.0, "step": 5248 }, { "epoch": 1.3903601694915255, "grad_norm": 1.8661562204360962, "learning_rate": 9.304952330508475e-06, "loss": 1.4712, "step": 5250 }, { "epoch": 1.3903601694915255, "eval_loss": 1.319163203239441, "eval_mean_token_accuracy": 0.699265181147433, "eval_num_tokens": 4231026.0, "eval_runtime": 48.3154, "eval_samples_per_second": 6.375, "eval_steps_per_second": 6.375, "step": 5250 }, { "epoch": 1.3908898305084745, "grad_norm": 1.3698481321334839, "learning_rate": 9.3046875e-06, "loss": 1.2722, "mean_token_accuracy": 0.6994674652814865, "num_tokens": 4232822.0, "step": 5252 }, { "epoch": 1.3914194915254237, "grad_norm": 1.2337161302566528, "learning_rate": 9.304422669491527e-06, "loss": 0.9472, "mean_token_accuracy": 0.7693755626678467, "num_tokens": 4234398.0, "step": 5254 }, { "epoch": 1.3919491525423728, "grad_norm": 1.539648175239563, "learning_rate": 9.304157838983052e-06, "loss": 1.1563, "mean_token_accuracy": 0.7153426185250282, "num_tokens": 4235980.0, "step": 5256 }, { "epoch": 1.392478813559322, "grad_norm": 1.609544038772583, "learning_rate": 9.303893008474576e-06, "loss": 1.2784, "mean_token_accuracy": 0.6967669501900673, "num_tokens": 4237385.0, "step": 5258 }, { "epoch": 1.3930084745762712, "grad_norm": 1.6515127420425415, "learning_rate": 9.303628177966101e-06, "loss": 0.921, "mean_token_accuracy": 0.7736379131674767, "num_tokens": 4238956.0, "step": 5260 }, { "epoch": 1.3935381355932204, "grad_norm": 1.5726020336151123, "learning_rate": 9.303363347457628e-06, "loss": 1.3467, "mean_token_accuracy": 0.6883268803358078, "num_tokens": 4240407.0, "step": 5262 }, { "epoch": 1.3940677966101696, "grad_norm": 1.526702642440796, "learning_rate": 9.303098516949153e-06, "loss": 1.1862, "mean_token_accuracy": 0.7199484705924988, "num_tokens": 4241945.0, "step": 5264 }, { "epoch": 1.3945974576271185, "grad_norm": 1.8866124153137207, "learning_rate": 9.302833686440678e-06, "loss": 1.4005, "mean_token_accuracy": 0.7072860486805439, "num_tokens": 4243572.0, "step": 5266 }, { "epoch": 1.395127118644068, "grad_norm": 1.4921563863754272, "learning_rate": 9.302568855932203e-06, "loss": 1.3747, "mean_token_accuracy": 0.704690046608448, "num_tokens": 4245443.0, "step": 5268 }, { "epoch": 1.3956567796610169, "grad_norm": 1.6749764680862427, "learning_rate": 9.30230402542373e-06, "loss": 1.3815, "mean_token_accuracy": 0.6776855513453484, "num_tokens": 4246928.0, "step": 5270 }, { "epoch": 1.396186440677966, "grad_norm": 1.746239185333252, "learning_rate": 9.302039194915256e-06, "loss": 1.0052, "mean_token_accuracy": 0.7858079820871353, "num_tokens": 4248685.0, "step": 5272 }, { "epoch": 1.3967161016949152, "grad_norm": 1.5859495401382446, "learning_rate": 9.301774364406781e-06, "loss": 1.4378, "mean_token_accuracy": 0.6627915948629379, "num_tokens": 4250440.0, "step": 5274 }, { "epoch": 1.3972457627118644, "grad_norm": 1.4485514163970947, "learning_rate": 9.301509533898306e-06, "loss": 1.2743, "mean_token_accuracy": 0.6849423348903656, "num_tokens": 4252150.0, "step": 5276 }, { "epoch": 1.3977754237288136, "grad_norm": 1.9893310070037842, "learning_rate": 9.301244703389831e-06, "loss": 1.4097, "mean_token_accuracy": 0.7102979496121407, "num_tokens": 4253561.0, "step": 5278 }, { "epoch": 1.3983050847457628, "grad_norm": 1.811171293258667, "learning_rate": 9.300979872881357e-06, "loss": 1.7083, "mean_token_accuracy": 0.6665230020880699, "num_tokens": 4254835.0, "step": 5280 }, { "epoch": 1.398834745762712, "grad_norm": 1.552575945854187, "learning_rate": 9.300715042372882e-06, "loss": 1.2898, "mean_token_accuracy": 0.6767791509628296, "num_tokens": 4256556.0, "step": 5282 }, { "epoch": 1.399364406779661, "grad_norm": 1.5674163103103638, "learning_rate": 9.300450211864407e-06, "loss": 1.3505, "mean_token_accuracy": 0.6917630136013031, "num_tokens": 4257941.0, "step": 5284 }, { "epoch": 1.3998940677966103, "grad_norm": 1.42085862159729, "learning_rate": 9.300185381355932e-06, "loss": 1.4149, "mean_token_accuracy": 0.692610427737236, "num_tokens": 4259646.0, "step": 5286 }, { "epoch": 1.4004237288135593, "grad_norm": 1.5331659317016602, "learning_rate": 9.299920550847459e-06, "loss": 1.3769, "mean_token_accuracy": 0.6944290921092033, "num_tokens": 4261158.0, "step": 5288 }, { "epoch": 1.4009533898305084, "grad_norm": 1.6469252109527588, "learning_rate": 9.299655720338984e-06, "loss": 1.48, "mean_token_accuracy": 0.6871114522218704, "num_tokens": 4262773.0, "step": 5290 }, { "epoch": 1.4014830508474576, "grad_norm": 1.8094438314437866, "learning_rate": 9.29939088983051e-06, "loss": 0.9898, "mean_token_accuracy": 0.7554563656449318, "num_tokens": 4264563.0, "step": 5292 }, { "epoch": 1.4020127118644068, "grad_norm": 1.5571016073226929, "learning_rate": 9.299126059322034e-06, "loss": 1.1175, "mean_token_accuracy": 0.7269973382353783, "num_tokens": 4266195.0, "step": 5294 }, { "epoch": 1.402542372881356, "grad_norm": 1.5987058877944946, "learning_rate": 9.29886122881356e-06, "loss": 1.018, "mean_token_accuracy": 0.7525079399347305, "num_tokens": 4267959.0, "step": 5296 }, { "epoch": 1.4030720338983051, "grad_norm": 1.6529312133789062, "learning_rate": 9.298596398305085e-06, "loss": 1.0641, "mean_token_accuracy": 0.7440296038985252, "num_tokens": 4269504.0, "step": 5298 }, { "epoch": 1.4036016949152543, "grad_norm": 1.4897249937057495, "learning_rate": 9.298331567796612e-06, "loss": 1.1941, "mean_token_accuracy": 0.7072054818272591, "num_tokens": 4271077.0, "step": 5300 }, { "epoch": 1.4041313559322033, "grad_norm": 1.7337689399719238, "learning_rate": 9.298066737288137e-06, "loss": 1.3566, "mean_token_accuracy": 0.6895507946610451, "num_tokens": 4272622.0, "step": 5302 }, { "epoch": 1.4046610169491525, "grad_norm": 1.653967261314392, "learning_rate": 9.297801906779662e-06, "loss": 1.5207, "mean_token_accuracy": 0.6661583408713341, "num_tokens": 4274346.0, "step": 5304 }, { "epoch": 1.4051906779661016, "grad_norm": 1.6559638977050781, "learning_rate": 9.297537076271187e-06, "loss": 1.0183, "mean_token_accuracy": 0.755540281534195, "num_tokens": 4275839.0, "step": 5306 }, { "epoch": 1.4057203389830508, "grad_norm": 1.795098900794983, "learning_rate": 9.297272245762713e-06, "loss": 1.2784, "mean_token_accuracy": 0.7103057578206062, "num_tokens": 4277621.0, "step": 5308 }, { "epoch": 1.40625, "grad_norm": 1.5847315788269043, "learning_rate": 9.297007415254238e-06, "loss": 1.3636, "mean_token_accuracy": 0.6759712100028992, "num_tokens": 4279479.0, "step": 5310 }, { "epoch": 1.4067796610169492, "grad_norm": 1.475233554840088, "learning_rate": 9.296742584745763e-06, "loss": 0.8483, "mean_token_accuracy": 0.8004929944872856, "num_tokens": 4280876.0, "step": 5312 }, { "epoch": 1.4073093220338984, "grad_norm": 1.4207369089126587, "learning_rate": 9.296477754237288e-06, "loss": 0.9159, "mean_token_accuracy": 0.7803255394101143, "num_tokens": 4282267.0, "step": 5314 }, { "epoch": 1.4078389830508475, "grad_norm": 1.9542515277862549, "learning_rate": 9.296212923728815e-06, "loss": 1.5743, "mean_token_accuracy": 0.639894500374794, "num_tokens": 4284012.0, "step": 5316 }, { "epoch": 1.4083686440677967, "grad_norm": 1.7913132905960083, "learning_rate": 9.29594809322034e-06, "loss": 1.4814, "mean_token_accuracy": 0.6673025861382484, "num_tokens": 4285333.0, "step": 5318 }, { "epoch": 1.4088983050847457, "grad_norm": 1.8413515090942383, "learning_rate": 9.295683262711865e-06, "loss": 1.9431, "mean_token_accuracy": 0.6056111082434654, "num_tokens": 4287342.0, "step": 5320 }, { "epoch": 1.4094279661016949, "grad_norm": 1.5511759519577026, "learning_rate": 9.29541843220339e-06, "loss": 1.1136, "mean_token_accuracy": 0.7401007041335106, "num_tokens": 4288790.0, "step": 5322 }, { "epoch": 1.409957627118644, "grad_norm": 1.7660281658172607, "learning_rate": 9.295153601694916e-06, "loss": 1.5175, "mean_token_accuracy": 0.6533787101507187, "num_tokens": 4290416.0, "step": 5324 }, { "epoch": 1.4104872881355932, "grad_norm": 1.2969948053359985, "learning_rate": 9.294888771186441e-06, "loss": 1.3544, "mean_token_accuracy": 0.6841111928224564, "num_tokens": 4292139.0, "step": 5326 }, { "epoch": 1.4110169491525424, "grad_norm": 1.738189458847046, "learning_rate": 9.294623940677968e-06, "loss": 1.4396, "mean_token_accuracy": 0.7024710327386856, "num_tokens": 4293669.0, "step": 5328 }, { "epoch": 1.4115466101694916, "grad_norm": 1.6438758373260498, "learning_rate": 9.294359110169493e-06, "loss": 1.1782, "mean_token_accuracy": 0.7192066162824631, "num_tokens": 4295263.0, "step": 5330 }, { "epoch": 1.4120762711864407, "grad_norm": 1.5691224336624146, "learning_rate": 9.294094279661018e-06, "loss": 1.0105, "mean_token_accuracy": 0.7504457384347916, "num_tokens": 4296821.0, "step": 5332 }, { "epoch": 1.4126059322033897, "grad_norm": 1.4824209213256836, "learning_rate": 9.293829449152542e-06, "loss": 1.0899, "mean_token_accuracy": 0.7498397901654243, "num_tokens": 4298116.0, "step": 5334 }, { "epoch": 1.413135593220339, "grad_norm": 1.5800788402557373, "learning_rate": 9.293564618644069e-06, "loss": 1.2725, "mean_token_accuracy": 0.6894366592168808, "num_tokens": 4299864.0, "step": 5336 }, { "epoch": 1.413665254237288, "grad_norm": 1.8923290967941284, "learning_rate": 9.293299788135594e-06, "loss": 1.3838, "mean_token_accuracy": 0.6834067180752754, "num_tokens": 4301385.0, "step": 5338 }, { "epoch": 1.4141949152542372, "grad_norm": 1.8220547437667847, "learning_rate": 9.293034957627119e-06, "loss": 1.4051, "mean_token_accuracy": 0.6870060935616493, "num_tokens": 4302855.0, "step": 5340 }, { "epoch": 1.4147245762711864, "grad_norm": 2.015923261642456, "learning_rate": 9.292770127118644e-06, "loss": 1.6006, "mean_token_accuracy": 0.6562168076634407, "num_tokens": 4304507.0, "step": 5342 }, { "epoch": 1.4152542372881356, "grad_norm": 1.7788807153701782, "learning_rate": 9.29250529661017e-06, "loss": 1.8083, "mean_token_accuracy": 0.5983685553073883, "num_tokens": 4306031.0, "step": 5344 }, { "epoch": 1.4157838983050848, "grad_norm": 1.6982389688491821, "learning_rate": 9.292240466101695e-06, "loss": 1.3951, "mean_token_accuracy": 0.6599730402231216, "num_tokens": 4307677.0, "step": 5346 }, { "epoch": 1.416313559322034, "grad_norm": 1.983832836151123, "learning_rate": 9.29197563559322e-06, "loss": 1.2124, "mean_token_accuracy": 0.7263487428426743, "num_tokens": 4308990.0, "step": 5348 }, { "epoch": 1.4168432203389831, "grad_norm": 1.5877279043197632, "learning_rate": 9.291710805084745e-06, "loss": 1.3365, "mean_token_accuracy": 0.7005249708890915, "num_tokens": 4310671.0, "step": 5350 }, { "epoch": 1.417372881355932, "grad_norm": 1.611264944076538, "learning_rate": 9.291445974576272e-06, "loss": 1.1349, "mean_token_accuracy": 0.7079000920057297, "num_tokens": 4312317.0, "step": 5352 }, { "epoch": 1.4179025423728815, "grad_norm": 1.677304983139038, "learning_rate": 9.291181144067799e-06, "loss": 1.179, "mean_token_accuracy": 0.7644586861133575, "num_tokens": 4313762.0, "step": 5354 }, { "epoch": 1.4184322033898304, "grad_norm": 2.0500941276550293, "learning_rate": 9.290916313559323e-06, "loss": 1.5517, "mean_token_accuracy": 0.650062307715416, "num_tokens": 4315198.0, "step": 5356 }, { "epoch": 1.4189618644067796, "grad_norm": 1.276079773902893, "learning_rate": 9.290651483050848e-06, "loss": 1.4037, "mean_token_accuracy": 0.7164890244603157, "num_tokens": 4317652.0, "step": 5358 }, { "epoch": 1.4194915254237288, "grad_norm": 1.6260713338851929, "learning_rate": 9.290386652542373e-06, "loss": 1.1041, "mean_token_accuracy": 0.7291971296072006, "num_tokens": 4319145.0, "step": 5360 }, { "epoch": 1.420021186440678, "grad_norm": 1.5652374029159546, "learning_rate": 9.2901218220339e-06, "loss": 1.7419, "mean_token_accuracy": 0.6111232861876488, "num_tokens": 4320917.0, "step": 5362 }, { "epoch": 1.4205508474576272, "grad_norm": 1.69358229637146, "learning_rate": 9.289856991525425e-06, "loss": 1.2558, "mean_token_accuracy": 0.71239423006773, "num_tokens": 4322630.0, "step": 5364 }, { "epoch": 1.4210805084745763, "grad_norm": 1.7843414545059204, "learning_rate": 9.28959216101695e-06, "loss": 1.8193, "mean_token_accuracy": 0.5790894962847233, "num_tokens": 4324569.0, "step": 5366 }, { "epoch": 1.4216101694915255, "grad_norm": 1.317030906677246, "learning_rate": 9.289327330508475e-06, "loss": 1.4405, "mean_token_accuracy": 0.6785231083631516, "num_tokens": 4326430.0, "step": 5368 }, { "epoch": 1.4221398305084745, "grad_norm": 1.7762861251831055, "learning_rate": 9.289062500000001e-06, "loss": 1.5647, "mean_token_accuracy": 0.656885139644146, "num_tokens": 4328156.0, "step": 5370 }, { "epoch": 1.4226694915254237, "grad_norm": 1.697489619255066, "learning_rate": 9.288797669491526e-06, "loss": 1.3932, "mean_token_accuracy": 0.6879876330494881, "num_tokens": 4329674.0, "step": 5372 }, { "epoch": 1.4231991525423728, "grad_norm": 1.7570877075195312, "learning_rate": 9.288532838983051e-06, "loss": 1.628, "mean_token_accuracy": 0.6523787155747414, "num_tokens": 4331240.0, "step": 5374 }, { "epoch": 1.423728813559322, "grad_norm": 1.8336207866668701, "learning_rate": 9.288268008474576e-06, "loss": 1.2565, "mean_token_accuracy": 0.7151982858777046, "num_tokens": 4332584.0, "step": 5376 }, { "epoch": 1.4242584745762712, "grad_norm": 1.21275794506073, "learning_rate": 9.288003177966103e-06, "loss": 1.1578, "mean_token_accuracy": 0.7219012081623077, "num_tokens": 4334276.0, "step": 5378 }, { "epoch": 1.4247881355932204, "grad_norm": 1.9618594646453857, "learning_rate": 9.287738347457628e-06, "loss": 1.3735, "mean_token_accuracy": 0.7107426449656487, "num_tokens": 4335521.0, "step": 5380 }, { "epoch": 1.4253177966101696, "grad_norm": 1.771681785583496, "learning_rate": 9.287473516949154e-06, "loss": 1.3455, "mean_token_accuracy": 0.6845500022172928, "num_tokens": 4336960.0, "step": 5382 }, { "epoch": 1.4258474576271185, "grad_norm": 1.4901988506317139, "learning_rate": 9.28720868644068e-06, "loss": 1.4604, "mean_token_accuracy": 0.6751112043857574, "num_tokens": 4338728.0, "step": 5384 }, { "epoch": 1.426377118644068, "grad_norm": 1.418386459350586, "learning_rate": 9.286943855932204e-06, "loss": 1.3548, "mean_token_accuracy": 0.6783330887556076, "num_tokens": 4340388.0, "step": 5386 }, { "epoch": 1.4269067796610169, "grad_norm": 1.8364057540893555, "learning_rate": 9.286679025423729e-06, "loss": 1.241, "mean_token_accuracy": 0.7371139153838158, "num_tokens": 4341788.0, "step": 5388 }, { "epoch": 1.427436440677966, "grad_norm": 1.6928002834320068, "learning_rate": 9.286414194915256e-06, "loss": 1.8299, "mean_token_accuracy": 0.5953653417527676, "num_tokens": 4343491.0, "step": 5390 }, { "epoch": 1.4279661016949152, "grad_norm": 1.8547114133834839, "learning_rate": 9.28614936440678e-06, "loss": 1.2438, "mean_token_accuracy": 0.7027894333004951, "num_tokens": 4345106.0, "step": 5392 }, { "epoch": 1.4284957627118644, "grad_norm": 1.4036765098571777, "learning_rate": 9.285884533898306e-06, "loss": 1.122, "mean_token_accuracy": 0.7292009443044662, "num_tokens": 4346507.0, "step": 5394 }, { "epoch": 1.4290254237288136, "grad_norm": 1.4624288082122803, "learning_rate": 9.28561970338983e-06, "loss": 1.8019, "mean_token_accuracy": 0.6024072915315628, "num_tokens": 4348560.0, "step": 5396 }, { "epoch": 1.4295550847457628, "grad_norm": 1.7047691345214844, "learning_rate": 9.285354872881357e-06, "loss": 1.3116, "mean_token_accuracy": 0.7145968973636627, "num_tokens": 4350113.0, "step": 5398 }, { "epoch": 1.430084745762712, "grad_norm": 1.3249807357788086, "learning_rate": 9.285090042372882e-06, "loss": 0.954, "mean_token_accuracy": 0.759150892496109, "num_tokens": 4351755.0, "step": 5400 }, { "epoch": 1.430614406779661, "grad_norm": 1.5327266454696655, "learning_rate": 9.284825211864407e-06, "loss": 1.5143, "mean_token_accuracy": 0.6734919957816601, "num_tokens": 4353337.0, "step": 5402 }, { "epoch": 1.4311440677966103, "grad_norm": 1.5723457336425781, "learning_rate": 9.284560381355932e-06, "loss": 1.0291, "mean_token_accuracy": 0.7709936797618866, "num_tokens": 4354588.0, "step": 5404 }, { "epoch": 1.4316737288135593, "grad_norm": 1.4439369440078735, "learning_rate": 9.284295550847459e-06, "loss": 1.0936, "mean_token_accuracy": 0.7552289515733719, "num_tokens": 4356558.0, "step": 5406 }, { "epoch": 1.4322033898305084, "grad_norm": 1.7306702136993408, "learning_rate": 9.284030720338983e-06, "loss": 1.412, "mean_token_accuracy": 0.673942893743515, "num_tokens": 4358132.0, "step": 5408 }, { "epoch": 1.4327330508474576, "grad_norm": 1.690846562385559, "learning_rate": 9.28376588983051e-06, "loss": 1.8702, "mean_token_accuracy": 0.5990024507045746, "num_tokens": 4359714.0, "step": 5410 }, { "epoch": 1.4332627118644068, "grad_norm": 2.2818763256073, "learning_rate": 9.283501059322035e-06, "loss": 1.4371, "mean_token_accuracy": 0.6687829568982124, "num_tokens": 4360905.0, "step": 5412 }, { "epoch": 1.433792372881356, "grad_norm": 1.8934152126312256, "learning_rate": 9.28323622881356e-06, "loss": 1.3529, "mean_token_accuracy": 0.7146824859082699, "num_tokens": 4362112.0, "step": 5414 }, { "epoch": 1.4343220338983051, "grad_norm": 1.8068848848342896, "learning_rate": 9.282971398305085e-06, "loss": 1.4042, "mean_token_accuracy": 0.6857500001788139, "num_tokens": 4363827.0, "step": 5416 }, { "epoch": 1.4348516949152543, "grad_norm": 1.921907663345337, "learning_rate": 9.282706567796611e-06, "loss": 1.695, "mean_token_accuracy": 0.6224986985325813, "num_tokens": 4365439.0, "step": 5418 }, { "epoch": 1.4353813559322033, "grad_norm": 1.3832722902297974, "learning_rate": 9.282441737288136e-06, "loss": 1.0463, "mean_token_accuracy": 0.7342897802591324, "num_tokens": 4367110.0, "step": 5420 }, { "epoch": 1.4359110169491525, "grad_norm": 2.180844783782959, "learning_rate": 9.282176906779661e-06, "loss": 1.4588, "mean_token_accuracy": 0.7020030096173286, "num_tokens": 4368586.0, "step": 5422 }, { "epoch": 1.4364406779661016, "grad_norm": 1.2097012996673584, "learning_rate": 9.281912076271186e-06, "loss": 1.0034, "mean_token_accuracy": 0.7796728238463402, "num_tokens": 4371029.0, "step": 5424 }, { "epoch": 1.4369703389830508, "grad_norm": 1.7687842845916748, "learning_rate": 9.281647245762713e-06, "loss": 1.4908, "mean_token_accuracy": 0.6774640530347824, "num_tokens": 4372605.0, "step": 5426 }, { "epoch": 1.4375, "grad_norm": 2.0669190883636475, "learning_rate": 9.281382415254238e-06, "loss": 1.4376, "mean_token_accuracy": 0.6730642542243004, "num_tokens": 4374370.0, "step": 5428 }, { "epoch": 1.4380296610169492, "grad_norm": 1.5165929794311523, "learning_rate": 9.281117584745763e-06, "loss": 1.6477, "mean_token_accuracy": 0.6308058127760887, "num_tokens": 4376003.0, "step": 5430 }, { "epoch": 1.4385593220338984, "grad_norm": 1.7320244312286377, "learning_rate": 9.280852754237288e-06, "loss": 1.2543, "mean_token_accuracy": 0.7162756323814392, "num_tokens": 4377477.0, "step": 5432 }, { "epoch": 1.4390889830508475, "grad_norm": 1.9318203926086426, "learning_rate": 9.280587923728814e-06, "loss": 1.5524, "mean_token_accuracy": 0.6774516627192497, "num_tokens": 4378935.0, "step": 5434 }, { "epoch": 1.4396186440677967, "grad_norm": 1.8968863487243652, "learning_rate": 9.28032309322034e-06, "loss": 1.4498, "mean_token_accuracy": 0.6883714720606804, "num_tokens": 4380641.0, "step": 5436 }, { "epoch": 1.4401483050847457, "grad_norm": 1.5268253087997437, "learning_rate": 9.280058262711866e-06, "loss": 1.3093, "mean_token_accuracy": 0.7086073160171509, "num_tokens": 4382251.0, "step": 5438 }, { "epoch": 1.4406779661016949, "grad_norm": 1.8424655199050903, "learning_rate": 9.27979343220339e-06, "loss": 1.5289, "mean_token_accuracy": 0.6453633829951286, "num_tokens": 4384095.0, "step": 5440 }, { "epoch": 1.441207627118644, "grad_norm": 1.2710593938827515, "learning_rate": 9.279528601694916e-06, "loss": 1.306, "mean_token_accuracy": 0.6983642354607582, "num_tokens": 4386000.0, "step": 5442 }, { "epoch": 1.4417372881355932, "grad_norm": 1.555474042892456, "learning_rate": 9.279263771186442e-06, "loss": 1.8805, "mean_token_accuracy": 0.6067388355731964, "num_tokens": 4387769.0, "step": 5444 }, { "epoch": 1.4422669491525424, "grad_norm": 1.5569326877593994, "learning_rate": 9.278998940677967e-06, "loss": 1.358, "mean_token_accuracy": 0.6834305226802826, "num_tokens": 4389188.0, "step": 5446 }, { "epoch": 1.4427966101694916, "grad_norm": 1.822244644165039, "learning_rate": 9.278734110169492e-06, "loss": 1.2834, "mean_token_accuracy": 0.7194046452641487, "num_tokens": 4390696.0, "step": 5448 }, { "epoch": 1.4433262711864407, "grad_norm": 1.5940788984298706, "learning_rate": 9.278469279661017e-06, "loss": 1.0177, "mean_token_accuracy": 0.7550387904047966, "num_tokens": 4392166.0, "step": 5450 }, { "epoch": 1.4438559322033897, "grad_norm": 1.7595908641815186, "learning_rate": 9.278204449152544e-06, "loss": 1.4277, "mean_token_accuracy": 0.6745292618870735, "num_tokens": 4393593.0, "step": 5452 }, { "epoch": 1.444385593220339, "grad_norm": 1.3682115077972412, "learning_rate": 9.277939618644069e-06, "loss": 1.2758, "mean_token_accuracy": 0.7089498192071915, "num_tokens": 4395217.0, "step": 5454 }, { "epoch": 1.444915254237288, "grad_norm": 1.5858887434005737, "learning_rate": 9.277674788135594e-06, "loss": 1.4912, "mean_token_accuracy": 0.6707531586289406, "num_tokens": 4396910.0, "step": 5456 }, { "epoch": 1.4454449152542372, "grad_norm": 1.4382411241531372, "learning_rate": 9.277409957627119e-06, "loss": 1.2931, "mean_token_accuracy": 0.6988636665046215, "num_tokens": 4398859.0, "step": 5458 }, { "epoch": 1.4459745762711864, "grad_norm": 1.4420959949493408, "learning_rate": 9.277145127118645e-06, "loss": 1.5675, "mean_token_accuracy": 0.6670750603079796, "num_tokens": 4400658.0, "step": 5460 }, { "epoch": 1.4465042372881356, "grad_norm": 1.6946957111358643, "learning_rate": 9.27688029661017e-06, "loss": 1.1974, "mean_token_accuracy": 0.717374175786972, "num_tokens": 4402216.0, "step": 5462 }, { "epoch": 1.4470338983050848, "grad_norm": 1.440645694732666, "learning_rate": 9.276615466101697e-06, "loss": 1.2279, "mean_token_accuracy": 0.7063122317194939, "num_tokens": 4403782.0, "step": 5464 }, { "epoch": 1.447563559322034, "grad_norm": 1.4869210720062256, "learning_rate": 9.276350635593222e-06, "loss": 1.1599, "mean_token_accuracy": 0.7296118512749672, "num_tokens": 4405242.0, "step": 5466 }, { "epoch": 1.4480932203389831, "grad_norm": 1.5828795433044434, "learning_rate": 9.276085805084747e-06, "loss": 1.2645, "mean_token_accuracy": 0.7237806618213654, "num_tokens": 4406952.0, "step": 5468 }, { "epoch": 1.448622881355932, "grad_norm": 1.626190423965454, "learning_rate": 9.275820974576272e-06, "loss": 1.2947, "mean_token_accuracy": 0.7007283344864845, "num_tokens": 4408592.0, "step": 5470 }, { "epoch": 1.4491525423728815, "grad_norm": 1.8976467847824097, "learning_rate": 9.275556144067798e-06, "loss": 1.8002, "mean_token_accuracy": 0.6502741724252701, "num_tokens": 4410196.0, "step": 5472 }, { "epoch": 1.4496822033898304, "grad_norm": 1.8975756168365479, "learning_rate": 9.275291313559323e-06, "loss": 1.7408, "mean_token_accuracy": 0.6098647862672806, "num_tokens": 4411929.0, "step": 5474 }, { "epoch": 1.4502118644067796, "grad_norm": 1.7610692977905273, "learning_rate": 9.275026483050848e-06, "loss": 1.0002, "mean_token_accuracy": 0.7645928338170052, "num_tokens": 4413479.0, "step": 5476 }, { "epoch": 1.4507415254237288, "grad_norm": 1.618822693824768, "learning_rate": 9.274761652542373e-06, "loss": 1.4378, "mean_token_accuracy": 0.6513976082205772, "num_tokens": 4415140.0, "step": 5478 }, { "epoch": 1.451271186440678, "grad_norm": 1.886459469795227, "learning_rate": 9.2744968220339e-06, "loss": 1.2861, "mean_token_accuracy": 0.7041894197463989, "num_tokens": 4416511.0, "step": 5480 }, { "epoch": 1.4518008474576272, "grad_norm": 1.9290143251419067, "learning_rate": 9.274231991525424e-06, "loss": 1.5707, "mean_token_accuracy": 0.6705638915300369, "num_tokens": 4417904.0, "step": 5482 }, { "epoch": 1.4523305084745763, "grad_norm": 1.749903678894043, "learning_rate": 9.27396716101695e-06, "loss": 1.7211, "mean_token_accuracy": 0.6357987187802792, "num_tokens": 4419604.0, "step": 5484 }, { "epoch": 1.4528601694915255, "grad_norm": 2.4233522415161133, "learning_rate": 9.273702330508474e-06, "loss": 1.0484, "mean_token_accuracy": 0.7217623889446259, "num_tokens": 4421046.0, "step": 5486 }, { "epoch": 1.4533898305084745, "grad_norm": 1.7908494472503662, "learning_rate": 9.273437500000001e-06, "loss": 1.5039, "mean_token_accuracy": 0.6676842570304871, "num_tokens": 4422672.0, "step": 5488 }, { "epoch": 1.4539194915254237, "grad_norm": 1.4140055179595947, "learning_rate": 9.273172669491526e-06, "loss": 0.9097, "mean_token_accuracy": 0.7823232337832451, "num_tokens": 4424182.0, "step": 5490 }, { "epoch": 1.4544491525423728, "grad_norm": 1.5690200328826904, "learning_rate": 9.272907838983052e-06, "loss": 1.3413, "mean_token_accuracy": 0.6935189291834831, "num_tokens": 4425700.0, "step": 5492 }, { "epoch": 1.454978813559322, "grad_norm": 1.4346139430999756, "learning_rate": 9.272643008474577e-06, "loss": 1.2228, "mean_token_accuracy": 0.7190987020730972, "num_tokens": 4427340.0, "step": 5494 }, { "epoch": 1.4555084745762712, "grad_norm": 1.7057019472122192, "learning_rate": 9.272378177966102e-06, "loss": 1.4791, "mean_token_accuracy": 0.65315081179142, "num_tokens": 4429154.0, "step": 5496 }, { "epoch": 1.4560381355932204, "grad_norm": 1.966172456741333, "learning_rate": 9.272113347457627e-06, "loss": 1.5933, "mean_token_accuracy": 0.6760683730244637, "num_tokens": 4430787.0, "step": 5498 }, { "epoch": 1.4565677966101696, "grad_norm": 1.5893101692199707, "learning_rate": 9.271848516949154e-06, "loss": 1.2205, "step": 5500 }, { "epoch": 1.4565677966101696, "eval_loss": 1.3183963298797607, "eval_mean_token_accuracy": 0.6995301670455313, "eval_num_tokens": 4432459.0, "eval_runtime": 48.2659, "eval_samples_per_second": 6.381, "eval_steps_per_second": 6.381, "step": 5500 }, { "epoch": 1.4570974576271185, "grad_norm": 1.6306771039962769, "learning_rate": 9.271583686440679e-06, "loss": 1.5292, "mean_token_accuracy": 0.6941600609570742, "num_tokens": 4434115.0, "step": 5502 }, { "epoch": 1.457627118644068, "grad_norm": 1.3595836162567139, "learning_rate": 9.271318855932204e-06, "loss": 1.2014, "mean_token_accuracy": 0.7090592533349991, "num_tokens": 4435953.0, "step": 5504 }, { "epoch": 1.4581567796610169, "grad_norm": 1.8893415927886963, "learning_rate": 9.271054025423729e-06, "loss": 1.8947, "mean_token_accuracy": 0.60396359115839, "num_tokens": 4437387.0, "step": 5506 }, { "epoch": 1.458686440677966, "grad_norm": 1.7452539205551147, "learning_rate": 9.270789194915255e-06, "loss": 1.3629, "mean_token_accuracy": 0.7360047549009323, "num_tokens": 4438840.0, "step": 5508 }, { "epoch": 1.4592161016949152, "grad_norm": 1.399499773979187, "learning_rate": 9.27052436440678e-06, "loss": 1.3797, "mean_token_accuracy": 0.6823044791817665, "num_tokens": 4440565.0, "step": 5510 }, { "epoch": 1.4597457627118644, "grad_norm": 1.5930346250534058, "learning_rate": 9.270259533898305e-06, "loss": 1.4449, "mean_token_accuracy": 0.6797993704676628, "num_tokens": 4442337.0, "step": 5512 }, { "epoch": 1.4602754237288136, "grad_norm": 1.4152237176895142, "learning_rate": 9.26999470338983e-06, "loss": 0.7846, "mean_token_accuracy": 0.7895475849509239, "num_tokens": 4443745.0, "step": 5514 }, { "epoch": 1.4608050847457628, "grad_norm": 1.223101019859314, "learning_rate": 9.269729872881357e-06, "loss": 0.7914, "mean_token_accuracy": 0.8018266260623932, "num_tokens": 4445325.0, "step": 5516 }, { "epoch": 1.461334745762712, "grad_norm": 1.735422134399414, "learning_rate": 9.269465042372882e-06, "loss": 1.3325, "mean_token_accuracy": 0.6895175576210022, "num_tokens": 4446848.0, "step": 5518 }, { "epoch": 1.461864406779661, "grad_norm": 1.5637052059173584, "learning_rate": 9.269200211864408e-06, "loss": 1.3053, "mean_token_accuracy": 0.6886930614709854, "num_tokens": 4448377.0, "step": 5520 }, { "epoch": 1.4623940677966103, "grad_norm": 1.6925896406173706, "learning_rate": 9.268935381355933e-06, "loss": 1.7549, "mean_token_accuracy": 0.6198719255626202, "num_tokens": 4449988.0, "step": 5522 }, { "epoch": 1.4629237288135593, "grad_norm": 1.6333086490631104, "learning_rate": 9.268670550847458e-06, "loss": 0.9296, "mean_token_accuracy": 0.7724568396806717, "num_tokens": 4451514.0, "step": 5524 }, { "epoch": 1.4634533898305084, "grad_norm": 1.6747055053710938, "learning_rate": 9.268405720338985e-06, "loss": 1.1034, "mean_token_accuracy": 0.7454763427376747, "num_tokens": 4453076.0, "step": 5526 }, { "epoch": 1.4639830508474576, "grad_norm": 1.7532830238342285, "learning_rate": 9.26814088983051e-06, "loss": 1.5306, "mean_token_accuracy": 0.6585794165730476, "num_tokens": 4454582.0, "step": 5528 }, { "epoch": 1.4645127118644068, "grad_norm": 1.5157239437103271, "learning_rate": 9.267876059322035e-06, "loss": 1.405, "mean_token_accuracy": 0.6767195612192154, "num_tokens": 4456135.0, "step": 5530 }, { "epoch": 1.465042372881356, "grad_norm": 1.397958517074585, "learning_rate": 9.26761122881356e-06, "loss": 1.3197, "mean_token_accuracy": 0.6875319816172123, "num_tokens": 4457912.0, "step": 5532 }, { "epoch": 1.4655720338983051, "grad_norm": 1.7783256769180298, "learning_rate": 9.267346398305086e-06, "loss": 1.4262, "mean_token_accuracy": 0.6764308214187622, "num_tokens": 4459762.0, "step": 5534 }, { "epoch": 1.4661016949152543, "grad_norm": 1.4084779024124146, "learning_rate": 9.267081567796611e-06, "loss": 1.3117, "mean_token_accuracy": 0.7157998457551003, "num_tokens": 4461345.0, "step": 5536 }, { "epoch": 1.4666313559322033, "grad_norm": 1.4128202199935913, "learning_rate": 9.266816737288136e-06, "loss": 1.0357, "mean_token_accuracy": 0.7464183419942856, "num_tokens": 4462729.0, "step": 5538 }, { "epoch": 1.4671610169491525, "grad_norm": 1.6622191667556763, "learning_rate": 9.266551906779661e-06, "loss": 1.4018, "mean_token_accuracy": 0.668097734451294, "num_tokens": 4464265.0, "step": 5540 }, { "epoch": 1.4676906779661016, "grad_norm": 2.010281562805176, "learning_rate": 9.266287076271188e-06, "loss": 1.3459, "mean_token_accuracy": 0.7172641456127167, "num_tokens": 4465735.0, "step": 5542 }, { "epoch": 1.4682203389830508, "grad_norm": 1.5134468078613281, "learning_rate": 9.266022245762713e-06, "loss": 1.1717, "mean_token_accuracy": 0.6965796649456024, "num_tokens": 4467510.0, "step": 5544 }, { "epoch": 1.46875, "grad_norm": 1.5849108695983887, "learning_rate": 9.265757415254239e-06, "loss": 1.3693, "mean_token_accuracy": 0.6902045086026192, "num_tokens": 4468971.0, "step": 5546 }, { "epoch": 1.4692796610169492, "grad_norm": 1.7973124980926514, "learning_rate": 9.265492584745764e-06, "loss": 1.2689, "mean_token_accuracy": 0.6937495693564415, "num_tokens": 4470739.0, "step": 5548 }, { "epoch": 1.4698093220338984, "grad_norm": 1.9045491218566895, "learning_rate": 9.265227754237289e-06, "loss": 1.9875, "mean_token_accuracy": 0.6135296709835529, "num_tokens": 4472336.0, "step": 5550 }, { "epoch": 1.4703389830508475, "grad_norm": 1.6860437393188477, "learning_rate": 9.264962923728814e-06, "loss": 1.0638, "mean_token_accuracy": 0.7309157773852348, "num_tokens": 4473797.0, "step": 5552 }, { "epoch": 1.4708686440677967, "grad_norm": 1.6553901433944702, "learning_rate": 9.26469809322034e-06, "loss": 1.4176, "mean_token_accuracy": 0.6862586438655853, "num_tokens": 4475276.0, "step": 5554 }, { "epoch": 1.4713983050847457, "grad_norm": 1.869452953338623, "learning_rate": 9.264433262711865e-06, "loss": 1.3529, "mean_token_accuracy": 0.6922827437520027, "num_tokens": 4476754.0, "step": 5556 }, { "epoch": 1.4719279661016949, "grad_norm": 1.8883479833602905, "learning_rate": 9.26416843220339e-06, "loss": 1.487, "mean_token_accuracy": 0.6707355156540871, "num_tokens": 4478251.0, "step": 5558 }, { "epoch": 1.472457627118644, "grad_norm": 1.8249388933181763, "learning_rate": 9.263903601694915e-06, "loss": 1.4328, "mean_token_accuracy": 0.6745442375540733, "num_tokens": 4479765.0, "step": 5560 }, { "epoch": 1.4729872881355932, "grad_norm": 1.5689878463745117, "learning_rate": 9.263638771186442e-06, "loss": 1.7438, "mean_token_accuracy": 0.6262103877961636, "num_tokens": 4481418.0, "step": 5562 }, { "epoch": 1.4735169491525424, "grad_norm": 1.3631194829940796, "learning_rate": 9.263373940677967e-06, "loss": 1.3021, "mean_token_accuracy": 0.6883457526564598, "num_tokens": 4483265.0, "step": 5564 }, { "epoch": 1.4740466101694916, "grad_norm": 2.1132915019989014, "learning_rate": 9.263109110169492e-06, "loss": 1.5925, "mean_token_accuracy": 0.6588397696614265, "num_tokens": 4484660.0, "step": 5566 }, { "epoch": 1.4745762711864407, "grad_norm": 1.5716943740844727, "learning_rate": 9.262844279661017e-06, "loss": 1.4764, "mean_token_accuracy": 0.6977319121360779, "num_tokens": 4486353.0, "step": 5568 }, { "epoch": 1.4751059322033897, "grad_norm": 1.743721604347229, "learning_rate": 9.262579449152543e-06, "loss": 1.1971, "mean_token_accuracy": 0.7126355767250061, "num_tokens": 4488071.0, "step": 5570 }, { "epoch": 1.475635593220339, "grad_norm": 1.6463979482650757, "learning_rate": 9.262314618644068e-06, "loss": 1.0884, "mean_token_accuracy": 0.7297529429197311, "num_tokens": 4489436.0, "step": 5572 }, { "epoch": 1.476165254237288, "grad_norm": 1.887786865234375, "learning_rate": 9.262049788135595e-06, "loss": 1.1057, "mean_token_accuracy": 0.7419656440615654, "num_tokens": 4491041.0, "step": 5574 }, { "epoch": 1.4766949152542372, "grad_norm": 1.4880255460739136, "learning_rate": 9.26178495762712e-06, "loss": 1.3834, "mean_token_accuracy": 0.6752445921301842, "num_tokens": 4492955.0, "step": 5576 }, { "epoch": 1.4772245762711864, "grad_norm": 1.9194468259811401, "learning_rate": 9.261520127118645e-06, "loss": 1.526, "mean_token_accuracy": 0.6740090399980545, "num_tokens": 4494255.0, "step": 5578 }, { "epoch": 1.4777542372881356, "grad_norm": 1.5543832778930664, "learning_rate": 9.26125529661017e-06, "loss": 1.4071, "mean_token_accuracy": 0.6761587038636208, "num_tokens": 4496192.0, "step": 5580 }, { "epoch": 1.4782838983050848, "grad_norm": 2.1115972995758057, "learning_rate": 9.260990466101696e-06, "loss": 1.5171, "mean_token_accuracy": 0.6446593888103962, "num_tokens": 4497640.0, "step": 5582 }, { "epoch": 1.478813559322034, "grad_norm": 1.2074533700942993, "learning_rate": 9.260725635593221e-06, "loss": 1.6302, "mean_token_accuracy": 0.6430019363760948, "num_tokens": 4499908.0, "step": 5584 }, { "epoch": 1.4793432203389831, "grad_norm": 1.6504117250442505, "learning_rate": 9.260460805084746e-06, "loss": 1.1678, "mean_token_accuracy": 0.7107608988881111, "num_tokens": 4501406.0, "step": 5586 }, { "epoch": 1.479872881355932, "grad_norm": 1.9322800636291504, "learning_rate": 9.260195974576271e-06, "loss": 1.212, "mean_token_accuracy": 0.7115069478750229, "num_tokens": 4502817.0, "step": 5588 }, { "epoch": 1.4804025423728815, "grad_norm": 1.5222103595733643, "learning_rate": 9.259931144067798e-06, "loss": 1.1978, "mean_token_accuracy": 0.7602322176098824, "num_tokens": 4504395.0, "step": 5590 }, { "epoch": 1.4809322033898304, "grad_norm": 1.531596064567566, "learning_rate": 9.259666313559323e-06, "loss": 1.4337, "mean_token_accuracy": 0.6734491363167763, "num_tokens": 4506034.0, "step": 5592 }, { "epoch": 1.4814618644067796, "grad_norm": 1.6954841613769531, "learning_rate": 9.259401483050848e-06, "loss": 1.3977, "mean_token_accuracy": 0.6715149581432343, "num_tokens": 4507590.0, "step": 5594 }, { "epoch": 1.4819915254237288, "grad_norm": 1.5207390785217285, "learning_rate": 9.259136652542373e-06, "loss": 0.9616, "mean_token_accuracy": 0.7713408693671227, "num_tokens": 4509145.0, "step": 5596 }, { "epoch": 1.482521186440678, "grad_norm": 1.4420584440231323, "learning_rate": 9.2588718220339e-06, "loss": 1.4036, "mean_token_accuracy": 0.696174293756485, "num_tokens": 4510646.0, "step": 5598 }, { "epoch": 1.4830508474576272, "grad_norm": 1.4142191410064697, "learning_rate": 9.258606991525424e-06, "loss": 1.0812, "mean_token_accuracy": 0.7232874780893326, "num_tokens": 4512224.0, "step": 5600 }, { "epoch": 1.4835805084745763, "grad_norm": 1.817916989326477, "learning_rate": 9.25834216101695e-06, "loss": 1.3991, "mean_token_accuracy": 0.6941902339458466, "num_tokens": 4514122.0, "step": 5602 }, { "epoch": 1.4841101694915255, "grad_norm": 1.496351718902588, "learning_rate": 9.258077330508476e-06, "loss": 1.0976, "mean_token_accuracy": 0.7398492321372032, "num_tokens": 4515439.0, "step": 5604 }, { "epoch": 1.4846398305084745, "grad_norm": 1.781498908996582, "learning_rate": 9.2578125e-06, "loss": 1.406, "mean_token_accuracy": 0.6911106407642365, "num_tokens": 4516774.0, "step": 5606 }, { "epoch": 1.4851694915254237, "grad_norm": 1.2096425294876099, "learning_rate": 9.257547669491527e-06, "loss": 1.1562, "mean_token_accuracy": 0.6951196864247322, "num_tokens": 4518493.0, "step": 5608 }, { "epoch": 1.4856991525423728, "grad_norm": 1.2911888360977173, "learning_rate": 9.257282838983052e-06, "loss": 1.168, "mean_token_accuracy": 0.7280111014842987, "num_tokens": 4520112.0, "step": 5610 }, { "epoch": 1.486228813559322, "grad_norm": 1.6610097885131836, "learning_rate": 9.257018008474577e-06, "loss": 1.1426, "mean_token_accuracy": 0.7310109287500381, "num_tokens": 4521842.0, "step": 5612 }, { "epoch": 1.4867584745762712, "grad_norm": 1.7288614511489868, "learning_rate": 9.256753177966102e-06, "loss": 1.5308, "mean_token_accuracy": 0.675943449139595, "num_tokens": 4523431.0, "step": 5614 }, { "epoch": 1.4872881355932204, "grad_norm": 2.0366790294647217, "learning_rate": 9.256488347457629e-06, "loss": 1.5476, "mean_token_accuracy": 0.6815623417496681, "num_tokens": 4524874.0, "step": 5616 }, { "epoch": 1.4878177966101696, "grad_norm": 1.2139382362365723, "learning_rate": 9.256223516949154e-06, "loss": 0.8344, "mean_token_accuracy": 0.7914842590689659, "num_tokens": 4526591.0, "step": 5618 }, { "epoch": 1.4883474576271185, "grad_norm": 1.4786784648895264, "learning_rate": 9.255958686440678e-06, "loss": 0.9774, "mean_token_accuracy": 0.7515576779842377, "num_tokens": 4528021.0, "step": 5620 }, { "epoch": 1.488877118644068, "grad_norm": 1.5075169801712036, "learning_rate": 9.255693855932203e-06, "loss": 1.1119, "mean_token_accuracy": 0.7434697151184082, "num_tokens": 4529580.0, "step": 5622 }, { "epoch": 1.4894067796610169, "grad_norm": 1.538164734840393, "learning_rate": 9.25542902542373e-06, "loss": 1.4012, "mean_token_accuracy": 0.7057214118540287, "num_tokens": 4531213.0, "step": 5624 }, { "epoch": 1.489936440677966, "grad_norm": 1.5171799659729004, "learning_rate": 9.255164194915255e-06, "loss": 1.7132, "mean_token_accuracy": 0.6240146830677986, "num_tokens": 4532993.0, "step": 5626 }, { "epoch": 1.4904661016949152, "grad_norm": 1.7678464651107788, "learning_rate": 9.254899364406782e-06, "loss": 0.9624, "mean_token_accuracy": 0.7848002463579178, "num_tokens": 4534390.0, "step": 5628 }, { "epoch": 1.4909957627118644, "grad_norm": 1.4574071168899536, "learning_rate": 9.254634533898306e-06, "loss": 1.0536, "mean_token_accuracy": 0.7638003081083298, "num_tokens": 4535864.0, "step": 5630 }, { "epoch": 1.4915254237288136, "grad_norm": 1.6000208854675293, "learning_rate": 9.254369703389831e-06, "loss": 1.6384, "mean_token_accuracy": 0.6386988013982773, "num_tokens": 4537703.0, "step": 5632 }, { "epoch": 1.4920550847457628, "grad_norm": 1.6286700963974, "learning_rate": 9.254104872881356e-06, "loss": 1.7002, "mean_token_accuracy": 0.6115070842206478, "num_tokens": 4539412.0, "step": 5634 }, { "epoch": 1.492584745762712, "grad_norm": 1.5315823554992676, "learning_rate": 9.253840042372883e-06, "loss": 1.2647, "mean_token_accuracy": 0.7103413790464401, "num_tokens": 4540935.0, "step": 5636 }, { "epoch": 1.493114406779661, "grad_norm": 1.5849775075912476, "learning_rate": 9.253575211864408e-06, "loss": 1.2589, "mean_token_accuracy": 0.7237462028861046, "num_tokens": 4542587.0, "step": 5638 }, { "epoch": 1.4936440677966103, "grad_norm": 1.5267032384872437, "learning_rate": 9.253310381355933e-06, "loss": 1.1845, "mean_token_accuracy": 0.7321439683437347, "num_tokens": 4544257.0, "step": 5640 }, { "epoch": 1.4941737288135593, "grad_norm": 2.1369457244873047, "learning_rate": 9.253045550847458e-06, "loss": 1.6069, "mean_token_accuracy": 0.6709285750985146, "num_tokens": 4545832.0, "step": 5642 }, { "epoch": 1.4947033898305084, "grad_norm": 1.6391841173171997, "learning_rate": 9.252780720338984e-06, "loss": 1.1779, "mean_token_accuracy": 0.719165526330471, "num_tokens": 4547401.0, "step": 5644 }, { "epoch": 1.4952330508474576, "grad_norm": 1.834914207458496, "learning_rate": 9.25251588983051e-06, "loss": 1.3902, "mean_token_accuracy": 0.7106180861592293, "num_tokens": 4548903.0, "step": 5646 }, { "epoch": 1.4957627118644068, "grad_norm": 1.5194988250732422, "learning_rate": 9.252251059322034e-06, "loss": 1.4052, "mean_token_accuracy": 0.6761549897491932, "num_tokens": 4550632.0, "step": 5648 }, { "epoch": 1.496292372881356, "grad_norm": 1.7148381471633911, "learning_rate": 9.25198622881356e-06, "loss": 1.4106, "mean_token_accuracy": 0.6733106896281242, "num_tokens": 4552142.0, "step": 5650 }, { "epoch": 1.4968220338983051, "grad_norm": 2.597846269607544, "learning_rate": 9.251721398305086e-06, "loss": 1.4946, "mean_token_accuracy": 0.6590528935194016, "num_tokens": 4553450.0, "step": 5652 }, { "epoch": 1.4973516949152543, "grad_norm": 1.6041312217712402, "learning_rate": 9.25145656779661e-06, "loss": 1.2048, "mean_token_accuracy": 0.7149757966399193, "num_tokens": 4554821.0, "step": 5654 }, { "epoch": 1.4978813559322033, "grad_norm": 1.6992853879928589, "learning_rate": 9.251191737288137e-06, "loss": 1.3617, "mean_token_accuracy": 0.7068678885698318, "num_tokens": 4556425.0, "step": 5656 }, { "epoch": 1.4984110169491525, "grad_norm": 1.4350035190582275, "learning_rate": 9.250926906779662e-06, "loss": 0.9828, "mean_token_accuracy": 0.7680585458874702, "num_tokens": 4558303.0, "step": 5658 }, { "epoch": 1.4989406779661016, "grad_norm": 1.601347804069519, "learning_rate": 9.250662076271187e-06, "loss": 1.5488, "mean_token_accuracy": 0.650469034910202, "num_tokens": 4560045.0, "step": 5660 }, { "epoch": 1.4994703389830508, "grad_norm": 1.3060945272445679, "learning_rate": 9.250397245762712e-06, "loss": 1.4415, "mean_token_accuracy": 0.6836819425225258, "num_tokens": 4561741.0, "step": 5662 }, { "epoch": 1.5, "grad_norm": 1.3998583555221558, "learning_rate": 9.250132415254239e-06, "loss": 1.4872, "mean_token_accuracy": 0.6994953826069832, "num_tokens": 4563531.0, "step": 5664 }, { "epoch": 1.5005296610169492, "grad_norm": 1.5954838991165161, "learning_rate": 9.249867584745764e-06, "loss": 1.2777, "mean_token_accuracy": 0.7077825516462326, "num_tokens": 4565678.0, "step": 5666 }, { "epoch": 1.5010593220338984, "grad_norm": 1.7570825815200806, "learning_rate": 9.249602754237289e-06, "loss": 1.0362, "mean_token_accuracy": 0.7490404918789864, "num_tokens": 4567046.0, "step": 5668 }, { "epoch": 1.5015889830508473, "grad_norm": 1.8271336555480957, "learning_rate": 9.249337923728814e-06, "loss": 1.6429, "mean_token_accuracy": 0.6500541679561138, "num_tokens": 4568839.0, "step": 5670 }, { "epoch": 1.5021186440677967, "grad_norm": 1.534326195716858, "learning_rate": 9.24907309322034e-06, "loss": 1.4658, "mean_token_accuracy": 0.6641197800636292, "num_tokens": 4570731.0, "step": 5672 }, { "epoch": 1.5026483050847457, "grad_norm": 1.5684843063354492, "learning_rate": 9.248808262711865e-06, "loss": 1.1938, "mean_token_accuracy": 0.7108750715851784, "num_tokens": 4572229.0, "step": 5674 }, { "epoch": 1.503177966101695, "grad_norm": 1.488110899925232, "learning_rate": 9.24854343220339e-06, "loss": 1.2777, "mean_token_accuracy": 0.7252218648791313, "num_tokens": 4573674.0, "step": 5676 }, { "epoch": 1.503707627118644, "grad_norm": 1.80799400806427, "learning_rate": 9.248278601694915e-06, "loss": 1.7888, "mean_token_accuracy": 0.6169235110282898, "num_tokens": 4575329.0, "step": 5678 }, { "epoch": 1.5042372881355932, "grad_norm": 1.316810131072998, "learning_rate": 9.248013771186442e-06, "loss": 1.2541, "mean_token_accuracy": 0.6923619732260704, "num_tokens": 4577126.0, "step": 5680 }, { "epoch": 1.5047669491525424, "grad_norm": 2.1008412837982178, "learning_rate": 9.247748940677967e-06, "loss": 1.4698, "mean_token_accuracy": 0.6751509383320808, "num_tokens": 4578633.0, "step": 5682 }, { "epoch": 1.5052966101694916, "grad_norm": 2.207479953765869, "learning_rate": 9.247484110169493e-06, "loss": 1.5958, "mean_token_accuracy": 0.6408334970474243, "num_tokens": 4579921.0, "step": 5684 }, { "epoch": 1.5058262711864407, "grad_norm": 1.4785680770874023, "learning_rate": 9.247219279661016e-06, "loss": 1.2195, "mean_token_accuracy": 0.7093973085284233, "num_tokens": 4581613.0, "step": 5686 }, { "epoch": 1.5063559322033897, "grad_norm": 1.912398338317871, "learning_rate": 9.246954449152543e-06, "loss": 1.3545, "mean_token_accuracy": 0.6998388543725014, "num_tokens": 4583018.0, "step": 5688 }, { "epoch": 1.506885593220339, "grad_norm": 1.2615530490875244, "learning_rate": 9.246689618644068e-06, "loss": 1.3359, "mean_token_accuracy": 0.6957366764545441, "num_tokens": 4584916.0, "step": 5690 }, { "epoch": 1.507415254237288, "grad_norm": 1.461965560913086, "learning_rate": 9.246424788135595e-06, "loss": 1.2306, "mean_token_accuracy": 0.689939558506012, "num_tokens": 4586732.0, "step": 5692 }, { "epoch": 1.5079449152542372, "grad_norm": 1.7430133819580078, "learning_rate": 9.24615995762712e-06, "loss": 1.254, "mean_token_accuracy": 0.7180686891078949, "num_tokens": 4588244.0, "step": 5694 }, { "epoch": 1.5084745762711864, "grad_norm": 1.7572399377822876, "learning_rate": 9.245895127118644e-06, "loss": 1.2647, "mean_token_accuracy": 0.6952821239829063, "num_tokens": 4589556.0, "step": 5696 }, { "epoch": 1.5090042372881356, "grad_norm": 1.7345831394195557, "learning_rate": 9.245630296610171e-06, "loss": 1.1248, "mean_token_accuracy": 0.7386225759983063, "num_tokens": 4590811.0, "step": 5698 }, { "epoch": 1.5095338983050848, "grad_norm": 1.248802661895752, "learning_rate": 9.245365466101696e-06, "loss": 1.2017, "mean_token_accuracy": 0.7190792411565781, "num_tokens": 4592367.0, "step": 5700 }, { "epoch": 1.5100635593220337, "grad_norm": 1.779984474182129, "learning_rate": 9.245100635593221e-06, "loss": 1.4673, "mean_token_accuracy": 0.6837042346596718, "num_tokens": 4593899.0, "step": 5702 }, { "epoch": 1.5105932203389831, "grad_norm": 2.0821545124053955, "learning_rate": 9.244835805084746e-06, "loss": 1.1709, "mean_token_accuracy": 0.7170992270112038, "num_tokens": 4595274.0, "step": 5704 }, { "epoch": 1.511122881355932, "grad_norm": 1.9211831092834473, "learning_rate": 9.244570974576272e-06, "loss": 1.8863, "mean_token_accuracy": 0.5859763324260712, "num_tokens": 4596695.0, "step": 5706 }, { "epoch": 1.5116525423728815, "grad_norm": 1.6806460618972778, "learning_rate": 9.244306144067797e-06, "loss": 1.2018, "mean_token_accuracy": 0.7275283858180046, "num_tokens": 4598608.0, "step": 5708 }, { "epoch": 1.5121822033898304, "grad_norm": 1.3550701141357422, "learning_rate": 9.244041313559324e-06, "loss": 1.2099, "mean_token_accuracy": 0.7236278280615807, "num_tokens": 4600308.0, "step": 5710 }, { "epoch": 1.5127118644067796, "grad_norm": 1.5544294118881226, "learning_rate": 9.243776483050849e-06, "loss": 1.579, "mean_token_accuracy": 0.6835449188947678, "num_tokens": 4601775.0, "step": 5712 }, { "epoch": 1.5132415254237288, "grad_norm": 1.4448590278625488, "learning_rate": 9.243511652542374e-06, "loss": 0.8504, "mean_token_accuracy": 0.7922725081443787, "num_tokens": 4603300.0, "step": 5714 }, { "epoch": 1.513771186440678, "grad_norm": 1.5641132593154907, "learning_rate": 9.243246822033899e-06, "loss": 0.86, "mean_token_accuracy": 0.8002498298883438, "num_tokens": 4604592.0, "step": 5716 }, { "epoch": 1.5143008474576272, "grad_norm": 1.3135329484939575, "learning_rate": 9.242981991525425e-06, "loss": 1.1833, "mean_token_accuracy": 0.7210443243384361, "num_tokens": 4606201.0, "step": 5718 }, { "epoch": 1.5148305084745761, "grad_norm": 1.604820728302002, "learning_rate": 9.24271716101695e-06, "loss": 1.0348, "mean_token_accuracy": 0.762991227209568, "num_tokens": 4607456.0, "step": 5720 }, { "epoch": 1.5153601694915255, "grad_norm": 1.788871169090271, "learning_rate": 9.242452330508475e-06, "loss": 1.7197, "mean_token_accuracy": 0.6207270212471485, "num_tokens": 4609119.0, "step": 5722 }, { "epoch": 1.5158898305084745, "grad_norm": 1.6754809617996216, "learning_rate": 9.2421875e-06, "loss": 1.4259, "mean_token_accuracy": 0.6778927482664585, "num_tokens": 4610753.0, "step": 5724 }, { "epoch": 1.5164194915254239, "grad_norm": 1.349352478981018, "learning_rate": 9.241922669491527e-06, "loss": 1.2422, "mean_token_accuracy": 0.7282118871808052, "num_tokens": 4612593.0, "step": 5726 }, { "epoch": 1.5169491525423728, "grad_norm": 1.113820195198059, "learning_rate": 9.241657838983052e-06, "loss": 1.0967, "mean_token_accuracy": 0.7337070107460022, "num_tokens": 4614822.0, "step": 5728 }, { "epoch": 1.517478813559322, "grad_norm": 1.6124553680419922, "learning_rate": 9.241393008474577e-06, "loss": 1.1749, "mean_token_accuracy": 0.73729507625103, "num_tokens": 4616416.0, "step": 5730 }, { "epoch": 1.5180084745762712, "grad_norm": 1.4512964487075806, "learning_rate": 9.241128177966102e-06, "loss": 1.1997, "mean_token_accuracy": 0.7192623168230057, "num_tokens": 4618132.0, "step": 5732 }, { "epoch": 1.5185381355932204, "grad_norm": 1.7887951135635376, "learning_rate": 9.240863347457628e-06, "loss": 1.4402, "mean_token_accuracy": 0.6692734509706497, "num_tokens": 4619680.0, "step": 5734 }, { "epoch": 1.5190677966101696, "grad_norm": 1.5855854749679565, "learning_rate": 9.240598516949153e-06, "loss": 1.5541, "mean_token_accuracy": 0.6480158194899559, "num_tokens": 4621310.0, "step": 5736 }, { "epoch": 1.5195974576271185, "grad_norm": 0.930094838142395, "learning_rate": 9.24033368644068e-06, "loss": 0.782, "mean_token_accuracy": 0.7991558387875557, "num_tokens": 4623499.0, "step": 5738 }, { "epoch": 1.520127118644068, "grad_norm": 1.6426979303359985, "learning_rate": 9.240068855932203e-06, "loss": 1.3837, "mean_token_accuracy": 0.6875345408916473, "num_tokens": 4626121.0, "step": 5740 }, { "epoch": 1.5206567796610169, "grad_norm": 1.5343739986419678, "learning_rate": 9.23980402542373e-06, "loss": 1.4575, "mean_token_accuracy": 0.6735314056277275, "num_tokens": 4627645.0, "step": 5742 }, { "epoch": 1.5211864406779663, "grad_norm": 1.678113579750061, "learning_rate": 9.239539194915255e-06, "loss": 1.3029, "mean_token_accuracy": 0.719585232436657, "num_tokens": 4629129.0, "step": 5744 }, { "epoch": 1.5217161016949152, "grad_norm": 1.43181574344635, "learning_rate": 9.239274364406781e-06, "loss": 1.3844, "mean_token_accuracy": 0.6758689507842064, "num_tokens": 4630673.0, "step": 5746 }, { "epoch": 1.5222457627118644, "grad_norm": 1.7059389352798462, "learning_rate": 9.239009533898306e-06, "loss": 1.4343, "mean_token_accuracy": 0.6723731979727745, "num_tokens": 4632185.0, "step": 5748 }, { "epoch": 1.5227754237288136, "grad_norm": 1.4206807613372803, "learning_rate": 9.238744703389831e-06, "loss": 1.2659, "step": 5750 }, { "epoch": 1.5227754237288136, "eval_loss": 1.318078875541687, "eval_mean_token_accuracy": 0.698760189212762, "eval_num_tokens": 4633877.0, "eval_runtime": 48.7503, "eval_samples_per_second": 6.318, "eval_steps_per_second": 6.318, "step": 5750 }, { "epoch": 1.5233050847457628, "grad_norm": 1.634247899055481, "learning_rate": 9.238479872881356e-06, "loss": 1.2866, "mean_token_accuracy": 0.7031409218907356, "num_tokens": 4635271.0, "step": 5752 }, { "epoch": 1.523834745762712, "grad_norm": 1.4076610803604126, "learning_rate": 9.238215042372883e-06, "loss": 1.5988, "mean_token_accuracy": 0.6324877515435219, "num_tokens": 4637051.0, "step": 5754 }, { "epoch": 1.524364406779661, "grad_norm": 1.5511395931243896, "learning_rate": 9.237950211864408e-06, "loss": 1.2039, "mean_token_accuracy": 0.7222444638609886, "num_tokens": 4638554.0, "step": 5756 }, { "epoch": 1.5248940677966103, "grad_norm": 1.6832504272460938, "learning_rate": 9.237685381355932e-06, "loss": 1.818, "mean_token_accuracy": 0.6258120872080326, "num_tokens": 4640218.0, "step": 5758 }, { "epoch": 1.5254237288135593, "grad_norm": 1.4196786880493164, "learning_rate": 9.237420550847457e-06, "loss": 1.1231, "mean_token_accuracy": 0.7268939912319183, "num_tokens": 4641938.0, "step": 5760 }, { "epoch": 1.5259533898305084, "grad_norm": 1.9451631307601929, "learning_rate": 9.237155720338984e-06, "loss": 1.4284, "mean_token_accuracy": 0.6838202960789204, "num_tokens": 4643335.0, "step": 5762 }, { "epoch": 1.5264830508474576, "grad_norm": 1.6645616292953491, "learning_rate": 9.236890889830509e-06, "loss": 1.3277, "mean_token_accuracy": 0.6830933764576912, "num_tokens": 4645282.0, "step": 5764 }, { "epoch": 1.5270127118644068, "grad_norm": 1.2790099382400513, "learning_rate": 9.236626059322036e-06, "loss": 1.099, "mean_token_accuracy": 0.7227793708443642, "num_tokens": 4646892.0, "step": 5766 }, { "epoch": 1.527542372881356, "grad_norm": 1.6139966249465942, "learning_rate": 9.236361228813559e-06, "loss": 1.6116, "mean_token_accuracy": 0.6685354486107826, "num_tokens": 4648366.0, "step": 5768 }, { "epoch": 1.528072033898305, "grad_norm": 1.7306095361709595, "learning_rate": 9.236096398305085e-06, "loss": 1.355, "mean_token_accuracy": 0.7071634978055954, "num_tokens": 4650018.0, "step": 5770 }, { "epoch": 1.5286016949152543, "grad_norm": 1.5618020296096802, "learning_rate": 9.23583156779661e-06, "loss": 1.1014, "mean_token_accuracy": 0.7518322467803955, "num_tokens": 4651450.0, "step": 5772 }, { "epoch": 1.5291313559322033, "grad_norm": 1.7807875871658325, "learning_rate": 9.235566737288137e-06, "loss": 1.5823, "mean_token_accuracy": 0.671952199190855, "num_tokens": 4653137.0, "step": 5774 }, { "epoch": 1.5296610169491527, "grad_norm": 1.6061993837356567, "learning_rate": 9.235301906779662e-06, "loss": 0.9769, "mean_token_accuracy": 0.7672720700502396, "num_tokens": 4654423.0, "step": 5776 }, { "epoch": 1.5301906779661016, "grad_norm": 1.7841142416000366, "learning_rate": 9.235037076271187e-06, "loss": 1.03, "mean_token_accuracy": 0.7527108937501907, "num_tokens": 4655831.0, "step": 5778 }, { "epoch": 1.5307203389830508, "grad_norm": 1.6550929546356201, "learning_rate": 9.234772245762713e-06, "loss": 1.3489, "mean_token_accuracy": 0.7099541872739792, "num_tokens": 4657309.0, "step": 5780 }, { "epoch": 1.53125, "grad_norm": 1.7430967092514038, "learning_rate": 9.234507415254238e-06, "loss": 1.5068, "mean_token_accuracy": 0.6819068193435669, "num_tokens": 4658754.0, "step": 5782 }, { "epoch": 1.5317796610169492, "grad_norm": 1.592814564704895, "learning_rate": 9.234242584745763e-06, "loss": 1.2361, "mean_token_accuracy": 0.7453542724251747, "num_tokens": 4660289.0, "step": 5784 }, { "epoch": 1.5323093220338984, "grad_norm": 2.0701682567596436, "learning_rate": 9.233977754237288e-06, "loss": 1.1925, "mean_token_accuracy": 0.6857344508171082, "num_tokens": 4661744.0, "step": 5786 }, { "epoch": 1.5328389830508473, "grad_norm": 1.567614197731018, "learning_rate": 9.233712923728815e-06, "loss": 1.2683, "mean_token_accuracy": 0.7097369953989983, "num_tokens": 4663236.0, "step": 5788 }, { "epoch": 1.5333686440677967, "grad_norm": 1.6672906875610352, "learning_rate": 9.23344809322034e-06, "loss": 1.4298, "mean_token_accuracy": 0.6540563181042671, "num_tokens": 4665002.0, "step": 5790 }, { "epoch": 1.5338983050847457, "grad_norm": 1.802450180053711, "learning_rate": 9.233183262711866e-06, "loss": 1.4976, "mean_token_accuracy": 0.6426173225045204, "num_tokens": 4666459.0, "step": 5792 }, { "epoch": 1.534427966101695, "grad_norm": 1.841215968132019, "learning_rate": 9.23291843220339e-06, "loss": 1.5517, "mean_token_accuracy": 0.6865900792181492, "num_tokens": 4668044.0, "step": 5794 }, { "epoch": 1.534957627118644, "grad_norm": 1.4664686918258667, "learning_rate": 9.232653601694916e-06, "loss": 1.6388, "mean_token_accuracy": 0.6438085772097111, "num_tokens": 4670113.0, "step": 5796 }, { "epoch": 1.5354872881355932, "grad_norm": 1.4827367067337036, "learning_rate": 9.232388771186441e-06, "loss": 1.4887, "mean_token_accuracy": 0.6784674227237701, "num_tokens": 4671718.0, "step": 5798 }, { "epoch": 1.5360169491525424, "grad_norm": 1.7819492816925049, "learning_rate": 9.232123940677968e-06, "loss": 1.1915, "mean_token_accuracy": 0.7083049602806568, "num_tokens": 4673315.0, "step": 5800 }, { "epoch": 1.5365466101694916, "grad_norm": 1.6154288053512573, "learning_rate": 9.231859110169493e-06, "loss": 1.2796, "mean_token_accuracy": 0.7173485904932022, "num_tokens": 4674845.0, "step": 5802 }, { "epoch": 1.5370762711864407, "grad_norm": 1.8857637643814087, "learning_rate": 9.231594279661018e-06, "loss": 1.4927, "mean_token_accuracy": 0.6747723110020161, "num_tokens": 4676399.0, "step": 5804 }, { "epoch": 1.5376059322033897, "grad_norm": 1.9427990913391113, "learning_rate": 9.231329449152543e-06, "loss": 1.2903, "mean_token_accuracy": 0.7248392030596733, "num_tokens": 4677579.0, "step": 5806 }, { "epoch": 1.538135593220339, "grad_norm": 1.5747144222259521, "learning_rate": 9.23106461864407e-06, "loss": 1.5171, "mean_token_accuracy": 0.6468885093927383, "num_tokens": 4679146.0, "step": 5808 }, { "epoch": 1.538665254237288, "grad_norm": 1.4169009923934937, "learning_rate": 9.230799788135594e-06, "loss": 1.2849, "mean_token_accuracy": 0.7085776254534721, "num_tokens": 4680763.0, "step": 5810 }, { "epoch": 1.5391949152542372, "grad_norm": 1.668899655342102, "learning_rate": 9.230534957627119e-06, "loss": 1.6678, "mean_token_accuracy": 0.643457692116499, "num_tokens": 4682474.0, "step": 5812 }, { "epoch": 1.5397245762711864, "grad_norm": 1.6437416076660156, "learning_rate": 9.230270127118644e-06, "loss": 1.6404, "mean_token_accuracy": 0.6581948176026344, "num_tokens": 4684201.0, "step": 5814 }, { "epoch": 1.5402542372881356, "grad_norm": 1.6463074684143066, "learning_rate": 9.23000529661017e-06, "loss": 1.228, "mean_token_accuracy": 0.7082420736551285, "num_tokens": 4685777.0, "step": 5816 }, { "epoch": 1.5407838983050848, "grad_norm": 2.128319025039673, "learning_rate": 9.229740466101696e-06, "loss": 1.5536, "mean_token_accuracy": 0.6739995181560516, "num_tokens": 4687053.0, "step": 5818 }, { "epoch": 1.5413135593220337, "grad_norm": 1.7324724197387695, "learning_rate": 9.229475635593222e-06, "loss": 1.1975, "mean_token_accuracy": 0.7205530628561974, "num_tokens": 4688586.0, "step": 5820 }, { "epoch": 1.5418432203389831, "grad_norm": 1.7732888460159302, "learning_rate": 9.229210805084745e-06, "loss": 1.0755, "mean_token_accuracy": 0.7386998161673546, "num_tokens": 4690073.0, "step": 5822 }, { "epoch": 1.542372881355932, "grad_norm": 1.4176125526428223, "learning_rate": 9.228945974576272e-06, "loss": 1.2519, "mean_token_accuracy": 0.7185938358306885, "num_tokens": 4691762.0, "step": 5824 }, { "epoch": 1.5429025423728815, "grad_norm": 1.6325130462646484, "learning_rate": 9.228681144067797e-06, "loss": 1.649, "mean_token_accuracy": 0.6366955265402794, "num_tokens": 4693381.0, "step": 5826 }, { "epoch": 1.5434322033898304, "grad_norm": 1.6488780975341797, "learning_rate": 9.228416313559324e-06, "loss": 1.5853, "mean_token_accuracy": 0.6605589613318443, "num_tokens": 4694888.0, "step": 5828 }, { "epoch": 1.5439618644067796, "grad_norm": 1.8309962749481201, "learning_rate": 9.228151483050849e-06, "loss": 1.2897, "mean_token_accuracy": 0.7122993767261505, "num_tokens": 4696461.0, "step": 5830 }, { "epoch": 1.5444915254237288, "grad_norm": 1.7202636003494263, "learning_rate": 9.227886652542373e-06, "loss": 0.9022, "mean_token_accuracy": 0.7604311630129814, "num_tokens": 4697969.0, "step": 5832 }, { "epoch": 1.545021186440678, "grad_norm": 1.5928939580917358, "learning_rate": 9.227621822033898e-06, "loss": 1.0917, "mean_token_accuracy": 0.7402958124876022, "num_tokens": 4699662.0, "step": 5834 }, { "epoch": 1.5455508474576272, "grad_norm": 1.7915865182876587, "learning_rate": 9.227356991525425e-06, "loss": 1.3466, "mean_token_accuracy": 0.682944767177105, "num_tokens": 4701030.0, "step": 5836 }, { "epoch": 1.5460805084745761, "grad_norm": 1.6456719636917114, "learning_rate": 9.22709216101695e-06, "loss": 1.7457, "mean_token_accuracy": 0.6155749559402466, "num_tokens": 4702810.0, "step": 5838 }, { "epoch": 1.5466101694915255, "grad_norm": 1.2741156816482544, "learning_rate": 9.226827330508475e-06, "loss": 1.2122, "mean_token_accuracy": 0.6955897137522697, "num_tokens": 4704692.0, "step": 5840 }, { "epoch": 1.5471398305084745, "grad_norm": 1.6351432800292969, "learning_rate": 9.2265625e-06, "loss": 1.3184, "mean_token_accuracy": 0.6890440098941326, "num_tokens": 4706437.0, "step": 5842 }, { "epoch": 1.5476694915254239, "grad_norm": 1.6179251670837402, "learning_rate": 9.226297669491526e-06, "loss": 0.9191, "mean_token_accuracy": 0.7701628282666206, "num_tokens": 4707981.0, "step": 5844 }, { "epoch": 1.5481991525423728, "grad_norm": 1.3289096355438232, "learning_rate": 9.226032838983051e-06, "loss": 1.1247, "mean_token_accuracy": 0.7216591313481331, "num_tokens": 4709682.0, "step": 5846 }, { "epoch": 1.548728813559322, "grad_norm": 1.183143138885498, "learning_rate": 9.225768008474576e-06, "loss": 0.7807, "mean_token_accuracy": 0.7895722389221191, "num_tokens": 4711655.0, "step": 5848 }, { "epoch": 1.5492584745762712, "grad_norm": 1.6252638101577759, "learning_rate": 9.225503177966101e-06, "loss": 1.0836, "mean_token_accuracy": 0.7223790511488914, "num_tokens": 4713118.0, "step": 5850 }, { "epoch": 1.5497881355932204, "grad_norm": 1.457961916923523, "learning_rate": 9.225238347457628e-06, "loss": 1.7725, "mean_token_accuracy": 0.629783608019352, "num_tokens": 4714920.0, "step": 5852 }, { "epoch": 1.5503177966101696, "grad_norm": 1.7026820182800293, "learning_rate": 9.224973516949153e-06, "loss": 1.4121, "mean_token_accuracy": 0.686894953250885, "num_tokens": 4716308.0, "step": 5854 }, { "epoch": 1.5508474576271185, "grad_norm": 1.6526970863342285, "learning_rate": 9.22470868644068e-06, "loss": 1.4353, "mean_token_accuracy": 0.6806357726454735, "num_tokens": 4717966.0, "step": 5856 }, { "epoch": 1.551377118644068, "grad_norm": 1.379270315170288, "learning_rate": 9.224443855932204e-06, "loss": 0.8834, "mean_token_accuracy": 0.7768008187413216, "num_tokens": 4719699.0, "step": 5858 }, { "epoch": 1.5519067796610169, "grad_norm": 1.7731895446777344, "learning_rate": 9.22417902542373e-06, "loss": 1.2962, "mean_token_accuracy": 0.7052993178367615, "num_tokens": 4721260.0, "step": 5860 }, { "epoch": 1.5524364406779663, "grad_norm": 1.669426679611206, "learning_rate": 9.223914194915256e-06, "loss": 1.5321, "mean_token_accuracy": 0.6455552354454994, "num_tokens": 4722968.0, "step": 5862 }, { "epoch": 1.5529661016949152, "grad_norm": 1.32212233543396, "learning_rate": 9.22364936440678e-06, "loss": 0.9676, "mean_token_accuracy": 0.7739951461553574, "num_tokens": 4724742.0, "step": 5864 }, { "epoch": 1.5534957627118644, "grad_norm": 1.6845868825912476, "learning_rate": 9.223384533898306e-06, "loss": 1.45, "mean_token_accuracy": 0.6854546070098877, "num_tokens": 4726468.0, "step": 5866 }, { "epoch": 1.5540254237288136, "grad_norm": 1.436661958694458, "learning_rate": 9.22311970338983e-06, "loss": 1.1627, "mean_token_accuracy": 0.7330411374568939, "num_tokens": 4728084.0, "step": 5868 }, { "epoch": 1.5545550847457628, "grad_norm": 1.2986805438995361, "learning_rate": 9.222854872881357e-06, "loss": 1.1611, "mean_token_accuracy": 0.728540226817131, "num_tokens": 4729903.0, "step": 5870 }, { "epoch": 1.555084745762712, "grad_norm": 1.8179088830947876, "learning_rate": 9.222590042372882e-06, "loss": 1.2781, "mean_token_accuracy": 0.7019878178834915, "num_tokens": 4731552.0, "step": 5872 }, { "epoch": 1.555614406779661, "grad_norm": 1.910670280456543, "learning_rate": 9.222325211864407e-06, "loss": 1.6508, "mean_token_accuracy": 0.6518843099474907, "num_tokens": 4732871.0, "step": 5874 }, { "epoch": 1.5561440677966103, "grad_norm": 1.5488834381103516, "learning_rate": 9.222060381355932e-06, "loss": 1.4426, "mean_token_accuracy": 0.6599133834242821, "num_tokens": 4734450.0, "step": 5876 }, { "epoch": 1.5566737288135593, "grad_norm": 1.7497299909591675, "learning_rate": 9.221795550847459e-06, "loss": 1.6288, "mean_token_accuracy": 0.6452294439077377, "num_tokens": 4735865.0, "step": 5878 }, { "epoch": 1.5572033898305084, "grad_norm": 1.7398183345794678, "learning_rate": 9.221530720338984e-06, "loss": 1.0764, "mean_token_accuracy": 0.7425872534513474, "num_tokens": 4737474.0, "step": 5880 }, { "epoch": 1.5577330508474576, "grad_norm": 1.7522087097167969, "learning_rate": 9.22126588983051e-06, "loss": 1.3316, "mean_token_accuracy": 0.6929865404963493, "num_tokens": 4738861.0, "step": 5882 }, { "epoch": 1.5582627118644068, "grad_norm": 2.0321521759033203, "learning_rate": 9.221001059322035e-06, "loss": 0.9895, "mean_token_accuracy": 0.7514083161950111, "num_tokens": 4740134.0, "step": 5884 }, { "epoch": 1.558792372881356, "grad_norm": 1.843224287033081, "learning_rate": 9.22073622881356e-06, "loss": 1.6153, "mean_token_accuracy": 0.6645276471972466, "num_tokens": 4741508.0, "step": 5886 }, { "epoch": 1.559322033898305, "grad_norm": 1.8748964071273804, "learning_rate": 9.220471398305085e-06, "loss": 1.6417, "mean_token_accuracy": 0.665631115436554, "num_tokens": 4743070.0, "step": 5888 }, { "epoch": 1.5598516949152543, "grad_norm": 1.6432043313980103, "learning_rate": 9.220206567796612e-06, "loss": 1.1307, "mean_token_accuracy": 0.7156916931271553, "num_tokens": 4744477.0, "step": 5890 }, { "epoch": 1.5603813559322033, "grad_norm": 2.118415117263794, "learning_rate": 9.219941737288137e-06, "loss": 1.9287, "mean_token_accuracy": 0.5955505445599556, "num_tokens": 4745927.0, "step": 5892 }, { "epoch": 1.5609110169491527, "grad_norm": 1.487673044204712, "learning_rate": 9.219676906779662e-06, "loss": 0.9297, "mean_token_accuracy": 0.7671007215976715, "num_tokens": 4747412.0, "step": 5894 }, { "epoch": 1.5614406779661016, "grad_norm": 1.5183545351028442, "learning_rate": 9.219412076271186e-06, "loss": 1.0248, "mean_token_accuracy": 0.7619152069091797, "num_tokens": 4748901.0, "step": 5896 }, { "epoch": 1.5619703389830508, "grad_norm": 1.4412094354629517, "learning_rate": 9.219147245762713e-06, "loss": 1.195, "mean_token_accuracy": 0.7102158404886723, "num_tokens": 4750679.0, "step": 5898 }, { "epoch": 1.5625, "grad_norm": 1.4668077230453491, "learning_rate": 9.218882415254238e-06, "loss": 0.9834, "mean_token_accuracy": 0.7569470629096031, "num_tokens": 4752180.0, "step": 5900 }, { "epoch": 1.5630296610169492, "grad_norm": 1.6383705139160156, "learning_rate": 9.218617584745763e-06, "loss": 1.6717, "mean_token_accuracy": 0.6220502108335495, "num_tokens": 4753793.0, "step": 5902 }, { "epoch": 1.5635593220338984, "grad_norm": 1.3721411228179932, "learning_rate": 9.218352754237288e-06, "loss": 1.3374, "mean_token_accuracy": 0.6849408000707626, "num_tokens": 4755454.0, "step": 5904 }, { "epoch": 1.5640889830508473, "grad_norm": 1.6779758930206299, "learning_rate": 9.218087923728814e-06, "loss": 1.4732, "mean_token_accuracy": 0.683961033821106, "num_tokens": 4757154.0, "step": 5906 }, { "epoch": 1.5646186440677967, "grad_norm": 1.1716127395629883, "learning_rate": 9.21782309322034e-06, "loss": 1.0161, "mean_token_accuracy": 0.745861366391182, "num_tokens": 4758808.0, "step": 5908 }, { "epoch": 1.5651483050847457, "grad_norm": 1.8058689832687378, "learning_rate": 9.217558262711866e-06, "loss": 1.6359, "mean_token_accuracy": 0.6790705993771553, "num_tokens": 4760497.0, "step": 5910 }, { "epoch": 1.565677966101695, "grad_norm": 1.3992174863815308, "learning_rate": 9.217293432203391e-06, "loss": 1.1073, "mean_token_accuracy": 0.7545427531003952, "num_tokens": 4761948.0, "step": 5912 }, { "epoch": 1.566207627118644, "grad_norm": 1.6208508014678955, "learning_rate": 9.217028601694916e-06, "loss": 1.5087, "mean_token_accuracy": 0.6602590121328831, "num_tokens": 4763352.0, "step": 5914 }, { "epoch": 1.5667372881355932, "grad_norm": 1.6849267482757568, "learning_rate": 9.21676377118644e-06, "loss": 1.6295, "mean_token_accuracy": 0.6277063153684139, "num_tokens": 4764869.0, "step": 5916 }, { "epoch": 1.5672669491525424, "grad_norm": 2.054044246673584, "learning_rate": 9.216498940677967e-06, "loss": 1.6044, "mean_token_accuracy": 0.6427688002586365, "num_tokens": 4766612.0, "step": 5918 }, { "epoch": 1.5677966101694916, "grad_norm": 1.0723742246627808, "learning_rate": 9.216234110169492e-06, "loss": 1.3139, "mean_token_accuracy": 0.701378621160984, "num_tokens": 4768229.0, "step": 5920 }, { "epoch": 1.5683262711864407, "grad_norm": 1.4636894464492798, "learning_rate": 9.215969279661017e-06, "loss": 1.0837, "mean_token_accuracy": 0.7440905869007111, "num_tokens": 4769779.0, "step": 5922 }, { "epoch": 1.5688559322033897, "grad_norm": 1.5270977020263672, "learning_rate": 9.215704449152542e-06, "loss": 1.0066, "mean_token_accuracy": 0.7393020391464233, "num_tokens": 4771542.0, "step": 5924 }, { "epoch": 1.569385593220339, "grad_norm": 1.3367362022399902, "learning_rate": 9.215439618644069e-06, "loss": 0.899, "mean_token_accuracy": 0.7733980193734169, "num_tokens": 4772957.0, "step": 5926 }, { "epoch": 1.569915254237288, "grad_norm": 1.5034557580947876, "learning_rate": 9.215174788135594e-06, "loss": 1.4216, "mean_token_accuracy": 0.6971366815268993, "num_tokens": 4774480.0, "step": 5928 }, { "epoch": 1.5704449152542372, "grad_norm": 1.1948106288909912, "learning_rate": 9.214909957627119e-06, "loss": 1.1915, "mean_token_accuracy": 0.7261747941374779, "num_tokens": 4776098.0, "step": 5930 }, { "epoch": 1.5709745762711864, "grad_norm": 1.7187443971633911, "learning_rate": 9.214645127118644e-06, "loss": 1.4302, "mean_token_accuracy": 0.6790416687726974, "num_tokens": 4777762.0, "step": 5932 }, { "epoch": 1.5715042372881356, "grad_norm": 2.0597174167633057, "learning_rate": 9.21438029661017e-06, "loss": 1.4127, "mean_token_accuracy": 0.6967996209859848, "num_tokens": 4779372.0, "step": 5934 }, { "epoch": 1.5720338983050848, "grad_norm": 1.6942553520202637, "learning_rate": 9.214115466101695e-06, "loss": 1.4132, "mean_token_accuracy": 0.6832864955067635, "num_tokens": 4780994.0, "step": 5936 }, { "epoch": 1.5725635593220337, "grad_norm": 1.6619575023651123, "learning_rate": 9.213850635593222e-06, "loss": 1.4427, "mean_token_accuracy": 0.6678834855556488, "num_tokens": 4782603.0, "step": 5938 }, { "epoch": 1.5730932203389831, "grad_norm": 1.3824076652526855, "learning_rate": 9.213585805084747e-06, "loss": 1.2202, "mean_token_accuracy": 0.711508721113205, "num_tokens": 4784251.0, "step": 5940 }, { "epoch": 1.573622881355932, "grad_norm": 1.3791484832763672, "learning_rate": 9.213320974576272e-06, "loss": 1.4326, "mean_token_accuracy": 0.6910076141357422, "num_tokens": 4786186.0, "step": 5942 }, { "epoch": 1.5741525423728815, "grad_norm": 1.9212367534637451, "learning_rate": 9.213056144067798e-06, "loss": 1.2729, "mean_token_accuracy": 0.7113937512040138, "num_tokens": 4787725.0, "step": 5944 }, { "epoch": 1.5746822033898304, "grad_norm": 1.6959631443023682, "learning_rate": 9.212791313559323e-06, "loss": 1.2289, "mean_token_accuracy": 0.7121476382017136, "num_tokens": 4789539.0, "step": 5946 }, { "epoch": 1.5752118644067796, "grad_norm": 1.5486876964569092, "learning_rate": 9.212526483050848e-06, "loss": 1.4015, "mean_token_accuracy": 0.6984347924590111, "num_tokens": 4791129.0, "step": 5948 }, { "epoch": 1.5757415254237288, "grad_norm": 2.112271547317505, "learning_rate": 9.212261652542373e-06, "loss": 1.6463, "mean_token_accuracy": 0.6447065658867359, "num_tokens": 4792698.0, "step": 5950 }, { "epoch": 1.576271186440678, "grad_norm": 1.9385464191436768, "learning_rate": 9.2119968220339e-06, "loss": 1.5565, "mean_token_accuracy": 0.6641016155481339, "num_tokens": 4794264.0, "step": 5952 }, { "epoch": 1.5768008474576272, "grad_norm": 1.74380362033844, "learning_rate": 9.211731991525425e-06, "loss": 1.5619, "mean_token_accuracy": 0.6551576927304268, "num_tokens": 4795754.0, "step": 5954 }, { "epoch": 1.5773305084745761, "grad_norm": 1.73235023021698, "learning_rate": 9.21146716101695e-06, "loss": 1.4762, "mean_token_accuracy": 0.6999996900558472, "num_tokens": 4797179.0, "step": 5956 }, { "epoch": 1.5778601694915255, "grad_norm": 1.629770278930664, "learning_rate": 9.211202330508474e-06, "loss": 1.0627, "mean_token_accuracy": 0.7493651397526264, "num_tokens": 4798777.0, "step": 5958 }, { "epoch": 1.5783898305084745, "grad_norm": 1.6705427169799805, "learning_rate": 9.210937500000001e-06, "loss": 1.2733, "mean_token_accuracy": 0.7153357416391373, "num_tokens": 4801252.0, "step": 5960 }, { "epoch": 1.5789194915254239, "grad_norm": 1.4571093320846558, "learning_rate": 9.210672669491526e-06, "loss": 1.4107, "mean_token_accuracy": 0.6809120699763298, "num_tokens": 4802868.0, "step": 5962 }, { "epoch": 1.5794491525423728, "grad_norm": 1.4007320404052734, "learning_rate": 9.210407838983053e-06, "loss": 1.2346, "mean_token_accuracy": 0.7270300760865211, "num_tokens": 4804486.0, "step": 5964 }, { "epoch": 1.579978813559322, "grad_norm": 1.2833278179168701, "learning_rate": 9.210143008474578e-06, "loss": 1.1975, "mean_token_accuracy": 0.6967112571001053, "num_tokens": 4806023.0, "step": 5966 }, { "epoch": 1.5805084745762712, "grad_norm": 1.802822470664978, "learning_rate": 9.209878177966103e-06, "loss": 1.4117, "mean_token_accuracy": 0.697545126080513, "num_tokens": 4807613.0, "step": 5968 }, { "epoch": 1.5810381355932204, "grad_norm": 1.4984098672866821, "learning_rate": 9.209613347457627e-06, "loss": 1.25, "mean_token_accuracy": 0.7101231552660465, "num_tokens": 4809267.0, "step": 5970 }, { "epoch": 1.5815677966101696, "grad_norm": 1.8145815134048462, "learning_rate": 9.209348516949154e-06, "loss": 1.5222, "mean_token_accuracy": 0.662606880068779, "num_tokens": 4810724.0, "step": 5972 }, { "epoch": 1.5820974576271185, "grad_norm": 1.7811896800994873, "learning_rate": 9.209083686440679e-06, "loss": 0.9581, "mean_token_accuracy": 0.7687723338603973, "num_tokens": 4812136.0, "step": 5974 }, { "epoch": 1.582627118644068, "grad_norm": 1.6028271913528442, "learning_rate": 9.208818855932204e-06, "loss": 1.5628, "mean_token_accuracy": 0.6579759158194065, "num_tokens": 4813715.0, "step": 5976 }, { "epoch": 1.5831567796610169, "grad_norm": 1.3883720636367798, "learning_rate": 9.208554025423729e-06, "loss": 1.0675, "mean_token_accuracy": 0.7346952706575394, "num_tokens": 4815377.0, "step": 5978 }, { "epoch": 1.5836864406779663, "grad_norm": 1.2262414693832397, "learning_rate": 9.208289194915255e-06, "loss": 0.849, "mean_token_accuracy": 0.7648626044392586, "num_tokens": 4817624.0, "step": 5980 }, { "epoch": 1.5842161016949152, "grad_norm": 1.6898024082183838, "learning_rate": 9.20802436440678e-06, "loss": 1.2964, "mean_token_accuracy": 0.6953293085098267, "num_tokens": 4819269.0, "step": 5982 }, { "epoch": 1.5847457627118644, "grad_norm": 1.4497672319412231, "learning_rate": 9.207759533898305e-06, "loss": 1.1237, "mean_token_accuracy": 0.7321931943297386, "num_tokens": 4821015.0, "step": 5984 }, { "epoch": 1.5852754237288136, "grad_norm": 1.490647554397583, "learning_rate": 9.20749470338983e-06, "loss": 1.0638, "mean_token_accuracy": 0.7435379922389984, "num_tokens": 4822408.0, "step": 5986 }, { "epoch": 1.5858050847457628, "grad_norm": 1.3229176998138428, "learning_rate": 9.207229872881357e-06, "loss": 1.3163, "mean_token_accuracy": 0.7004328705370426, "num_tokens": 4824168.0, "step": 5988 }, { "epoch": 1.586334745762712, "grad_norm": 1.298750400543213, "learning_rate": 9.206965042372882e-06, "loss": 1.3454, "mean_token_accuracy": 0.6974105834960938, "num_tokens": 4826140.0, "step": 5990 }, { "epoch": 1.586864406779661, "grad_norm": 1.545967936515808, "learning_rate": 9.206700211864408e-06, "loss": 0.8279, "mean_token_accuracy": 0.797598347067833, "num_tokens": 4827537.0, "step": 5992 }, { "epoch": 1.5873940677966103, "grad_norm": 1.5315537452697754, "learning_rate": 9.206435381355933e-06, "loss": 1.5557, "mean_token_accuracy": 0.6540053710341454, "num_tokens": 4829293.0, "step": 5994 }, { "epoch": 1.5879237288135593, "grad_norm": 1.4791622161865234, "learning_rate": 9.206170550847458e-06, "loss": 1.2061, "mean_token_accuracy": 0.7199132442474365, "num_tokens": 4830853.0, "step": 5996 }, { "epoch": 1.5884533898305084, "grad_norm": 1.7309364080429077, "learning_rate": 9.205905720338983e-06, "loss": 1.3979, "mean_token_accuracy": 0.6926534101366997, "num_tokens": 4832268.0, "step": 5998 }, { "epoch": 1.5889830508474576, "grad_norm": 1.5560246706008911, "learning_rate": 9.20564088983051e-06, "loss": 1.3731, "step": 6000 }, { "epoch": 1.5889830508474576, "eval_loss": 1.3176281452178955, "eval_mean_token_accuracy": 0.6993492541762142, "eval_num_tokens": 4833889.0, "eval_runtime": 48.7999, "eval_samples_per_second": 6.311, "eval_steps_per_second": 6.311, "step": 6000 }, { "epoch": 1.5895127118644068, "grad_norm": 2.02890944480896, "learning_rate": 9.205376059322035e-06, "loss": 1.4184, "mean_token_accuracy": 0.6915656551718712, "num_tokens": 4835517.0, "step": 6002 }, { "epoch": 1.590042372881356, "grad_norm": 1.75142502784729, "learning_rate": 9.20511122881356e-06, "loss": 1.5153, "mean_token_accuracy": 0.6607195883989334, "num_tokens": 4837144.0, "step": 6004 }, { "epoch": 1.590572033898305, "grad_norm": 1.9024109840393066, "learning_rate": 9.204846398305085e-06, "loss": 1.7413, "mean_token_accuracy": 0.5997967347502708, "num_tokens": 4838697.0, "step": 6006 }, { "epoch": 1.5911016949152543, "grad_norm": 1.6024196147918701, "learning_rate": 9.204581567796611e-06, "loss": 1.3014, "mean_token_accuracy": 0.7105267345905304, "num_tokens": 4840100.0, "step": 6008 }, { "epoch": 1.5916313559322033, "grad_norm": 1.279100775718689, "learning_rate": 9.204316737288136e-06, "loss": 0.9399, "mean_token_accuracy": 0.7745695561170578, "num_tokens": 4841706.0, "step": 6010 }, { "epoch": 1.5921610169491527, "grad_norm": 1.9080122709274292, "learning_rate": 9.204051906779661e-06, "loss": 1.052, "mean_token_accuracy": 0.7193062528967857, "num_tokens": 4844059.0, "step": 6012 }, { "epoch": 1.5926906779661016, "grad_norm": 1.3056561946868896, "learning_rate": 9.203787076271186e-06, "loss": 1.4456, "mean_token_accuracy": 0.6906970143318176, "num_tokens": 4845452.0, "step": 6014 }, { "epoch": 1.5932203389830508, "grad_norm": 1.5618363618850708, "learning_rate": 9.203522245762713e-06, "loss": 1.4229, "mean_token_accuracy": 0.6767306290566921, "num_tokens": 4847258.0, "step": 6016 }, { "epoch": 1.59375, "grad_norm": 2.0653116703033447, "learning_rate": 9.203257415254238e-06, "loss": 1.2627, "mean_token_accuracy": 0.724895067512989, "num_tokens": 4848608.0, "step": 6018 }, { "epoch": 1.5942796610169492, "grad_norm": 1.554030418395996, "learning_rate": 9.202992584745764e-06, "loss": 1.3133, "mean_token_accuracy": 0.7250455841422081, "num_tokens": 4850251.0, "step": 6020 }, { "epoch": 1.5948093220338984, "grad_norm": 1.7055630683898926, "learning_rate": 9.20272775423729e-06, "loss": 0.9791, "mean_token_accuracy": 0.7496231198310852, "num_tokens": 4851709.0, "step": 6022 }, { "epoch": 1.5953389830508473, "grad_norm": 1.6652714014053345, "learning_rate": 9.202462923728814e-06, "loss": 1.4577, "mean_token_accuracy": 0.6503535509109497, "num_tokens": 4853390.0, "step": 6024 }, { "epoch": 1.5958686440677967, "grad_norm": 1.572054147720337, "learning_rate": 9.202198093220339e-06, "loss": 1.2751, "mean_token_accuracy": 0.6967690661549568, "num_tokens": 4854940.0, "step": 6026 }, { "epoch": 1.5963983050847457, "grad_norm": 1.4819179773330688, "learning_rate": 9.201933262711866e-06, "loss": 1.0293, "mean_token_accuracy": 0.7625196799635887, "num_tokens": 4856474.0, "step": 6028 }, { "epoch": 1.596927966101695, "grad_norm": 1.6070261001586914, "learning_rate": 9.20166843220339e-06, "loss": 1.1989, "mean_token_accuracy": 0.7143897898495197, "num_tokens": 4858106.0, "step": 6030 }, { "epoch": 1.597457627118644, "grad_norm": 1.4917564392089844, "learning_rate": 9.201403601694916e-06, "loss": 1.2411, "mean_token_accuracy": 0.7102661356329918, "num_tokens": 4859658.0, "step": 6032 }, { "epoch": 1.5979872881355932, "grad_norm": 1.5021963119506836, "learning_rate": 9.201138771186442e-06, "loss": 1.5584, "mean_token_accuracy": 0.6761854030191898, "num_tokens": 4861680.0, "step": 6034 }, { "epoch": 1.5985169491525424, "grad_norm": 1.5224876403808594, "learning_rate": 9.200873940677967e-06, "loss": 1.4862, "mean_token_accuracy": 0.6637970581650734, "num_tokens": 4863518.0, "step": 6036 }, { "epoch": 1.5990466101694916, "grad_norm": 1.5341218709945679, "learning_rate": 9.200609110169492e-06, "loss": 1.2247, "mean_token_accuracy": 0.7089535295963287, "num_tokens": 4865045.0, "step": 6038 }, { "epoch": 1.5995762711864407, "grad_norm": 1.5420727729797363, "learning_rate": 9.200344279661017e-06, "loss": 1.1734, "mean_token_accuracy": 0.697243258357048, "num_tokens": 4866474.0, "step": 6040 }, { "epoch": 1.6001059322033897, "grad_norm": 1.822769284248352, "learning_rate": 9.200079449152544e-06, "loss": 1.2484, "mean_token_accuracy": 0.7078445106744766, "num_tokens": 4868040.0, "step": 6042 }, { "epoch": 1.600635593220339, "grad_norm": 1.6073321104049683, "learning_rate": 9.199814618644068e-06, "loss": 1.5198, "mean_token_accuracy": 0.6567173860967159, "num_tokens": 4869620.0, "step": 6044 }, { "epoch": 1.601165254237288, "grad_norm": 1.7028354406356812, "learning_rate": 9.199549788135595e-06, "loss": 1.5044, "mean_token_accuracy": 0.6499039232730865, "num_tokens": 4871226.0, "step": 6046 }, { "epoch": 1.6016949152542372, "grad_norm": 1.702235221862793, "learning_rate": 9.19928495762712e-06, "loss": 1.5302, "mean_token_accuracy": 0.6548959240317345, "num_tokens": 4872727.0, "step": 6048 }, { "epoch": 1.6022245762711864, "grad_norm": 1.6885497570037842, "learning_rate": 9.199020127118645e-06, "loss": 1.4838, "mean_token_accuracy": 0.6797542870044708, "num_tokens": 4874346.0, "step": 6050 }, { "epoch": 1.6027542372881356, "grad_norm": 1.7567991018295288, "learning_rate": 9.19875529661017e-06, "loss": 1.0312, "mean_token_accuracy": 0.7473221644759178, "num_tokens": 4876092.0, "step": 6052 }, { "epoch": 1.6032838983050848, "grad_norm": 1.5438101291656494, "learning_rate": 9.198490466101697e-06, "loss": 1.2157, "mean_token_accuracy": 0.7237057238817215, "num_tokens": 4877846.0, "step": 6054 }, { "epoch": 1.6038135593220337, "grad_norm": 1.5768640041351318, "learning_rate": 9.198225635593221e-06, "loss": 1.2789, "mean_token_accuracy": 0.7461013197898865, "num_tokens": 4879168.0, "step": 6056 }, { "epoch": 1.6043432203389831, "grad_norm": 1.7047683000564575, "learning_rate": 9.197960805084746e-06, "loss": 1.5073, "mean_token_accuracy": 0.6715851649641991, "num_tokens": 4880776.0, "step": 6058 }, { "epoch": 1.604872881355932, "grad_norm": 1.4372146129608154, "learning_rate": 9.197695974576271e-06, "loss": 1.6801, "mean_token_accuracy": 0.6547995209693909, "num_tokens": 4882330.0, "step": 6060 }, { "epoch": 1.6054025423728815, "grad_norm": 1.8418992757797241, "learning_rate": 9.197431144067798e-06, "loss": 1.5694, "mean_token_accuracy": 0.668797567486763, "num_tokens": 4883841.0, "step": 6062 }, { "epoch": 1.6059322033898304, "grad_norm": 1.6093566417694092, "learning_rate": 9.197166313559323e-06, "loss": 1.1945, "mean_token_accuracy": 0.7291547283530235, "num_tokens": 4885400.0, "step": 6064 }, { "epoch": 1.6064618644067796, "grad_norm": 1.758481740951538, "learning_rate": 9.196901483050848e-06, "loss": 1.2051, "mean_token_accuracy": 0.7154102101922035, "num_tokens": 4886993.0, "step": 6066 }, { "epoch": 1.6069915254237288, "grad_norm": 1.3641256093978882, "learning_rate": 9.196636652542373e-06, "loss": 1.2787, "mean_token_accuracy": 0.7127728909254074, "num_tokens": 4888723.0, "step": 6068 }, { "epoch": 1.607521186440678, "grad_norm": 1.4287258386611938, "learning_rate": 9.1963718220339e-06, "loss": 1.1968, "mean_token_accuracy": 0.7176511734724045, "num_tokens": 4890450.0, "step": 6070 }, { "epoch": 1.6080508474576272, "grad_norm": 1.6288456916809082, "learning_rate": 9.196106991525424e-06, "loss": 1.5581, "mean_token_accuracy": 0.6350398659706116, "num_tokens": 4892071.0, "step": 6072 }, { "epoch": 1.6085805084745761, "grad_norm": 1.6094510555267334, "learning_rate": 9.195842161016951e-06, "loss": 1.6519, "mean_token_accuracy": 0.6520341597497463, "num_tokens": 4893894.0, "step": 6074 }, { "epoch": 1.6091101694915255, "grad_norm": 1.7566437721252441, "learning_rate": 9.195577330508476e-06, "loss": 1.4707, "mean_token_accuracy": 0.6847673431038857, "num_tokens": 4895592.0, "step": 6076 }, { "epoch": 1.6096398305084745, "grad_norm": 1.7119956016540527, "learning_rate": 9.1953125e-06, "loss": 1.4309, "mean_token_accuracy": 0.6705470681190491, "num_tokens": 4897078.0, "step": 6078 }, { "epoch": 1.6101694915254239, "grad_norm": 1.5777301788330078, "learning_rate": 9.195047669491526e-06, "loss": 0.9726, "mean_token_accuracy": 0.747036837041378, "num_tokens": 4898612.0, "step": 6080 }, { "epoch": 1.6106991525423728, "grad_norm": 1.5037105083465576, "learning_rate": 9.194782838983052e-06, "loss": 1.1299, "mean_token_accuracy": 0.7352149188518524, "num_tokens": 4900026.0, "step": 6082 }, { "epoch": 1.611228813559322, "grad_norm": 1.6877351999282837, "learning_rate": 9.194518008474577e-06, "loss": 1.3253, "mean_token_accuracy": 0.6858278661966324, "num_tokens": 4901647.0, "step": 6084 }, { "epoch": 1.6117584745762712, "grad_norm": 1.5421183109283447, "learning_rate": 9.194253177966102e-06, "loss": 1.2752, "mean_token_accuracy": 0.7047963812947273, "num_tokens": 4902987.0, "step": 6086 }, { "epoch": 1.6122881355932204, "grad_norm": 1.5047656297683716, "learning_rate": 9.193988347457627e-06, "loss": 1.165, "mean_token_accuracy": 0.7307403609156609, "num_tokens": 4904279.0, "step": 6088 }, { "epoch": 1.6128177966101696, "grad_norm": 1.2775782346725464, "learning_rate": 9.193723516949154e-06, "loss": 1.6598, "mean_token_accuracy": 0.5853173211216927, "num_tokens": 4906941.0, "step": 6090 }, { "epoch": 1.6133474576271185, "grad_norm": 1.5970321893692017, "learning_rate": 9.193458686440679e-06, "loss": 1.3994, "mean_token_accuracy": 0.6550223156809807, "num_tokens": 4908477.0, "step": 6092 }, { "epoch": 1.613877118644068, "grad_norm": 2.014875650405884, "learning_rate": 9.193193855932204e-06, "loss": 1.599, "mean_token_accuracy": 0.6373291835188866, "num_tokens": 4909902.0, "step": 6094 }, { "epoch": 1.6144067796610169, "grad_norm": 1.6844439506530762, "learning_rate": 9.192929025423728e-06, "loss": 1.2009, "mean_token_accuracy": 0.7169523760676384, "num_tokens": 4911614.0, "step": 6096 }, { "epoch": 1.6149364406779663, "grad_norm": 2.4492502212524414, "learning_rate": 9.192664194915255e-06, "loss": 1.6505, "mean_token_accuracy": 0.6701070070266724, "num_tokens": 4912859.0, "step": 6098 }, { "epoch": 1.6154661016949152, "grad_norm": 1.8753752708435059, "learning_rate": 9.19239936440678e-06, "loss": 1.3445, "mean_token_accuracy": 0.680817537009716, "num_tokens": 4914729.0, "step": 6100 }, { "epoch": 1.6159957627118644, "grad_norm": 1.7778536081314087, "learning_rate": 9.192134533898307e-06, "loss": 1.389, "mean_token_accuracy": 0.6812112852931023, "num_tokens": 4916399.0, "step": 6102 }, { "epoch": 1.6165254237288136, "grad_norm": 1.6633142232894897, "learning_rate": 9.191869703389832e-06, "loss": 1.3738, "mean_token_accuracy": 0.7294364497065544, "num_tokens": 4917719.0, "step": 6104 }, { "epoch": 1.6170550847457628, "grad_norm": 1.6791220903396606, "learning_rate": 9.191604872881357e-06, "loss": 1.2668, "mean_token_accuracy": 0.7196221873164177, "num_tokens": 4919197.0, "step": 6106 }, { "epoch": 1.617584745762712, "grad_norm": 1.353939414024353, "learning_rate": 9.191340042372881e-06, "loss": 1.5823, "mean_token_accuracy": 0.6413134858012199, "num_tokens": 4921126.0, "step": 6108 }, { "epoch": 1.618114406779661, "grad_norm": 1.7105131149291992, "learning_rate": 9.191075211864408e-06, "loss": 1.6295, "mean_token_accuracy": 0.6538231298327446, "num_tokens": 4922843.0, "step": 6110 }, { "epoch": 1.6186440677966103, "grad_norm": 1.4344154596328735, "learning_rate": 9.190810381355933e-06, "loss": 0.9504, "mean_token_accuracy": 0.7849168330430984, "num_tokens": 4924251.0, "step": 6112 }, { "epoch": 1.6191737288135593, "grad_norm": 1.8168030977249146, "learning_rate": 9.190545550847458e-06, "loss": 1.2745, "mean_token_accuracy": 0.7251601666212082, "num_tokens": 4926009.0, "step": 6114 }, { "epoch": 1.6197033898305084, "grad_norm": 1.5185377597808838, "learning_rate": 9.190280720338985e-06, "loss": 1.1759, "mean_token_accuracy": 0.7257948331534863, "num_tokens": 4927691.0, "step": 6116 }, { "epoch": 1.6202330508474576, "grad_norm": 1.6832125186920166, "learning_rate": 9.19001588983051e-06, "loss": 1.5288, "mean_token_accuracy": 0.6715108156204224, "num_tokens": 4929097.0, "step": 6118 }, { "epoch": 1.6207627118644068, "grad_norm": 1.5586177110671997, "learning_rate": 9.189751059322034e-06, "loss": 1.1421, "mean_token_accuracy": 0.7505735009908676, "num_tokens": 4930500.0, "step": 6120 }, { "epoch": 1.621292372881356, "grad_norm": 1.6302849054336548, "learning_rate": 9.18948622881356e-06, "loss": 1.4983, "mean_token_accuracy": 0.6710459217429161, "num_tokens": 4932089.0, "step": 6122 }, { "epoch": 1.621822033898305, "grad_norm": 1.9365568161010742, "learning_rate": 9.189221398305086e-06, "loss": 1.1797, "mean_token_accuracy": 0.7217025831341743, "num_tokens": 4933602.0, "step": 6124 }, { "epoch": 1.6223516949152543, "grad_norm": 1.3743672370910645, "learning_rate": 9.188956567796611e-06, "loss": 0.7957, "mean_token_accuracy": 0.7920208647847176, "num_tokens": 4935067.0, "step": 6126 }, { "epoch": 1.6228813559322033, "grad_norm": 1.9626836776733398, "learning_rate": 9.188691737288138e-06, "loss": 1.3255, "mean_token_accuracy": 0.6960393749177456, "num_tokens": 4936548.0, "step": 6128 }, { "epoch": 1.6234110169491527, "grad_norm": 1.6128554344177246, "learning_rate": 9.188426906779662e-06, "loss": 1.2749, "mean_token_accuracy": 0.7022073939442635, "num_tokens": 4938061.0, "step": 6130 }, { "epoch": 1.6239406779661016, "grad_norm": 1.4944889545440674, "learning_rate": 9.188162076271187e-06, "loss": 1.1338, "mean_token_accuracy": 0.7185666486620903, "num_tokens": 4939725.0, "step": 6132 }, { "epoch": 1.6244703389830508, "grad_norm": 1.8568650484085083, "learning_rate": 9.187897245762712e-06, "loss": 1.2709, "mean_token_accuracy": 0.6959889307618141, "num_tokens": 4941383.0, "step": 6134 }, { "epoch": 1.625, "grad_norm": 1.4388607740402222, "learning_rate": 9.187632415254239e-06, "loss": 1.0869, "mean_token_accuracy": 0.7565033063292503, "num_tokens": 4942948.0, "step": 6136 }, { "epoch": 1.6255296610169492, "grad_norm": 1.2723623514175415, "learning_rate": 9.187367584745764e-06, "loss": 1.4584, "mean_token_accuracy": 0.6927084363996983, "num_tokens": 4944839.0, "step": 6138 }, { "epoch": 1.6260593220338984, "grad_norm": 1.660827875137329, "learning_rate": 9.187102754237289e-06, "loss": 1.2131, "mean_token_accuracy": 0.7232341170310974, "num_tokens": 4946423.0, "step": 6140 }, { "epoch": 1.6265889830508473, "grad_norm": 1.5608999729156494, "learning_rate": 9.186837923728814e-06, "loss": 1.6123, "mean_token_accuracy": 0.6479317769408226, "num_tokens": 4948096.0, "step": 6142 }, { "epoch": 1.6271186440677967, "grad_norm": 1.4605883359909058, "learning_rate": 9.18657309322034e-06, "loss": 1.0401, "mean_token_accuracy": 0.7412856668233871, "num_tokens": 4949605.0, "step": 6144 }, { "epoch": 1.6276483050847457, "grad_norm": 1.9210747480392456, "learning_rate": 9.186308262711865e-06, "loss": 1.6088, "mean_token_accuracy": 0.6575752794742584, "num_tokens": 4951203.0, "step": 6146 }, { "epoch": 1.628177966101695, "grad_norm": 1.5965360403060913, "learning_rate": 9.18604343220339e-06, "loss": 1.3559, "mean_token_accuracy": 0.6749341711401939, "num_tokens": 4952957.0, "step": 6148 }, { "epoch": 1.628707627118644, "grad_norm": 1.721600890159607, "learning_rate": 9.185778601694915e-06, "loss": 1.5003, "mean_token_accuracy": 0.6623788699507713, "num_tokens": 4954514.0, "step": 6150 }, { "epoch": 1.6292372881355932, "grad_norm": 1.5080881118774414, "learning_rate": 9.185513771186442e-06, "loss": 1.497, "mean_token_accuracy": 0.6902179047465324, "num_tokens": 4956041.0, "step": 6152 }, { "epoch": 1.6297669491525424, "grad_norm": 1.6425089836120605, "learning_rate": 9.185248940677967e-06, "loss": 1.1627, "mean_token_accuracy": 0.7518087066709995, "num_tokens": 4957720.0, "step": 6154 }, { "epoch": 1.6302966101694916, "grad_norm": 1.7256872653961182, "learning_rate": 9.184984110169493e-06, "loss": 1.1166, "mean_token_accuracy": 0.7425692975521088, "num_tokens": 4959190.0, "step": 6156 }, { "epoch": 1.6308262711864407, "grad_norm": 1.577414870262146, "learning_rate": 9.184719279661018e-06, "loss": 1.1245, "mean_token_accuracy": 0.7132248878479004, "num_tokens": 4960946.0, "step": 6158 }, { "epoch": 1.6313559322033897, "grad_norm": 1.613924264907837, "learning_rate": 9.184454449152543e-06, "loss": 1.3612, "mean_token_accuracy": 0.71122045814991, "num_tokens": 4962397.0, "step": 6160 }, { "epoch": 1.631885593220339, "grad_norm": 1.6127985715866089, "learning_rate": 9.184189618644068e-06, "loss": 1.224, "mean_token_accuracy": 0.7199060022830963, "num_tokens": 4964027.0, "step": 6162 }, { "epoch": 1.632415254237288, "grad_norm": 1.7335058450698853, "learning_rate": 9.183924788135595e-06, "loss": 1.3506, "mean_token_accuracy": 0.6920731663703918, "num_tokens": 4965886.0, "step": 6164 }, { "epoch": 1.6329449152542372, "grad_norm": 1.7089219093322754, "learning_rate": 9.18365995762712e-06, "loss": 1.4418, "mean_token_accuracy": 0.6997049003839493, "num_tokens": 4967651.0, "step": 6166 }, { "epoch": 1.6334745762711864, "grad_norm": 1.1396825313568115, "learning_rate": 9.183395127118645e-06, "loss": 1.5183, "mean_token_accuracy": 0.6516091302037239, "num_tokens": 4970174.0, "step": 6168 }, { "epoch": 1.6340042372881356, "grad_norm": 1.9586517810821533, "learning_rate": 9.18313029661017e-06, "loss": 1.404, "mean_token_accuracy": 0.6710131391882896, "num_tokens": 4971613.0, "step": 6170 }, { "epoch": 1.6345338983050848, "grad_norm": 1.9023826122283936, "learning_rate": 9.182865466101696e-06, "loss": 1.1056, "mean_token_accuracy": 0.7179487943649292, "num_tokens": 4972926.0, "step": 6172 }, { "epoch": 1.6350635593220337, "grad_norm": 1.573426365852356, "learning_rate": 9.182600635593221e-06, "loss": 1.409, "mean_token_accuracy": 0.6879361644387245, "num_tokens": 4974614.0, "step": 6174 }, { "epoch": 1.6355932203389831, "grad_norm": 1.2681365013122559, "learning_rate": 9.182335805084746e-06, "loss": 0.729, "mean_token_accuracy": 0.8054801300168037, "num_tokens": 4976176.0, "step": 6176 }, { "epoch": 1.636122881355932, "grad_norm": 1.8412123918533325, "learning_rate": 9.182070974576271e-06, "loss": 1.2731, "mean_token_accuracy": 0.695757769048214, "num_tokens": 4977676.0, "step": 6178 }, { "epoch": 1.6366525423728815, "grad_norm": 1.5862493515014648, "learning_rate": 9.181806144067798e-06, "loss": 1.4882, "mean_token_accuracy": 0.6793697848916054, "num_tokens": 4979437.0, "step": 6180 }, { "epoch": 1.6371822033898304, "grad_norm": 1.3330605030059814, "learning_rate": 9.181541313559322e-06, "loss": 1.2845, "mean_token_accuracy": 0.6984110549092293, "num_tokens": 4981406.0, "step": 6182 }, { "epoch": 1.6377118644067796, "grad_norm": 1.722135066986084, "learning_rate": 9.181276483050849e-06, "loss": 1.585, "mean_token_accuracy": 0.655475590378046, "num_tokens": 4983009.0, "step": 6184 }, { "epoch": 1.6382415254237288, "grad_norm": 1.701948881149292, "learning_rate": 9.181011652542372e-06, "loss": 1.2934, "mean_token_accuracy": 0.7104171589016914, "num_tokens": 4984703.0, "step": 6186 }, { "epoch": 1.638771186440678, "grad_norm": 1.8315757513046265, "learning_rate": 9.180746822033899e-06, "loss": 1.5053, "mean_token_accuracy": 0.6820231899619102, "num_tokens": 4986072.0, "step": 6188 }, { "epoch": 1.6393008474576272, "grad_norm": 1.597528338432312, "learning_rate": 9.180481991525424e-06, "loss": 1.7035, "mean_token_accuracy": 0.6257113888859749, "num_tokens": 4987834.0, "step": 6190 }, { "epoch": 1.6398305084745761, "grad_norm": 1.5862480401992798, "learning_rate": 9.18021716101695e-06, "loss": 1.6845, "mean_token_accuracy": 0.6433534547686577, "num_tokens": 4989516.0, "step": 6192 }, { "epoch": 1.6403601694915255, "grad_norm": 1.635435700416565, "learning_rate": 9.179952330508475e-06, "loss": 1.0171, "mean_token_accuracy": 0.7449576929211617, "num_tokens": 4991119.0, "step": 6194 }, { "epoch": 1.6408898305084745, "grad_norm": 1.6245856285095215, "learning_rate": 9.1796875e-06, "loss": 1.7265, "mean_token_accuracy": 0.64525006711483, "num_tokens": 4992719.0, "step": 6196 }, { "epoch": 1.6414194915254239, "grad_norm": 2.0295774936676025, "learning_rate": 9.179422669491527e-06, "loss": 1.5489, "mean_token_accuracy": 0.7012237086892128, "num_tokens": 4994319.0, "step": 6198 }, { "epoch": 1.6419491525423728, "grad_norm": 1.8308218717575073, "learning_rate": 9.179157838983052e-06, "loss": 1.6178, "mean_token_accuracy": 0.6616715490818024, "num_tokens": 4995772.0, "step": 6200 }, { "epoch": 1.642478813559322, "grad_norm": 1.5453156232833862, "learning_rate": 9.178893008474577e-06, "loss": 1.2749, "mean_token_accuracy": 0.7195152416825294, "num_tokens": 4997050.0, "step": 6202 }, { "epoch": 1.6430084745762712, "grad_norm": 1.7599551677703857, "learning_rate": 9.178628177966102e-06, "loss": 1.336, "mean_token_accuracy": 0.6922643259167671, "num_tokens": 4998446.0, "step": 6204 }, { "epoch": 1.6435381355932204, "grad_norm": 1.4145317077636719, "learning_rate": 9.178363347457628e-06, "loss": 1.2269, "mean_token_accuracy": 0.7128052599728107, "num_tokens": 5000025.0, "step": 6206 }, { "epoch": 1.6440677966101696, "grad_norm": 1.8667110204696655, "learning_rate": 9.178098516949153e-06, "loss": 1.274, "mean_token_accuracy": 0.7098240554332733, "num_tokens": 5001517.0, "step": 6208 }, { "epoch": 1.6445974576271185, "grad_norm": 1.7704437971115112, "learning_rate": 9.17783368644068e-06, "loss": 1.3967, "mean_token_accuracy": 0.6975041590631008, "num_tokens": 5002937.0, "step": 6210 }, { "epoch": 1.645127118644068, "grad_norm": 1.3555450439453125, "learning_rate": 9.177568855932205e-06, "loss": 1.2533, "mean_token_accuracy": 0.7258699685335159, "num_tokens": 5004573.0, "step": 6212 }, { "epoch": 1.6456567796610169, "grad_norm": 1.4871540069580078, "learning_rate": 9.17730402542373e-06, "loss": 1.2561, "mean_token_accuracy": 0.6963680237531662, "num_tokens": 5006479.0, "step": 6214 }, { "epoch": 1.6461864406779663, "grad_norm": 1.7062585353851318, "learning_rate": 9.177039194915255e-06, "loss": 1.4478, "mean_token_accuracy": 0.6993290409445763, "num_tokens": 5008098.0, "step": 6216 }, { "epoch": 1.6467161016949152, "grad_norm": 1.6503281593322754, "learning_rate": 9.176774364406781e-06, "loss": 1.5863, "mean_token_accuracy": 0.6688390821218491, "num_tokens": 5009961.0, "step": 6218 }, { "epoch": 1.6472457627118644, "grad_norm": 1.8237015008926392, "learning_rate": 9.176509533898306e-06, "loss": 1.6086, "mean_token_accuracy": 0.63998594135046, "num_tokens": 5011642.0, "step": 6220 }, { "epoch": 1.6477754237288136, "grad_norm": 1.7326289415359497, "learning_rate": 9.176244703389831e-06, "loss": 1.4515, "mean_token_accuracy": 0.691846676170826, "num_tokens": 5013377.0, "step": 6222 }, { "epoch": 1.6483050847457628, "grad_norm": 1.6150753498077393, "learning_rate": 9.175979872881356e-06, "loss": 1.1728, "mean_token_accuracy": 0.7016376033425331, "num_tokens": 5015155.0, "step": 6224 }, { "epoch": 1.648834745762712, "grad_norm": 1.8314564228057861, "learning_rate": 9.175715042372883e-06, "loss": 1.381, "mean_token_accuracy": 0.6987780034542084, "num_tokens": 5016763.0, "step": 6226 }, { "epoch": 1.649364406779661, "grad_norm": 1.5140337944030762, "learning_rate": 9.175450211864408e-06, "loss": 1.0145, "mean_token_accuracy": 0.758922629058361, "num_tokens": 5018326.0, "step": 6228 }, { "epoch": 1.6498940677966103, "grad_norm": 1.0609581470489502, "learning_rate": 9.175185381355933e-06, "loss": 1.2139, "mean_token_accuracy": 0.6987856179475784, "num_tokens": 5020679.0, "step": 6230 }, { "epoch": 1.6504237288135593, "grad_norm": 1.558985948562622, "learning_rate": 9.174920550847458e-06, "loss": 1.2654, "mean_token_accuracy": 0.6935751587152481, "num_tokens": 5022388.0, "step": 6232 }, { "epoch": 1.6509533898305084, "grad_norm": 1.5357160568237305, "learning_rate": 9.174655720338984e-06, "loss": 1.117, "mean_token_accuracy": 0.7598686814308167, "num_tokens": 5023873.0, "step": 6234 }, { "epoch": 1.6514830508474576, "grad_norm": 1.1653319597244263, "learning_rate": 9.174390889830509e-06, "loss": 1.0553, "mean_token_accuracy": 0.7619708701968193, "num_tokens": 5025781.0, "step": 6236 }, { "epoch": 1.6520127118644068, "grad_norm": 1.8969866037368774, "learning_rate": 9.174126059322036e-06, "loss": 1.5222, "mean_token_accuracy": 0.6593065485358238, "num_tokens": 5027218.0, "step": 6238 }, { "epoch": 1.652542372881356, "grad_norm": 1.402408242225647, "learning_rate": 9.173861228813559e-06, "loss": 1.161, "mean_token_accuracy": 0.7100326418876648, "num_tokens": 5028818.0, "step": 6240 }, { "epoch": 1.653072033898305, "grad_norm": 1.7241290807724, "learning_rate": 9.173596398305086e-06, "loss": 0.9307, "mean_token_accuracy": 0.7765909805893898, "num_tokens": 5030260.0, "step": 6242 }, { "epoch": 1.6536016949152543, "grad_norm": 1.251288652420044, "learning_rate": 9.17333156779661e-06, "loss": 1.5356, "mean_token_accuracy": 0.6617387309670448, "num_tokens": 5032236.0, "step": 6244 }, { "epoch": 1.6541313559322033, "grad_norm": 1.6239464282989502, "learning_rate": 9.173066737288137e-06, "loss": 0.8907, "mean_token_accuracy": 0.7826599776744843, "num_tokens": 5033698.0, "step": 6246 }, { "epoch": 1.6546610169491527, "grad_norm": 1.3974465131759644, "learning_rate": 9.172801906779662e-06, "loss": 1.7964, "mean_token_accuracy": 0.6002227962017059, "num_tokens": 5035819.0, "step": 6248 }, { "epoch": 1.6551906779661016, "grad_norm": 1.5374395847320557, "learning_rate": 9.172537076271187e-06, "loss": 1.4859, "step": 6250 }, { "epoch": 1.6551906779661016, "eval_loss": 1.317041039466858, "eval_mean_token_accuracy": 0.6996878331357782, "eval_num_tokens": 5037314.0, "eval_runtime": 48.8911, "eval_samples_per_second": 6.3, "eval_steps_per_second": 6.3, "step": 6250 }, { "epoch": 1.6557203389830508, "grad_norm": 1.3403031826019287, "learning_rate": 9.172272245762712e-06, "loss": 1.0924, "mean_token_accuracy": 0.6905650720000267, "num_tokens": 5038655.0, "step": 6252 }, { "epoch": 1.65625, "grad_norm": 1.3125486373901367, "learning_rate": 9.172007415254239e-06, "loss": 1.2166, "mean_token_accuracy": 0.721409298479557, "num_tokens": 5040408.0, "step": 6254 }, { "epoch": 1.6567796610169492, "grad_norm": 1.8124620914459229, "learning_rate": 9.171742584745763e-06, "loss": 1.3056, "mean_token_accuracy": 0.7131531834602356, "num_tokens": 5041925.0, "step": 6256 }, { "epoch": 1.6573093220338984, "grad_norm": 1.7371461391448975, "learning_rate": 9.171477754237288e-06, "loss": 1.3814, "mean_token_accuracy": 0.6890811994671822, "num_tokens": 5043486.0, "step": 6258 }, { "epoch": 1.6578389830508473, "grad_norm": 1.744136095046997, "learning_rate": 9.171212923728813e-06, "loss": 1.4358, "mean_token_accuracy": 0.6864831149578094, "num_tokens": 5044981.0, "step": 6260 }, { "epoch": 1.6583686440677967, "grad_norm": 1.6298725605010986, "learning_rate": 9.17094809322034e-06, "loss": 0.8113, "mean_token_accuracy": 0.8143189698457718, "num_tokens": 5046206.0, "step": 6262 }, { "epoch": 1.6588983050847457, "grad_norm": 1.036329746246338, "learning_rate": 9.170683262711865e-06, "loss": 0.9236, "mean_token_accuracy": 0.7916361019015312, "num_tokens": 5047774.0, "step": 6264 }, { "epoch": 1.659427966101695, "grad_norm": 1.4876890182495117, "learning_rate": 9.170418432203392e-06, "loss": 1.2434, "mean_token_accuracy": 0.7048654332756996, "num_tokens": 5049636.0, "step": 6266 }, { "epoch": 1.659957627118644, "grad_norm": 1.6874891519546509, "learning_rate": 9.170153601694915e-06, "loss": 1.0272, "mean_token_accuracy": 0.7621947675943375, "num_tokens": 5051076.0, "step": 6268 }, { "epoch": 1.6604872881355932, "grad_norm": 1.6336731910705566, "learning_rate": 9.169888771186441e-06, "loss": 1.5013, "mean_token_accuracy": 0.6599678471684456, "num_tokens": 5052941.0, "step": 6270 }, { "epoch": 1.6610169491525424, "grad_norm": 1.9543839693069458, "learning_rate": 9.169623940677966e-06, "loss": 1.3555, "mean_token_accuracy": 0.6869162544608116, "num_tokens": 5054163.0, "step": 6272 }, { "epoch": 1.6615466101694916, "grad_norm": 1.64411461353302, "learning_rate": 9.169359110169493e-06, "loss": 1.4394, "mean_token_accuracy": 0.6998415216803551, "num_tokens": 5055981.0, "step": 6274 }, { "epoch": 1.6620762711864407, "grad_norm": 1.4000486135482788, "learning_rate": 9.169094279661018e-06, "loss": 1.0564, "mean_token_accuracy": 0.7533652931451797, "num_tokens": 5057392.0, "step": 6276 }, { "epoch": 1.6626059322033897, "grad_norm": 1.3810817003250122, "learning_rate": 9.168829449152543e-06, "loss": 0.959, "mean_token_accuracy": 0.7795252203941345, "num_tokens": 5059092.0, "step": 6278 }, { "epoch": 1.663135593220339, "grad_norm": 1.5926824808120728, "learning_rate": 9.168564618644068e-06, "loss": 1.2265, "mean_token_accuracy": 0.7304432913661003, "num_tokens": 5060710.0, "step": 6280 }, { "epoch": 1.663665254237288, "grad_norm": 1.51869535446167, "learning_rate": 9.168299788135594e-06, "loss": 1.5775, "mean_token_accuracy": 0.6480984911322594, "num_tokens": 5062518.0, "step": 6282 }, { "epoch": 1.6641949152542372, "grad_norm": 1.653404712677002, "learning_rate": 9.16803495762712e-06, "loss": 1.4857, "mean_token_accuracy": 0.6551055610179901, "num_tokens": 5064108.0, "step": 6284 }, { "epoch": 1.6647245762711864, "grad_norm": 1.6465107202529907, "learning_rate": 9.167770127118644e-06, "loss": 1.0765, "mean_token_accuracy": 0.7335151061415672, "num_tokens": 5065651.0, "step": 6286 }, { "epoch": 1.6652542372881356, "grad_norm": 1.5453100204467773, "learning_rate": 9.16750529661017e-06, "loss": 1.1865, "mean_token_accuracy": 0.725636325776577, "num_tokens": 5067152.0, "step": 6288 }, { "epoch": 1.6657838983050848, "grad_norm": 1.6466673612594604, "learning_rate": 9.167240466101696e-06, "loss": 1.4456, "mean_token_accuracy": 0.6639216989278793, "num_tokens": 5068633.0, "step": 6290 }, { "epoch": 1.6663135593220337, "grad_norm": 1.4718761444091797, "learning_rate": 9.166975635593222e-06, "loss": 1.2396, "mean_token_accuracy": 0.7411638759076595, "num_tokens": 5070238.0, "step": 6292 }, { "epoch": 1.6668432203389831, "grad_norm": 1.7380297183990479, "learning_rate": 9.166710805084746e-06, "loss": 1.1546, "mean_token_accuracy": 0.7196813300251961, "num_tokens": 5072066.0, "step": 6294 }, { "epoch": 1.667372881355932, "grad_norm": 1.4274197816848755, "learning_rate": 9.166445974576272e-06, "loss": 1.2866, "mean_token_accuracy": 0.7104419022798538, "num_tokens": 5073564.0, "step": 6296 }, { "epoch": 1.6679025423728815, "grad_norm": 1.7582383155822754, "learning_rate": 9.166181144067797e-06, "loss": 1.4085, "mean_token_accuracy": 0.6817518621683121, "num_tokens": 5075240.0, "step": 6298 }, { "epoch": 1.6684322033898304, "grad_norm": 1.6273325681686401, "learning_rate": 9.165916313559324e-06, "loss": 1.2133, "mean_token_accuracy": 0.7023016512393951, "num_tokens": 5076851.0, "step": 6300 }, { "epoch": 1.6689618644067796, "grad_norm": 1.344359040260315, "learning_rate": 9.165651483050849e-06, "loss": 1.0814, "mean_token_accuracy": 0.7404837161302567, "num_tokens": 5078450.0, "step": 6302 }, { "epoch": 1.6694915254237288, "grad_norm": 1.526167392730713, "learning_rate": 9.165386652542374e-06, "loss": 1.5764, "mean_token_accuracy": 0.663090068846941, "num_tokens": 5080209.0, "step": 6304 }, { "epoch": 1.670021186440678, "grad_norm": 1.3842391967773438, "learning_rate": 9.165121822033899e-06, "loss": 1.1221, "mean_token_accuracy": 0.7380570620298386, "num_tokens": 5082073.0, "step": 6306 }, { "epoch": 1.6705508474576272, "grad_norm": 1.6817442178726196, "learning_rate": 9.164856991525425e-06, "loss": 1.6987, "mean_token_accuracy": 0.5995561331510544, "num_tokens": 5083698.0, "step": 6308 }, { "epoch": 1.6710805084745761, "grad_norm": 1.99541175365448, "learning_rate": 9.16459216101695e-06, "loss": 1.5621, "mean_token_accuracy": 0.6576838418841362, "num_tokens": 5085105.0, "step": 6310 }, { "epoch": 1.6716101694915255, "grad_norm": 1.8371161222457886, "learning_rate": 9.164327330508475e-06, "loss": 1.4425, "mean_token_accuracy": 0.6556607708334923, "num_tokens": 5086784.0, "step": 6312 }, { "epoch": 1.6721398305084745, "grad_norm": 1.7532458305358887, "learning_rate": 9.1640625e-06, "loss": 1.1625, "mean_token_accuracy": 0.7313051372766495, "num_tokens": 5088258.0, "step": 6314 }, { "epoch": 1.6726694915254239, "grad_norm": 2.1096601486206055, "learning_rate": 9.163797669491527e-06, "loss": 1.5333, "mean_token_accuracy": 0.6614001989364624, "num_tokens": 5089907.0, "step": 6316 }, { "epoch": 1.6731991525423728, "grad_norm": 1.9129221439361572, "learning_rate": 9.163532838983052e-06, "loss": 1.357, "mean_token_accuracy": 0.7038718611001968, "num_tokens": 5091288.0, "step": 6318 }, { "epoch": 1.673728813559322, "grad_norm": 1.615119218826294, "learning_rate": 9.163268008474578e-06, "loss": 0.8906, "mean_token_accuracy": 0.7916830778121948, "num_tokens": 5092449.0, "step": 6320 }, { "epoch": 1.6742584745762712, "grad_norm": 1.8883930444717407, "learning_rate": 9.163003177966101e-06, "loss": 1.3526, "mean_token_accuracy": 0.6935962587594986, "num_tokens": 5093873.0, "step": 6322 }, { "epoch": 1.6747881355932204, "grad_norm": 2.41261887550354, "learning_rate": 9.162738347457628e-06, "loss": 1.0933, "mean_token_accuracy": 0.7378095760941505, "num_tokens": 5095250.0, "step": 6324 }, { "epoch": 1.6753177966101696, "grad_norm": 1.6070032119750977, "learning_rate": 9.162473516949153e-06, "loss": 1.0045, "mean_token_accuracy": 0.7521642744541168, "num_tokens": 5096704.0, "step": 6326 }, { "epoch": 1.6758474576271185, "grad_norm": 1.6415594816207886, "learning_rate": 9.16220868644068e-06, "loss": 1.0343, "mean_token_accuracy": 0.7600444182753563, "num_tokens": 5098056.0, "step": 6328 }, { "epoch": 1.676377118644068, "grad_norm": 1.640093445777893, "learning_rate": 9.161943855932204e-06, "loss": 1.1489, "mean_token_accuracy": 0.7331652417778969, "num_tokens": 5099379.0, "step": 6330 }, { "epoch": 1.6769067796610169, "grad_norm": 1.468959093093872, "learning_rate": 9.16167902542373e-06, "loss": 1.1432, "mean_token_accuracy": 0.7316413372755051, "num_tokens": 5100975.0, "step": 6332 }, { "epoch": 1.6774364406779663, "grad_norm": 2.0082268714904785, "learning_rate": 9.161414194915254e-06, "loss": 1.4114, "mean_token_accuracy": 0.6806704849004745, "num_tokens": 5102560.0, "step": 6334 }, { "epoch": 1.6779661016949152, "grad_norm": 1.6293283700942993, "learning_rate": 9.161149364406781e-06, "loss": 1.6726, "mean_token_accuracy": 0.6249843761324883, "num_tokens": 5104174.0, "step": 6336 }, { "epoch": 1.6784957627118644, "grad_norm": 1.552018165588379, "learning_rate": 9.160884533898306e-06, "loss": 1.3242, "mean_token_accuracy": 0.716795951128006, "num_tokens": 5105905.0, "step": 6338 }, { "epoch": 1.6790254237288136, "grad_norm": 1.6582765579223633, "learning_rate": 9.16061970338983e-06, "loss": 1.1979, "mean_token_accuracy": 0.7390137314796448, "num_tokens": 5107626.0, "step": 6340 }, { "epoch": 1.6795550847457628, "grad_norm": 1.6738113164901733, "learning_rate": 9.160354872881356e-06, "loss": 1.6097, "mean_token_accuracy": 0.6512332931160927, "num_tokens": 5109115.0, "step": 6342 }, { "epoch": 1.680084745762712, "grad_norm": 1.8642734289169312, "learning_rate": 9.160090042372882e-06, "loss": 1.3601, "mean_token_accuracy": 0.6973319053649902, "num_tokens": 5110814.0, "step": 6344 }, { "epoch": 1.680614406779661, "grad_norm": 1.9300627708435059, "learning_rate": 9.159825211864407e-06, "loss": 1.6224, "mean_token_accuracy": 0.6684745475649834, "num_tokens": 5112239.0, "step": 6346 }, { "epoch": 1.6811440677966103, "grad_norm": 1.4555013179779053, "learning_rate": 9.159560381355932e-06, "loss": 1.2932, "mean_token_accuracy": 0.7387852519750595, "num_tokens": 5113776.0, "step": 6348 }, { "epoch": 1.6816737288135593, "grad_norm": 1.7832679748535156, "learning_rate": 9.159295550847457e-06, "loss": 1.7229, "mean_token_accuracy": 0.6474725753068924, "num_tokens": 5115300.0, "step": 6350 }, { "epoch": 1.6822033898305084, "grad_norm": 1.4015339612960815, "learning_rate": 9.159030720338984e-06, "loss": 1.723, "mean_token_accuracy": 0.6192510724067688, "num_tokens": 5117392.0, "step": 6352 }, { "epoch": 1.6827330508474576, "grad_norm": 1.946135401725769, "learning_rate": 9.158765889830509e-06, "loss": 1.8315, "mean_token_accuracy": 0.6224934384226799, "num_tokens": 5118981.0, "step": 6354 }, { "epoch": 1.6832627118644068, "grad_norm": 1.3293668031692505, "learning_rate": 9.158501059322035e-06, "loss": 1.1544, "mean_token_accuracy": 0.7407304719090462, "num_tokens": 5120567.0, "step": 6356 }, { "epoch": 1.683792372881356, "grad_norm": 1.4945459365844727, "learning_rate": 9.15823622881356e-06, "loss": 1.5218, "mean_token_accuracy": 0.6679820045828819, "num_tokens": 5122509.0, "step": 6358 }, { "epoch": 1.684322033898305, "grad_norm": 1.621355414390564, "learning_rate": 9.157971398305085e-06, "loss": 1.1881, "mean_token_accuracy": 0.698062501847744, "num_tokens": 5124202.0, "step": 6360 }, { "epoch": 1.6848516949152543, "grad_norm": 1.7199068069458008, "learning_rate": 9.15770656779661e-06, "loss": 1.3362, "mean_token_accuracy": 0.6972900256514549, "num_tokens": 5125744.0, "step": 6362 }, { "epoch": 1.6853813559322033, "grad_norm": 1.6456167697906494, "learning_rate": 9.157441737288137e-06, "loss": 1.3732, "mean_token_accuracy": 0.689166970551014, "num_tokens": 5127338.0, "step": 6364 }, { "epoch": 1.6859110169491527, "grad_norm": 1.1990654468536377, "learning_rate": 9.157176906779662e-06, "loss": 1.1075, "mean_token_accuracy": 0.7385615408420563, "num_tokens": 5129295.0, "step": 6366 }, { "epoch": 1.6864406779661016, "grad_norm": 1.6357070207595825, "learning_rate": 9.156912076271187e-06, "loss": 1.4223, "mean_token_accuracy": 0.7038724049925804, "num_tokens": 5130891.0, "step": 6368 }, { "epoch": 1.6869703389830508, "grad_norm": 1.5743155479431152, "learning_rate": 9.156647245762713e-06, "loss": 1.0752, "mean_token_accuracy": 0.7210886478424072, "num_tokens": 5132380.0, "step": 6370 }, { "epoch": 1.6875, "grad_norm": 1.7338967323303223, "learning_rate": 9.156382415254238e-06, "loss": 1.3361, "mean_token_accuracy": 0.6758042126893997, "num_tokens": 5133857.0, "step": 6372 }, { "epoch": 1.6880296610169492, "grad_norm": 1.6129764318466187, "learning_rate": 9.156117584745765e-06, "loss": 1.1807, "mean_token_accuracy": 0.7218145877122879, "num_tokens": 5135198.0, "step": 6374 }, { "epoch": 1.6885593220338984, "grad_norm": 1.7642430067062378, "learning_rate": 9.155852754237288e-06, "loss": 1.4713, "mean_token_accuracy": 0.6817417480051517, "num_tokens": 5136796.0, "step": 6376 }, { "epoch": 1.6890889830508473, "grad_norm": 1.5326955318450928, "learning_rate": 9.155587923728815e-06, "loss": 1.0657, "mean_token_accuracy": 0.745066337287426, "num_tokens": 5138264.0, "step": 6378 }, { "epoch": 1.6896186440677967, "grad_norm": 1.8205727338790894, "learning_rate": 9.15532309322034e-06, "loss": 1.3697, "mean_token_accuracy": 0.7168178781867027, "num_tokens": 5139780.0, "step": 6380 }, { "epoch": 1.6901483050847457, "grad_norm": 1.743777871131897, "learning_rate": 9.155058262711866e-06, "loss": 1.883, "mean_token_accuracy": 0.5952965319156647, "num_tokens": 5141586.0, "step": 6382 }, { "epoch": 1.690677966101695, "grad_norm": 1.6061581373214722, "learning_rate": 9.154793432203391e-06, "loss": 1.5203, "mean_token_accuracy": 0.6692943647503853, "num_tokens": 5143130.0, "step": 6384 }, { "epoch": 1.691207627118644, "grad_norm": 1.2608697414398193, "learning_rate": 9.154528601694916e-06, "loss": 0.9602, "mean_token_accuracy": 0.7591960728168488, "num_tokens": 5144781.0, "step": 6386 }, { "epoch": 1.6917372881355932, "grad_norm": 1.9201768636703491, "learning_rate": 9.154263771186441e-06, "loss": 1.1727, "mean_token_accuracy": 0.73624137789011, "num_tokens": 5146207.0, "step": 6388 }, { "epoch": 1.6922669491525424, "grad_norm": 1.4540883302688599, "learning_rate": 9.153998940677968e-06, "loss": 1.1529, "mean_token_accuracy": 0.7509015947580338, "num_tokens": 5147864.0, "step": 6390 }, { "epoch": 1.6927966101694916, "grad_norm": 1.5543841123580933, "learning_rate": 9.153734110169493e-06, "loss": 1.6491, "mean_token_accuracy": 0.662809744477272, "num_tokens": 5149496.0, "step": 6392 }, { "epoch": 1.6933262711864407, "grad_norm": 1.363770604133606, "learning_rate": 9.153469279661017e-06, "loss": 1.1002, "mean_token_accuracy": 0.7310436517000198, "num_tokens": 5150936.0, "step": 6394 }, { "epoch": 1.6938559322033897, "grad_norm": 1.8249114751815796, "learning_rate": 9.153204449152542e-06, "loss": 1.3537, "mean_token_accuracy": 0.6944941729307175, "num_tokens": 5152521.0, "step": 6396 }, { "epoch": 1.694385593220339, "grad_norm": 1.4496939182281494, "learning_rate": 9.152939618644069e-06, "loss": 1.1938, "mean_token_accuracy": 0.7302176430821419, "num_tokens": 5153959.0, "step": 6398 }, { "epoch": 1.694915254237288, "grad_norm": 1.4289312362670898, "learning_rate": 9.152674788135594e-06, "loss": 1.1991, "mean_token_accuracy": 0.7251539751887321, "num_tokens": 5155415.0, "step": 6400 }, { "epoch": 1.6954449152542372, "grad_norm": 1.2265160083770752, "learning_rate": 9.152409957627119e-06, "loss": 0.8683, "mean_token_accuracy": 0.7750329002737999, "num_tokens": 5156820.0, "step": 6402 }, { "epoch": 1.6959745762711864, "grad_norm": 1.7042877674102783, "learning_rate": 9.152145127118644e-06, "loss": 1.4549, "mean_token_accuracy": 0.6842729896306992, "num_tokens": 5158433.0, "step": 6404 }, { "epoch": 1.6965042372881356, "grad_norm": 1.766451358795166, "learning_rate": 9.15188029661017e-06, "loss": 1.5477, "mean_token_accuracy": 0.6637315154075623, "num_tokens": 5159969.0, "step": 6406 }, { "epoch": 1.6970338983050848, "grad_norm": 1.486629605293274, "learning_rate": 9.151615466101695e-06, "loss": 1.2182, "mean_token_accuracy": 0.7105119451880455, "num_tokens": 5161627.0, "step": 6408 }, { "epoch": 1.6975635593220337, "grad_norm": 2.0521702766418457, "learning_rate": 9.151350635593222e-06, "loss": 1.8341, "mean_token_accuracy": 0.6348080188035965, "num_tokens": 5163180.0, "step": 6410 }, { "epoch": 1.6980932203389831, "grad_norm": 1.9089546203613281, "learning_rate": 9.151085805084747e-06, "loss": 1.3184, "mean_token_accuracy": 0.7055917009711266, "num_tokens": 5164937.0, "step": 6412 }, { "epoch": 1.698622881355932, "grad_norm": 1.6566267013549805, "learning_rate": 9.150820974576272e-06, "loss": 1.5205, "mean_token_accuracy": 0.6776097789406776, "num_tokens": 5166790.0, "step": 6414 }, { "epoch": 1.6991525423728815, "grad_norm": 1.7993276119232178, "learning_rate": 9.150556144067797e-06, "loss": 1.5658, "mean_token_accuracy": 0.6704016774892807, "num_tokens": 5168258.0, "step": 6416 }, { "epoch": 1.6996822033898304, "grad_norm": 1.6610536575317383, "learning_rate": 9.150291313559323e-06, "loss": 1.4732, "mean_token_accuracy": 0.680350910872221, "num_tokens": 5169732.0, "step": 6418 }, { "epoch": 1.7002118644067796, "grad_norm": 1.548911452293396, "learning_rate": 9.150026483050848e-06, "loss": 1.5614, "mean_token_accuracy": 0.6409818232059479, "num_tokens": 5171292.0, "step": 6420 }, { "epoch": 1.7007415254237288, "grad_norm": 1.7324919700622559, "learning_rate": 9.149761652542373e-06, "loss": 1.6513, "mean_token_accuracy": 0.6147007495164871, "num_tokens": 5172910.0, "step": 6422 }, { "epoch": 1.701271186440678, "grad_norm": 1.4374698400497437, "learning_rate": 9.149496822033898e-06, "loss": 1.2477, "mean_token_accuracy": 0.7390192821621895, "num_tokens": 5174445.0, "step": 6424 }, { "epoch": 1.7018008474576272, "grad_norm": 1.345784306526184, "learning_rate": 9.149231991525425e-06, "loss": 1.0083, "mean_token_accuracy": 0.7620126605033875, "num_tokens": 5175899.0, "step": 6426 }, { "epoch": 1.7023305084745761, "grad_norm": 1.4510401487350464, "learning_rate": 9.14896716101695e-06, "loss": 1.4355, "mean_token_accuracy": 0.7054065018892288, "num_tokens": 5177716.0, "step": 6428 }, { "epoch": 1.7028601694915255, "grad_norm": 1.2212623357772827, "learning_rate": 9.148702330508475e-06, "loss": 1.1114, "mean_token_accuracy": 0.7524727582931519, "num_tokens": 5179147.0, "step": 6430 }, { "epoch": 1.7033898305084745, "grad_norm": 1.440169095993042, "learning_rate": 9.1484375e-06, "loss": 1.4502, "mean_token_accuracy": 0.6696625128388405, "num_tokens": 5180954.0, "step": 6432 }, { "epoch": 1.7039194915254239, "grad_norm": 1.5193430185317993, "learning_rate": 9.148172669491526e-06, "loss": 1.2412, "mean_token_accuracy": 0.716497965157032, "num_tokens": 5182405.0, "step": 6434 }, { "epoch": 1.7044491525423728, "grad_norm": 1.6839165687561035, "learning_rate": 9.147907838983051e-06, "loss": 1.2758, "mean_token_accuracy": 0.7244735062122345, "num_tokens": 5184059.0, "step": 6436 }, { "epoch": 1.704978813559322, "grad_norm": 2.075375556945801, "learning_rate": 9.147643008474578e-06, "loss": 1.6832, "mean_token_accuracy": 0.6261340118944645, "num_tokens": 5185647.0, "step": 6438 }, { "epoch": 1.7055084745762712, "grad_norm": 1.681449055671692, "learning_rate": 9.147378177966103e-06, "loss": 1.0648, "mean_token_accuracy": 0.7444479614496231, "num_tokens": 5187231.0, "step": 6440 }, { "epoch": 1.7060381355932204, "grad_norm": 1.2840687036514282, "learning_rate": 9.147113347457628e-06, "loss": 1.3739, "mean_token_accuracy": 0.6729383915662766, "num_tokens": 5188830.0, "step": 6442 }, { "epoch": 1.7065677966101696, "grad_norm": 1.5874871015548706, "learning_rate": 9.146848516949153e-06, "loss": 1.4122, "mean_token_accuracy": 0.6751940846443176, "num_tokens": 5190870.0, "step": 6444 }, { "epoch": 1.7070974576271185, "grad_norm": 1.8303149938583374, "learning_rate": 9.14658368644068e-06, "loss": 1.1977, "mean_token_accuracy": 0.7329652085900307, "num_tokens": 5192078.0, "step": 6446 }, { "epoch": 1.707627118644068, "grad_norm": 1.7567757368087769, "learning_rate": 9.146318855932204e-06, "loss": 1.4983, "mean_token_accuracy": 0.6536573767662048, "num_tokens": 5193894.0, "step": 6448 }, { "epoch": 1.7081567796610169, "grad_norm": 2.0094258785247803, "learning_rate": 9.146054025423729e-06, "loss": 1.4213, "mean_token_accuracy": 0.6897540464997292, "num_tokens": 5195287.0, "step": 6450 }, { "epoch": 1.7086864406779663, "grad_norm": 1.2553528547286987, "learning_rate": 9.145789194915256e-06, "loss": 1.1038, "mean_token_accuracy": 0.7428829222917557, "num_tokens": 5196747.0, "step": 6452 }, { "epoch": 1.7092161016949152, "grad_norm": 1.5758172273635864, "learning_rate": 9.14552436440678e-06, "loss": 1.3608, "mean_token_accuracy": 0.7056540548801422, "num_tokens": 5198087.0, "step": 6454 }, { "epoch": 1.7097457627118644, "grad_norm": 1.768619418144226, "learning_rate": 9.145259533898306e-06, "loss": 1.4917, "mean_token_accuracy": 0.6774669364094734, "num_tokens": 5199726.0, "step": 6456 }, { "epoch": 1.7102754237288136, "grad_norm": 1.3126640319824219, "learning_rate": 9.14499470338983e-06, "loss": 1.4607, "mean_token_accuracy": 0.6921239569783211, "num_tokens": 5201387.0, "step": 6458 }, { "epoch": 1.7108050847457628, "grad_norm": 1.9475204944610596, "learning_rate": 9.144729872881357e-06, "loss": 1.3146, "mean_token_accuracy": 0.7240774482488632, "num_tokens": 5202922.0, "step": 6460 }, { "epoch": 1.711334745762712, "grad_norm": 1.7078526020050049, "learning_rate": 9.144465042372882e-06, "loss": 1.5769, "mean_token_accuracy": 0.6728699803352356, "num_tokens": 5204638.0, "step": 6462 }, { "epoch": 1.711864406779661, "grad_norm": 1.5295599699020386, "learning_rate": 9.144200211864409e-06, "loss": 1.383, "mean_token_accuracy": 0.6943216696381569, "num_tokens": 5206294.0, "step": 6464 }, { "epoch": 1.7123940677966103, "grad_norm": 1.2786014080047607, "learning_rate": 9.143935381355934e-06, "loss": 1.3792, "mean_token_accuracy": 0.6965581178665161, "num_tokens": 5208146.0, "step": 6466 }, { "epoch": 1.7129237288135593, "grad_norm": 1.6866772174835205, "learning_rate": 9.143670550847458e-06, "loss": 1.4374, "mean_token_accuracy": 0.6874054670333862, "num_tokens": 5209621.0, "step": 6468 }, { "epoch": 1.7134533898305084, "grad_norm": 1.4672585725784302, "learning_rate": 9.143405720338983e-06, "loss": 1.0887, "mean_token_accuracy": 0.7325920164585114, "num_tokens": 5211208.0, "step": 6470 }, { "epoch": 1.7139830508474576, "grad_norm": 1.3343753814697266, "learning_rate": 9.14314088983051e-06, "loss": 0.9747, "mean_token_accuracy": 0.7614685371518135, "num_tokens": 5212777.0, "step": 6472 }, { "epoch": 1.7145127118644068, "grad_norm": 1.3468748331069946, "learning_rate": 9.142876059322035e-06, "loss": 0.829, "mean_token_accuracy": 0.7826456129550934, "num_tokens": 5214301.0, "step": 6474 }, { "epoch": 1.715042372881356, "grad_norm": 1.5987522602081299, "learning_rate": 9.14261122881356e-06, "loss": 1.2692, "mean_token_accuracy": 0.7391042560338974, "num_tokens": 5215795.0, "step": 6476 }, { "epoch": 1.715572033898305, "grad_norm": 1.7283035516738892, "learning_rate": 9.142346398305085e-06, "loss": 1.5099, "mean_token_accuracy": 0.6352418512105942, "num_tokens": 5217450.0, "step": 6478 }, { "epoch": 1.7161016949152543, "grad_norm": 1.5121506452560425, "learning_rate": 9.142081567796611e-06, "loss": 1.0731, "mean_token_accuracy": 0.7460025325417519, "num_tokens": 5218951.0, "step": 6480 }, { "epoch": 1.7166313559322033, "grad_norm": 1.7908843755722046, "learning_rate": 9.141816737288136e-06, "loss": 1.3697, "mean_token_accuracy": 0.6763341650366783, "num_tokens": 5220590.0, "step": 6482 }, { "epoch": 1.7171610169491527, "grad_norm": 1.8785672187805176, "learning_rate": 9.141551906779661e-06, "loss": 1.2049, "mean_token_accuracy": 0.7151436284184456, "num_tokens": 5221946.0, "step": 6484 }, { "epoch": 1.7176906779661016, "grad_norm": 1.7683089971542358, "learning_rate": 9.141287076271186e-06, "loss": 1.5268, "mean_token_accuracy": 0.645138967782259, "num_tokens": 5223411.0, "step": 6486 }, { "epoch": 1.7182203389830508, "grad_norm": 1.3192347288131714, "learning_rate": 9.141022245762713e-06, "loss": 0.7215, "mean_token_accuracy": 0.7988718152046204, "num_tokens": 5224985.0, "step": 6488 }, { "epoch": 1.71875, "grad_norm": 1.9743950366973877, "learning_rate": 9.140757415254238e-06, "loss": 1.5438, "mean_token_accuracy": 0.6595615074038506, "num_tokens": 5226426.0, "step": 6490 }, { "epoch": 1.7192796610169492, "grad_norm": 1.8837093114852905, "learning_rate": 9.140492584745764e-06, "loss": 1.1381, "mean_token_accuracy": 0.7321324124932289, "num_tokens": 5227788.0, "step": 6492 }, { "epoch": 1.7198093220338984, "grad_norm": 1.9704614877700806, "learning_rate": 9.14022775423729e-06, "loss": 1.4919, "mean_token_accuracy": 0.664509080350399, "num_tokens": 5229282.0, "step": 6494 }, { "epoch": 1.7203389830508473, "grad_norm": 1.7172123193740845, "learning_rate": 9.139962923728814e-06, "loss": 1.2705, "mean_token_accuracy": 0.719779334962368, "num_tokens": 5230842.0, "step": 6496 }, { "epoch": 1.7208686440677967, "grad_norm": 1.809144377708435, "learning_rate": 9.13969809322034e-06, "loss": 1.5381, "mean_token_accuracy": 0.6412015184760094, "num_tokens": 5232422.0, "step": 6498 }, { "epoch": 1.7213983050847457, "grad_norm": 1.7315216064453125, "learning_rate": 9.139433262711866e-06, "loss": 1.2876, "step": 6500 }, { "epoch": 1.7213983050847457, "eval_loss": 1.3145488500595093, "eval_mean_token_accuracy": 0.6996550571608853, "eval_num_tokens": 5234026.0, "eval_runtime": 48.4845, "eval_samples_per_second": 6.353, "eval_steps_per_second": 6.353, "step": 6500 }, { "epoch": 1.721927966101695, "grad_norm": 1.5957714319229126, "learning_rate": 9.13916843220339e-06, "loss": 1.7833, "mean_token_accuracy": 0.67134053632617, "num_tokens": 5236069.0, "step": 6502 }, { "epoch": 1.722457627118644, "grad_norm": 1.4892696142196655, "learning_rate": 9.138903601694916e-06, "loss": 1.3629, "mean_token_accuracy": 0.6811873689293861, "num_tokens": 5237705.0, "step": 6504 }, { "epoch": 1.7229872881355932, "grad_norm": 1.8466527462005615, "learning_rate": 9.13863877118644e-06, "loss": 1.334, "mean_token_accuracy": 0.6944746486842632, "num_tokens": 5239316.0, "step": 6506 }, { "epoch": 1.7235169491525424, "grad_norm": 1.8691833019256592, "learning_rate": 9.138373940677967e-06, "loss": 1.6788, "mean_token_accuracy": 0.6496964544057846, "num_tokens": 5240728.0, "step": 6508 }, { "epoch": 1.7240466101694916, "grad_norm": 1.5062764883041382, "learning_rate": 9.138109110169492e-06, "loss": 1.375, "mean_token_accuracy": 0.6974556222558022, "num_tokens": 5242194.0, "step": 6510 }, { "epoch": 1.7245762711864407, "grad_norm": 1.8472851514816284, "learning_rate": 9.137844279661017e-06, "loss": 1.2141, "mean_token_accuracy": 0.7229363322257996, "num_tokens": 5243501.0, "step": 6512 }, { "epoch": 1.7251059322033897, "grad_norm": 1.4096966981887817, "learning_rate": 9.137579449152542e-06, "loss": 1.2704, "mean_token_accuracy": 0.7297151796519756, "num_tokens": 5245090.0, "step": 6514 }, { "epoch": 1.725635593220339, "grad_norm": 1.4262975454330444, "learning_rate": 9.137314618644069e-06, "loss": 1.0323, "mean_token_accuracy": 0.7466704919934273, "num_tokens": 5246715.0, "step": 6516 }, { "epoch": 1.726165254237288, "grad_norm": 1.6956027746200562, "learning_rate": 9.137049788135594e-06, "loss": 1.2052, "mean_token_accuracy": 0.7162231132388115, "num_tokens": 5248404.0, "step": 6518 }, { "epoch": 1.7266949152542372, "grad_norm": 1.7528868913650513, "learning_rate": 9.13678495762712e-06, "loss": 1.7246, "mean_token_accuracy": 0.6335904449224472, "num_tokens": 5250125.0, "step": 6520 }, { "epoch": 1.7272245762711864, "grad_norm": 1.4423085451126099, "learning_rate": 9.136520127118645e-06, "loss": 1.1894, "mean_token_accuracy": 0.7178661525249481, "num_tokens": 5251499.0, "step": 6522 }, { "epoch": 1.7277542372881356, "grad_norm": 1.501230001449585, "learning_rate": 9.13625529661017e-06, "loss": 1.0556, "mean_token_accuracy": 0.7364345341920853, "num_tokens": 5252925.0, "step": 6524 }, { "epoch": 1.7282838983050848, "grad_norm": 1.6104414463043213, "learning_rate": 9.135990466101695e-06, "loss": 1.8122, "mean_token_accuracy": 0.6207946389913559, "num_tokens": 5254743.0, "step": 6526 }, { "epoch": 1.7288135593220337, "grad_norm": 2.0349714756011963, "learning_rate": 9.135725635593222e-06, "loss": 0.8004, "mean_token_accuracy": 0.7932778224349022, "num_tokens": 5256057.0, "step": 6528 }, { "epoch": 1.7293432203389831, "grad_norm": 1.4782304763793945, "learning_rate": 9.135460805084747e-06, "loss": 1.2067, "mean_token_accuracy": 0.7005766406655312, "num_tokens": 5257730.0, "step": 6530 }, { "epoch": 1.729872881355932, "grad_norm": 1.8579405546188354, "learning_rate": 9.135195974576271e-06, "loss": 1.6891, "mean_token_accuracy": 0.6267448142170906, "num_tokens": 5259176.0, "step": 6532 }, { "epoch": 1.7304025423728815, "grad_norm": 2.0027849674224854, "learning_rate": 9.134931144067798e-06, "loss": 1.4049, "mean_token_accuracy": 0.6649906486272812, "num_tokens": 5260592.0, "step": 6534 }, { "epoch": 1.7309322033898304, "grad_norm": 1.9164925813674927, "learning_rate": 9.134666313559323e-06, "loss": 1.3265, "mean_token_accuracy": 0.7082846909761429, "num_tokens": 5261846.0, "step": 6536 }, { "epoch": 1.7314618644067796, "grad_norm": 1.6051191091537476, "learning_rate": 9.134401483050848e-06, "loss": 1.6176, "mean_token_accuracy": 0.6540259122848511, "num_tokens": 5263376.0, "step": 6538 }, { "epoch": 1.7319915254237288, "grad_norm": 1.5977323055267334, "learning_rate": 9.134136652542373e-06, "loss": 1.0261, "mean_token_accuracy": 0.7692795246839523, "num_tokens": 5264917.0, "step": 6540 }, { "epoch": 1.732521186440678, "grad_norm": 1.4526331424713135, "learning_rate": 9.1338718220339e-06, "loss": 0.9385, "mean_token_accuracy": 0.7579108774662018, "num_tokens": 5266822.0, "step": 6542 }, { "epoch": 1.7330508474576272, "grad_norm": 1.4692010879516602, "learning_rate": 9.133606991525424e-06, "loss": 1.6008, "mean_token_accuracy": 0.6413181945681572, "num_tokens": 5268843.0, "step": 6544 }, { "epoch": 1.7335805084745761, "grad_norm": 1.3413536548614502, "learning_rate": 9.133342161016951e-06, "loss": 1.2338, "mean_token_accuracy": 0.7145485244691372, "num_tokens": 5270676.0, "step": 6546 }, { "epoch": 1.7341101694915255, "grad_norm": 1.872215747833252, "learning_rate": 9.133077330508476e-06, "loss": 1.2555, "mean_token_accuracy": 0.7142828777432442, "num_tokens": 5272081.0, "step": 6548 }, { "epoch": 1.7346398305084745, "grad_norm": 1.6313103437423706, "learning_rate": 9.132812500000001e-06, "loss": 1.1418, "mean_token_accuracy": 0.7210762873291969, "num_tokens": 5273481.0, "step": 6550 }, { "epoch": 1.7351694915254239, "grad_norm": 1.7016957998275757, "learning_rate": 9.132547669491526e-06, "loss": 1.5453, "mean_token_accuracy": 0.6604606881737709, "num_tokens": 5275178.0, "step": 6552 }, { "epoch": 1.7356991525423728, "grad_norm": 1.7853398323059082, "learning_rate": 9.132282838983052e-06, "loss": 1.3445, "mean_token_accuracy": 0.6979877129197121, "num_tokens": 5276778.0, "step": 6554 }, { "epoch": 1.736228813559322, "grad_norm": 1.2333176136016846, "learning_rate": 9.132018008474577e-06, "loss": 0.8926, "mean_token_accuracy": 0.7850839272141457, "num_tokens": 5278593.0, "step": 6556 }, { "epoch": 1.7367584745762712, "grad_norm": 1.4583828449249268, "learning_rate": 9.131753177966102e-06, "loss": 1.1189, "mean_token_accuracy": 0.7490474581718445, "num_tokens": 5280345.0, "step": 6558 }, { "epoch": 1.7372881355932204, "grad_norm": 2.0664634704589844, "learning_rate": 9.131488347457627e-06, "loss": 1.3929, "mean_token_accuracy": 0.7166487947106361, "num_tokens": 5281645.0, "step": 6560 }, { "epoch": 1.7378177966101696, "grad_norm": 1.288979172706604, "learning_rate": 9.131223516949154e-06, "loss": 1.2166, "mean_token_accuracy": 0.712526723742485, "num_tokens": 5283475.0, "step": 6562 }, { "epoch": 1.7383474576271185, "grad_norm": 1.6957474946975708, "learning_rate": 9.130958686440679e-06, "loss": 1.5757, "mean_token_accuracy": 0.685169655829668, "num_tokens": 5285109.0, "step": 6564 }, { "epoch": 1.738877118644068, "grad_norm": 1.5835936069488525, "learning_rate": 9.130693855932204e-06, "loss": 1.3447, "mean_token_accuracy": 0.7085881009697914, "num_tokens": 5286655.0, "step": 6566 }, { "epoch": 1.7394067796610169, "grad_norm": 1.7037829160690308, "learning_rate": 9.130429025423729e-06, "loss": 0.9644, "mean_token_accuracy": 0.7637228891253471, "num_tokens": 5288185.0, "step": 6568 }, { "epoch": 1.7399364406779663, "grad_norm": 1.8686202764511108, "learning_rate": 9.130164194915255e-06, "loss": 1.4081, "mean_token_accuracy": 0.6964668333530426, "num_tokens": 5289456.0, "step": 6570 }, { "epoch": 1.7404661016949152, "grad_norm": 1.366650938987732, "learning_rate": 9.12989936440678e-06, "loss": 1.0785, "mean_token_accuracy": 0.7310544885694981, "num_tokens": 5291154.0, "step": 6572 }, { "epoch": 1.7409957627118644, "grad_norm": 1.533363938331604, "learning_rate": 9.129634533898307e-06, "loss": 1.616, "mean_token_accuracy": 0.644300252199173, "num_tokens": 5292850.0, "step": 6574 }, { "epoch": 1.7415254237288136, "grad_norm": 1.1191370487213135, "learning_rate": 9.129369703389832e-06, "loss": 1.2968, "mean_token_accuracy": 0.7103104442358017, "num_tokens": 5294644.0, "step": 6576 }, { "epoch": 1.7420550847457628, "grad_norm": 1.5050806999206543, "learning_rate": 9.129104872881357e-06, "loss": 1.2153, "mean_token_accuracy": 0.7350491732358932, "num_tokens": 5296131.0, "step": 6578 }, { "epoch": 1.742584745762712, "grad_norm": 1.798883318901062, "learning_rate": 9.128840042372882e-06, "loss": 1.775, "mean_token_accuracy": 0.636204220354557, "num_tokens": 5297668.0, "step": 6580 }, { "epoch": 1.743114406779661, "grad_norm": 2.038222551345825, "learning_rate": 9.128575211864408e-06, "loss": 1.3055, "mean_token_accuracy": 0.7037275061011314, "num_tokens": 5298980.0, "step": 6582 }, { "epoch": 1.7436440677966103, "grad_norm": 1.4914782047271729, "learning_rate": 9.128310381355933e-06, "loss": 1.1539, "mean_token_accuracy": 0.7243223264813423, "num_tokens": 5300612.0, "step": 6584 }, { "epoch": 1.7441737288135593, "grad_norm": 1.8350683450698853, "learning_rate": 9.128045550847458e-06, "loss": 1.7368, "mean_token_accuracy": 0.6082389242947102, "num_tokens": 5302277.0, "step": 6586 }, { "epoch": 1.7447033898305084, "grad_norm": 1.4680176973342896, "learning_rate": 9.127780720338983e-06, "loss": 1.2076, "mean_token_accuracy": 0.6982128471136093, "num_tokens": 5303913.0, "step": 6588 }, { "epoch": 1.7452330508474576, "grad_norm": 2.090395927429199, "learning_rate": 9.12751588983051e-06, "loss": 1.3228, "mean_token_accuracy": 0.7038610652089119, "num_tokens": 5305542.0, "step": 6590 }, { "epoch": 1.7457627118644068, "grad_norm": 1.6252754926681519, "learning_rate": 9.127251059322035e-06, "loss": 1.1628, "mean_token_accuracy": 0.7143980041146278, "num_tokens": 5307156.0, "step": 6592 }, { "epoch": 1.746292372881356, "grad_norm": 1.3874636888504028, "learning_rate": 9.12698622881356e-06, "loss": 1.4655, "mean_token_accuracy": 0.6786540970206261, "num_tokens": 5308895.0, "step": 6594 }, { "epoch": 1.746822033898305, "grad_norm": 1.63796865940094, "learning_rate": 9.126721398305084e-06, "loss": 1.7092, "mean_token_accuracy": 0.6272636204957962, "num_tokens": 5310622.0, "step": 6596 }, { "epoch": 1.7473516949152543, "grad_norm": 2.2133922576904297, "learning_rate": 9.126456567796611e-06, "loss": 1.5967, "mean_token_accuracy": 0.6513154059648514, "num_tokens": 5312114.0, "step": 6598 }, { "epoch": 1.7478813559322033, "grad_norm": 1.7841613292694092, "learning_rate": 9.126191737288136e-06, "loss": 1.2918, "mean_token_accuracy": 0.7083096951246262, "num_tokens": 5313698.0, "step": 6600 }, { "epoch": 1.7484110169491527, "grad_norm": 1.751270055770874, "learning_rate": 9.125926906779663e-06, "loss": 1.6374, "mean_token_accuracy": 0.6235169172286987, "num_tokens": 5315345.0, "step": 6602 }, { "epoch": 1.7489406779661016, "grad_norm": 1.5881739854812622, "learning_rate": 9.125662076271188e-06, "loss": 1.0601, "mean_token_accuracy": 0.7501050382852554, "num_tokens": 5316893.0, "step": 6604 }, { "epoch": 1.7494703389830508, "grad_norm": 1.8423584699630737, "learning_rate": 9.125397245762712e-06, "loss": 1.4968, "mean_token_accuracy": 0.6790401302278042, "num_tokens": 5318593.0, "step": 6606 }, { "epoch": 1.75, "grad_norm": 1.4802095890045166, "learning_rate": 9.125132415254237e-06, "loss": 1.0881, "mean_token_accuracy": 0.74635149538517, "num_tokens": 5320202.0, "step": 6608 }, { "epoch": 1.7505296610169492, "grad_norm": 1.7278388738632202, "learning_rate": 9.124867584745764e-06, "loss": 1.5886, "mean_token_accuracy": 0.663570299744606, "num_tokens": 5321787.0, "step": 6610 }, { "epoch": 1.7510593220338984, "grad_norm": 2.011455535888672, "learning_rate": 9.124602754237289e-06, "loss": 1.3392, "mean_token_accuracy": 0.6987279728055, "num_tokens": 5323097.0, "step": 6612 }, { "epoch": 1.7515889830508473, "grad_norm": 1.6778887510299683, "learning_rate": 9.124337923728814e-06, "loss": 1.5721, "mean_token_accuracy": 0.6498831063508987, "num_tokens": 5324542.0, "step": 6614 }, { "epoch": 1.7521186440677967, "grad_norm": 1.2701293230056763, "learning_rate": 9.124073093220339e-06, "loss": 1.276, "mean_token_accuracy": 0.7341665476560593, "num_tokens": 5326251.0, "step": 6616 }, { "epoch": 1.7526483050847457, "grad_norm": 1.4459308385849, "learning_rate": 9.123808262711865e-06, "loss": 1.2945, "mean_token_accuracy": 0.7075933888554573, "num_tokens": 5328139.0, "step": 6618 }, { "epoch": 1.753177966101695, "grad_norm": 1.5181105136871338, "learning_rate": 9.12354343220339e-06, "loss": 1.3437, "mean_token_accuracy": 0.7132008075714111, "num_tokens": 5329573.0, "step": 6620 }, { "epoch": 1.753707627118644, "grad_norm": 1.487628698348999, "learning_rate": 9.123278601694915e-06, "loss": 1.4273, "mean_token_accuracy": 0.6846785098314285, "num_tokens": 5331548.0, "step": 6622 }, { "epoch": 1.7542372881355932, "grad_norm": 1.4962892532348633, "learning_rate": 9.123013771186442e-06, "loss": 1.4154, "mean_token_accuracy": 0.6709600314497948, "num_tokens": 5333154.0, "step": 6624 }, { "epoch": 1.7547669491525424, "grad_norm": 1.5248829126358032, "learning_rate": 9.122748940677967e-06, "loss": 1.275, "mean_token_accuracy": 0.70554418861866, "num_tokens": 5334676.0, "step": 6626 }, { "epoch": 1.7552966101694916, "grad_norm": 1.5996971130371094, "learning_rate": 9.122484110169493e-06, "loss": 1.6404, "mean_token_accuracy": 0.6347054615616798, "num_tokens": 5336262.0, "step": 6628 }, { "epoch": 1.7558262711864407, "grad_norm": 1.5509281158447266, "learning_rate": 9.122219279661018e-06, "loss": 1.1053, "mean_token_accuracy": 0.7532645240426064, "num_tokens": 5337797.0, "step": 6630 }, { "epoch": 1.7563559322033897, "grad_norm": 1.4661204814910889, "learning_rate": 9.121954449152543e-06, "loss": 1.4455, "mean_token_accuracy": 0.6795387417078018, "num_tokens": 5339383.0, "step": 6632 }, { "epoch": 1.756885593220339, "grad_norm": 1.5808124542236328, "learning_rate": 9.121689618644068e-06, "loss": 1.3381, "mean_token_accuracy": 0.6955201104283333, "num_tokens": 5340777.0, "step": 6634 }, { "epoch": 1.757415254237288, "grad_norm": 1.603412389755249, "learning_rate": 9.121424788135595e-06, "loss": 1.3273, "mean_token_accuracy": 0.7108740210533142, "num_tokens": 5342269.0, "step": 6636 }, { "epoch": 1.7579449152542372, "grad_norm": 1.5467925071716309, "learning_rate": 9.12115995762712e-06, "loss": 1.2379, "mean_token_accuracy": 0.7082177326083183, "num_tokens": 5343842.0, "step": 6638 }, { "epoch": 1.7584745762711864, "grad_norm": 1.5502066612243652, "learning_rate": 9.120895127118645e-06, "loss": 1.6903, "mean_token_accuracy": 0.6252474710345268, "num_tokens": 5345536.0, "step": 6640 }, { "epoch": 1.7590042372881356, "grad_norm": 1.4080201387405396, "learning_rate": 9.12063029661017e-06, "loss": 0.9257, "mean_token_accuracy": 0.7773787081241608, "num_tokens": 5347061.0, "step": 6642 }, { "epoch": 1.7595338983050848, "grad_norm": 1.4805519580841064, "learning_rate": 9.120365466101696e-06, "loss": 1.2519, "mean_token_accuracy": 0.7237170040607452, "num_tokens": 5348809.0, "step": 6644 }, { "epoch": 1.7600635593220337, "grad_norm": 1.7599672079086304, "learning_rate": 9.120100635593221e-06, "loss": 1.6421, "mean_token_accuracy": 0.6445893160998821, "num_tokens": 5350230.0, "step": 6646 }, { "epoch": 1.7605932203389831, "grad_norm": 1.7904952764511108, "learning_rate": 9.119835805084746e-06, "loss": 1.5686, "mean_token_accuracy": 0.6340363100171089, "num_tokens": 5351695.0, "step": 6648 }, { "epoch": 1.761122881355932, "grad_norm": 1.5914113521575928, "learning_rate": 9.119570974576271e-06, "loss": 1.2422, "mean_token_accuracy": 0.7250097170472145, "num_tokens": 5353378.0, "step": 6650 }, { "epoch": 1.7616525423728815, "grad_norm": 1.6159688234329224, "learning_rate": 9.119306144067798e-06, "loss": 1.3465, "mean_token_accuracy": 0.6773818247020245, "num_tokens": 5354725.0, "step": 6652 }, { "epoch": 1.7621822033898304, "grad_norm": 1.6492167711257935, "learning_rate": 9.119041313559323e-06, "loss": 1.3636, "mean_token_accuracy": 0.6829788908362389, "num_tokens": 5356241.0, "step": 6654 }, { "epoch": 1.7627118644067796, "grad_norm": 1.5693795680999756, "learning_rate": 9.11877648305085e-06, "loss": 1.3141, "mean_token_accuracy": 0.6861448511481285, "num_tokens": 5358076.0, "step": 6656 }, { "epoch": 1.7632415254237288, "grad_norm": 1.3373041152954102, "learning_rate": 9.118511652542374e-06, "loss": 1.3784, "mean_token_accuracy": 0.685725923627615, "num_tokens": 5359709.0, "step": 6658 }, { "epoch": 1.763771186440678, "grad_norm": 1.9161853790283203, "learning_rate": 9.118246822033899e-06, "loss": 1.3211, "mean_token_accuracy": 0.7159452587366104, "num_tokens": 5361262.0, "step": 6660 }, { "epoch": 1.7643008474576272, "grad_norm": 1.757338523864746, "learning_rate": 9.117981991525424e-06, "loss": 1.0143, "mean_token_accuracy": 0.7787011750042439, "num_tokens": 5362725.0, "step": 6662 }, { "epoch": 1.7648305084745761, "grad_norm": 1.4727474451065063, "learning_rate": 9.11771716101695e-06, "loss": 1.096, "mean_token_accuracy": 0.7228538021445274, "num_tokens": 5364309.0, "step": 6664 }, { "epoch": 1.7653601694915255, "grad_norm": 1.3605406284332275, "learning_rate": 9.117452330508476e-06, "loss": 1.0219, "mean_token_accuracy": 0.7373860701918602, "num_tokens": 5365924.0, "step": 6666 }, { "epoch": 1.7658898305084745, "grad_norm": 1.5074589252471924, "learning_rate": 9.1171875e-06, "loss": 0.9607, "mean_token_accuracy": 0.7903016358613968, "num_tokens": 5367448.0, "step": 6668 }, { "epoch": 1.7664194915254239, "grad_norm": 1.638421654701233, "learning_rate": 9.116922669491525e-06, "loss": 1.6047, "mean_token_accuracy": 0.6671560704708099, "num_tokens": 5369009.0, "step": 6670 }, { "epoch": 1.7669491525423728, "grad_norm": 1.6069450378417969, "learning_rate": 9.116657838983052e-06, "loss": 1.154, "mean_token_accuracy": 0.7255538403987885, "num_tokens": 5370346.0, "step": 6672 }, { "epoch": 1.767478813559322, "grad_norm": 1.5566257238388062, "learning_rate": 9.116393008474577e-06, "loss": 1.094, "mean_token_accuracy": 0.7316945716738701, "num_tokens": 5372172.0, "step": 6674 }, { "epoch": 1.7680084745762712, "grad_norm": 1.7604613304138184, "learning_rate": 9.116128177966102e-06, "loss": 1.2629, "mean_token_accuracy": 0.7059484198689461, "num_tokens": 5373859.0, "step": 6676 }, { "epoch": 1.7685381355932204, "grad_norm": 1.7776321172714233, "learning_rate": 9.115863347457627e-06, "loss": 1.501, "mean_token_accuracy": 0.6197832860052586, "num_tokens": 5376313.0, "step": 6678 }, { "epoch": 1.7690677966101696, "grad_norm": 1.3751558065414429, "learning_rate": 9.115598516949153e-06, "loss": 1.1369, "mean_token_accuracy": 0.7541599050164223, "num_tokens": 5377961.0, "step": 6680 }, { "epoch": 1.7695974576271185, "grad_norm": 1.3543044328689575, "learning_rate": 9.115333686440678e-06, "loss": 1.4697, "mean_token_accuracy": 0.6605760715901852, "num_tokens": 5379457.0, "step": 6682 }, { "epoch": 1.770127118644068, "grad_norm": 1.3708852529525757, "learning_rate": 9.115068855932205e-06, "loss": 0.9594, "mean_token_accuracy": 0.7681813687086105, "num_tokens": 5380883.0, "step": 6684 }, { "epoch": 1.7706567796610169, "grad_norm": 1.964186668395996, "learning_rate": 9.114804025423728e-06, "loss": 1.5153, "mean_token_accuracy": 0.6526407077908516, "num_tokens": 5382473.0, "step": 6686 }, { "epoch": 1.7711864406779663, "grad_norm": 1.6816563606262207, "learning_rate": 9.114539194915255e-06, "loss": 1.5231, "mean_token_accuracy": 0.6718948110938072, "num_tokens": 5383826.0, "step": 6688 }, { "epoch": 1.7717161016949152, "grad_norm": 2.15806245803833, "learning_rate": 9.11427436440678e-06, "loss": 1.3871, "mean_token_accuracy": 0.6827029660344124, "num_tokens": 5385229.0, "step": 6690 }, { "epoch": 1.7722457627118644, "grad_norm": 1.7925283908843994, "learning_rate": 9.114009533898306e-06, "loss": 1.4597, "mean_token_accuracy": 0.6549268960952759, "num_tokens": 5386963.0, "step": 6692 }, { "epoch": 1.7727754237288136, "grad_norm": 1.6188522577285767, "learning_rate": 9.113744703389831e-06, "loss": 1.2962, "mean_token_accuracy": 0.709490530192852, "num_tokens": 5388441.0, "step": 6694 }, { "epoch": 1.7733050847457628, "grad_norm": 1.6976715326309204, "learning_rate": 9.113479872881356e-06, "loss": 1.0537, "mean_token_accuracy": 0.7472002282738686, "num_tokens": 5389919.0, "step": 6696 }, { "epoch": 1.773834745762712, "grad_norm": 1.740755558013916, "learning_rate": 9.113215042372881e-06, "loss": 1.5859, "mean_token_accuracy": 0.6738530956208706, "num_tokens": 5391423.0, "step": 6698 }, { "epoch": 1.774364406779661, "grad_norm": 1.6152915954589844, "learning_rate": 9.112950211864408e-06, "loss": 1.2684, "mean_token_accuracy": 0.6979676932096481, "num_tokens": 5392806.0, "step": 6700 }, { "epoch": 1.7748940677966103, "grad_norm": 1.859000563621521, "learning_rate": 9.112685381355933e-06, "loss": 1.2482, "mean_token_accuracy": 0.7095822617411613, "num_tokens": 5394531.0, "step": 6702 }, { "epoch": 1.7754237288135593, "grad_norm": 1.2684253454208374, "learning_rate": 9.112420550847458e-06, "loss": 1.0916, "mean_token_accuracy": 0.73911552131176, "num_tokens": 5396455.0, "step": 6704 }, { "epoch": 1.7759533898305084, "grad_norm": 1.493996500968933, "learning_rate": 9.112155720338984e-06, "loss": 0.8891, "mean_token_accuracy": 0.7674756422638893, "num_tokens": 5397840.0, "step": 6706 }, { "epoch": 1.7764830508474576, "grad_norm": 1.7544721364974976, "learning_rate": 9.11189088983051e-06, "loss": 1.5629, "mean_token_accuracy": 0.6487449742853642, "num_tokens": 5399259.0, "step": 6708 }, { "epoch": 1.7770127118644068, "grad_norm": 1.8895455598831177, "learning_rate": 9.111626059322036e-06, "loss": 1.847, "mean_token_accuracy": 0.6321107856929302, "num_tokens": 5400857.0, "step": 6710 }, { "epoch": 1.777542372881356, "grad_norm": 1.6048123836517334, "learning_rate": 9.11136122881356e-06, "loss": 1.2654, "mean_token_accuracy": 0.7138217389583588, "num_tokens": 5402943.0, "step": 6712 }, { "epoch": 1.778072033898305, "grad_norm": 1.3511290550231934, "learning_rate": 9.111096398305086e-06, "loss": 0.8915, "mean_token_accuracy": 0.7677613943815231, "num_tokens": 5404585.0, "step": 6714 }, { "epoch": 1.7786016949152543, "grad_norm": 1.861132264137268, "learning_rate": 9.11083156779661e-06, "loss": 1.3177, "mean_token_accuracy": 0.7091440409421921, "num_tokens": 5405907.0, "step": 6716 }, { "epoch": 1.7791313559322033, "grad_norm": 1.0423259735107422, "learning_rate": 9.110566737288137e-06, "loss": 0.9488, "mean_token_accuracy": 0.7728962935507298, "num_tokens": 5407755.0, "step": 6718 }, { "epoch": 1.7796610169491527, "grad_norm": 1.66131591796875, "learning_rate": 9.110301906779662e-06, "loss": 1.2071, "mean_token_accuracy": 0.7295856848359108, "num_tokens": 5409020.0, "step": 6720 }, { "epoch": 1.7801906779661016, "grad_norm": 1.3884644508361816, "learning_rate": 9.110037076271187e-06, "loss": 1.3375, "mean_token_accuracy": 0.706524271517992, "num_tokens": 5410795.0, "step": 6722 }, { "epoch": 1.7807203389830508, "grad_norm": 1.6531813144683838, "learning_rate": 9.109772245762712e-06, "loss": 1.6772, "mean_token_accuracy": 0.6445883437991142, "num_tokens": 5412526.0, "step": 6724 }, { "epoch": 1.78125, "grad_norm": 1.7695708274841309, "learning_rate": 9.109507415254239e-06, "loss": 1.6701, "mean_token_accuracy": 0.6389095783233643, "num_tokens": 5414092.0, "step": 6726 }, { "epoch": 1.7817796610169492, "grad_norm": 1.8254107236862183, "learning_rate": 9.109242584745764e-06, "loss": 1.3097, "mean_token_accuracy": 0.7061361372470856, "num_tokens": 5415616.0, "step": 6728 }, { "epoch": 1.7823093220338984, "grad_norm": 1.5973920822143555, "learning_rate": 9.108977754237289e-06, "loss": 1.4835, "mean_token_accuracy": 0.6762059703469276, "num_tokens": 5417256.0, "step": 6730 }, { "epoch": 1.7828389830508473, "grad_norm": 1.6753172874450684, "learning_rate": 9.108712923728814e-06, "loss": 1.1022, "mean_token_accuracy": 0.7561335787177086, "num_tokens": 5418502.0, "step": 6732 }, { "epoch": 1.7833686440677967, "grad_norm": 1.5114736557006836, "learning_rate": 9.10844809322034e-06, "loss": 1.068, "mean_token_accuracy": 0.7412331476807594, "num_tokens": 5420253.0, "step": 6734 }, { "epoch": 1.7838983050847457, "grad_norm": 1.7852603197097778, "learning_rate": 9.108183262711865e-06, "loss": 1.612, "mean_token_accuracy": 0.686830647289753, "num_tokens": 5421738.0, "step": 6736 }, { "epoch": 1.784427966101695, "grad_norm": 1.458666443824768, "learning_rate": 9.107918432203392e-06, "loss": 0.8733, "mean_token_accuracy": 0.7826774939894676, "num_tokens": 5423108.0, "step": 6738 }, { "epoch": 1.784957627118644, "grad_norm": 1.9506155252456665, "learning_rate": 9.107653601694915e-06, "loss": 1.6156, "mean_token_accuracy": 0.6402513459324837, "num_tokens": 5424659.0, "step": 6740 }, { "epoch": 1.7854872881355932, "grad_norm": 1.3285083770751953, "learning_rate": 9.107388771186442e-06, "loss": 1.0838, "mean_token_accuracy": 0.7663859874010086, "num_tokens": 5426111.0, "step": 6742 }, { "epoch": 1.7860169491525424, "grad_norm": 1.5057487487792969, "learning_rate": 9.107123940677966e-06, "loss": 0.9825, "mean_token_accuracy": 0.7810887917876244, "num_tokens": 5427574.0, "step": 6744 }, { "epoch": 1.7865466101694916, "grad_norm": 1.6119983196258545, "learning_rate": 9.106859110169493e-06, "loss": 1.4043, "mean_token_accuracy": 0.6905403658747673, "num_tokens": 5429062.0, "step": 6746 }, { "epoch": 1.7870762711864407, "grad_norm": 1.7507388591766357, "learning_rate": 9.106594279661018e-06, "loss": 1.4568, "mean_token_accuracy": 0.6739524826407433, "num_tokens": 5430634.0, "step": 6748 }, { "epoch": 1.7876059322033897, "grad_norm": 1.6840229034423828, "learning_rate": 9.106329449152543e-06, "loss": 1.5295, "step": 6750 }, { "epoch": 1.7876059322033897, "eval_loss": 1.314324140548706, "eval_mean_token_accuracy": 0.7001062058783197, "eval_num_tokens": 5432441.0, "eval_runtime": 48.7991, "eval_samples_per_second": 6.312, "eval_steps_per_second": 6.312, "step": 6750 }, { "epoch": 1.788135593220339, "grad_norm": 1.6082825660705566, "learning_rate": 9.106064618644068e-06, "loss": 1.4568, "mean_token_accuracy": 0.6658873315900564, "num_tokens": 5434455.0, "step": 6752 }, { "epoch": 1.788665254237288, "grad_norm": 1.6136236190795898, "learning_rate": 9.105799788135595e-06, "loss": 1.142, "mean_token_accuracy": 0.7324178665876389, "num_tokens": 5436248.0, "step": 6754 }, { "epoch": 1.7891949152542372, "grad_norm": 1.9093068838119507, "learning_rate": 9.10553495762712e-06, "loss": 1.5227, "mean_token_accuracy": 0.6827840358018875, "num_tokens": 5437756.0, "step": 6756 }, { "epoch": 1.7897245762711864, "grad_norm": 1.797563076019287, "learning_rate": 9.105270127118644e-06, "loss": 1.189, "mean_token_accuracy": 0.7057675197720528, "num_tokens": 5439254.0, "step": 6758 }, { "epoch": 1.7902542372881356, "grad_norm": 1.488743543624878, "learning_rate": 9.10500529661017e-06, "loss": 1.5986, "mean_token_accuracy": 0.6603640019893646, "num_tokens": 5440880.0, "step": 6760 }, { "epoch": 1.7907838983050848, "grad_norm": 1.2791821956634521, "learning_rate": 9.104740466101696e-06, "loss": 1.1875, "mean_token_accuracy": 0.7169779762625694, "num_tokens": 5442579.0, "step": 6762 }, { "epoch": 1.7913135593220337, "grad_norm": 1.2718942165374756, "learning_rate": 9.104475635593221e-06, "loss": 1.12, "mean_token_accuracy": 0.7315373346209526, "num_tokens": 5444143.0, "step": 6764 }, { "epoch": 1.7918432203389831, "grad_norm": 1.4582569599151611, "learning_rate": 9.104210805084747e-06, "loss": 1.0892, "mean_token_accuracy": 0.7287270054221153, "num_tokens": 5445669.0, "step": 6766 }, { "epoch": 1.792372881355932, "grad_norm": 1.9190725088119507, "learning_rate": 9.10394597457627e-06, "loss": 1.6913, "mean_token_accuracy": 0.6475846841931343, "num_tokens": 5447153.0, "step": 6768 }, { "epoch": 1.7929025423728815, "grad_norm": 1.40975821018219, "learning_rate": 9.103681144067797e-06, "loss": 1.0995, "mean_token_accuracy": 0.7384446784853935, "num_tokens": 5448519.0, "step": 6770 }, { "epoch": 1.7934322033898304, "grad_norm": 1.3627787828445435, "learning_rate": 9.103416313559322e-06, "loss": 1.319, "mean_token_accuracy": 0.7017509117722511, "num_tokens": 5450089.0, "step": 6772 }, { "epoch": 1.7939618644067796, "grad_norm": 1.752833366394043, "learning_rate": 9.103151483050849e-06, "loss": 1.5562, "mean_token_accuracy": 0.6497797295451164, "num_tokens": 5451665.0, "step": 6774 }, { "epoch": 1.7944915254237288, "grad_norm": 1.567548394203186, "learning_rate": 9.102886652542374e-06, "loss": 1.3463, "mean_token_accuracy": 0.7123103067278862, "num_tokens": 5453259.0, "step": 6776 }, { "epoch": 1.795021186440678, "grad_norm": 1.3664226531982422, "learning_rate": 9.102621822033899e-06, "loss": 1.4493, "mean_token_accuracy": 0.6456907689571381, "num_tokens": 5455111.0, "step": 6778 }, { "epoch": 1.7955508474576272, "grad_norm": 1.425809383392334, "learning_rate": 9.102356991525424e-06, "loss": 1.0452, "mean_token_accuracy": 0.7492536827921867, "num_tokens": 5457108.0, "step": 6780 }, { "epoch": 1.7960805084745761, "grad_norm": 1.7160215377807617, "learning_rate": 9.10209216101695e-06, "loss": 1.3506, "mean_token_accuracy": 0.713304303586483, "num_tokens": 5458591.0, "step": 6782 }, { "epoch": 1.7966101694915255, "grad_norm": 1.921156883239746, "learning_rate": 9.101827330508475e-06, "loss": 1.5147, "mean_token_accuracy": 0.6679084748029709, "num_tokens": 5460181.0, "step": 6784 }, { "epoch": 1.7971398305084745, "grad_norm": 1.8386491537094116, "learning_rate": 9.1015625e-06, "loss": 1.5955, "mean_token_accuracy": 0.6719868332147598, "num_tokens": 5461576.0, "step": 6786 }, { "epoch": 1.7976694915254239, "grad_norm": 1.6808600425720215, "learning_rate": 9.101297669491527e-06, "loss": 1.6196, "mean_token_accuracy": 0.656184509396553, "num_tokens": 5463117.0, "step": 6788 }, { "epoch": 1.7981991525423728, "grad_norm": 1.506178379058838, "learning_rate": 9.101032838983052e-06, "loss": 1.3694, "mean_token_accuracy": 0.7055156901478767, "num_tokens": 5464533.0, "step": 6790 }, { "epoch": 1.798728813559322, "grad_norm": 1.5274920463562012, "learning_rate": 9.100768008474578e-06, "loss": 1.2434, "mean_token_accuracy": 0.7004460841417313, "num_tokens": 5466249.0, "step": 6792 }, { "epoch": 1.7992584745762712, "grad_norm": 1.5839866399765015, "learning_rate": 9.100503177966102e-06, "loss": 1.3424, "mean_token_accuracy": 0.7067376375198364, "num_tokens": 5467956.0, "step": 6794 }, { "epoch": 1.7997881355932204, "grad_norm": 1.7517722845077515, "learning_rate": 9.100238347457628e-06, "loss": 1.3863, "mean_token_accuracy": 0.6943300366401672, "num_tokens": 5469552.0, "step": 6796 }, { "epoch": 1.8003177966101696, "grad_norm": 1.8535970449447632, "learning_rate": 9.099973516949153e-06, "loss": 1.1882, "mean_token_accuracy": 0.7375738024711609, "num_tokens": 5470927.0, "step": 6798 }, { "epoch": 1.8008474576271185, "grad_norm": 1.545111894607544, "learning_rate": 9.09970868644068e-06, "loss": 1.3503, "mean_token_accuracy": 0.6880002096295357, "num_tokens": 5472600.0, "step": 6800 }, { "epoch": 1.801377118644068, "grad_norm": 1.5837780237197876, "learning_rate": 9.099443855932205e-06, "loss": 1.3524, "mean_token_accuracy": 0.7126856409013271, "num_tokens": 5474129.0, "step": 6802 }, { "epoch": 1.8019067796610169, "grad_norm": 1.6244382858276367, "learning_rate": 9.09917902542373e-06, "loss": 1.1479, "mean_token_accuracy": 0.7212411314249039, "num_tokens": 5475404.0, "step": 6804 }, { "epoch": 1.8024364406779663, "grad_norm": 1.6627758741378784, "learning_rate": 9.098914194915255e-06, "loss": 1.22, "mean_token_accuracy": 0.7112591713666916, "num_tokens": 5477039.0, "step": 6806 }, { "epoch": 1.8029661016949152, "grad_norm": 1.3707785606384277, "learning_rate": 9.098649364406781e-06, "loss": 1.1955, "mean_token_accuracy": 0.7211624532938004, "num_tokens": 5478659.0, "step": 6808 }, { "epoch": 1.8034957627118644, "grad_norm": 2.190652370452881, "learning_rate": 9.098384533898306e-06, "loss": 1.5007, "mean_token_accuracy": 0.7161206901073456, "num_tokens": 5479951.0, "step": 6810 }, { "epoch": 1.8040254237288136, "grad_norm": 1.872832179069519, "learning_rate": 9.098119703389831e-06, "loss": 1.4135, "mean_token_accuracy": 0.6882544681429863, "num_tokens": 5481602.0, "step": 6812 }, { "epoch": 1.8045550847457628, "grad_norm": 1.1572301387786865, "learning_rate": 9.097854872881356e-06, "loss": 1.4001, "mean_token_accuracy": 0.691037006676197, "num_tokens": 5483499.0, "step": 6814 }, { "epoch": 1.805084745762712, "grad_norm": 1.5539872646331787, "learning_rate": 9.097590042372883e-06, "loss": 1.1474, "mean_token_accuracy": 0.7404263466596603, "num_tokens": 5485078.0, "step": 6816 }, { "epoch": 1.805614406779661, "grad_norm": 1.5675475597381592, "learning_rate": 9.097325211864407e-06, "loss": 0.9598, "mean_token_accuracy": 0.7501399964094162, "num_tokens": 5486731.0, "step": 6818 }, { "epoch": 1.8061440677966103, "grad_norm": 1.947131872177124, "learning_rate": 9.097060381355934e-06, "loss": 1.6319, "mean_token_accuracy": 0.6403877511620522, "num_tokens": 5488338.0, "step": 6820 }, { "epoch": 1.8066737288135593, "grad_norm": 1.5492538213729858, "learning_rate": 9.096795550847457e-06, "loss": 1.3497, "mean_token_accuracy": 0.6953045688569546, "num_tokens": 5489891.0, "step": 6822 }, { "epoch": 1.8072033898305084, "grad_norm": 1.4045671224594116, "learning_rate": 9.096530720338984e-06, "loss": 1.3555, "mean_token_accuracy": 0.6979015842080116, "num_tokens": 5491476.0, "step": 6824 }, { "epoch": 1.8077330508474576, "grad_norm": 1.7316726446151733, "learning_rate": 9.096265889830509e-06, "loss": 1.6057, "mean_token_accuracy": 0.6432054676115513, "num_tokens": 5493244.0, "step": 6826 }, { "epoch": 1.8082627118644068, "grad_norm": 1.586249589920044, "learning_rate": 9.096001059322036e-06, "loss": 1.6234, "mean_token_accuracy": 0.6294875368475914, "num_tokens": 5495063.0, "step": 6828 }, { "epoch": 1.808792372881356, "grad_norm": 2.066178798675537, "learning_rate": 9.09573622881356e-06, "loss": 1.2786, "mean_token_accuracy": 0.7051237523555756, "num_tokens": 5496424.0, "step": 6830 }, { "epoch": 1.809322033898305, "grad_norm": 1.6757670640945435, "learning_rate": 9.095471398305085e-06, "loss": 1.5034, "mean_token_accuracy": 0.6432120278477669, "num_tokens": 5498073.0, "step": 6832 }, { "epoch": 1.8098516949152543, "grad_norm": 1.7773079872131348, "learning_rate": 9.09520656779661e-06, "loss": 1.4397, "mean_token_accuracy": 0.6850721016526222, "num_tokens": 5499502.0, "step": 6834 }, { "epoch": 1.8103813559322033, "grad_norm": 1.9305593967437744, "learning_rate": 9.094941737288137e-06, "loss": 1.2054, "mean_token_accuracy": 0.7303856015205383, "num_tokens": 5501045.0, "step": 6836 }, { "epoch": 1.8109110169491527, "grad_norm": 1.1856664419174194, "learning_rate": 9.094676906779662e-06, "loss": 1.2698, "mean_token_accuracy": 0.7205808535218239, "num_tokens": 5503306.0, "step": 6838 }, { "epoch": 1.8114406779661016, "grad_norm": 2.6513803005218506, "learning_rate": 9.094412076271187e-06, "loss": 1.6084, "mean_token_accuracy": 0.6358400285243988, "num_tokens": 5504565.0, "step": 6840 }, { "epoch": 1.8119703389830508, "grad_norm": 1.5970574617385864, "learning_rate": 9.094147245762712e-06, "loss": 1.3611, "mean_token_accuracy": 0.6768734902143478, "num_tokens": 5506079.0, "step": 6842 }, { "epoch": 1.8125, "grad_norm": 1.3453381061553955, "learning_rate": 9.093882415254238e-06, "loss": 1.1539, "mean_token_accuracy": 0.707453615963459, "num_tokens": 5508059.0, "step": 6844 }, { "epoch": 1.8130296610169492, "grad_norm": 1.4845621585845947, "learning_rate": 9.093617584745763e-06, "loss": 1.4866, "mean_token_accuracy": 0.6835177801549435, "num_tokens": 5509757.0, "step": 6846 }, { "epoch": 1.8135593220338984, "grad_norm": 1.242018461227417, "learning_rate": 9.093352754237288e-06, "loss": 1.0517, "mean_token_accuracy": 0.7332083135843277, "num_tokens": 5511448.0, "step": 6848 }, { "epoch": 1.8140889830508473, "grad_norm": 1.5778266191482544, "learning_rate": 9.093087923728813e-06, "loss": 1.3233, "mean_token_accuracy": 0.6993255540728569, "num_tokens": 5513084.0, "step": 6850 }, { "epoch": 1.8146186440677967, "grad_norm": 1.9310582876205444, "learning_rate": 9.09282309322034e-06, "loss": 1.6373, "mean_token_accuracy": 0.6560299694538116, "num_tokens": 5514576.0, "step": 6852 }, { "epoch": 1.8151483050847457, "grad_norm": 1.5075798034667969, "learning_rate": 9.092558262711865e-06, "loss": 1.137, "mean_token_accuracy": 0.7156312391161919, "num_tokens": 5516429.0, "step": 6854 }, { "epoch": 1.815677966101695, "grad_norm": 1.799898624420166, "learning_rate": 9.092293432203391e-06, "loss": 1.6309, "mean_token_accuracy": 0.6398084536194801, "num_tokens": 5517890.0, "step": 6856 }, { "epoch": 1.816207627118644, "grad_norm": 1.1185977458953857, "learning_rate": 9.092028601694916e-06, "loss": 1.2214, "mean_token_accuracy": 0.7184348851442337, "num_tokens": 5520355.0, "step": 6858 }, { "epoch": 1.8167372881355932, "grad_norm": 1.910335659980774, "learning_rate": 9.091763771186441e-06, "loss": 1.1306, "mean_token_accuracy": 0.7401856556534767, "num_tokens": 5521811.0, "step": 6860 }, { "epoch": 1.8172669491525424, "grad_norm": 1.5791486501693726, "learning_rate": 9.091498940677966e-06, "loss": 1.2443, "mean_token_accuracy": 0.7104068771004677, "num_tokens": 5523413.0, "step": 6862 }, { "epoch": 1.8177966101694916, "grad_norm": 1.9155410528182983, "learning_rate": 9.091234110169493e-06, "loss": 1.564, "mean_token_accuracy": 0.6691959649324417, "num_tokens": 5524787.0, "step": 6864 }, { "epoch": 1.8183262711864407, "grad_norm": 1.8737953901290894, "learning_rate": 9.090969279661018e-06, "loss": 1.591, "mean_token_accuracy": 0.6520008370280266, "num_tokens": 5526391.0, "step": 6866 }, { "epoch": 1.8188559322033897, "grad_norm": 1.2910453081130981, "learning_rate": 9.090704449152543e-06, "loss": 1.2021, "mean_token_accuracy": 0.734244205057621, "num_tokens": 5527805.0, "step": 6868 }, { "epoch": 1.819385593220339, "grad_norm": 1.792844295501709, "learning_rate": 9.090439618644068e-06, "loss": 1.5729, "mean_token_accuracy": 0.6591081619262695, "num_tokens": 5529276.0, "step": 6870 }, { "epoch": 1.819915254237288, "grad_norm": 1.680607557296753, "learning_rate": 9.090174788135594e-06, "loss": 1.2583, "mean_token_accuracy": 0.707405686378479, "num_tokens": 5530918.0, "step": 6872 }, { "epoch": 1.8204449152542372, "grad_norm": 1.5183547735214233, "learning_rate": 9.08990995762712e-06, "loss": 1.4638, "mean_token_accuracy": 0.6743801236152649, "num_tokens": 5532471.0, "step": 6874 }, { "epoch": 1.8209745762711864, "grad_norm": 1.3908275365829468, "learning_rate": 9.089645127118644e-06, "loss": 1.0509, "mean_token_accuracy": 0.7270056530833244, "num_tokens": 5534198.0, "step": 6876 }, { "epoch": 1.8215042372881356, "grad_norm": 1.5117299556732178, "learning_rate": 9.08938029661017e-06, "loss": 1.2619, "mean_token_accuracy": 0.6936850100755692, "num_tokens": 5535813.0, "step": 6878 }, { "epoch": 1.8220338983050848, "grad_norm": 1.6187225580215454, "learning_rate": 9.089115466101696e-06, "loss": 1.3051, "mean_token_accuracy": 0.7042249217629433, "num_tokens": 5537506.0, "step": 6880 }, { "epoch": 1.8225635593220337, "grad_norm": 1.6724209785461426, "learning_rate": 9.088850635593222e-06, "loss": 1.0849, "mean_token_accuracy": 0.7332053110003471, "num_tokens": 5539047.0, "step": 6882 }, { "epoch": 1.8230932203389831, "grad_norm": 1.587050199508667, "learning_rate": 9.088585805084747e-06, "loss": 1.2725, "mean_token_accuracy": 0.6886167451739311, "num_tokens": 5540518.0, "step": 6884 }, { "epoch": 1.823622881355932, "grad_norm": 1.4849684238433838, "learning_rate": 9.088320974576272e-06, "loss": 1.495, "mean_token_accuracy": 0.6555045805871487, "num_tokens": 5542429.0, "step": 6886 }, { "epoch": 1.8241525423728815, "grad_norm": 1.6865737438201904, "learning_rate": 9.088056144067797e-06, "loss": 1.4783, "mean_token_accuracy": 0.6540272682905197, "num_tokens": 5543977.0, "step": 6888 }, { "epoch": 1.8246822033898304, "grad_norm": 1.729699969291687, "learning_rate": 9.087791313559324e-06, "loss": 1.0398, "mean_token_accuracy": 0.7300195395946503, "num_tokens": 5545465.0, "step": 6890 }, { "epoch": 1.8252118644067796, "grad_norm": 1.4125241041183472, "learning_rate": 9.087526483050849e-06, "loss": 1.0901, "mean_token_accuracy": 0.755342073738575, "num_tokens": 5547035.0, "step": 6892 }, { "epoch": 1.8257415254237288, "grad_norm": 1.8226056098937988, "learning_rate": 9.087261652542373e-06, "loss": 1.2533, "mean_token_accuracy": 0.6853447258472443, "num_tokens": 5548612.0, "step": 6894 }, { "epoch": 1.826271186440678, "grad_norm": 1.8215845823287964, "learning_rate": 9.086996822033898e-06, "loss": 1.5505, "mean_token_accuracy": 0.6786178350448608, "num_tokens": 5550011.0, "step": 6896 }, { "epoch": 1.8268008474576272, "grad_norm": 1.8040517568588257, "learning_rate": 9.086731991525425e-06, "loss": 0.902, "mean_token_accuracy": 0.7779790759086609, "num_tokens": 5551425.0, "step": 6898 }, { "epoch": 1.8273305084745761, "grad_norm": 1.6880638599395752, "learning_rate": 9.08646716101695e-06, "loss": 1.3138, "mean_token_accuracy": 0.6838629469275475, "num_tokens": 5553085.0, "step": 6900 }, { "epoch": 1.8278601694915255, "grad_norm": 1.5860165357589722, "learning_rate": 9.086202330508475e-06, "loss": 1.0756, "mean_token_accuracy": 0.7365376725792885, "num_tokens": 5554271.0, "step": 6902 }, { "epoch": 1.8283898305084745, "grad_norm": 1.889123797416687, "learning_rate": 9.0859375e-06, "loss": 1.3881, "mean_token_accuracy": 0.7127976268529892, "num_tokens": 5555702.0, "step": 6904 }, { "epoch": 1.8289194915254239, "grad_norm": 1.669599175453186, "learning_rate": 9.085672669491526e-06, "loss": 1.2822, "mean_token_accuracy": 0.726009912788868, "num_tokens": 5557671.0, "step": 6906 }, { "epoch": 1.8294491525423728, "grad_norm": 1.4356591701507568, "learning_rate": 9.085407838983051e-06, "loss": 1.0655, "mean_token_accuracy": 0.7385055720806122, "num_tokens": 5559305.0, "step": 6908 }, { "epoch": 1.829978813559322, "grad_norm": 1.5469468832015991, "learning_rate": 9.085143008474578e-06, "loss": 1.8293, "mean_token_accuracy": 0.5923226401209831, "num_tokens": 5561002.0, "step": 6910 }, { "epoch": 1.8305084745762712, "grad_norm": 1.5913915634155273, "learning_rate": 9.084878177966103e-06, "loss": 1.1605, "mean_token_accuracy": 0.734638474881649, "num_tokens": 5562439.0, "step": 6912 }, { "epoch": 1.8310381355932204, "grad_norm": 1.818041443824768, "learning_rate": 9.084613347457628e-06, "loss": 1.519, "mean_token_accuracy": 0.6662858091294765, "num_tokens": 5563984.0, "step": 6914 }, { "epoch": 1.8315677966101696, "grad_norm": 1.590887188911438, "learning_rate": 9.084348516949153e-06, "loss": 1.1643, "mean_token_accuracy": 0.7470946311950684, "num_tokens": 5565366.0, "step": 6916 }, { "epoch": 1.8320974576271185, "grad_norm": 1.2412080764770508, "learning_rate": 9.08408368644068e-06, "loss": 1.0041, "mean_token_accuracy": 0.7461922466754913, "num_tokens": 5567175.0, "step": 6918 }, { "epoch": 1.832627118644068, "grad_norm": 1.3892775774002075, "learning_rate": 9.083818855932204e-06, "loss": 1.0838, "mean_token_accuracy": 0.724794402718544, "num_tokens": 5568716.0, "step": 6920 }, { "epoch": 1.8331567796610169, "grad_norm": 2.0449564456939697, "learning_rate": 9.08355402542373e-06, "loss": 1.5763, "mean_token_accuracy": 0.6709667891263962, "num_tokens": 5570128.0, "step": 6922 }, { "epoch": 1.8336864406779663, "grad_norm": 1.827139139175415, "learning_rate": 9.083289194915254e-06, "loss": 1.6614, "mean_token_accuracy": 0.6479788720607758, "num_tokens": 5571788.0, "step": 6924 }, { "epoch": 1.8342161016949152, "grad_norm": 1.3304096460342407, "learning_rate": 9.08302436440678e-06, "loss": 1.3921, "mean_token_accuracy": 0.6886019334197044, "num_tokens": 5573744.0, "step": 6926 }, { "epoch": 1.8347457627118644, "grad_norm": 1.6073559522628784, "learning_rate": 9.082759533898306e-06, "loss": 1.2854, "mean_token_accuracy": 0.6604223884642124, "num_tokens": 5576328.0, "step": 6928 }, { "epoch": 1.8352754237288136, "grad_norm": 1.6790771484375, "learning_rate": 9.08249470338983e-06, "loss": 0.9622, "mean_token_accuracy": 0.7429182417690754, "num_tokens": 5577801.0, "step": 6930 }, { "epoch": 1.8358050847457628, "grad_norm": 1.8343935012817383, "learning_rate": 9.082229872881356e-06, "loss": 1.403, "mean_token_accuracy": 0.6949217915534973, "num_tokens": 5579246.0, "step": 6932 }, { "epoch": 1.836334745762712, "grad_norm": 1.3309345245361328, "learning_rate": 9.081965042372882e-06, "loss": 1.4837, "mean_token_accuracy": 0.6607310846447945, "num_tokens": 5581041.0, "step": 6934 }, { "epoch": 1.836864406779661, "grad_norm": 1.5159214735031128, "learning_rate": 9.081700211864407e-06, "loss": 1.4378, "mean_token_accuracy": 0.6799951791763306, "num_tokens": 5582845.0, "step": 6936 }, { "epoch": 1.8373940677966103, "grad_norm": 1.8801509141921997, "learning_rate": 9.081435381355934e-06, "loss": 1.4523, "mean_token_accuracy": 0.6863446235656738, "num_tokens": 5584342.0, "step": 6938 }, { "epoch": 1.8379237288135593, "grad_norm": 1.3903110027313232, "learning_rate": 9.081170550847459e-06, "loss": 0.8791, "mean_token_accuracy": 0.7747137248516083, "num_tokens": 5585651.0, "step": 6940 }, { "epoch": 1.8384533898305084, "grad_norm": 1.7252622842788696, "learning_rate": 9.080905720338984e-06, "loss": 1.8826, "mean_token_accuracy": 0.6037546433508396, "num_tokens": 5587191.0, "step": 6942 }, { "epoch": 1.8389830508474576, "grad_norm": 1.6935142278671265, "learning_rate": 9.080640889830509e-06, "loss": 1.0424, "mean_token_accuracy": 0.7507185637950897, "num_tokens": 5588515.0, "step": 6944 }, { "epoch": 1.8395127118644068, "grad_norm": 2.058833599090576, "learning_rate": 9.080376059322035e-06, "loss": 1.3111, "mean_token_accuracy": 0.716299258172512, "num_tokens": 5590119.0, "step": 6946 }, { "epoch": 1.840042372881356, "grad_norm": 1.6584078073501587, "learning_rate": 9.08011122881356e-06, "loss": 1.0675, "mean_token_accuracy": 0.7586538940668106, "num_tokens": 5591614.0, "step": 6948 }, { "epoch": 1.840572033898305, "grad_norm": 1.3972656726837158, "learning_rate": 9.079846398305085e-06, "loss": 1.4525, "mean_token_accuracy": 0.6607425846159458, "num_tokens": 5593409.0, "step": 6950 }, { "epoch": 1.8411016949152543, "grad_norm": 1.563751220703125, "learning_rate": 9.07958156779661e-06, "loss": 1.3789, "mean_token_accuracy": 0.6757482960820198, "num_tokens": 5595264.0, "step": 6952 }, { "epoch": 1.8416313559322033, "grad_norm": 1.2713648080825806, "learning_rate": 9.079316737288137e-06, "loss": 1.0291, "mean_token_accuracy": 0.7622454762458801, "num_tokens": 5597059.0, "step": 6954 }, { "epoch": 1.8421610169491527, "grad_norm": 1.5745575428009033, "learning_rate": 9.079051906779661e-06, "loss": 1.1427, "mean_token_accuracy": 0.7297241315245628, "num_tokens": 5598698.0, "step": 6956 }, { "epoch": 1.8426906779661016, "grad_norm": 1.1636956930160522, "learning_rate": 9.078787076271186e-06, "loss": 0.9613, "mean_token_accuracy": 0.7413768619298935, "num_tokens": 5600382.0, "step": 6958 }, { "epoch": 1.8432203389830508, "grad_norm": 1.6863480806350708, "learning_rate": 9.078522245762713e-06, "loss": 1.0268, "mean_token_accuracy": 0.7530750632286072, "num_tokens": 5602042.0, "step": 6960 }, { "epoch": 1.84375, "grad_norm": 1.5083808898925781, "learning_rate": 9.078257415254238e-06, "loss": 1.7333, "mean_token_accuracy": 0.6051489003002644, "num_tokens": 5603867.0, "step": 6962 }, { "epoch": 1.8442796610169492, "grad_norm": 1.1209070682525635, "learning_rate": 9.077992584745765e-06, "loss": 0.8387, "mean_token_accuracy": 0.7798356339335442, "num_tokens": 5605995.0, "step": 6964 }, { "epoch": 1.8448093220338984, "grad_norm": 1.9547138214111328, "learning_rate": 9.07772775423729e-06, "loss": 1.9135, "mean_token_accuracy": 0.6204229258000851, "num_tokens": 5607563.0, "step": 6966 }, { "epoch": 1.8453389830508473, "grad_norm": 1.3147953748703003, "learning_rate": 9.077462923728814e-06, "loss": 1.2773, "mean_token_accuracy": 0.7140476256608963, "num_tokens": 5609332.0, "step": 6968 }, { "epoch": 1.8458686440677967, "grad_norm": 1.4858553409576416, "learning_rate": 9.07719809322034e-06, "loss": 1.0269, "mean_token_accuracy": 0.7729184776544571, "num_tokens": 5610867.0, "step": 6970 }, { "epoch": 1.8463983050847457, "grad_norm": 1.5391756296157837, "learning_rate": 9.076933262711866e-06, "loss": 1.5884, "mean_token_accuracy": 0.6740216165781021, "num_tokens": 5612687.0, "step": 6972 }, { "epoch": 1.846927966101695, "grad_norm": 1.9676352739334106, "learning_rate": 9.076668432203391e-06, "loss": 1.472, "mean_token_accuracy": 0.6679452508687973, "num_tokens": 5614411.0, "step": 6974 }, { "epoch": 1.847457627118644, "grad_norm": 1.8609141111373901, "learning_rate": 9.076403601694916e-06, "loss": 1.7185, "mean_token_accuracy": 0.6285541877150536, "num_tokens": 5616231.0, "step": 6976 }, { "epoch": 1.8479872881355932, "grad_norm": 1.1775212287902832, "learning_rate": 9.07613877118644e-06, "loss": 0.8234, "mean_token_accuracy": 0.7885456457734108, "num_tokens": 5617837.0, "step": 6978 }, { "epoch": 1.8485169491525424, "grad_norm": 1.6883207559585571, "learning_rate": 9.075873940677967e-06, "loss": 1.5774, "mean_token_accuracy": 0.6303320452570915, "num_tokens": 5619486.0, "step": 6980 }, { "epoch": 1.8490466101694916, "grad_norm": 1.4148883819580078, "learning_rate": 9.075609110169492e-06, "loss": 1.3874, "mean_token_accuracy": 0.6777274906635284, "num_tokens": 5621053.0, "step": 6982 }, { "epoch": 1.8495762711864407, "grad_norm": 1.6507166624069214, "learning_rate": 9.075344279661017e-06, "loss": 1.5398, "mean_token_accuracy": 0.6814097091555595, "num_tokens": 5622400.0, "step": 6984 }, { "epoch": 1.8501059322033897, "grad_norm": 1.7553582191467285, "learning_rate": 9.075079449152542e-06, "loss": 1.3762, "mean_token_accuracy": 0.6736228168010712, "num_tokens": 5624055.0, "step": 6986 }, { "epoch": 1.850635593220339, "grad_norm": 1.8098561763763428, "learning_rate": 9.074814618644069e-06, "loss": 1.5126, "mean_token_accuracy": 0.6630753055214882, "num_tokens": 5625690.0, "step": 6988 }, { "epoch": 1.851165254237288, "grad_norm": 1.4515693187713623, "learning_rate": 9.074549788135594e-06, "loss": 1.3234, "mean_token_accuracy": 0.6908864080905914, "num_tokens": 5627312.0, "step": 6990 }, { "epoch": 1.8516949152542372, "grad_norm": 1.5496923923492432, "learning_rate": 9.07428495762712e-06, "loss": 1.2948, "mean_token_accuracy": 0.6965868026018143, "num_tokens": 5628847.0, "step": 6992 }, { "epoch": 1.8522245762711864, "grad_norm": 1.6524903774261475, "learning_rate": 9.074020127118645e-06, "loss": 1.4296, "mean_token_accuracy": 0.6680499874055386, "num_tokens": 5630518.0, "step": 6994 }, { "epoch": 1.8527542372881356, "grad_norm": 1.953811764717102, "learning_rate": 9.07375529661017e-06, "loss": 1.3504, "mean_token_accuracy": 0.6849886402487755, "num_tokens": 5631849.0, "step": 6996 }, { "epoch": 1.8532838983050848, "grad_norm": 1.8055404424667358, "learning_rate": 9.073490466101695e-06, "loss": 1.9349, "mean_token_accuracy": 0.5791692212224007, "num_tokens": 5633756.0, "step": 6998 }, { "epoch": 1.8538135593220337, "grad_norm": 1.579379677772522, "learning_rate": 9.073225635593222e-06, "loss": 1.3446, "step": 7000 }, { "epoch": 1.8538135593220337, "eval_loss": 1.3134313821792603, "eval_mean_token_accuracy": 0.7002581786606219, "eval_num_tokens": 5635378.0, "eval_runtime": 48.3611, "eval_samples_per_second": 6.369, "eval_steps_per_second": 6.369, "step": 7000 }, { "epoch": 1.8543432203389831, "grad_norm": 1.3855226039886475, "learning_rate": 9.072960805084747e-06, "loss": 1.2142, "mean_token_accuracy": 0.702954888343811, "num_tokens": 5636943.0, "step": 7002 }, { "epoch": 1.854872881355932, "grad_norm": 1.358729600906372, "learning_rate": 9.072695974576272e-06, "loss": 0.8785, "mean_token_accuracy": 0.7972686439752579, "num_tokens": 5638476.0, "step": 7004 }, { "epoch": 1.8554025423728815, "grad_norm": 1.7572345733642578, "learning_rate": 9.072431144067797e-06, "loss": 1.2443, "mean_token_accuracy": 0.7303402870893478, "num_tokens": 5639907.0, "step": 7006 }, { "epoch": 1.8559322033898304, "grad_norm": 1.566251277923584, "learning_rate": 9.072166313559323e-06, "loss": 1.4231, "mean_token_accuracy": 0.721201729029417, "num_tokens": 5641316.0, "step": 7008 }, { "epoch": 1.8564618644067796, "grad_norm": 1.6001192331314087, "learning_rate": 9.071901483050848e-06, "loss": 1.3454, "mean_token_accuracy": 0.7028109207749367, "num_tokens": 5642878.0, "step": 7010 }, { "epoch": 1.8569915254237288, "grad_norm": 1.5982704162597656, "learning_rate": 9.071636652542373e-06, "loss": 1.5483, "mean_token_accuracy": 0.6390077546238899, "num_tokens": 5644694.0, "step": 7012 }, { "epoch": 1.857521186440678, "grad_norm": 1.6764034032821655, "learning_rate": 9.071371822033898e-06, "loss": 1.1291, "mean_token_accuracy": 0.7379333935678005, "num_tokens": 5646216.0, "step": 7014 }, { "epoch": 1.8580508474576272, "grad_norm": 1.702406883239746, "learning_rate": 9.071106991525425e-06, "loss": 1.4706, "mean_token_accuracy": 0.6626339852809906, "num_tokens": 5647952.0, "step": 7016 }, { "epoch": 1.8585805084745761, "grad_norm": 1.895059585571289, "learning_rate": 9.07084216101695e-06, "loss": 1.6032, "mean_token_accuracy": 0.654455691576004, "num_tokens": 5649436.0, "step": 7018 }, { "epoch": 1.8591101694915255, "grad_norm": 1.2778019905090332, "learning_rate": 9.070577330508476e-06, "loss": 1.1965, "mean_token_accuracy": 0.7271705716848373, "num_tokens": 5651360.0, "step": 7020 }, { "epoch": 1.8596398305084745, "grad_norm": 1.5294865369796753, "learning_rate": 9.070312500000001e-06, "loss": 1.1928, "mean_token_accuracy": 0.7308241501450539, "num_tokens": 5652740.0, "step": 7022 }, { "epoch": 1.8601694915254239, "grad_norm": 1.6189320087432861, "learning_rate": 9.070047669491526e-06, "loss": 1.6699, "mean_token_accuracy": 0.647515133023262, "num_tokens": 5654280.0, "step": 7024 }, { "epoch": 1.8606991525423728, "grad_norm": 1.6157705783843994, "learning_rate": 9.069782838983051e-06, "loss": 1.2654, "mean_token_accuracy": 0.7084082290530205, "num_tokens": 5655741.0, "step": 7026 }, { "epoch": 1.861228813559322, "grad_norm": 1.9173465967178345, "learning_rate": 9.069518008474578e-06, "loss": 1.4023, "mean_token_accuracy": 0.6610125675797462, "num_tokens": 5657240.0, "step": 7028 }, { "epoch": 1.8617584745762712, "grad_norm": 1.6119904518127441, "learning_rate": 9.069253177966103e-06, "loss": 1.1862, "mean_token_accuracy": 0.7280844449996948, "num_tokens": 5658636.0, "step": 7030 }, { "epoch": 1.8622881355932204, "grad_norm": 1.4249714612960815, "learning_rate": 9.068988347457627e-06, "loss": 1.1357, "mean_token_accuracy": 0.7458668202161789, "num_tokens": 5660210.0, "step": 7032 }, { "epoch": 1.8628177966101696, "grad_norm": 1.8073740005493164, "learning_rate": 9.068723516949152e-06, "loss": 1.7271, "mean_token_accuracy": 0.6422180086374283, "num_tokens": 5661835.0, "step": 7034 }, { "epoch": 1.8633474576271185, "grad_norm": 1.6613363027572632, "learning_rate": 9.068458686440679e-06, "loss": 1.5239, "mean_token_accuracy": 0.6512240245938301, "num_tokens": 5663361.0, "step": 7036 }, { "epoch": 1.863877118644068, "grad_norm": 1.251325011253357, "learning_rate": 9.068193855932204e-06, "loss": 0.9787, "mean_token_accuracy": 0.7563428580760956, "num_tokens": 5664924.0, "step": 7038 }, { "epoch": 1.8644067796610169, "grad_norm": 1.532326579093933, "learning_rate": 9.067929025423729e-06, "loss": 1.4122, "mean_token_accuracy": 0.6845449134707451, "num_tokens": 5666580.0, "step": 7040 }, { "epoch": 1.8649364406779663, "grad_norm": 1.3140225410461426, "learning_rate": 9.067664194915255e-06, "loss": 1.1498, "mean_token_accuracy": 0.7354604378342628, "num_tokens": 5668031.0, "step": 7042 }, { "epoch": 1.8654661016949152, "grad_norm": 1.4575238227844238, "learning_rate": 9.06739936440678e-06, "loss": 0.9834, "mean_token_accuracy": 0.755275309085846, "num_tokens": 5669591.0, "step": 7044 }, { "epoch": 1.8659957627118644, "grad_norm": 1.6918888092041016, "learning_rate": 9.067134533898307e-06, "loss": 1.7264, "mean_token_accuracy": 0.6272558644413948, "num_tokens": 5671064.0, "step": 7046 }, { "epoch": 1.8665254237288136, "grad_norm": 1.596646785736084, "learning_rate": 9.066869703389832e-06, "loss": 1.3168, "mean_token_accuracy": 0.7007053270936012, "num_tokens": 5672412.0, "step": 7048 }, { "epoch": 1.8670550847457628, "grad_norm": 1.5883913040161133, "learning_rate": 9.066604872881357e-06, "loss": 1.5905, "mean_token_accuracy": 0.6478186435997486, "num_tokens": 5674227.0, "step": 7050 }, { "epoch": 1.867584745762712, "grad_norm": 1.4397257566452026, "learning_rate": 9.066340042372882e-06, "loss": 1.0504, "mean_token_accuracy": 0.7367837503552437, "num_tokens": 5675825.0, "step": 7052 }, { "epoch": 1.868114406779661, "grad_norm": 1.8758128881454468, "learning_rate": 9.066075211864408e-06, "loss": 1.3874, "mean_token_accuracy": 0.6889802739024162, "num_tokens": 5677418.0, "step": 7054 }, { "epoch": 1.8686440677966103, "grad_norm": 1.4045734405517578, "learning_rate": 9.065810381355933e-06, "loss": 1.3478, "mean_token_accuracy": 0.6966215595602989, "num_tokens": 5679101.0, "step": 7056 }, { "epoch": 1.8691737288135593, "grad_norm": 1.8158167600631714, "learning_rate": 9.065545550847458e-06, "loss": 1.2048, "mean_token_accuracy": 0.7224232777953148, "num_tokens": 5680530.0, "step": 7058 }, { "epoch": 1.8697033898305084, "grad_norm": 1.5981435775756836, "learning_rate": 9.065280720338983e-06, "loss": 1.1493, "mean_token_accuracy": 0.7184822261333466, "num_tokens": 5682160.0, "step": 7060 }, { "epoch": 1.8702330508474576, "grad_norm": 1.5582247972488403, "learning_rate": 9.06501588983051e-06, "loss": 1.5037, "mean_token_accuracy": 0.6892072074115276, "num_tokens": 5683656.0, "step": 7062 }, { "epoch": 1.8707627118644068, "grad_norm": 1.8802638053894043, "learning_rate": 9.064751059322035e-06, "loss": 1.6705, "mean_token_accuracy": 0.6289894953370094, "num_tokens": 5685011.0, "step": 7064 }, { "epoch": 1.871292372881356, "grad_norm": 1.6572848558425903, "learning_rate": 9.06448622881356e-06, "loss": 1.4522, "mean_token_accuracy": 0.6827244311571121, "num_tokens": 5686740.0, "step": 7066 }, { "epoch": 1.871822033898305, "grad_norm": 1.578904151916504, "learning_rate": 9.064221398305085e-06, "loss": 1.4294, "mean_token_accuracy": 0.6779777258634567, "num_tokens": 5688337.0, "step": 7068 }, { "epoch": 1.8723516949152543, "grad_norm": 1.826063871383667, "learning_rate": 9.063956567796611e-06, "loss": 1.6901, "mean_token_accuracy": 0.6317310407757759, "num_tokens": 5689946.0, "step": 7070 }, { "epoch": 1.8728813559322033, "grad_norm": 1.437757134437561, "learning_rate": 9.063691737288136e-06, "loss": 1.4235, "mean_token_accuracy": 0.642868161201477, "num_tokens": 5691771.0, "step": 7072 }, { "epoch": 1.8734110169491527, "grad_norm": 2.035769462585449, "learning_rate": 9.063426906779663e-06, "loss": 1.5742, "mean_token_accuracy": 0.6703033074736595, "num_tokens": 5693008.0, "step": 7074 }, { "epoch": 1.8739406779661016, "grad_norm": 1.5123264789581299, "learning_rate": 9.063162076271188e-06, "loss": 1.1726, "mean_token_accuracy": 0.7234683856368065, "num_tokens": 5694627.0, "step": 7076 }, { "epoch": 1.8744703389830508, "grad_norm": 1.6510709524154663, "learning_rate": 9.062897245762713e-06, "loss": 1.3982, "mean_token_accuracy": 0.705161064863205, "num_tokens": 5696162.0, "step": 7078 }, { "epoch": 1.875, "grad_norm": 1.2133190631866455, "learning_rate": 9.062632415254238e-06, "loss": 0.9813, "mean_token_accuracy": 0.7591023445129395, "num_tokens": 5698059.0, "step": 7080 }, { "epoch": 1.8755296610169492, "grad_norm": 1.3152533769607544, "learning_rate": 9.062367584745764e-06, "loss": 1.2735, "mean_token_accuracy": 0.7017393559217453, "num_tokens": 5699699.0, "step": 7082 }, { "epoch": 1.8760593220338984, "grad_norm": 1.4788775444030762, "learning_rate": 9.062102754237289e-06, "loss": 0.9085, "mean_token_accuracy": 0.7877783551812172, "num_tokens": 5701217.0, "step": 7084 }, { "epoch": 1.8765889830508473, "grad_norm": 1.9106453657150269, "learning_rate": 9.061837923728814e-06, "loss": 1.0577, "mean_token_accuracy": 0.7448162510991096, "num_tokens": 5702556.0, "step": 7086 }, { "epoch": 1.8771186440677967, "grad_norm": 1.574099063873291, "learning_rate": 9.061573093220339e-06, "loss": 1.1106, "mean_token_accuracy": 0.7635921388864517, "num_tokens": 5704104.0, "step": 7088 }, { "epoch": 1.8776483050847457, "grad_norm": 1.507758378982544, "learning_rate": 9.061308262711866e-06, "loss": 0.7094, "mean_token_accuracy": 0.8193476721644402, "num_tokens": 5705487.0, "step": 7090 }, { "epoch": 1.878177966101695, "grad_norm": 1.662350058555603, "learning_rate": 9.06104343220339e-06, "loss": 1.2839, "mean_token_accuracy": 0.708502784371376, "num_tokens": 5707098.0, "step": 7092 }, { "epoch": 1.878707627118644, "grad_norm": 2.0006206035614014, "learning_rate": 9.060778601694915e-06, "loss": 1.8834, "mean_token_accuracy": 0.6098851710557938, "num_tokens": 5708574.0, "step": 7094 }, { "epoch": 1.8792372881355932, "grad_norm": 1.6957504749298096, "learning_rate": 9.06051377118644e-06, "loss": 1.3035, "mean_token_accuracy": 0.6882267966866493, "num_tokens": 5710290.0, "step": 7096 }, { "epoch": 1.8797669491525424, "grad_norm": 1.685475468635559, "learning_rate": 9.060248940677967e-06, "loss": 1.8219, "mean_token_accuracy": 0.5807065479457378, "num_tokens": 5711850.0, "step": 7098 }, { "epoch": 1.8802966101694916, "grad_norm": 1.711799144744873, "learning_rate": 9.059984110169492e-06, "loss": 1.1734, "mean_token_accuracy": 0.7018214166164398, "num_tokens": 5713239.0, "step": 7100 }, { "epoch": 1.8808262711864407, "grad_norm": 1.235708236694336, "learning_rate": 9.059719279661019e-06, "loss": 0.7944, "mean_token_accuracy": 0.799321174621582, "num_tokens": 5714820.0, "step": 7102 }, { "epoch": 1.8813559322033897, "grad_norm": 1.8343666791915894, "learning_rate": 9.059454449152544e-06, "loss": 1.3581, "mean_token_accuracy": 0.7156985029578209, "num_tokens": 5716360.0, "step": 7104 }, { "epoch": 1.881885593220339, "grad_norm": 1.6040165424346924, "learning_rate": 9.059189618644068e-06, "loss": 1.2443, "mean_token_accuracy": 0.7076909691095352, "num_tokens": 5718110.0, "step": 7106 }, { "epoch": 1.882415254237288, "grad_norm": 1.8788806200027466, "learning_rate": 9.058924788135593e-06, "loss": 1.2003, "mean_token_accuracy": 0.7232289090752602, "num_tokens": 5719574.0, "step": 7108 }, { "epoch": 1.8829449152542372, "grad_norm": 1.5227044820785522, "learning_rate": 9.05865995762712e-06, "loss": 1.1254, "mean_token_accuracy": 0.7380528599023819, "num_tokens": 5721102.0, "step": 7110 }, { "epoch": 1.8834745762711864, "grad_norm": 1.5830769538879395, "learning_rate": 9.058395127118645e-06, "loss": 1.3075, "mean_token_accuracy": 0.6962127834558487, "num_tokens": 5722922.0, "step": 7112 }, { "epoch": 1.8840042372881356, "grad_norm": 1.930368185043335, "learning_rate": 9.05813029661017e-06, "loss": 1.3897, "mean_token_accuracy": 0.6833531931042671, "num_tokens": 5724395.0, "step": 7114 }, { "epoch": 1.8845338983050848, "grad_norm": 1.6282659769058228, "learning_rate": 9.057865466101695e-06, "loss": 1.312, "mean_token_accuracy": 0.6945675015449524, "num_tokens": 5726312.0, "step": 7116 }, { "epoch": 1.8850635593220337, "grad_norm": 1.4060792922973633, "learning_rate": 9.057600635593221e-06, "loss": 1.2701, "mean_token_accuracy": 0.6725979670882225, "num_tokens": 5728506.0, "step": 7118 }, { "epoch": 1.8855932203389831, "grad_norm": 2.039478302001953, "learning_rate": 9.057335805084746e-06, "loss": 1.3741, "mean_token_accuracy": 0.7182293012738228, "num_tokens": 5729937.0, "step": 7120 }, { "epoch": 1.886122881355932, "grad_norm": 1.5309038162231445, "learning_rate": 9.057070974576271e-06, "loss": 1.4579, "mean_token_accuracy": 0.6612421572208405, "num_tokens": 5731719.0, "step": 7122 }, { "epoch": 1.8866525423728815, "grad_norm": 1.474812626838684, "learning_rate": 9.056806144067798e-06, "loss": 1.1378, "mean_token_accuracy": 0.7181640416383743, "num_tokens": 5733642.0, "step": 7124 }, { "epoch": 1.8871822033898304, "grad_norm": 1.4518115520477295, "learning_rate": 9.056541313559323e-06, "loss": 1.5913, "mean_token_accuracy": 0.6297040432691574, "num_tokens": 5735724.0, "step": 7126 }, { "epoch": 1.8877118644067796, "grad_norm": 1.642953872680664, "learning_rate": 9.05627648305085e-06, "loss": 1.2011, "mean_token_accuracy": 0.7170702442526817, "num_tokens": 5737141.0, "step": 7128 }, { "epoch": 1.8882415254237288, "grad_norm": 1.2252414226531982, "learning_rate": 9.056011652542374e-06, "loss": 1.0279, "mean_token_accuracy": 0.7457487285137177, "num_tokens": 5738960.0, "step": 7130 }, { "epoch": 1.888771186440678, "grad_norm": 1.7137502431869507, "learning_rate": 9.0557468220339e-06, "loss": 1.4447, "mean_token_accuracy": 0.6693085543811321, "num_tokens": 5740718.0, "step": 7132 }, { "epoch": 1.8893008474576272, "grad_norm": 1.52803373336792, "learning_rate": 9.055481991525424e-06, "loss": 1.1979, "mean_token_accuracy": 0.7315712571144104, "num_tokens": 5742072.0, "step": 7134 }, { "epoch": 1.8898305084745761, "grad_norm": 2.7395076751708984, "learning_rate": 9.055217161016951e-06, "loss": 1.5285, "mean_token_accuracy": 0.6772713661193848, "num_tokens": 5743433.0, "step": 7136 }, { "epoch": 1.8903601694915255, "grad_norm": 1.7604310512542725, "learning_rate": 9.054952330508476e-06, "loss": 1.2178, "mean_token_accuracy": 0.6924138814210892, "num_tokens": 5744833.0, "step": 7138 }, { "epoch": 1.8908898305084745, "grad_norm": 1.311018705368042, "learning_rate": 9.0546875e-06, "loss": 1.2118, "mean_token_accuracy": 0.7398838102817535, "num_tokens": 5746328.0, "step": 7140 }, { "epoch": 1.8914194915254239, "grad_norm": 1.7271898984909058, "learning_rate": 9.054422669491526e-06, "loss": 1.419, "mean_token_accuracy": 0.6951223164796829, "num_tokens": 5747677.0, "step": 7142 }, { "epoch": 1.8919491525423728, "grad_norm": 1.9151477813720703, "learning_rate": 9.054157838983052e-06, "loss": 1.624, "mean_token_accuracy": 0.6382759213447571, "num_tokens": 5749216.0, "step": 7144 }, { "epoch": 1.892478813559322, "grad_norm": 1.3492366075515747, "learning_rate": 9.053893008474577e-06, "loss": 1.2835, "mean_token_accuracy": 0.7112333700060844, "num_tokens": 5751095.0, "step": 7146 }, { "epoch": 1.8930084745762712, "grad_norm": 1.4980472326278687, "learning_rate": 9.053628177966102e-06, "loss": 1.237, "mean_token_accuracy": 0.709931343793869, "num_tokens": 5752762.0, "step": 7148 }, { "epoch": 1.8935381355932204, "grad_norm": 2.096837282180786, "learning_rate": 9.053363347457627e-06, "loss": 1.5373, "mean_token_accuracy": 0.6539532244205475, "num_tokens": 5754255.0, "step": 7150 }, { "epoch": 1.8940677966101696, "grad_norm": 1.8119279146194458, "learning_rate": 9.053098516949154e-06, "loss": 1.3352, "mean_token_accuracy": 0.6763590425252914, "num_tokens": 5755949.0, "step": 7152 }, { "epoch": 1.8945974576271185, "grad_norm": 1.6681228876113892, "learning_rate": 9.052833686440679e-06, "loss": 1.8585, "mean_token_accuracy": 0.6219748929142952, "num_tokens": 5757631.0, "step": 7154 }, { "epoch": 1.895127118644068, "grad_norm": 1.4502207040786743, "learning_rate": 9.052568855932205e-06, "loss": 1.0434, "mean_token_accuracy": 0.7201570272445679, "num_tokens": 5759146.0, "step": 7156 }, { "epoch": 1.8956567796610169, "grad_norm": 1.5035561323165894, "learning_rate": 9.05230402542373e-06, "loss": 1.1108, "mean_token_accuracy": 0.7498400658369064, "num_tokens": 5760670.0, "step": 7158 }, { "epoch": 1.8961864406779663, "grad_norm": 1.2111680507659912, "learning_rate": 9.052039194915255e-06, "loss": 0.7899, "mean_token_accuracy": 0.7925721034407616, "num_tokens": 5762411.0, "step": 7160 }, { "epoch": 1.8967161016949152, "grad_norm": 1.2281686067581177, "learning_rate": 9.05177436440678e-06, "loss": 1.1249, "mean_token_accuracy": 0.718913309276104, "num_tokens": 5764625.0, "step": 7162 }, { "epoch": 1.8972457627118644, "grad_norm": 1.721839427947998, "learning_rate": 9.051509533898307e-06, "loss": 1.4804, "mean_token_accuracy": 0.6671300455927849, "num_tokens": 5766007.0, "step": 7164 }, { "epoch": 1.8977754237288136, "grad_norm": 1.9804412126541138, "learning_rate": 9.051244703389832e-06, "loss": 1.2799, "mean_token_accuracy": 0.6966354325413704, "num_tokens": 5767507.0, "step": 7166 }, { "epoch": 1.8983050847457628, "grad_norm": 14.253013610839844, "learning_rate": 9.050979872881356e-06, "loss": 1.5911, "mean_token_accuracy": 0.6608220189809799, "num_tokens": 5769131.0, "step": 7168 }, { "epoch": 1.898834745762712, "grad_norm": 1.5742027759552002, "learning_rate": 9.050715042372881e-06, "loss": 1.3848, "mean_token_accuracy": 0.6840471178293228, "num_tokens": 5770768.0, "step": 7170 }, { "epoch": 1.899364406779661, "grad_norm": 1.6502898931503296, "learning_rate": 9.050450211864408e-06, "loss": 1.1583, "mean_token_accuracy": 0.7174621671438217, "num_tokens": 5772497.0, "step": 7172 }, { "epoch": 1.8998940677966103, "grad_norm": 1.776493787765503, "learning_rate": 9.050185381355933e-06, "loss": 1.3404, "mean_token_accuracy": 0.6876151189208031, "num_tokens": 5774083.0, "step": 7174 }, { "epoch": 1.9004237288135593, "grad_norm": 1.7591444253921509, "learning_rate": 9.049920550847458e-06, "loss": 1.6288, "mean_token_accuracy": 0.6742131188511848, "num_tokens": 5775650.0, "step": 7176 }, { "epoch": 1.9009533898305084, "grad_norm": 1.529980182647705, "learning_rate": 9.049655720338983e-06, "loss": 0.9531, "mean_token_accuracy": 0.7717616334557533, "num_tokens": 5777108.0, "step": 7178 }, { "epoch": 1.9014830508474576, "grad_norm": 1.686450481414795, "learning_rate": 9.04939088983051e-06, "loss": 1.6896, "mean_token_accuracy": 0.6201642453670502, "num_tokens": 5778759.0, "step": 7180 }, { "epoch": 1.9020127118644068, "grad_norm": 1.6741946935653687, "learning_rate": 9.049126059322034e-06, "loss": 1.4016, "mean_token_accuracy": 0.6959246285259724, "num_tokens": 5780215.0, "step": 7182 }, { "epoch": 1.902542372881356, "grad_norm": 1.6745586395263672, "learning_rate": 9.048861228813561e-06, "loss": 1.2299, "mean_token_accuracy": 0.7245704531669617, "num_tokens": 5781784.0, "step": 7184 }, { "epoch": 1.903072033898305, "grad_norm": 1.4922720193862915, "learning_rate": 9.048596398305084e-06, "loss": 1.2891, "mean_token_accuracy": 0.714314267039299, "num_tokens": 5783169.0, "step": 7186 }, { "epoch": 1.9036016949152543, "grad_norm": 1.2814122438430786, "learning_rate": 9.048331567796611e-06, "loss": 1.4097, "mean_token_accuracy": 0.6914474368095398, "num_tokens": 5785120.0, "step": 7188 }, { "epoch": 1.9041313559322033, "grad_norm": 1.9598345756530762, "learning_rate": 9.048066737288136e-06, "loss": 1.5537, "mean_token_accuracy": 0.6730374135077, "num_tokens": 5786520.0, "step": 7190 }, { "epoch": 1.9046610169491527, "grad_norm": 1.834059238433838, "learning_rate": 9.047801906779662e-06, "loss": 1.3859, "mean_token_accuracy": 0.6896252855658531, "num_tokens": 5788009.0, "step": 7192 }, { "epoch": 1.9051906779661016, "grad_norm": 1.9713374376296997, "learning_rate": 9.047537076271187e-06, "loss": 1.4405, "mean_token_accuracy": 0.6958571299910545, "num_tokens": 5789398.0, "step": 7194 }, { "epoch": 1.9057203389830508, "grad_norm": 1.0075976848602295, "learning_rate": 9.047272245762712e-06, "loss": 0.9606, "mean_token_accuracy": 0.7506486475467682, "num_tokens": 5791408.0, "step": 7196 }, { "epoch": 1.90625, "grad_norm": 1.2822961807250977, "learning_rate": 9.047007415254237e-06, "loss": 0.8252, "mean_token_accuracy": 0.8026715666055679, "num_tokens": 5793035.0, "step": 7198 }, { "epoch": 1.9067796610169492, "grad_norm": 1.3052043914794922, "learning_rate": 9.046742584745764e-06, "loss": 1.3823, "mean_token_accuracy": 0.6891150549054146, "num_tokens": 5794634.0, "step": 7200 }, { "epoch": 1.9073093220338984, "grad_norm": 1.274712085723877, "learning_rate": 9.046477754237289e-06, "loss": 0.8741, "mean_token_accuracy": 0.7849568724632263, "num_tokens": 5796129.0, "step": 7202 }, { "epoch": 1.9078389830508473, "grad_norm": 1.406428337097168, "learning_rate": 9.046212923728814e-06, "loss": 0.8933, "mean_token_accuracy": 0.7692480683326721, "num_tokens": 5797549.0, "step": 7204 }, { "epoch": 1.9083686440677967, "grad_norm": 1.7770659923553467, "learning_rate": 9.045948093220339e-06, "loss": 1.1863, "mean_token_accuracy": 0.7437794655561447, "num_tokens": 5798924.0, "step": 7206 }, { "epoch": 1.9088983050847457, "grad_norm": 1.960107684135437, "learning_rate": 9.045683262711865e-06, "loss": 1.6531, "mean_token_accuracy": 0.6383460685610771, "num_tokens": 5800601.0, "step": 7208 }, { "epoch": 1.909427966101695, "grad_norm": 1.4909287691116333, "learning_rate": 9.045418432203392e-06, "loss": 1.3577, "mean_token_accuracy": 0.6820268742740154, "num_tokens": 5802237.0, "step": 7210 }, { "epoch": 1.909957627118644, "grad_norm": 1.7533091306686401, "learning_rate": 9.045153601694917e-06, "loss": 1.4789, "mean_token_accuracy": 0.6613913848996162, "num_tokens": 5803907.0, "step": 7212 }, { "epoch": 1.9104872881355932, "grad_norm": 1.2655816078186035, "learning_rate": 9.044888771186442e-06, "loss": 0.8015, "mean_token_accuracy": 0.8128022104501724, "num_tokens": 5805298.0, "step": 7214 }, { "epoch": 1.9110169491525424, "grad_norm": 1.713434100151062, "learning_rate": 9.044623940677967e-06, "loss": 1.6504, "mean_token_accuracy": 0.6483498886227608, "num_tokens": 5806662.0, "step": 7216 }, { "epoch": 1.9115466101694916, "grad_norm": 1.9359838962554932, "learning_rate": 9.044359110169493e-06, "loss": 1.669, "mean_token_accuracy": 0.6318418905138969, "num_tokens": 5808306.0, "step": 7218 }, { "epoch": 1.9120762711864407, "grad_norm": 1.6395227909088135, "learning_rate": 9.044094279661018e-06, "loss": 1.5632, "mean_token_accuracy": 0.6404348909854889, "num_tokens": 5809677.0, "step": 7220 }, { "epoch": 1.9126059322033897, "grad_norm": 1.5054398775100708, "learning_rate": 9.043829449152543e-06, "loss": 1.3733, "mean_token_accuracy": 0.6854632794857025, "num_tokens": 5811333.0, "step": 7222 }, { "epoch": 1.913135593220339, "grad_norm": 1.5307523012161255, "learning_rate": 9.043564618644068e-06, "loss": 1.3679, "mean_token_accuracy": 0.6803345009684563, "num_tokens": 5812760.0, "step": 7224 }, { "epoch": 1.913665254237288, "grad_norm": 1.3686106204986572, "learning_rate": 9.043299788135595e-06, "loss": 1.0375, "mean_token_accuracy": 0.7412337735295296, "num_tokens": 5814501.0, "step": 7226 }, { "epoch": 1.9141949152542372, "grad_norm": 2.0104103088378906, "learning_rate": 9.04303495762712e-06, "loss": 1.2104, "mean_token_accuracy": 0.7043781131505966, "num_tokens": 5816018.0, "step": 7228 }, { "epoch": 1.9147245762711864, "grad_norm": 1.689733624458313, "learning_rate": 9.042770127118645e-06, "loss": 1.6743, "mean_token_accuracy": 0.6557523161172867, "num_tokens": 5817456.0, "step": 7230 }, { "epoch": 1.9152542372881356, "grad_norm": 1.5253369808197021, "learning_rate": 9.04250529661017e-06, "loss": 1.202, "mean_token_accuracy": 0.7220181599259377, "num_tokens": 5819135.0, "step": 7232 }, { "epoch": 1.9157838983050848, "grad_norm": 1.786194086074829, "learning_rate": 9.042240466101696e-06, "loss": 1.3056, "mean_token_accuracy": 0.6978886127471924, "num_tokens": 5821182.0, "step": 7234 }, { "epoch": 1.9163135593220337, "grad_norm": 1.7324960231781006, "learning_rate": 9.041975635593221e-06, "loss": 1.294, "mean_token_accuracy": 0.7278420627117157, "num_tokens": 5822790.0, "step": 7236 }, { "epoch": 1.9168432203389831, "grad_norm": 1.4214075803756714, "learning_rate": 9.041710805084748e-06, "loss": 0.9403, "mean_token_accuracy": 0.7559043988585472, "num_tokens": 5824418.0, "step": 7238 }, { "epoch": 1.917372881355932, "grad_norm": 1.7097622156143188, "learning_rate": 9.041445974576271e-06, "loss": 1.3551, "mean_token_accuracy": 0.6967418342828751, "num_tokens": 5826020.0, "step": 7240 }, { "epoch": 1.9179025423728815, "grad_norm": 1.4223495721817017, "learning_rate": 9.041181144067798e-06, "loss": 1.1998, "mean_token_accuracy": 0.7316791489720345, "num_tokens": 5827759.0, "step": 7242 }, { "epoch": 1.9184322033898304, "grad_norm": 1.4938076734542847, "learning_rate": 9.040916313559322e-06, "loss": 1.0433, "mean_token_accuracy": 0.7405308820307255, "num_tokens": 5829402.0, "step": 7244 }, { "epoch": 1.9189618644067796, "grad_norm": 2.501345634460449, "learning_rate": 9.040651483050849e-06, "loss": 1.2959, "mean_token_accuracy": 0.698896199464798, "num_tokens": 5831103.0, "step": 7246 }, { "epoch": 1.9194915254237288, "grad_norm": 1.3157764673233032, "learning_rate": 9.040386652542374e-06, "loss": 1.2824, "mean_token_accuracy": 0.6938411891460419, "num_tokens": 5832689.0, "step": 7248 }, { "epoch": 1.920021186440678, "grad_norm": 1.9058281183242798, "learning_rate": 9.040121822033899e-06, "loss": 0.9788, "step": 7250 }, { "epoch": 1.920021186440678, "eval_loss": 1.313592791557312, "eval_mean_token_accuracy": 0.7008796630354671, "eval_num_tokens": 5834068.0, "eval_runtime": 48.8679, "eval_samples_per_second": 6.303, "eval_steps_per_second": 6.303, "step": 7250 }, { "epoch": 1.9205508474576272, "grad_norm": 1.4067188501358032, "learning_rate": 9.039856991525424e-06, "loss": 1.2472, "mean_token_accuracy": 0.7482550069689751, "num_tokens": 5835775.0, "step": 7252 }, { "epoch": 1.9210805084745761, "grad_norm": 1.7460665702819824, "learning_rate": 9.03959216101695e-06, "loss": 1.2292, "mean_token_accuracy": 0.73247180134058, "num_tokens": 5837044.0, "step": 7254 }, { "epoch": 1.9216101694915255, "grad_norm": 1.4837623834609985, "learning_rate": 9.039327330508475e-06, "loss": 0.9964, "mean_token_accuracy": 0.7543095871806145, "num_tokens": 5838708.0, "step": 7256 }, { "epoch": 1.9221398305084745, "grad_norm": 1.3529973030090332, "learning_rate": 9.0390625e-06, "loss": 1.2501, "mean_token_accuracy": 0.7060338333249092, "num_tokens": 5840568.0, "step": 7258 }, { "epoch": 1.9226694915254239, "grad_norm": 1.663051724433899, "learning_rate": 9.038797669491525e-06, "loss": 1.6185, "mean_token_accuracy": 0.6633439436554909, "num_tokens": 5841939.0, "step": 7260 }, { "epoch": 1.9231991525423728, "grad_norm": 1.9273130893707275, "learning_rate": 9.038532838983052e-06, "loss": 1.3459, "mean_token_accuracy": 0.6946137547492981, "num_tokens": 5843592.0, "step": 7262 }, { "epoch": 1.923728813559322, "grad_norm": 1.4538955688476562, "learning_rate": 9.038268008474577e-06, "loss": 1.4014, "mean_token_accuracy": 0.7171857059001923, "num_tokens": 5845428.0, "step": 7264 }, { "epoch": 1.9242584745762712, "grad_norm": 1.6363120079040527, "learning_rate": 9.038003177966103e-06, "loss": 1.5391, "mean_token_accuracy": 0.6587506681680679, "num_tokens": 5846877.0, "step": 7266 }, { "epoch": 1.9247881355932204, "grad_norm": 1.4965442419052124, "learning_rate": 9.037738347457627e-06, "loss": 1.2925, "mean_token_accuracy": 0.6988686695694923, "num_tokens": 5848624.0, "step": 7268 }, { "epoch": 1.9253177966101696, "grad_norm": 1.700262188911438, "learning_rate": 9.037473516949153e-06, "loss": 1.0804, "mean_token_accuracy": 0.747570089995861, "num_tokens": 5850080.0, "step": 7270 }, { "epoch": 1.9258474576271185, "grad_norm": 1.355635643005371, "learning_rate": 9.037208686440678e-06, "loss": 1.2439, "mean_token_accuracy": 0.7200514450669289, "num_tokens": 5851558.0, "step": 7272 }, { "epoch": 1.926377118644068, "grad_norm": 1.1855504512786865, "learning_rate": 9.036943855932205e-06, "loss": 1.2996, "mean_token_accuracy": 0.7005456760525703, "num_tokens": 5853289.0, "step": 7274 }, { "epoch": 1.9269067796610169, "grad_norm": 1.512204885482788, "learning_rate": 9.03667902542373e-06, "loss": 1.179, "mean_token_accuracy": 0.7196986973285675, "num_tokens": 5854737.0, "step": 7276 }, { "epoch": 1.9274364406779663, "grad_norm": 1.3230863809585571, "learning_rate": 9.036414194915255e-06, "loss": 1.0052, "mean_token_accuracy": 0.7461301237344742, "num_tokens": 5856647.0, "step": 7278 }, { "epoch": 1.9279661016949152, "grad_norm": 1.7584081888198853, "learning_rate": 9.03614936440678e-06, "loss": 1.3595, "mean_token_accuracy": 0.6559435427188873, "num_tokens": 5858681.0, "step": 7280 }, { "epoch": 1.9284957627118644, "grad_norm": 1.6836668252944946, "learning_rate": 9.035884533898306e-06, "loss": 1.5171, "mean_token_accuracy": 0.6596908234059811, "num_tokens": 5860304.0, "step": 7282 }, { "epoch": 1.9290254237288136, "grad_norm": 1.6224219799041748, "learning_rate": 9.035619703389831e-06, "loss": 1.2661, "mean_token_accuracy": 0.7141239270567894, "num_tokens": 5861852.0, "step": 7284 }, { "epoch": 1.9295550847457628, "grad_norm": 1.4909884929656982, "learning_rate": 9.035354872881356e-06, "loss": 1.2864, "mean_token_accuracy": 0.7034536302089691, "num_tokens": 5863552.0, "step": 7286 }, { "epoch": 1.930084745762712, "grad_norm": 1.6223880052566528, "learning_rate": 9.035090042372881e-06, "loss": 1.5823, "mean_token_accuracy": 0.6203645169734955, "num_tokens": 5865331.0, "step": 7288 }, { "epoch": 1.930614406779661, "grad_norm": 1.5898557901382446, "learning_rate": 9.034825211864408e-06, "loss": 1.3149, "mean_token_accuracy": 0.6985098198056221, "num_tokens": 5866738.0, "step": 7290 }, { "epoch": 1.9311440677966103, "grad_norm": 1.72689688205719, "learning_rate": 9.034560381355934e-06, "loss": 1.3639, "mean_token_accuracy": 0.6748972162604332, "num_tokens": 5868331.0, "step": 7292 }, { "epoch": 1.9316737288135593, "grad_norm": 1.8290461301803589, "learning_rate": 9.034295550847458e-06, "loss": 1.2895, "mean_token_accuracy": 0.7219503745436668, "num_tokens": 5869757.0, "step": 7294 }, { "epoch": 1.9322033898305084, "grad_norm": 1.4360045194625854, "learning_rate": 9.034030720338984e-06, "loss": 1.1243, "mean_token_accuracy": 0.7234503105282784, "num_tokens": 5871195.0, "step": 7296 }, { "epoch": 1.9327330508474576, "grad_norm": 1.3147730827331543, "learning_rate": 9.033765889830509e-06, "loss": 1.1886, "mean_token_accuracy": 0.7176220566034317, "num_tokens": 5872813.0, "step": 7298 }, { "epoch": 1.9332627118644068, "grad_norm": 1.6515978574752808, "learning_rate": 9.033501059322036e-06, "loss": 1.6437, "mean_token_accuracy": 0.6598829180002213, "num_tokens": 5874610.0, "step": 7300 }, { "epoch": 1.933792372881356, "grad_norm": 1.471561074256897, "learning_rate": 9.03323622881356e-06, "loss": 0.9609, "mean_token_accuracy": 0.7558170929551125, "num_tokens": 5876109.0, "step": 7302 }, { "epoch": 1.934322033898305, "grad_norm": 1.4673993587493896, "learning_rate": 9.032971398305086e-06, "loss": 1.3411, "mean_token_accuracy": 0.6891463659703732, "num_tokens": 5877785.0, "step": 7304 }, { "epoch": 1.9348516949152543, "grad_norm": 1.802914023399353, "learning_rate": 9.03270656779661e-06, "loss": 1.5245, "mean_token_accuracy": 0.6532997116446495, "num_tokens": 5879415.0, "step": 7306 }, { "epoch": 1.9353813559322033, "grad_norm": 1.6645640134811401, "learning_rate": 9.032441737288137e-06, "loss": 1.2548, "mean_token_accuracy": 0.6939086318016052, "num_tokens": 5880971.0, "step": 7308 }, { "epoch": 1.9359110169491527, "grad_norm": 1.7401148080825806, "learning_rate": 9.032176906779662e-06, "loss": 1.5673, "mean_token_accuracy": 0.6245781183242798, "num_tokens": 5882525.0, "step": 7310 }, { "epoch": 1.9364406779661016, "grad_norm": 1.7974708080291748, "learning_rate": 9.031912076271187e-06, "loss": 0.9312, "mean_token_accuracy": 0.7659169882535934, "num_tokens": 5883801.0, "step": 7312 }, { "epoch": 1.9369703389830508, "grad_norm": 1.639539361000061, "learning_rate": 9.031647245762712e-06, "loss": 1.4836, "mean_token_accuracy": 0.6946133375167847, "num_tokens": 5885757.0, "step": 7314 }, { "epoch": 1.9375, "grad_norm": 2.3806703090667725, "learning_rate": 9.031382415254239e-06, "loss": 1.3131, "mean_token_accuracy": 0.6902054622769356, "num_tokens": 5887488.0, "step": 7316 }, { "epoch": 1.9380296610169492, "grad_norm": 2.0397143363952637, "learning_rate": 9.031117584745763e-06, "loss": 1.0317, "mean_token_accuracy": 0.7548928484320641, "num_tokens": 5888786.0, "step": 7318 }, { "epoch": 1.9385593220338984, "grad_norm": 1.4352084398269653, "learning_rate": 9.03085275423729e-06, "loss": 1.3631, "mean_token_accuracy": 0.7326938211917877, "num_tokens": 5890445.0, "step": 7320 }, { "epoch": 1.9390889830508473, "grad_norm": 1.478172779083252, "learning_rate": 9.030587923728813e-06, "loss": 1.1169, "mean_token_accuracy": 0.7413665503263474, "num_tokens": 5892092.0, "step": 7322 }, { "epoch": 1.9396186440677967, "grad_norm": 1.2328863143920898, "learning_rate": 9.03032309322034e-06, "loss": 0.9322, "mean_token_accuracy": 0.7749969810247421, "num_tokens": 5893556.0, "step": 7324 }, { "epoch": 1.9401483050847457, "grad_norm": 1.6998313665390015, "learning_rate": 9.030058262711865e-06, "loss": 1.113, "mean_token_accuracy": 0.7188244611024857, "num_tokens": 5895197.0, "step": 7326 }, { "epoch": 1.940677966101695, "grad_norm": 1.4787789583206177, "learning_rate": 9.029793432203391e-06, "loss": 1.011, "mean_token_accuracy": 0.7521925196051598, "num_tokens": 5896808.0, "step": 7328 }, { "epoch": 1.941207627118644, "grad_norm": 1.8635015487670898, "learning_rate": 9.029528601694916e-06, "loss": 1.2089, "mean_token_accuracy": 0.7269071042537689, "num_tokens": 5898229.0, "step": 7330 }, { "epoch": 1.9417372881355932, "grad_norm": 1.4487866163253784, "learning_rate": 9.029263771186441e-06, "loss": 1.1102, "mean_token_accuracy": 0.7297641560435295, "num_tokens": 5899708.0, "step": 7332 }, { "epoch": 1.9422669491525424, "grad_norm": 1.8141411542892456, "learning_rate": 9.028998940677966e-06, "loss": 1.4882, "mean_token_accuracy": 0.6849715188145638, "num_tokens": 5901134.0, "step": 7334 }, { "epoch": 1.9427966101694916, "grad_norm": 1.878014087677002, "learning_rate": 9.028734110169493e-06, "loss": 1.5418, "mean_token_accuracy": 0.6783953458070755, "num_tokens": 5902321.0, "step": 7336 }, { "epoch": 1.9433262711864407, "grad_norm": 1.4874736070632935, "learning_rate": 9.028469279661018e-06, "loss": 1.4873, "mean_token_accuracy": 0.6625063195824623, "num_tokens": 5903853.0, "step": 7338 }, { "epoch": 1.9438559322033897, "grad_norm": 2.1484534740448, "learning_rate": 9.028204449152543e-06, "loss": 1.1368, "mean_token_accuracy": 0.7391721904277802, "num_tokens": 5905071.0, "step": 7340 }, { "epoch": 1.944385593220339, "grad_norm": 1.69466233253479, "learning_rate": 9.027939618644068e-06, "loss": 1.5665, "mean_token_accuracy": 0.6635820902884007, "num_tokens": 5907097.0, "step": 7342 }, { "epoch": 1.944915254237288, "grad_norm": 1.345274567604065, "learning_rate": 9.027674788135594e-06, "loss": 1.0534, "mean_token_accuracy": 0.7376562654972076, "num_tokens": 5908600.0, "step": 7344 }, { "epoch": 1.9454449152542372, "grad_norm": 1.7619953155517578, "learning_rate": 9.02740995762712e-06, "loss": 1.7022, "mean_token_accuracy": 0.6556727103888988, "num_tokens": 5910126.0, "step": 7346 }, { "epoch": 1.9459745762711864, "grad_norm": 1.6408087015151978, "learning_rate": 9.027145127118644e-06, "loss": 1.4129, "mean_token_accuracy": 0.6834857240319252, "num_tokens": 5911663.0, "step": 7348 }, { "epoch": 1.9465042372881356, "grad_norm": 1.5070490837097168, "learning_rate": 9.026880296610169e-06, "loss": 1.2399, "mean_token_accuracy": 0.7128011733293533, "num_tokens": 5913392.0, "step": 7350 }, { "epoch": 1.9470338983050848, "grad_norm": 1.529338002204895, "learning_rate": 9.026615466101696e-06, "loss": 1.4048, "mean_token_accuracy": 0.6731192097067833, "num_tokens": 5915022.0, "step": 7352 }, { "epoch": 1.9475635593220337, "grad_norm": 1.8720040321350098, "learning_rate": 9.02635063559322e-06, "loss": 1.0441, "mean_token_accuracy": 0.74294313788414, "num_tokens": 5916457.0, "step": 7354 }, { "epoch": 1.9480932203389831, "grad_norm": 1.230405330657959, "learning_rate": 9.026085805084747e-06, "loss": 1.0871, "mean_token_accuracy": 0.7502029910683632, "num_tokens": 5918000.0, "step": 7356 }, { "epoch": 1.948622881355932, "grad_norm": 1.8689663410186768, "learning_rate": 9.025820974576272e-06, "loss": 1.7367, "mean_token_accuracy": 0.6164097189903259, "num_tokens": 5919323.0, "step": 7358 }, { "epoch": 1.9491525423728815, "grad_norm": 1.5270153284072876, "learning_rate": 9.025556144067797e-06, "loss": 1.0581, "mean_token_accuracy": 0.7676108628511429, "num_tokens": 5920837.0, "step": 7360 }, { "epoch": 1.9496822033898304, "grad_norm": 1.4382433891296387, "learning_rate": 9.025291313559322e-06, "loss": 1.2355, "mean_token_accuracy": 0.7337807789444923, "num_tokens": 5922239.0, "step": 7362 }, { "epoch": 1.9502118644067796, "grad_norm": 1.630661129951477, "learning_rate": 9.025026483050849e-06, "loss": 1.7197, "mean_token_accuracy": 0.6579703912138939, "num_tokens": 5923801.0, "step": 7364 }, { "epoch": 1.9507415254237288, "grad_norm": 1.4806116819381714, "learning_rate": 9.024761652542374e-06, "loss": 1.2665, "mean_token_accuracy": 0.7038647532463074, "num_tokens": 5925496.0, "step": 7366 }, { "epoch": 1.951271186440678, "grad_norm": 1.2455917596817017, "learning_rate": 9.024496822033899e-06, "loss": 1.4238, "mean_token_accuracy": 0.682115875184536, "num_tokens": 5927254.0, "step": 7368 }, { "epoch": 1.9518008474576272, "grad_norm": 1.646252155303955, "learning_rate": 9.024231991525423e-06, "loss": 1.4482, "mean_token_accuracy": 0.6547119989991188, "num_tokens": 5928981.0, "step": 7370 }, { "epoch": 1.9523305084745761, "grad_norm": 1.7008429765701294, "learning_rate": 9.02396716101695e-06, "loss": 1.3441, "mean_token_accuracy": 0.7050770781934261, "num_tokens": 5930652.0, "step": 7372 }, { "epoch": 1.9528601694915255, "grad_norm": 1.3469552993774414, "learning_rate": 9.023702330508475e-06, "loss": 1.3194, "mean_token_accuracy": 0.6697225570678711, "num_tokens": 5932366.0, "step": 7374 }, { "epoch": 1.9533898305084745, "grad_norm": 1.4056049585342407, "learning_rate": 9.0234375e-06, "loss": 1.2589, "mean_token_accuracy": 0.6960246115922928, "num_tokens": 5933886.0, "step": 7376 }, { "epoch": 1.9539194915254239, "grad_norm": 1.4715410470962524, "learning_rate": 9.023172669491527e-06, "loss": 1.3388, "mean_token_accuracy": 0.7251508831977844, "num_tokens": 5935828.0, "step": 7378 }, { "epoch": 1.9544491525423728, "grad_norm": 1.4832584857940674, "learning_rate": 9.022907838983052e-06, "loss": 1.5474, "mean_token_accuracy": 0.6541347727179527, "num_tokens": 5937491.0, "step": 7380 }, { "epoch": 1.954978813559322, "grad_norm": 1.9360796213150024, "learning_rate": 9.022643008474578e-06, "loss": 1.1468, "mean_token_accuracy": 0.7272114306688309, "num_tokens": 5939034.0, "step": 7382 }, { "epoch": 1.9555084745762712, "grad_norm": 1.451838731765747, "learning_rate": 9.022378177966103e-06, "loss": 1.3877, "mean_token_accuracy": 0.6866331771016121, "num_tokens": 5940811.0, "step": 7384 }, { "epoch": 1.9560381355932204, "grad_norm": 1.6755497455596924, "learning_rate": 9.022113347457628e-06, "loss": 1.2786, "mean_token_accuracy": 0.7031773552298546, "num_tokens": 5942363.0, "step": 7386 }, { "epoch": 1.9565677966101696, "grad_norm": 1.2672314643859863, "learning_rate": 9.021848516949153e-06, "loss": 0.8261, "mean_token_accuracy": 0.782929465174675, "num_tokens": 5943907.0, "step": 7388 }, { "epoch": 1.9570974576271185, "grad_norm": 1.4563401937484741, "learning_rate": 9.02158368644068e-06, "loss": 1.4294, "mean_token_accuracy": 0.6836043670773506, "num_tokens": 5945627.0, "step": 7390 }, { "epoch": 1.957627118644068, "grad_norm": 1.4879074096679688, "learning_rate": 9.021318855932204e-06, "loss": 1.7353, "mean_token_accuracy": 0.6328735649585724, "num_tokens": 5947467.0, "step": 7392 }, { "epoch": 1.9581567796610169, "grad_norm": 1.4089173078536987, "learning_rate": 9.02105402542373e-06, "loss": 1.0924, "mean_token_accuracy": 0.7113103792071342, "num_tokens": 5949390.0, "step": 7394 }, { "epoch": 1.9586864406779663, "grad_norm": 1.4457781314849854, "learning_rate": 9.020789194915254e-06, "loss": 1.3644, "mean_token_accuracy": 0.6860705092549324, "num_tokens": 5950961.0, "step": 7396 }, { "epoch": 1.9592161016949152, "grad_norm": 1.5396102666854858, "learning_rate": 9.020524364406781e-06, "loss": 1.4696, "mean_token_accuracy": 0.6672714203596115, "num_tokens": 5952763.0, "step": 7398 }, { "epoch": 1.9597457627118644, "grad_norm": 1.689473271369934, "learning_rate": 9.020259533898306e-06, "loss": 1.3725, "mean_token_accuracy": 0.7215461730957031, "num_tokens": 5954138.0, "step": 7400 }, { "epoch": 1.9602754237288136, "grad_norm": 1.307753324508667, "learning_rate": 9.01999470338983e-06, "loss": 1.4777, "mean_token_accuracy": 0.6775204539299011, "num_tokens": 5955976.0, "step": 7402 }, { "epoch": 1.9608050847457628, "grad_norm": 1.846623420715332, "learning_rate": 9.019729872881356e-06, "loss": 1.4931, "mean_token_accuracy": 0.684248648583889, "num_tokens": 5957209.0, "step": 7404 }, { "epoch": 1.961334745762712, "grad_norm": 1.5097819566726685, "learning_rate": 9.019465042372882e-06, "loss": 1.1135, "mean_token_accuracy": 0.7453015819191933, "num_tokens": 5958842.0, "step": 7406 }, { "epoch": 1.961864406779661, "grad_norm": 1.4193631410598755, "learning_rate": 9.019200211864407e-06, "loss": 1.2634, "mean_token_accuracy": 0.7127874121069908, "num_tokens": 5960298.0, "step": 7408 }, { "epoch": 1.9623940677966103, "grad_norm": 1.5822829008102417, "learning_rate": 9.018935381355934e-06, "loss": 1.3028, "mean_token_accuracy": 0.7130756229162216, "num_tokens": 5961851.0, "step": 7410 }, { "epoch": 1.9629237288135593, "grad_norm": 1.7678680419921875, "learning_rate": 9.018670550847459e-06, "loss": 1.3966, "mean_token_accuracy": 0.6728309765458107, "num_tokens": 5963288.0, "step": 7412 }, { "epoch": 1.9634533898305084, "grad_norm": 1.790611743927002, "learning_rate": 9.018405720338984e-06, "loss": 1.7338, "mean_token_accuracy": 0.6497522033751011, "num_tokens": 5965029.0, "step": 7414 }, { "epoch": 1.9639830508474576, "grad_norm": 1.341299057006836, "learning_rate": 9.018140889830509e-06, "loss": 1.0343, "mean_token_accuracy": 0.7524746060371399, "num_tokens": 5966821.0, "step": 7416 }, { "epoch": 1.9645127118644068, "grad_norm": 1.215232014656067, "learning_rate": 9.017876059322035e-06, "loss": 1.234, "mean_token_accuracy": 0.6964300870895386, "num_tokens": 5969086.0, "step": 7418 }, { "epoch": 1.965042372881356, "grad_norm": 1.2142024040222168, "learning_rate": 9.01761122881356e-06, "loss": 1.1865, "mean_token_accuracy": 0.7242877036333084, "num_tokens": 5970542.0, "step": 7420 }, { "epoch": 1.965572033898305, "grad_norm": 1.6487770080566406, "learning_rate": 9.017346398305085e-06, "loss": 1.3225, "mean_token_accuracy": 0.686241626739502, "num_tokens": 5972101.0, "step": 7422 }, { "epoch": 1.9661016949152543, "grad_norm": 1.4775387048721313, "learning_rate": 9.01708156779661e-06, "loss": 1.4425, "mean_token_accuracy": 0.6950395181775093, "num_tokens": 5973897.0, "step": 7424 }, { "epoch": 1.9666313559322033, "grad_norm": 1.6868247985839844, "learning_rate": 9.016816737288137e-06, "loss": 1.679, "mean_token_accuracy": 0.6558496505022049, "num_tokens": 5975305.0, "step": 7426 }, { "epoch": 1.9671610169491527, "grad_norm": 1.6387357711791992, "learning_rate": 9.016551906779662e-06, "loss": 1.3112, "mean_token_accuracy": 0.6771652474999428, "num_tokens": 5977103.0, "step": 7428 }, { "epoch": 1.9676906779661016, "grad_norm": 1.3034027814865112, "learning_rate": 9.016287076271187e-06, "loss": 1.1597, "mean_token_accuracy": 0.7391278222203255, "num_tokens": 5978717.0, "step": 7430 }, { "epoch": 1.9682203389830508, "grad_norm": 1.3573392629623413, "learning_rate": 9.016022245762712e-06, "loss": 1.054, "mean_token_accuracy": 0.7405083775520325, "num_tokens": 5980345.0, "step": 7432 }, { "epoch": 1.96875, "grad_norm": 1.7783043384552002, "learning_rate": 9.015757415254238e-06, "loss": 2.0862, "mean_token_accuracy": 0.5631822608411312, "num_tokens": 5982455.0, "step": 7434 }, { "epoch": 1.9692796610169492, "grad_norm": 1.7265613079071045, "learning_rate": 9.015492584745763e-06, "loss": 1.6232, "mean_token_accuracy": 0.6652790978550911, "num_tokens": 5983983.0, "step": 7436 }, { "epoch": 1.9698093220338984, "grad_norm": 1.6743141412734985, "learning_rate": 9.01522775423729e-06, "loss": 1.8053, "mean_token_accuracy": 0.5914388485252857, "num_tokens": 5985757.0, "step": 7438 }, { "epoch": 1.9703389830508473, "grad_norm": 1.5642800331115723, "learning_rate": 9.014962923728815e-06, "loss": 1.3876, "mean_token_accuracy": 0.6868370994925499, "num_tokens": 5987328.0, "step": 7440 }, { "epoch": 1.9708686440677967, "grad_norm": 1.5907708406448364, "learning_rate": 9.01469809322034e-06, "loss": 1.3103, "mean_token_accuracy": 0.709210142493248, "num_tokens": 5988830.0, "step": 7442 }, { "epoch": 1.9713983050847457, "grad_norm": 1.6063721179962158, "learning_rate": 9.014433262711864e-06, "loss": 1.5577, "mean_token_accuracy": 0.6676786318421364, "num_tokens": 5990677.0, "step": 7444 }, { "epoch": 1.971927966101695, "grad_norm": 1.800533413887024, "learning_rate": 9.014168432203391e-06, "loss": 1.5533, "mean_token_accuracy": 0.635550245642662, "num_tokens": 5992300.0, "step": 7446 }, { "epoch": 1.972457627118644, "grad_norm": 1.4125515222549438, "learning_rate": 9.013903601694916e-06, "loss": 1.1867, "mean_token_accuracy": 0.7201745510101318, "num_tokens": 5994058.0, "step": 7448 }, { "epoch": 1.9729872881355932, "grad_norm": 1.474695086479187, "learning_rate": 9.013638771186441e-06, "loss": 1.0973, "mean_token_accuracy": 0.7315299436450005, "num_tokens": 5995524.0, "step": 7450 }, { "epoch": 1.9735169491525424, "grad_norm": 1.4500426054000854, "learning_rate": 9.013373940677966e-06, "loss": 1.0687, "mean_token_accuracy": 0.7499907687306404, "num_tokens": 5997142.0, "step": 7452 }, { "epoch": 1.9740466101694916, "grad_norm": 1.586976170539856, "learning_rate": 9.013109110169493e-06, "loss": 1.2741, "mean_token_accuracy": 0.7131967395544052, "num_tokens": 5998816.0, "step": 7454 }, { "epoch": 1.9745762711864407, "grad_norm": 1.3919347524642944, "learning_rate": 9.012844279661017e-06, "loss": 1.1476, "mean_token_accuracy": 0.7229447513818741, "num_tokens": 6000319.0, "step": 7456 }, { "epoch": 1.9751059322033897, "grad_norm": 1.8474189043045044, "learning_rate": 9.012579449152542e-06, "loss": 1.8388, "mean_token_accuracy": 0.629966489970684, "num_tokens": 6001803.0, "step": 7458 }, { "epoch": 1.975635593220339, "grad_norm": 1.4295892715454102, "learning_rate": 9.012314618644067e-06, "loss": 1.3082, "mean_token_accuracy": 0.7194943204522133, "num_tokens": 6003493.0, "step": 7460 }, { "epoch": 1.976165254237288, "grad_norm": 1.9504365921020508, "learning_rate": 9.012049788135594e-06, "loss": 1.7075, "mean_token_accuracy": 0.6266732811927795, "num_tokens": 6005146.0, "step": 7462 }, { "epoch": 1.9766949152542372, "grad_norm": 1.5319843292236328, "learning_rate": 9.01178495762712e-06, "loss": 1.3492, "mean_token_accuracy": 0.6985380239784718, "num_tokens": 6006738.0, "step": 7464 }, { "epoch": 1.9772245762711864, "grad_norm": 1.7898509502410889, "learning_rate": 9.011520127118645e-06, "loss": 1.1025, "mean_token_accuracy": 0.759161040186882, "num_tokens": 6008063.0, "step": 7466 }, { "epoch": 1.9777542372881356, "grad_norm": 1.6145269870758057, "learning_rate": 9.01125529661017e-06, "loss": 1.8194, "mean_token_accuracy": 0.6335078738629818, "num_tokens": 6009792.0, "step": 7468 }, { "epoch": 1.9782838983050848, "grad_norm": 1.9026577472686768, "learning_rate": 9.010990466101695e-06, "loss": 0.9753, "mean_token_accuracy": 0.7606576010584831, "num_tokens": 6011357.0, "step": 7470 }, { "epoch": 1.9788135593220337, "grad_norm": 1.5689915418624878, "learning_rate": 9.010725635593222e-06, "loss": 1.5498, "mean_token_accuracy": 0.6713080331683159, "num_tokens": 6012858.0, "step": 7472 }, { "epoch": 1.9793432203389831, "grad_norm": 1.7633085250854492, "learning_rate": 9.010460805084747e-06, "loss": 1.8588, "mean_token_accuracy": 0.6058387160301208, "num_tokens": 6014387.0, "step": 7474 }, { "epoch": 1.979872881355932, "grad_norm": 1.7231189012527466, "learning_rate": 9.010195974576272e-06, "loss": 1.4388, "mean_token_accuracy": 0.6991261914372444, "num_tokens": 6015705.0, "step": 7476 }, { "epoch": 1.9804025423728815, "grad_norm": 1.717703104019165, "learning_rate": 9.009931144067797e-06, "loss": 1.2254, "mean_token_accuracy": 0.7211096584796906, "num_tokens": 6017369.0, "step": 7478 }, { "epoch": 1.9809322033898304, "grad_norm": 1.2704968452453613, "learning_rate": 9.009666313559323e-06, "loss": 1.1561, "mean_token_accuracy": 0.7299630492925644, "num_tokens": 6019460.0, "step": 7480 }, { "epoch": 1.9814618644067796, "grad_norm": 1.7640036344528198, "learning_rate": 9.009401483050848e-06, "loss": 1.0055, "mean_token_accuracy": 0.7768299654126167, "num_tokens": 6020940.0, "step": 7482 }, { "epoch": 1.9819915254237288, "grad_norm": 1.4259040355682373, "learning_rate": 9.009136652542373e-06, "loss": 1.4658, "mean_token_accuracy": 0.6772666089236736, "num_tokens": 6022847.0, "step": 7484 }, { "epoch": 1.982521186440678, "grad_norm": 1.8866132497787476, "learning_rate": 9.008871822033898e-06, "loss": 1.544, "mean_token_accuracy": 0.6523677483201027, "num_tokens": 6024341.0, "step": 7486 }, { "epoch": 1.9830508474576272, "grad_norm": 1.3022270202636719, "learning_rate": 9.008606991525425e-06, "loss": 1.3213, "mean_token_accuracy": 0.712446965277195, "num_tokens": 6026061.0, "step": 7488 }, { "epoch": 1.9835805084745761, "grad_norm": 1.6109098196029663, "learning_rate": 9.00834216101695e-06, "loss": 1.5244, "mean_token_accuracy": 0.6785361468791962, "num_tokens": 6027511.0, "step": 7490 }, { "epoch": 1.9841101694915255, "grad_norm": 1.6683650016784668, "learning_rate": 9.008077330508476e-06, "loss": 1.352, "mean_token_accuracy": 0.700630709528923, "num_tokens": 6029222.0, "step": 7492 }, { "epoch": 1.9846398305084745, "grad_norm": 1.7720081806182861, "learning_rate": 9.007812500000001e-06, "loss": 1.4734, "mean_token_accuracy": 0.6859524585306644, "num_tokens": 6030728.0, "step": 7494 }, { "epoch": 1.9851694915254239, "grad_norm": 1.783504605293274, "learning_rate": 9.007547669491526e-06, "loss": 1.8054, "mean_token_accuracy": 0.5941120833158493, "num_tokens": 6032182.0, "step": 7496 }, { "epoch": 1.9856991525423728, "grad_norm": 1.5018750429153442, "learning_rate": 9.007282838983051e-06, "loss": 1.1956, "mean_token_accuracy": 0.7413340210914612, "num_tokens": 6033695.0, "step": 7498 }, { "epoch": 1.986228813559322, "grad_norm": 1.3408966064453125, "learning_rate": 9.007018008474578e-06, "loss": 0.926, "step": 7500 }, { "epoch": 1.986228813559322, "eval_loss": 1.31161367893219, "eval_mean_token_accuracy": 0.7008301175453446, "eval_num_tokens": 6035194.0, "eval_runtime": 49.0808, "eval_samples_per_second": 6.275, "eval_steps_per_second": 6.275, "step": 7500 }, { "epoch": 1.9867584745762712, "grad_norm": 1.7514761686325073, "learning_rate": 9.006753177966103e-06, "loss": 1.1962, "mean_token_accuracy": 0.763805016875267, "num_tokens": 6036842.0, "step": 7502 }, { "epoch": 1.9872881355932204, "grad_norm": 1.3319315910339355, "learning_rate": 9.006488347457628e-06, "loss": 1.2915, "mean_token_accuracy": 0.6908207684755325, "num_tokens": 6038728.0, "step": 7504 }, { "epoch": 1.9878177966101696, "grad_norm": 1.7541953325271606, "learning_rate": 9.006223516949153e-06, "loss": 1.5123, "mean_token_accuracy": 0.6616251021623611, "num_tokens": 6040435.0, "step": 7506 }, { "epoch": 1.9883474576271185, "grad_norm": 1.7023695707321167, "learning_rate": 9.00595868644068e-06, "loss": 1.0727, "mean_token_accuracy": 0.7321592271327972, "num_tokens": 6041867.0, "step": 7508 }, { "epoch": 1.988877118644068, "grad_norm": 1.691215991973877, "learning_rate": 9.005693855932204e-06, "loss": 1.2151, "mean_token_accuracy": 0.725956991314888, "num_tokens": 6043303.0, "step": 7510 }, { "epoch": 1.9894067796610169, "grad_norm": 1.3819162845611572, "learning_rate": 9.005429025423729e-06, "loss": 1.2678, "mean_token_accuracy": 0.7152619138360023, "num_tokens": 6044903.0, "step": 7512 }, { "epoch": 1.9899364406779663, "grad_norm": 1.5880687236785889, "learning_rate": 9.005164194915254e-06, "loss": 1.1316, "mean_token_accuracy": 0.7435967028141022, "num_tokens": 6046280.0, "step": 7514 }, { "epoch": 1.9904661016949152, "grad_norm": 1.6873116493225098, "learning_rate": 9.00489936440678e-06, "loss": 1.1641, "mean_token_accuracy": 0.721088845282793, "num_tokens": 6047797.0, "step": 7516 }, { "epoch": 1.9909957627118644, "grad_norm": 1.4390126466751099, "learning_rate": 9.004634533898305e-06, "loss": 1.1311, "mean_token_accuracy": 0.7452909350395203, "num_tokens": 6049369.0, "step": 7518 }, { "epoch": 1.9915254237288136, "grad_norm": 1.291011929512024, "learning_rate": 9.004369703389832e-06, "loss": 1.3874, "mean_token_accuracy": 0.6818477511405945, "num_tokens": 6051438.0, "step": 7520 }, { "epoch": 1.9920550847457628, "grad_norm": 1.115014672279358, "learning_rate": 9.004104872881357e-06, "loss": 1.2318, "mean_token_accuracy": 0.7209687158465385, "num_tokens": 6053279.0, "step": 7522 }, { "epoch": 1.992584745762712, "grad_norm": 1.4431532621383667, "learning_rate": 9.003840042372882e-06, "loss": 1.1011, "mean_token_accuracy": 0.7172753214836121, "num_tokens": 6054791.0, "step": 7524 }, { "epoch": 1.993114406779661, "grad_norm": 1.715989351272583, "learning_rate": 9.003575211864407e-06, "loss": 1.6007, "mean_token_accuracy": 0.662197895348072, "num_tokens": 6056191.0, "step": 7526 }, { "epoch": 1.9936440677966103, "grad_norm": 1.9087495803833008, "learning_rate": 9.003310381355934e-06, "loss": 1.7046, "mean_token_accuracy": 0.6350364536046982, "num_tokens": 6057959.0, "step": 7528 }, { "epoch": 1.9941737288135593, "grad_norm": 1.6926835775375366, "learning_rate": 9.003045550847458e-06, "loss": 1.3965, "mean_token_accuracy": 0.6969967260956764, "num_tokens": 6059508.0, "step": 7530 }, { "epoch": 1.9947033898305084, "grad_norm": 1.4331958293914795, "learning_rate": 9.002780720338983e-06, "loss": 0.9451, "mean_token_accuracy": 0.7652033306658268, "num_tokens": 6061040.0, "step": 7532 }, { "epoch": 1.9952330508474576, "grad_norm": 1.257758378982544, "learning_rate": 9.002515889830508e-06, "loss": 1.0779, "mean_token_accuracy": 0.7421952709555626, "num_tokens": 6062770.0, "step": 7534 }, { "epoch": 1.9957627118644068, "grad_norm": 1.5010886192321777, "learning_rate": 9.002251059322035e-06, "loss": 1.0272, "mean_token_accuracy": 0.7592736929655075, "num_tokens": 6064309.0, "step": 7536 }, { "epoch": 1.996292372881356, "grad_norm": 1.5055794715881348, "learning_rate": 9.00198622881356e-06, "loss": 0.9178, "mean_token_accuracy": 0.7652551084756851, "num_tokens": 6065769.0, "step": 7538 }, { "epoch": 1.996822033898305, "grad_norm": 1.813673734664917, "learning_rate": 9.001721398305085e-06, "loss": 1.5618, "mean_token_accuracy": 0.6473158895969391, "num_tokens": 6067290.0, "step": 7540 }, { "epoch": 1.9973516949152543, "grad_norm": 2.002079725265503, "learning_rate": 9.00145656779661e-06, "loss": 1.4519, "mean_token_accuracy": 0.6756400167942047, "num_tokens": 6068581.0, "step": 7542 }, { "epoch": 1.9978813559322033, "grad_norm": 1.4935098886489868, "learning_rate": 9.001191737288136e-06, "loss": 1.3086, "mean_token_accuracy": 0.7189033105969429, "num_tokens": 6070091.0, "step": 7544 }, { "epoch": 1.9984110169491527, "grad_norm": 1.6419693231582642, "learning_rate": 9.000926906779663e-06, "loss": 1.3149, "mean_token_accuracy": 0.7094447538256645, "num_tokens": 6071554.0, "step": 7546 }, { "epoch": 1.9989406779661016, "grad_norm": 1.5964072942733765, "learning_rate": 9.000662076271188e-06, "loss": 1.4035, "mean_token_accuracy": 0.6830266490578651, "num_tokens": 6073266.0, "step": 7548 }, { "epoch": 1.9994703389830508, "grad_norm": 1.562181830406189, "learning_rate": 9.000397245762713e-06, "loss": 1.301, "mean_token_accuracy": 0.6850918605923653, "num_tokens": 6074899.0, "step": 7550 }, { "epoch": 2.0, "grad_norm": 1.5117093324661255, "learning_rate": 9.000132415254238e-06, "loss": 1.3418, "mean_token_accuracy": 0.6784619614481926, "num_tokens": 6076536.0, "step": 7552 }, { "epoch": 2.000529661016949, "grad_norm": 1.3124065399169922, "learning_rate": 8.999867584745764e-06, "loss": 1.1097, "mean_token_accuracy": 0.7201583534479141, "num_tokens": 6078737.0, "step": 7554 }, { "epoch": 2.0010593220338984, "grad_norm": 1.5575098991394043, "learning_rate": 8.99960275423729e-06, "loss": 1.2751, "mean_token_accuracy": 0.7173792272806168, "num_tokens": 6080203.0, "step": 7556 }, { "epoch": 2.0015889830508473, "grad_norm": 1.737756371498108, "learning_rate": 8.999337923728814e-06, "loss": 1.4134, "mean_token_accuracy": 0.6761658936738968, "num_tokens": 6081642.0, "step": 7558 }, { "epoch": 2.0021186440677967, "grad_norm": 1.658235788345337, "learning_rate": 8.99907309322034e-06, "loss": 1.6401, "mean_token_accuracy": 0.645596019923687, "num_tokens": 6083180.0, "step": 7560 }, { "epoch": 2.0026483050847457, "grad_norm": 1.307611346244812, "learning_rate": 8.998808262711866e-06, "loss": 1.1098, "mean_token_accuracy": 0.7387803271412849, "num_tokens": 6084869.0, "step": 7562 }, { "epoch": 2.003177966101695, "grad_norm": 1.4079439640045166, "learning_rate": 8.99854343220339e-06, "loss": 1.0994, "mean_token_accuracy": 0.7332394272089005, "num_tokens": 6086325.0, "step": 7564 }, { "epoch": 2.003707627118644, "grad_norm": 1.959202766418457, "learning_rate": 8.998278601694916e-06, "loss": 1.7275, "mean_token_accuracy": 0.6371149197220802, "num_tokens": 6087787.0, "step": 7566 }, { "epoch": 2.0042372881355934, "grad_norm": 1.2957221269607544, "learning_rate": 8.99801377118644e-06, "loss": 1.2342, "mean_token_accuracy": 0.7249212488532066, "num_tokens": 6089479.0, "step": 7568 }, { "epoch": 2.0047669491525424, "grad_norm": 1.4653187990188599, "learning_rate": 8.997748940677967e-06, "loss": 1.3061, "mean_token_accuracy": 0.6822238266468048, "num_tokens": 6091145.0, "step": 7570 }, { "epoch": 2.0052966101694913, "grad_norm": 1.4083853960037231, "learning_rate": 8.997484110169492e-06, "loss": 1.0237, "mean_token_accuracy": 0.7449470236897469, "num_tokens": 6092645.0, "step": 7572 }, { "epoch": 2.0058262711864407, "grad_norm": 1.8688632249832153, "learning_rate": 8.997219279661019e-06, "loss": 1.2481, "mean_token_accuracy": 0.7053359337151051, "num_tokens": 6094116.0, "step": 7574 }, { "epoch": 2.0063559322033897, "grad_norm": 1.6266995668411255, "learning_rate": 8.996954449152544e-06, "loss": 1.6411, "mean_token_accuracy": 0.6474232375621796, "num_tokens": 6095635.0, "step": 7576 }, { "epoch": 2.006885593220339, "grad_norm": 1.8755578994750977, "learning_rate": 8.996689618644069e-06, "loss": 1.9105, "mean_token_accuracy": 0.5942000895738602, "num_tokens": 6097211.0, "step": 7578 }, { "epoch": 2.007415254237288, "grad_norm": 1.3956000804901123, "learning_rate": 8.996424788135594e-06, "loss": 1.2433, "mean_token_accuracy": 0.7291734740138054, "num_tokens": 6098874.0, "step": 7580 }, { "epoch": 2.0079449152542375, "grad_norm": 1.397831678390503, "learning_rate": 8.99615995762712e-06, "loss": 1.2762, "mean_token_accuracy": 0.7056702747941017, "num_tokens": 6101280.0, "step": 7582 }, { "epoch": 2.0084745762711864, "grad_norm": 1.6945953369140625, "learning_rate": 8.995895127118645e-06, "loss": 1.358, "mean_token_accuracy": 0.707465834915638, "num_tokens": 6102741.0, "step": 7584 }, { "epoch": 2.0090042372881354, "grad_norm": 1.5728462934494019, "learning_rate": 8.99563029661017e-06, "loss": 1.5115, "mean_token_accuracy": 0.6609991267323494, "num_tokens": 6104807.0, "step": 7586 }, { "epoch": 2.0095338983050848, "grad_norm": 1.7200840711593628, "learning_rate": 8.995365466101695e-06, "loss": 1.3472, "mean_token_accuracy": 0.6875007674098015, "num_tokens": 6106334.0, "step": 7588 }, { "epoch": 2.0100635593220337, "grad_norm": 1.941624402999878, "learning_rate": 8.995100635593222e-06, "loss": 1.4769, "mean_token_accuracy": 0.6587407067418098, "num_tokens": 6107933.0, "step": 7590 }, { "epoch": 2.010593220338983, "grad_norm": 1.8108315467834473, "learning_rate": 8.994835805084747e-06, "loss": 1.1277, "mean_token_accuracy": 0.7277543917298317, "num_tokens": 6109280.0, "step": 7592 }, { "epoch": 2.011122881355932, "grad_norm": 1.6968674659729004, "learning_rate": 8.994570974576271e-06, "loss": 1.6005, "mean_token_accuracy": 0.6755397841334343, "num_tokens": 6110842.0, "step": 7594 }, { "epoch": 2.0116525423728815, "grad_norm": 1.552003026008606, "learning_rate": 8.994306144067796e-06, "loss": 1.4177, "mean_token_accuracy": 0.6902074068784714, "num_tokens": 6112531.0, "step": 7596 }, { "epoch": 2.0121822033898304, "grad_norm": 1.3423833847045898, "learning_rate": 8.994041313559323e-06, "loss": 1.0459, "mean_token_accuracy": 0.7456000372767448, "num_tokens": 6114006.0, "step": 7598 }, { "epoch": 2.01271186440678, "grad_norm": 2.0837931632995605, "learning_rate": 8.993776483050848e-06, "loss": 1.144, "mean_token_accuracy": 0.7169513553380966, "num_tokens": 6115591.0, "step": 7600 }, { "epoch": 2.013241525423729, "grad_norm": 1.5366755723953247, "learning_rate": 8.993511652542375e-06, "loss": 1.2681, "mean_token_accuracy": 0.70838812738657, "num_tokens": 6116893.0, "step": 7602 }, { "epoch": 2.0137711864406778, "grad_norm": 1.7716683149337769, "learning_rate": 8.9932468220339e-06, "loss": 1.8034, "mean_token_accuracy": 0.6242638379335403, "num_tokens": 6118247.0, "step": 7604 }, { "epoch": 2.014300847457627, "grad_norm": 1.6276769638061523, "learning_rate": 8.992981991525424e-06, "loss": 1.2314, "mean_token_accuracy": 0.7206421867012978, "num_tokens": 6119761.0, "step": 7606 }, { "epoch": 2.014830508474576, "grad_norm": 1.8538167476654053, "learning_rate": 8.99271716101695e-06, "loss": 1.287, "mean_token_accuracy": 0.6900466307997704, "num_tokens": 6121262.0, "step": 7608 }, { "epoch": 2.0153601694915255, "grad_norm": 1.7781401872634888, "learning_rate": 8.992452330508476e-06, "loss": 1.194, "mean_token_accuracy": 0.7179964408278465, "num_tokens": 6122766.0, "step": 7610 }, { "epoch": 2.0158898305084745, "grad_norm": 1.434420108795166, "learning_rate": 8.992187500000001e-06, "loss": 1.5771, "mean_token_accuracy": 0.6418590471148491, "num_tokens": 6124680.0, "step": 7612 }, { "epoch": 2.016419491525424, "grad_norm": 1.9625650644302368, "learning_rate": 8.991922669491526e-06, "loss": 1.6551, "mean_token_accuracy": 0.6514497175812721, "num_tokens": 6126391.0, "step": 7614 }, { "epoch": 2.016949152542373, "grad_norm": 1.449546456336975, "learning_rate": 8.99165783898305e-06, "loss": 1.1278, "mean_token_accuracy": 0.7411185167729855, "num_tokens": 6127874.0, "step": 7616 }, { "epoch": 2.0174788135593222, "grad_norm": 1.6245877742767334, "learning_rate": 8.991393008474577e-06, "loss": 1.2165, "mean_token_accuracy": 0.7217575758695602, "num_tokens": 6129558.0, "step": 7618 }, { "epoch": 2.018008474576271, "grad_norm": 2.2847414016723633, "learning_rate": 8.991128177966102e-06, "loss": 1.7223, "mean_token_accuracy": 0.6301713064312935, "num_tokens": 6131109.0, "step": 7620 }, { "epoch": 2.01853813559322, "grad_norm": 1.1943148374557495, "learning_rate": 8.990863347457627e-06, "loss": 1.1244, "mean_token_accuracy": 0.7147857025265694, "num_tokens": 6132850.0, "step": 7622 }, { "epoch": 2.0190677966101696, "grad_norm": 1.606849193572998, "learning_rate": 8.990598516949152e-06, "loss": 1.6407, "mean_token_accuracy": 0.6423106715083122, "num_tokens": 6134447.0, "step": 7624 }, { "epoch": 2.0195974576271185, "grad_norm": 1.6520798206329346, "learning_rate": 8.990333686440679e-06, "loss": 0.9172, "mean_token_accuracy": 0.7777545675635338, "num_tokens": 6135871.0, "step": 7626 }, { "epoch": 2.020127118644068, "grad_norm": 1.3364065885543823, "learning_rate": 8.990068855932204e-06, "loss": 1.071, "mean_token_accuracy": 0.755392000079155, "num_tokens": 6137628.0, "step": 7628 }, { "epoch": 2.020656779661017, "grad_norm": 1.8226935863494873, "learning_rate": 8.98980402542373e-06, "loss": 1.682, "mean_token_accuracy": 0.6229934990406036, "num_tokens": 6139249.0, "step": 7630 }, { "epoch": 2.0211864406779663, "grad_norm": 1.4577269554138184, "learning_rate": 8.989539194915255e-06, "loss": 1.3518, "mean_token_accuracy": 0.7125702053308487, "num_tokens": 6140805.0, "step": 7632 }, { "epoch": 2.0217161016949152, "grad_norm": 1.436293125152588, "learning_rate": 8.98927436440678e-06, "loss": 1.4245, "mean_token_accuracy": 0.6459128707647324, "num_tokens": 6142740.0, "step": 7634 }, { "epoch": 2.0222457627118646, "grad_norm": 1.7955926656723022, "learning_rate": 8.989009533898307e-06, "loss": 1.5504, "mean_token_accuracy": 0.6655549854040146, "num_tokens": 6144290.0, "step": 7636 }, { "epoch": 2.0227754237288136, "grad_norm": 1.6787453889846802, "learning_rate": 8.988744703389832e-06, "loss": 1.7092, "mean_token_accuracy": 0.6319114416837692, "num_tokens": 6146090.0, "step": 7638 }, { "epoch": 2.0233050847457625, "grad_norm": 1.442428708076477, "learning_rate": 8.988479872881357e-06, "loss": 1.3359, "mean_token_accuracy": 0.699370089918375, "num_tokens": 6147863.0, "step": 7640 }, { "epoch": 2.023834745762712, "grad_norm": 1.5484497547149658, "learning_rate": 8.988215042372882e-06, "loss": 1.2279, "mean_token_accuracy": 0.7001619935035706, "num_tokens": 6149690.0, "step": 7642 }, { "epoch": 2.024364406779661, "grad_norm": 2.0783369541168213, "learning_rate": 8.987950211864408e-06, "loss": 1.6526, "mean_token_accuracy": 0.6329706534743309, "num_tokens": 6151461.0, "step": 7644 }, { "epoch": 2.0248940677966103, "grad_norm": 1.2722407579421997, "learning_rate": 8.987685381355933e-06, "loss": 1.0698, "mean_token_accuracy": 0.7758025527000427, "num_tokens": 6152843.0, "step": 7646 }, { "epoch": 2.0254237288135593, "grad_norm": 1.7179338932037354, "learning_rate": 8.987420550847458e-06, "loss": 1.0634, "mean_token_accuracy": 0.7321410030126572, "num_tokens": 6154381.0, "step": 7648 }, { "epoch": 2.0259533898305087, "grad_norm": 2.2139434814453125, "learning_rate": 8.987155720338983e-06, "loss": 0.9796, "mean_token_accuracy": 0.7428217753767967, "num_tokens": 6155828.0, "step": 7650 }, { "epoch": 2.0264830508474576, "grad_norm": 1.7024257183074951, "learning_rate": 8.98689088983051e-06, "loss": 1.3142, "mean_token_accuracy": 0.6920883059501648, "num_tokens": 6157196.0, "step": 7652 }, { "epoch": 2.0270127118644066, "grad_norm": 1.6262435913085938, "learning_rate": 8.986626059322035e-06, "loss": 1.4193, "mean_token_accuracy": 0.6822097450494766, "num_tokens": 6159071.0, "step": 7654 }, { "epoch": 2.027542372881356, "grad_norm": 1.8052101135253906, "learning_rate": 8.986361228813561e-06, "loss": 1.2245, "mean_token_accuracy": 0.7050699144601822, "num_tokens": 6161271.0, "step": 7656 }, { "epoch": 2.028072033898305, "grad_norm": 1.981044888496399, "learning_rate": 8.986096398305086e-06, "loss": 1.5786, "mean_token_accuracy": 0.6588246151804924, "num_tokens": 6162689.0, "step": 7658 }, { "epoch": 2.0286016949152543, "grad_norm": 2.08206844329834, "learning_rate": 8.985831567796611e-06, "loss": 1.5765, "mean_token_accuracy": 0.6622669249773026, "num_tokens": 6164202.0, "step": 7660 }, { "epoch": 2.0291313559322033, "grad_norm": 1.6873759031295776, "learning_rate": 8.985566737288136e-06, "loss": 1.526, "mean_token_accuracy": 0.6321828439831734, "num_tokens": 6165714.0, "step": 7662 }, { "epoch": 2.0296610169491527, "grad_norm": 1.6483484506607056, "learning_rate": 8.985301906779663e-06, "loss": 1.2663, "mean_token_accuracy": 0.698660247027874, "num_tokens": 6167532.0, "step": 7664 }, { "epoch": 2.0301906779661016, "grad_norm": 1.5552434921264648, "learning_rate": 8.985037076271188e-06, "loss": 1.4298, "mean_token_accuracy": 0.6801819056272507, "num_tokens": 6169304.0, "step": 7666 }, { "epoch": 2.030720338983051, "grad_norm": 1.7772127389907837, "learning_rate": 8.984772245762712e-06, "loss": 1.2247, "mean_token_accuracy": 0.7227632775902748, "num_tokens": 6170915.0, "step": 7668 }, { "epoch": 2.03125, "grad_norm": 1.4966018199920654, "learning_rate": 8.984507415254237e-06, "loss": 0.9664, "mean_token_accuracy": 0.7779639959335327, "num_tokens": 6172420.0, "step": 7670 }, { "epoch": 2.031779661016949, "grad_norm": 1.6116706132888794, "learning_rate": 8.984242584745764e-06, "loss": 1.2447, "mean_token_accuracy": 0.6867119893431664, "num_tokens": 6173996.0, "step": 7672 }, { "epoch": 2.0323093220338984, "grad_norm": 1.6816394329071045, "learning_rate": 8.983977754237289e-06, "loss": 1.3614, "mean_token_accuracy": 0.712483361363411, "num_tokens": 6175568.0, "step": 7674 }, { "epoch": 2.0328389830508473, "grad_norm": 1.7825894355773926, "learning_rate": 8.983712923728814e-06, "loss": 1.7006, "mean_token_accuracy": 0.619577769190073, "num_tokens": 6177248.0, "step": 7676 }, { "epoch": 2.0333686440677967, "grad_norm": 1.9745779037475586, "learning_rate": 8.983448093220339e-06, "loss": 1.4457, "mean_token_accuracy": 0.686602771282196, "num_tokens": 6178768.0, "step": 7678 }, { "epoch": 2.0338983050847457, "grad_norm": 1.3618512153625488, "learning_rate": 8.983183262711865e-06, "loss": 0.9686, "mean_token_accuracy": 0.7742505222558975, "num_tokens": 6180342.0, "step": 7680 }, { "epoch": 2.034427966101695, "grad_norm": 1.8203656673431396, "learning_rate": 8.98291843220339e-06, "loss": 1.3481, "mean_token_accuracy": 0.6991022005677223, "num_tokens": 6181779.0, "step": 7682 }, { "epoch": 2.034957627118644, "grad_norm": 1.4418162107467651, "learning_rate": 8.982653601694917e-06, "loss": 1.2472, "mean_token_accuracy": 0.7383312284946442, "num_tokens": 6183270.0, "step": 7684 }, { "epoch": 2.0354872881355934, "grad_norm": 1.5081815719604492, "learning_rate": 8.98238877118644e-06, "loss": 1.1, "mean_token_accuracy": 0.7345011532306671, "num_tokens": 6184845.0, "step": 7686 }, { "epoch": 2.0360169491525424, "grad_norm": 1.6634708642959595, "learning_rate": 8.982123940677967e-06, "loss": 1.2629, "mean_token_accuracy": 0.7294004745781422, "num_tokens": 6186317.0, "step": 7688 }, { "epoch": 2.0365466101694913, "grad_norm": 1.5883151292800903, "learning_rate": 8.981859110169492e-06, "loss": 1.0431, "mean_token_accuracy": 0.7367925718426704, "num_tokens": 6187782.0, "step": 7690 }, { "epoch": 2.0370762711864407, "grad_norm": 1.840118169784546, "learning_rate": 8.981594279661018e-06, "loss": 1.6356, "mean_token_accuracy": 0.6162141636013985, "num_tokens": 6189181.0, "step": 7692 }, { "epoch": 2.0376059322033897, "grad_norm": 1.9600884914398193, "learning_rate": 8.981329449152543e-06, "loss": 1.4269, "mean_token_accuracy": 0.7090040370821953, "num_tokens": 6190579.0, "step": 7694 }, { "epoch": 2.038135593220339, "grad_norm": 1.5145869255065918, "learning_rate": 8.981064618644068e-06, "loss": 0.9426, "mean_token_accuracy": 0.7683237344026566, "num_tokens": 6192256.0, "step": 7696 }, { "epoch": 2.038665254237288, "grad_norm": 1.8231418132781982, "learning_rate": 8.980799788135593e-06, "loss": 1.3315, "mean_token_accuracy": 0.6952713578939438, "num_tokens": 6193804.0, "step": 7698 }, { "epoch": 2.0391949152542375, "grad_norm": 1.2575569152832031, "learning_rate": 8.98053495762712e-06, "loss": 1.172, "mean_token_accuracy": 0.7389083951711655, "num_tokens": 6195370.0, "step": 7700 }, { "epoch": 2.0397245762711864, "grad_norm": 1.6372342109680176, "learning_rate": 8.980270127118645e-06, "loss": 0.8257, "mean_token_accuracy": 0.7863701432943344, "num_tokens": 6196840.0, "step": 7702 }, { "epoch": 2.0402542372881354, "grad_norm": 1.6030484437942505, "learning_rate": 8.98000529661017e-06, "loss": 1.6881, "mean_token_accuracy": 0.6243118569254875, "num_tokens": 6198670.0, "step": 7704 }, { "epoch": 2.0407838983050848, "grad_norm": 1.9698373079299927, "learning_rate": 8.979740466101695e-06, "loss": 1.3812, "mean_token_accuracy": 0.6894251219928265, "num_tokens": 6200474.0, "step": 7706 }, { "epoch": 2.0413135593220337, "grad_norm": 1.7422864437103271, "learning_rate": 8.979475635593221e-06, "loss": 1.5234, "mean_token_accuracy": 0.6411044597625732, "num_tokens": 6202168.0, "step": 7708 }, { "epoch": 2.041843220338983, "grad_norm": 1.6869293451309204, "learning_rate": 8.979210805084746e-06, "loss": 1.3334, "mean_token_accuracy": 0.716175876557827, "num_tokens": 6203644.0, "step": 7710 }, { "epoch": 2.042372881355932, "grad_norm": 1.0515356063842773, "learning_rate": 8.978945974576273e-06, "loss": 0.9794, "mean_token_accuracy": 0.7687959000468254, "num_tokens": 6205298.0, "step": 7712 }, { "epoch": 2.0429025423728815, "grad_norm": 1.6653457880020142, "learning_rate": 8.978681144067798e-06, "loss": 1.4454, "mean_token_accuracy": 0.6842171810567379, "num_tokens": 6206962.0, "step": 7714 }, { "epoch": 2.0434322033898304, "grad_norm": 2.095733404159546, "learning_rate": 8.978416313559323e-06, "loss": 1.3655, "mean_token_accuracy": 0.7202947065234184, "num_tokens": 6208395.0, "step": 7716 }, { "epoch": 2.04396186440678, "grad_norm": 1.4829565286636353, "learning_rate": 8.97815148305085e-06, "loss": 0.7999, "mean_token_accuracy": 0.7883100658655167, "num_tokens": 6209855.0, "step": 7718 }, { "epoch": 2.044491525423729, "grad_norm": 1.482158899307251, "learning_rate": 8.977886652542374e-06, "loss": 1.6647, "mean_token_accuracy": 0.654649805277586, "num_tokens": 6211498.0, "step": 7720 }, { "epoch": 2.0450211864406778, "grad_norm": 1.725423812866211, "learning_rate": 8.977621822033899e-06, "loss": 1.3601, "mean_token_accuracy": 0.6922180503606796, "num_tokens": 6213043.0, "step": 7722 }, { "epoch": 2.045550847457627, "grad_norm": 1.777004599571228, "learning_rate": 8.977356991525424e-06, "loss": 1.3236, "mean_token_accuracy": 0.7040783911943436, "num_tokens": 6214749.0, "step": 7724 }, { "epoch": 2.046080508474576, "grad_norm": 1.3045856952667236, "learning_rate": 8.97709216101695e-06, "loss": 1.1981, "mean_token_accuracy": 0.6984702199697495, "num_tokens": 6216530.0, "step": 7726 }, { "epoch": 2.0466101694915255, "grad_norm": 1.4708247184753418, "learning_rate": 8.976827330508476e-06, "loss": 1.1742, "mean_token_accuracy": 0.7055889144539833, "num_tokens": 6218102.0, "step": 7728 }, { "epoch": 2.0471398305084745, "grad_norm": 1.452476978302002, "learning_rate": 8.9765625e-06, "loss": 1.1104, "mean_token_accuracy": 0.7138402685523033, "num_tokens": 6219710.0, "step": 7730 }, { "epoch": 2.047669491525424, "grad_norm": 1.5641640424728394, "learning_rate": 8.976297669491525e-06, "loss": 1.0962, "mean_token_accuracy": 0.7325372993946075, "num_tokens": 6221294.0, "step": 7732 }, { "epoch": 2.048199152542373, "grad_norm": 1.918121337890625, "learning_rate": 8.976032838983052e-06, "loss": 1.3483, "mean_token_accuracy": 0.7044383734464645, "num_tokens": 6222876.0, "step": 7734 }, { "epoch": 2.0487288135593222, "grad_norm": 1.7608933448791504, "learning_rate": 8.975768008474577e-06, "loss": 1.483, "mean_token_accuracy": 0.6671528741717339, "num_tokens": 6224418.0, "step": 7736 }, { "epoch": 2.049258474576271, "grad_norm": 1.1152745485305786, "learning_rate": 8.975503177966104e-06, "loss": 1.0988, "mean_token_accuracy": 0.7442381829023361, "num_tokens": 6226852.0, "step": 7738 }, { "epoch": 2.04978813559322, "grad_norm": 2.0761067867279053, "learning_rate": 8.975238347457627e-06, "loss": 1.5775, "mean_token_accuracy": 0.6675691977143288, "num_tokens": 6228560.0, "step": 7740 }, { "epoch": 2.0503177966101696, "grad_norm": 1.928546667098999, "learning_rate": 8.974973516949153e-06, "loss": 1.3963, "mean_token_accuracy": 0.6748607605695724, "num_tokens": 6229973.0, "step": 7742 }, { "epoch": 2.0508474576271185, "grad_norm": 1.4856832027435303, "learning_rate": 8.974708686440678e-06, "loss": 1.7175, "mean_token_accuracy": 0.656060554087162, "num_tokens": 6231870.0, "step": 7744 }, { "epoch": 2.051377118644068, "grad_norm": 1.5933659076690674, "learning_rate": 8.974443855932205e-06, "loss": 1.4286, "mean_token_accuracy": 0.6932349130511284, "num_tokens": 6233390.0, "step": 7746 }, { "epoch": 2.051906779661017, "grad_norm": 1.730203628540039, "learning_rate": 8.97417902542373e-06, "loss": 1.2394, "mean_token_accuracy": 0.6888428777456284, "num_tokens": 6235152.0, "step": 7748 }, { "epoch": 2.0524364406779663, "grad_norm": 1.6986839771270752, "learning_rate": 8.973914194915255e-06, "loss": 1.2048, "step": 7750 }, { "epoch": 2.0524364406779663, "eval_loss": 1.313528060913086, "eval_mean_token_accuracy": 0.7005020560962814, "eval_num_tokens": 6236704.0, "eval_runtime": 49.0016, "eval_samples_per_second": 6.286, "eval_steps_per_second": 6.286, "step": 7750 }, { "epoch": 2.0529661016949152, "grad_norm": 1.3664629459381104, "learning_rate": 8.97364936440678e-06, "loss": 1.0842, "mean_token_accuracy": 0.7425757087767124, "num_tokens": 6238426.0, "step": 7752 }, { "epoch": 2.0534957627118646, "grad_norm": 1.8650615215301514, "learning_rate": 8.973384533898306e-06, "loss": 1.5737, "mean_token_accuracy": 0.6751150190830231, "num_tokens": 6239848.0, "step": 7754 }, { "epoch": 2.0540254237288136, "grad_norm": 1.2093619108200073, "learning_rate": 8.973119703389831e-06, "loss": 1.5243, "mean_token_accuracy": 0.6636245511472225, "num_tokens": 6241866.0, "step": 7756 }, { "epoch": 2.0545550847457625, "grad_norm": 1.8724685907363892, "learning_rate": 8.972854872881356e-06, "loss": 1.5337, "mean_token_accuracy": 0.6749710962176323, "num_tokens": 6243204.0, "step": 7758 }, { "epoch": 2.055084745762712, "grad_norm": 2.191537380218506, "learning_rate": 8.972590042372881e-06, "loss": 1.476, "mean_token_accuracy": 0.6989373937249184, "num_tokens": 6244423.0, "step": 7760 }, { "epoch": 2.055614406779661, "grad_norm": 1.674910306930542, "learning_rate": 8.972325211864408e-06, "loss": 1.2705, "mean_token_accuracy": 0.7146656066179276, "num_tokens": 6246260.0, "step": 7762 }, { "epoch": 2.0561440677966103, "grad_norm": 1.8543133735656738, "learning_rate": 8.972060381355933e-06, "loss": 1.414, "mean_token_accuracy": 0.684586338698864, "num_tokens": 6247965.0, "step": 7764 }, { "epoch": 2.0566737288135593, "grad_norm": 1.6153132915496826, "learning_rate": 8.97179555084746e-06, "loss": 1.3292, "mean_token_accuracy": 0.6988878548145294, "num_tokens": 6249543.0, "step": 7766 }, { "epoch": 2.0572033898305087, "grad_norm": 1.9418903589248657, "learning_rate": 8.971530720338983e-06, "loss": 1.6475, "mean_token_accuracy": 0.6892391443252563, "num_tokens": 6251046.0, "step": 7768 }, { "epoch": 2.0577330508474576, "grad_norm": 1.9352529048919678, "learning_rate": 8.97126588983051e-06, "loss": 1.1552, "mean_token_accuracy": 0.7238982915878296, "num_tokens": 6252273.0, "step": 7770 }, { "epoch": 2.0582627118644066, "grad_norm": 1.561509370803833, "learning_rate": 8.971001059322034e-06, "loss": 1.6506, "mean_token_accuracy": 0.6913379430770874, "num_tokens": 6253865.0, "step": 7772 }, { "epoch": 2.058792372881356, "grad_norm": 1.7059603929519653, "learning_rate": 8.97073622881356e-06, "loss": 1.5164, "mean_token_accuracy": 0.6443094611167908, "num_tokens": 6255621.0, "step": 7774 }, { "epoch": 2.059322033898305, "grad_norm": 1.6878371238708496, "learning_rate": 8.970471398305086e-06, "loss": 1.4587, "mean_token_accuracy": 0.6769910156726837, "num_tokens": 6257236.0, "step": 7776 }, { "epoch": 2.0598516949152543, "grad_norm": 2.1008667945861816, "learning_rate": 8.97020656779661e-06, "loss": 1.404, "mean_token_accuracy": 0.68265251070261, "num_tokens": 6258464.0, "step": 7778 }, { "epoch": 2.0603813559322033, "grad_norm": 1.239655613899231, "learning_rate": 8.969941737288136e-06, "loss": 1.0634, "mean_token_accuracy": 0.7229494601488113, "num_tokens": 6260483.0, "step": 7780 }, { "epoch": 2.0609110169491527, "grad_norm": 1.303213357925415, "learning_rate": 8.969676906779662e-06, "loss": 1.0854, "mean_token_accuracy": 0.7511736080050468, "num_tokens": 6262217.0, "step": 7782 }, { "epoch": 2.0614406779661016, "grad_norm": 1.8656492233276367, "learning_rate": 8.969412076271187e-06, "loss": 1.8755, "mean_token_accuracy": 0.6058767959475517, "num_tokens": 6263841.0, "step": 7784 }, { "epoch": 2.061970338983051, "grad_norm": 1.9100922346115112, "learning_rate": 8.969147245762712e-06, "loss": 1.7921, "mean_token_accuracy": 0.6238126792013645, "num_tokens": 6265362.0, "step": 7786 }, { "epoch": 2.0625, "grad_norm": 1.95054030418396, "learning_rate": 8.968882415254237e-06, "loss": 1.5685, "mean_token_accuracy": 0.6568994745612144, "num_tokens": 6266866.0, "step": 7788 }, { "epoch": 2.063029661016949, "grad_norm": 1.4181365966796875, "learning_rate": 8.968617584745764e-06, "loss": 1.231, "mean_token_accuracy": 0.7268587574362755, "num_tokens": 6268545.0, "step": 7790 }, { "epoch": 2.0635593220338984, "grad_norm": 1.3378722667694092, "learning_rate": 8.968352754237289e-06, "loss": 1.0676, "mean_token_accuracy": 0.7507472410798073, "num_tokens": 6270074.0, "step": 7792 }, { "epoch": 2.0640889830508473, "grad_norm": 1.4892555475234985, "learning_rate": 8.968087923728813e-06, "loss": 1.323, "mean_token_accuracy": 0.6951103657484055, "num_tokens": 6271577.0, "step": 7794 }, { "epoch": 2.0646186440677967, "grad_norm": 1.284510612487793, "learning_rate": 8.967823093220338e-06, "loss": 1.3665, "mean_token_accuracy": 0.7342265881597996, "num_tokens": 6273273.0, "step": 7796 }, { "epoch": 2.0651483050847457, "grad_norm": 1.6809980869293213, "learning_rate": 8.967558262711865e-06, "loss": 1.3851, "mean_token_accuracy": 0.6917401030659676, "num_tokens": 6275121.0, "step": 7798 }, { "epoch": 2.065677966101695, "grad_norm": 1.4322377443313599, "learning_rate": 8.967293432203392e-06, "loss": 1.4362, "mean_token_accuracy": 0.6577980220317841, "num_tokens": 6276643.0, "step": 7800 }, { "epoch": 2.066207627118644, "grad_norm": 1.976496934890747, "learning_rate": 8.967028601694917e-06, "loss": 1.6125, "mean_token_accuracy": 0.65644820779562, "num_tokens": 6278130.0, "step": 7802 }, { "epoch": 2.0667372881355934, "grad_norm": 1.7346572875976562, "learning_rate": 8.966763771186442e-06, "loss": 1.602, "mean_token_accuracy": 0.6386920921504498, "num_tokens": 6280100.0, "step": 7804 }, { "epoch": 2.0672669491525424, "grad_norm": 1.878752589225769, "learning_rate": 8.966498940677966e-06, "loss": 1.251, "mean_token_accuracy": 0.7103314101696014, "num_tokens": 6281503.0, "step": 7806 }, { "epoch": 2.0677966101694913, "grad_norm": 1.626923680305481, "learning_rate": 8.966234110169493e-06, "loss": 1.16, "mean_token_accuracy": 0.7272659838199615, "num_tokens": 6282834.0, "step": 7808 }, { "epoch": 2.0683262711864407, "grad_norm": 1.68381929397583, "learning_rate": 8.965969279661018e-06, "loss": 1.0269, "mean_token_accuracy": 0.7547668516635895, "num_tokens": 6284446.0, "step": 7810 }, { "epoch": 2.0688559322033897, "grad_norm": 1.6343839168548584, "learning_rate": 8.965704449152543e-06, "loss": 1.3049, "mean_token_accuracy": 0.706984356045723, "num_tokens": 6285864.0, "step": 7812 }, { "epoch": 2.069385593220339, "grad_norm": 1.5097520351409912, "learning_rate": 8.965439618644068e-06, "loss": 0.8062, "mean_token_accuracy": 0.7926261648535728, "num_tokens": 6287489.0, "step": 7814 }, { "epoch": 2.069915254237288, "grad_norm": 1.5520750284194946, "learning_rate": 8.965174788135594e-06, "loss": 0.937, "mean_token_accuracy": 0.7724969759583473, "num_tokens": 6288804.0, "step": 7816 }, { "epoch": 2.0704449152542375, "grad_norm": 1.4985325336456299, "learning_rate": 8.96490995762712e-06, "loss": 0.9147, "mean_token_accuracy": 0.7762386500835419, "num_tokens": 6290188.0, "step": 7818 }, { "epoch": 2.0709745762711864, "grad_norm": 1.6393465995788574, "learning_rate": 8.964645127118646e-06, "loss": 1.3773, "mean_token_accuracy": 0.6931664794683456, "num_tokens": 6291599.0, "step": 7820 }, { "epoch": 2.071504237288136, "grad_norm": 1.667872428894043, "learning_rate": 8.96438029661017e-06, "loss": 1.1176, "mean_token_accuracy": 0.7514436393976212, "num_tokens": 6293017.0, "step": 7822 }, { "epoch": 2.0720338983050848, "grad_norm": 1.7740594148635864, "learning_rate": 8.964115466101696e-06, "loss": 1.4968, "mean_token_accuracy": 0.6628971472382545, "num_tokens": 6294463.0, "step": 7824 }, { "epoch": 2.0725635593220337, "grad_norm": 1.672716736793518, "learning_rate": 8.96385063559322e-06, "loss": 1.253, "mean_token_accuracy": 0.7340158075094223, "num_tokens": 6296020.0, "step": 7826 }, { "epoch": 2.073093220338983, "grad_norm": 1.557681679725647, "learning_rate": 8.963585805084747e-06, "loss": 0.9668, "mean_token_accuracy": 0.7624166831374168, "num_tokens": 6297526.0, "step": 7828 }, { "epoch": 2.073622881355932, "grad_norm": 1.3072909116744995, "learning_rate": 8.963320974576272e-06, "loss": 1.2427, "mean_token_accuracy": 0.6827194504439831, "num_tokens": 6299423.0, "step": 7830 }, { "epoch": 2.0741525423728815, "grad_norm": 1.7528553009033203, "learning_rate": 8.963056144067797e-06, "loss": 1.3194, "mean_token_accuracy": 0.6879872828722, "num_tokens": 6300751.0, "step": 7832 }, { "epoch": 2.0746822033898304, "grad_norm": 1.6595147848129272, "learning_rate": 8.962791313559322e-06, "loss": 1.3684, "mean_token_accuracy": 0.6769559159874916, "num_tokens": 6302326.0, "step": 7834 }, { "epoch": 2.07521186440678, "grad_norm": 1.6667311191558838, "learning_rate": 8.962526483050849e-06, "loss": 1.404, "mean_token_accuracy": 0.6998259946703911, "num_tokens": 6303869.0, "step": 7836 }, { "epoch": 2.075741525423729, "grad_norm": 1.7429851293563843, "learning_rate": 8.962261652542374e-06, "loss": 1.4603, "mean_token_accuracy": 0.6896816268563271, "num_tokens": 6305577.0, "step": 7838 }, { "epoch": 2.0762711864406778, "grad_norm": 1.4534428119659424, "learning_rate": 8.961996822033899e-06, "loss": 1.2214, "mean_token_accuracy": 0.7387772798538208, "num_tokens": 6307002.0, "step": 7840 }, { "epoch": 2.076800847457627, "grad_norm": 1.385776162147522, "learning_rate": 8.961731991525424e-06, "loss": 1.1466, "mean_token_accuracy": 0.7411847710609436, "num_tokens": 6308592.0, "step": 7842 }, { "epoch": 2.077330508474576, "grad_norm": 1.6275686025619507, "learning_rate": 8.96146716101695e-06, "loss": 1.112, "mean_token_accuracy": 0.7531841173768044, "num_tokens": 6310182.0, "step": 7844 }, { "epoch": 2.0778601694915255, "grad_norm": 1.6650794744491577, "learning_rate": 8.961202330508475e-06, "loss": 1.5191, "mean_token_accuracy": 0.6498076915740967, "num_tokens": 6312376.0, "step": 7846 }, { "epoch": 2.0783898305084745, "grad_norm": 2.145587921142578, "learning_rate": 8.9609375e-06, "loss": 1.4892, "mean_token_accuracy": 0.6705345585942268, "num_tokens": 6313864.0, "step": 7848 }, { "epoch": 2.078919491525424, "grad_norm": 1.7999390363693237, "learning_rate": 8.960672669491525e-06, "loss": 1.2648, "mean_token_accuracy": 0.7136830985546112, "num_tokens": 6315627.0, "step": 7850 }, { "epoch": 2.079449152542373, "grad_norm": 1.8197393417358398, "learning_rate": 8.960407838983052e-06, "loss": 1.2846, "mean_token_accuracy": 0.7130758091807365, "num_tokens": 6317129.0, "step": 7852 }, { "epoch": 2.0799788135593222, "grad_norm": 2.0474555492401123, "learning_rate": 8.960143008474577e-06, "loss": 1.121, "mean_token_accuracy": 0.7275065705180168, "num_tokens": 6318633.0, "step": 7854 }, { "epoch": 2.080508474576271, "grad_norm": 1.973500370979309, "learning_rate": 8.959878177966103e-06, "loss": 1.2005, "mean_token_accuracy": 0.7357950732111931, "num_tokens": 6319985.0, "step": 7856 }, { "epoch": 2.08103813559322, "grad_norm": 1.5176833868026733, "learning_rate": 8.959613347457628e-06, "loss": 1.4866, "mean_token_accuracy": 0.6676516123116016, "num_tokens": 6321750.0, "step": 7858 }, { "epoch": 2.0815677966101696, "grad_norm": 1.9011878967285156, "learning_rate": 8.959348516949153e-06, "loss": 1.3865, "mean_token_accuracy": 0.6882304474711418, "num_tokens": 6323271.0, "step": 7860 }, { "epoch": 2.0820974576271185, "grad_norm": 1.4052377939224243, "learning_rate": 8.959083686440678e-06, "loss": 0.9518, "mean_token_accuracy": 0.7473112344741821, "num_tokens": 6325092.0, "step": 7862 }, { "epoch": 2.082627118644068, "grad_norm": 1.4176546335220337, "learning_rate": 8.958818855932205e-06, "loss": 1.2013, "mean_token_accuracy": 0.7182940095663071, "num_tokens": 6326835.0, "step": 7864 }, { "epoch": 2.083156779661017, "grad_norm": 1.7378681898117065, "learning_rate": 8.95855402542373e-06, "loss": 1.653, "mean_token_accuracy": 0.6477057412266731, "num_tokens": 6328296.0, "step": 7866 }, { "epoch": 2.0836864406779663, "grad_norm": 1.9099990129470825, "learning_rate": 8.958289194915254e-06, "loss": 1.4538, "mean_token_accuracy": 0.674824096262455, "num_tokens": 6329615.0, "step": 7868 }, { "epoch": 2.0842161016949152, "grad_norm": 1.9985361099243164, "learning_rate": 8.95802436440678e-06, "loss": 1.3271, "mean_token_accuracy": 0.6817716658115387, "num_tokens": 6331004.0, "step": 7870 }, { "epoch": 2.084745762711864, "grad_norm": 1.889259696006775, "learning_rate": 8.957759533898306e-06, "loss": 1.1151, "mean_token_accuracy": 0.7450169026851654, "num_tokens": 6332374.0, "step": 7872 }, { "epoch": 2.0852754237288136, "grad_norm": 2.1404449939727783, "learning_rate": 8.957494703389831e-06, "loss": 1.2813, "mean_token_accuracy": 0.701849177479744, "num_tokens": 6333735.0, "step": 7874 }, { "epoch": 2.0858050847457625, "grad_norm": 2.3307132720947266, "learning_rate": 8.957229872881356e-06, "loss": 1.6485, "mean_token_accuracy": 0.6530419215559959, "num_tokens": 6335115.0, "step": 7876 }, { "epoch": 2.086334745762712, "grad_norm": 1.8489569425582886, "learning_rate": 8.95696504237288e-06, "loss": 1.0876, "mean_token_accuracy": 0.7458324953913689, "num_tokens": 6336859.0, "step": 7878 }, { "epoch": 2.086864406779661, "grad_norm": 2.0197489261627197, "learning_rate": 8.956700211864407e-06, "loss": 1.5831, "mean_token_accuracy": 0.6755664311349392, "num_tokens": 6338346.0, "step": 7880 }, { "epoch": 2.0873940677966103, "grad_norm": 1.7681605815887451, "learning_rate": 8.956435381355934e-06, "loss": 1.3934, "mean_token_accuracy": 0.7029744535684586, "num_tokens": 6339681.0, "step": 7882 }, { "epoch": 2.0879237288135593, "grad_norm": 2.063525676727295, "learning_rate": 8.956170550847459e-06, "loss": 1.0412, "mean_token_accuracy": 0.7598030716180801, "num_tokens": 6341290.0, "step": 7884 }, { "epoch": 2.0884533898305087, "grad_norm": 1.2827374935150146, "learning_rate": 8.955905720338984e-06, "loss": 1.4062, "mean_token_accuracy": 0.697121150791645, "num_tokens": 6342758.0, "step": 7886 }, { "epoch": 2.0889830508474576, "grad_norm": 1.8703010082244873, "learning_rate": 8.955640889830509e-06, "loss": 1.2667, "mean_token_accuracy": 0.7072326242923737, "num_tokens": 6344236.0, "step": 7888 }, { "epoch": 2.0895127118644066, "grad_norm": 1.861783504486084, "learning_rate": 8.955376059322035e-06, "loss": 1.26, "mean_token_accuracy": 0.7256642729043961, "num_tokens": 6345549.0, "step": 7890 }, { "epoch": 2.090042372881356, "grad_norm": 1.6956614255905151, "learning_rate": 8.95511122881356e-06, "loss": 0.916, "mean_token_accuracy": 0.7751121744513512, "num_tokens": 6346854.0, "step": 7892 }, { "epoch": 2.090572033898305, "grad_norm": 1.4529789686203003, "learning_rate": 8.954846398305085e-06, "loss": 1.1738, "mean_token_accuracy": 0.7090246751904488, "num_tokens": 6348814.0, "step": 7894 }, { "epoch": 2.0911016949152543, "grad_norm": 1.780355453491211, "learning_rate": 8.95458156779661e-06, "loss": 1.2751, "mean_token_accuracy": 0.709972470998764, "num_tokens": 6350299.0, "step": 7896 }, { "epoch": 2.0916313559322033, "grad_norm": 1.2920985221862793, "learning_rate": 8.954316737288137e-06, "loss": 1.1018, "mean_token_accuracy": 0.7626253068447113, "num_tokens": 6352211.0, "step": 7898 }, { "epoch": 2.0921610169491527, "grad_norm": 1.8313173055648804, "learning_rate": 8.954051906779662e-06, "loss": 1.3711, "mean_token_accuracy": 0.716863751411438, "num_tokens": 6353717.0, "step": 7900 }, { "epoch": 2.0926906779661016, "grad_norm": 1.3601582050323486, "learning_rate": 8.953787076271187e-06, "loss": 0.909, "mean_token_accuracy": 0.7669104486703873, "num_tokens": 6355181.0, "step": 7902 }, { "epoch": 2.093220338983051, "grad_norm": 1.8025022745132446, "learning_rate": 8.953522245762712e-06, "loss": 1.311, "mean_token_accuracy": 0.7116818949580193, "num_tokens": 6356980.0, "step": 7904 }, { "epoch": 2.09375, "grad_norm": 1.5117250680923462, "learning_rate": 8.953257415254238e-06, "loss": 1.1405, "mean_token_accuracy": 0.7338011413812637, "num_tokens": 6358618.0, "step": 7906 }, { "epoch": 2.094279661016949, "grad_norm": 1.861574411392212, "learning_rate": 8.952992584745763e-06, "loss": 1.1843, "mean_token_accuracy": 0.7452475801110268, "num_tokens": 6359896.0, "step": 7908 }, { "epoch": 2.0948093220338984, "grad_norm": 1.8289471864700317, "learning_rate": 8.95272775423729e-06, "loss": 1.1664, "mean_token_accuracy": 0.7176011353731155, "num_tokens": 6361229.0, "step": 7910 }, { "epoch": 2.0953389830508473, "grad_norm": 1.7649978399276733, "learning_rate": 8.952462923728815e-06, "loss": 1.2029, "mean_token_accuracy": 0.7029770016670227, "num_tokens": 6363130.0, "step": 7912 }, { "epoch": 2.0958686440677967, "grad_norm": 1.663212776184082, "learning_rate": 8.95219809322034e-06, "loss": 1.2932, "mean_token_accuracy": 0.6935501024127007, "num_tokens": 6364860.0, "step": 7914 }, { "epoch": 2.0963983050847457, "grad_norm": 1.3514007329940796, "learning_rate": 8.951933262711865e-06, "loss": 1.2418, "mean_token_accuracy": 0.6998862400650978, "num_tokens": 6366470.0, "step": 7916 }, { "epoch": 2.096927966101695, "grad_norm": 1.4486562013626099, "learning_rate": 8.951668432203391e-06, "loss": 1.1819, "mean_token_accuracy": 0.7170857340097427, "num_tokens": 6368352.0, "step": 7918 }, { "epoch": 2.097457627118644, "grad_norm": 1.5185562372207642, "learning_rate": 8.951403601694916e-06, "loss": 0.9817, "mean_token_accuracy": 0.7526308000087738, "num_tokens": 6369722.0, "step": 7920 }, { "epoch": 2.0979872881355934, "grad_norm": 1.4630011320114136, "learning_rate": 8.951138771186441e-06, "loss": 1.1249, "mean_token_accuracy": 0.7254040986299515, "num_tokens": 6371425.0, "step": 7922 }, { "epoch": 2.0985169491525424, "grad_norm": 2.029054641723633, "learning_rate": 8.950873940677966e-06, "loss": 1.3058, "mean_token_accuracy": 0.6941740736365318, "num_tokens": 6372825.0, "step": 7924 }, { "epoch": 2.0990466101694913, "grad_norm": 1.67795729637146, "learning_rate": 8.950609110169493e-06, "loss": 1.1963, "mean_token_accuracy": 0.7147894650697708, "num_tokens": 6374547.0, "step": 7926 }, { "epoch": 2.0995762711864407, "grad_norm": 1.5684188604354858, "learning_rate": 8.950344279661018e-06, "loss": 1.0777, "mean_token_accuracy": 0.7342772409319878, "num_tokens": 6376228.0, "step": 7928 }, { "epoch": 2.1001059322033897, "grad_norm": 1.4558967351913452, "learning_rate": 8.950079449152543e-06, "loss": 1.0222, "mean_token_accuracy": 0.7552890777587891, "num_tokens": 6377734.0, "step": 7930 }, { "epoch": 2.100635593220339, "grad_norm": 1.9181236028671265, "learning_rate": 8.949814618644067e-06, "loss": 1.441, "mean_token_accuracy": 0.6876174509525299, "num_tokens": 6379079.0, "step": 7932 }, { "epoch": 2.101165254237288, "grad_norm": 1.7862249612808228, "learning_rate": 8.949549788135594e-06, "loss": 1.3937, "mean_token_accuracy": 0.6700479909777641, "num_tokens": 6380799.0, "step": 7934 }, { "epoch": 2.1016949152542375, "grad_norm": 1.8435126543045044, "learning_rate": 8.949284957627119e-06, "loss": 1.1934, "mean_token_accuracy": 0.7144891992211342, "num_tokens": 6382176.0, "step": 7936 }, { "epoch": 2.1022245762711864, "grad_norm": 1.5453846454620361, "learning_rate": 8.949020127118646e-06, "loss": 1.4573, "mean_token_accuracy": 0.6836131513118744, "num_tokens": 6383902.0, "step": 7938 }, { "epoch": 2.102754237288136, "grad_norm": 1.581628441810608, "learning_rate": 8.94875529661017e-06, "loss": 1.0324, "mean_token_accuracy": 0.7513349279761314, "num_tokens": 6385437.0, "step": 7940 }, { "epoch": 2.1032838983050848, "grad_norm": 1.3055099248886108, "learning_rate": 8.948490466101696e-06, "loss": 1.4036, "mean_token_accuracy": 0.7050686255097389, "num_tokens": 6387166.0, "step": 7942 }, { "epoch": 2.1038135593220337, "grad_norm": 1.63540780544281, "learning_rate": 8.94822563559322e-06, "loss": 1.2223, "mean_token_accuracy": 0.7215449810028076, "num_tokens": 6388964.0, "step": 7944 }, { "epoch": 2.104343220338983, "grad_norm": 1.648646593093872, "learning_rate": 8.947960805084747e-06, "loss": 1.5926, "mean_token_accuracy": 0.6890849396586418, "num_tokens": 6390682.0, "step": 7946 }, { "epoch": 2.104872881355932, "grad_norm": 1.6047194004058838, "learning_rate": 8.947695974576272e-06, "loss": 1.1748, "mean_token_accuracy": 0.7189359813928604, "num_tokens": 6392188.0, "step": 7948 }, { "epoch": 2.1054025423728815, "grad_norm": 1.536853313446045, "learning_rate": 8.947431144067797e-06, "loss": 1.2409, "mean_token_accuracy": 0.7343959510326385, "num_tokens": 6394097.0, "step": 7950 }, { "epoch": 2.1059322033898304, "grad_norm": 1.5560039281845093, "learning_rate": 8.947166313559322e-06, "loss": 1.3456, "mean_token_accuracy": 0.6960033625364304, "num_tokens": 6396019.0, "step": 7952 }, { "epoch": 2.10646186440678, "grad_norm": 1.7139254808425903, "learning_rate": 8.946901483050848e-06, "loss": 1.4849, "mean_token_accuracy": 0.6753886118531227, "num_tokens": 6397508.0, "step": 7954 }, { "epoch": 2.106991525423729, "grad_norm": 2.0733039379119873, "learning_rate": 8.946636652542373e-06, "loss": 1.2411, "mean_token_accuracy": 0.7336713224649429, "num_tokens": 6398940.0, "step": 7956 }, { "epoch": 2.1075211864406778, "grad_norm": 1.4464704990386963, "learning_rate": 8.946371822033898e-06, "loss": 1.2902, "mean_token_accuracy": 0.6980869695544243, "num_tokens": 6400888.0, "step": 7958 }, { "epoch": 2.108050847457627, "grad_norm": 1.389252781867981, "learning_rate": 8.946106991525423e-06, "loss": 1.2061, "mean_token_accuracy": 0.7240249663591385, "num_tokens": 6402435.0, "step": 7960 }, { "epoch": 2.108580508474576, "grad_norm": 1.5146032571792603, "learning_rate": 8.94584216101695e-06, "loss": 0.8834, "mean_token_accuracy": 0.7830767408013344, "num_tokens": 6404154.0, "step": 7962 }, { "epoch": 2.1091101694915255, "grad_norm": 1.4759843349456787, "learning_rate": 8.945577330508475e-06, "loss": 1.1778, "mean_token_accuracy": 0.7273412868380547, "num_tokens": 6405818.0, "step": 7964 }, { "epoch": 2.1096398305084745, "grad_norm": 1.5614088773727417, "learning_rate": 8.945312500000001e-06, "loss": 0.8563, "mean_token_accuracy": 0.7925713211297989, "num_tokens": 6407270.0, "step": 7966 }, { "epoch": 2.110169491525424, "grad_norm": 1.6521553993225098, "learning_rate": 8.945047669491526e-06, "loss": 1.5418, "mean_token_accuracy": 0.6378215774893761, "num_tokens": 6408885.0, "step": 7968 }, { "epoch": 2.110699152542373, "grad_norm": 2.0120737552642822, "learning_rate": 8.944782838983051e-06, "loss": 1.1021, "mean_token_accuracy": 0.7527049966156483, "num_tokens": 6410674.0, "step": 7970 }, { "epoch": 2.1112288135593222, "grad_norm": 1.5314377546310425, "learning_rate": 8.944518008474578e-06, "loss": 1.4302, "mean_token_accuracy": 0.6689990311861038, "num_tokens": 6412518.0, "step": 7972 }, { "epoch": 2.111758474576271, "grad_norm": 1.4964971542358398, "learning_rate": 8.944253177966103e-06, "loss": 1.0555, "mean_token_accuracy": 0.763054832816124, "num_tokens": 6414401.0, "step": 7974 }, { "epoch": 2.11228813559322, "grad_norm": 1.8593213558197021, "learning_rate": 8.943988347457628e-06, "loss": 1.1129, "mean_token_accuracy": 0.7276405915617943, "num_tokens": 6415978.0, "step": 7976 }, { "epoch": 2.1128177966101696, "grad_norm": 2.2340316772460938, "learning_rate": 8.943723516949153e-06, "loss": 1.3482, "mean_token_accuracy": 0.7043823376297951, "num_tokens": 6417393.0, "step": 7978 }, { "epoch": 2.1133474576271185, "grad_norm": 1.4553241729736328, "learning_rate": 8.94345868644068e-06, "loss": 0.9495, "mean_token_accuracy": 0.7698774263262749, "num_tokens": 6418740.0, "step": 7980 }, { "epoch": 2.113877118644068, "grad_norm": 1.586114525794983, "learning_rate": 8.943193855932204e-06, "loss": 1.2112, "mean_token_accuracy": 0.7103045955300331, "num_tokens": 6420239.0, "step": 7982 }, { "epoch": 2.114406779661017, "grad_norm": 1.7549892663955688, "learning_rate": 8.94292902542373e-06, "loss": 1.4796, "mean_token_accuracy": 0.6522721573710442, "num_tokens": 6421742.0, "step": 7984 }, { "epoch": 2.1149364406779663, "grad_norm": 1.7687909603118896, "learning_rate": 8.942664194915254e-06, "loss": 1.0542, "mean_token_accuracy": 0.7404856234788895, "num_tokens": 6423928.0, "step": 7986 }, { "epoch": 2.1154661016949152, "grad_norm": 1.777432918548584, "learning_rate": 8.94239936440678e-06, "loss": 1.2283, "mean_token_accuracy": 0.71397415548563, "num_tokens": 6425468.0, "step": 7988 }, { "epoch": 2.115995762711864, "grad_norm": 1.7060418128967285, "learning_rate": 8.942134533898306e-06, "loss": 1.2413, "mean_token_accuracy": 0.7056662701070309, "num_tokens": 6427129.0, "step": 7990 }, { "epoch": 2.1165254237288136, "grad_norm": 1.6957985162734985, "learning_rate": 8.941869703389832e-06, "loss": 1.3358, "mean_token_accuracy": 0.6813997179269791, "num_tokens": 6428729.0, "step": 7992 }, { "epoch": 2.1170550847457625, "grad_norm": 1.8135474920272827, "learning_rate": 8.941604872881357e-06, "loss": 1.2856, "mean_token_accuracy": 0.7022154480218887, "num_tokens": 6430444.0, "step": 7994 }, { "epoch": 2.117584745762712, "grad_norm": 1.6870031356811523, "learning_rate": 8.941340042372882e-06, "loss": 1.4909, "mean_token_accuracy": 0.671499066054821, "num_tokens": 6432242.0, "step": 7996 }, { "epoch": 2.118114406779661, "grad_norm": 1.7748464345932007, "learning_rate": 8.941075211864407e-06, "loss": 1.404, "mean_token_accuracy": 0.7155507281422615, "num_tokens": 6433790.0, "step": 7998 }, { "epoch": 2.1186440677966103, "grad_norm": 1.5692347288131714, "learning_rate": 8.940810381355934e-06, "loss": 1.2401, "step": 8000 }, { "epoch": 2.1186440677966103, "eval_loss": 1.3136341571807861, "eval_mean_token_accuracy": 0.7003051686015996, "eval_num_tokens": 6435438.0, "eval_runtime": 49.0503, "eval_samples_per_second": 6.279, "eval_steps_per_second": 6.279, "step": 8000 }, { "epoch": 2.1191737288135593, "grad_norm": 1.7270513772964478, "learning_rate": 8.940545550847459e-06, "loss": 1.5965, "mean_token_accuracy": 0.6645294912159443, "num_tokens": 6437039.0, "step": 8002 }, { "epoch": 2.1197033898305087, "grad_norm": 1.818536639213562, "learning_rate": 8.940280720338984e-06, "loss": 1.4204, "mean_token_accuracy": 0.6879279464483261, "num_tokens": 6438526.0, "step": 8004 }, { "epoch": 2.1202330508474576, "grad_norm": 1.5011043548583984, "learning_rate": 8.940015889830508e-06, "loss": 1.0009, "mean_token_accuracy": 0.7712629213929176, "num_tokens": 6439841.0, "step": 8006 }, { "epoch": 2.1207627118644066, "grad_norm": 2.3759233951568604, "learning_rate": 8.939751059322035e-06, "loss": 1.602, "mean_token_accuracy": 0.6610948070883751, "num_tokens": 6441235.0, "step": 8008 }, { "epoch": 2.121292372881356, "grad_norm": 1.9691288471221924, "learning_rate": 8.93948622881356e-06, "loss": 1.1702, "mean_token_accuracy": 0.7286648117005825, "num_tokens": 6442869.0, "step": 8010 }, { "epoch": 2.121822033898305, "grad_norm": 1.882780909538269, "learning_rate": 8.939221398305085e-06, "loss": 1.0382, "mean_token_accuracy": 0.7321488037705421, "num_tokens": 6444544.0, "step": 8012 }, { "epoch": 2.1223516949152543, "grad_norm": 1.651121735572815, "learning_rate": 8.93895656779661e-06, "loss": 0.8075, "mean_token_accuracy": 0.791106566786766, "num_tokens": 6446180.0, "step": 8014 }, { "epoch": 2.1228813559322033, "grad_norm": 1.590242862701416, "learning_rate": 8.938691737288137e-06, "loss": 1.1975, "mean_token_accuracy": 0.7075076401233673, "num_tokens": 6448009.0, "step": 8016 }, { "epoch": 2.1234110169491527, "grad_norm": 1.6076666116714478, "learning_rate": 8.938426906779661e-06, "loss": 1.2148, "mean_token_accuracy": 0.690311849117279, "num_tokens": 6449626.0, "step": 8018 }, { "epoch": 2.1239406779661016, "grad_norm": 1.6712369918823242, "learning_rate": 8.938162076271188e-06, "loss": 1.1708, "mean_token_accuracy": 0.7251237221062183, "num_tokens": 6451402.0, "step": 8020 }, { "epoch": 2.124470338983051, "grad_norm": 1.243834137916565, "learning_rate": 8.937897245762713e-06, "loss": 1.1032, "mean_token_accuracy": 0.7484624609351158, "num_tokens": 6453202.0, "step": 8022 }, { "epoch": 2.125, "grad_norm": 1.828399419784546, "learning_rate": 8.937632415254238e-06, "loss": 1.2023, "mean_token_accuracy": 0.7113272920250893, "num_tokens": 6454893.0, "step": 8024 }, { "epoch": 2.125529661016949, "grad_norm": 1.4905292987823486, "learning_rate": 8.937367584745763e-06, "loss": 1.2442, "mean_token_accuracy": 0.7046612799167633, "num_tokens": 6456514.0, "step": 8026 }, { "epoch": 2.1260593220338984, "grad_norm": 1.8192920684814453, "learning_rate": 8.93710275423729e-06, "loss": 1.3681, "mean_token_accuracy": 0.6869800239801407, "num_tokens": 6457795.0, "step": 8028 }, { "epoch": 2.1265889830508473, "grad_norm": 1.8752490282058716, "learning_rate": 8.936837923728814e-06, "loss": 1.4477, "mean_token_accuracy": 0.6717269942164421, "num_tokens": 6459607.0, "step": 8030 }, { "epoch": 2.1271186440677967, "grad_norm": 1.2812848091125488, "learning_rate": 8.93657309322034e-06, "loss": 1.0745, "mean_token_accuracy": 0.7443144097924232, "num_tokens": 6461035.0, "step": 8032 }, { "epoch": 2.1276483050847457, "grad_norm": 1.5378566980361938, "learning_rate": 8.936308262711864e-06, "loss": 1.5581, "mean_token_accuracy": 0.6594459861516953, "num_tokens": 6462828.0, "step": 8034 }, { "epoch": 2.128177966101695, "grad_norm": 1.7045314311981201, "learning_rate": 8.936043432203391e-06, "loss": 1.012, "mean_token_accuracy": 0.7672887891530991, "num_tokens": 6464274.0, "step": 8036 }, { "epoch": 2.128707627118644, "grad_norm": 1.659933090209961, "learning_rate": 8.935778601694916e-06, "loss": 1.2251, "mean_token_accuracy": 0.7099736481904984, "num_tokens": 6465809.0, "step": 8038 }, { "epoch": 2.1292372881355934, "grad_norm": 1.8808369636535645, "learning_rate": 8.93551377118644e-06, "loss": 1.9501, "mean_token_accuracy": 0.6077488511800766, "num_tokens": 6467649.0, "step": 8040 }, { "epoch": 2.1297669491525424, "grad_norm": 1.7997652292251587, "learning_rate": 8.935248940677966e-06, "loss": 1.0519, "mean_token_accuracy": 0.75782061368227, "num_tokens": 6469111.0, "step": 8042 }, { "epoch": 2.1302966101694913, "grad_norm": 2.2203805446624756, "learning_rate": 8.934984110169492e-06, "loss": 1.7063, "mean_token_accuracy": 0.6375319324433804, "num_tokens": 6470609.0, "step": 8044 }, { "epoch": 2.1308262711864407, "grad_norm": 1.9515273571014404, "learning_rate": 8.934719279661017e-06, "loss": 1.6363, "mean_token_accuracy": 0.6451957188546658, "num_tokens": 6472128.0, "step": 8046 }, { "epoch": 2.1313559322033897, "grad_norm": 1.7258107662200928, "learning_rate": 8.934454449152544e-06, "loss": 1.4696, "mean_token_accuracy": 0.6681338250637054, "num_tokens": 6473660.0, "step": 8048 }, { "epoch": 2.131885593220339, "grad_norm": 1.923870325088501, "learning_rate": 8.934189618644069e-06, "loss": 1.1542, "mean_token_accuracy": 0.728170707821846, "num_tokens": 6475380.0, "step": 8050 }, { "epoch": 2.132415254237288, "grad_norm": 1.8894516229629517, "learning_rate": 8.933924788135594e-06, "loss": 1.1579, "mean_token_accuracy": 0.734958179295063, "num_tokens": 6476844.0, "step": 8052 }, { "epoch": 2.1329449152542375, "grad_norm": 1.6128689050674438, "learning_rate": 8.93365995762712e-06, "loss": 0.8492, "mean_token_accuracy": 0.7883257567882538, "num_tokens": 6478342.0, "step": 8054 }, { "epoch": 2.1334745762711864, "grad_norm": 1.8037365674972534, "learning_rate": 8.933395127118645e-06, "loss": 1.1937, "mean_token_accuracy": 0.725154310464859, "num_tokens": 6479733.0, "step": 8056 }, { "epoch": 2.134004237288136, "grad_norm": 1.9110127687454224, "learning_rate": 8.93313029661017e-06, "loss": 1.8771, "mean_token_accuracy": 0.6129784360527992, "num_tokens": 6481285.0, "step": 8058 }, { "epoch": 2.1345338983050848, "grad_norm": 2.1661980152130127, "learning_rate": 8.932865466101695e-06, "loss": 1.4398, "mean_token_accuracy": 0.7015359699726105, "num_tokens": 6482839.0, "step": 8060 }, { "epoch": 2.1350635593220337, "grad_norm": 1.7769978046417236, "learning_rate": 8.932600635593222e-06, "loss": 1.0743, "mean_token_accuracy": 0.7238348871469498, "num_tokens": 6484444.0, "step": 8062 }, { "epoch": 2.135593220338983, "grad_norm": 1.6092948913574219, "learning_rate": 8.932335805084747e-06, "loss": 1.2134, "mean_token_accuracy": 0.7194490879774094, "num_tokens": 6486164.0, "step": 8064 }, { "epoch": 2.136122881355932, "grad_norm": 1.8381855487823486, "learning_rate": 8.932070974576272e-06, "loss": 1.2013, "mean_token_accuracy": 0.7211140282452106, "num_tokens": 6487697.0, "step": 8066 }, { "epoch": 2.1366525423728815, "grad_norm": 2.0144333839416504, "learning_rate": 8.931806144067797e-06, "loss": 1.7075, "mean_token_accuracy": 0.6402847096323967, "num_tokens": 6489183.0, "step": 8068 }, { "epoch": 2.1371822033898304, "grad_norm": 1.6339086294174194, "learning_rate": 8.931541313559323e-06, "loss": 1.4834, "mean_token_accuracy": 0.6756738573312759, "num_tokens": 6490877.0, "step": 8070 }, { "epoch": 2.13771186440678, "grad_norm": 2.2193140983581543, "learning_rate": 8.931276483050848e-06, "loss": 1.3804, "mean_token_accuracy": 0.6806372776627541, "num_tokens": 6492634.0, "step": 8072 }, { "epoch": 2.138241525423729, "grad_norm": 1.4902918338775635, "learning_rate": 8.931011652542375e-06, "loss": 1.2466, "mean_token_accuracy": 0.7251958027482033, "num_tokens": 6493987.0, "step": 8074 }, { "epoch": 2.1387711864406778, "grad_norm": 1.7079662084579468, "learning_rate": 8.9307468220339e-06, "loss": 1.1681, "mean_token_accuracy": 0.7271867021918297, "num_tokens": 6496028.0, "step": 8076 }, { "epoch": 2.139300847457627, "grad_norm": 1.619264006614685, "learning_rate": 8.930481991525425e-06, "loss": 1.458, "mean_token_accuracy": 0.6737759858369827, "num_tokens": 6497775.0, "step": 8078 }, { "epoch": 2.139830508474576, "grad_norm": 1.4315060377120972, "learning_rate": 8.93021716101695e-06, "loss": 1.0013, "mean_token_accuracy": 0.7581291124224663, "num_tokens": 6499663.0, "step": 8080 }, { "epoch": 2.1403601694915255, "grad_norm": 1.2348626852035522, "learning_rate": 8.929952330508476e-06, "loss": 0.9836, "mean_token_accuracy": 0.7584791257977486, "num_tokens": 6501117.0, "step": 8082 }, { "epoch": 2.1408898305084745, "grad_norm": 1.565700650215149, "learning_rate": 8.929687500000001e-06, "loss": 0.918, "mean_token_accuracy": 0.7626380324363708, "num_tokens": 6502682.0, "step": 8084 }, { "epoch": 2.141419491525424, "grad_norm": 2.053351640701294, "learning_rate": 8.929422669491526e-06, "loss": 1.6493, "mean_token_accuracy": 0.6364932954311371, "num_tokens": 6504297.0, "step": 8086 }, { "epoch": 2.141949152542373, "grad_norm": 1.5758670568466187, "learning_rate": 8.929157838983051e-06, "loss": 1.1804, "mean_token_accuracy": 0.7315733917057514, "num_tokens": 6506045.0, "step": 8088 }, { "epoch": 2.1424788135593222, "grad_norm": 1.4406861066818237, "learning_rate": 8.928893008474578e-06, "loss": 1.0813, "mean_token_accuracy": 0.7298672646284103, "num_tokens": 6507668.0, "step": 8090 }, { "epoch": 2.143008474576271, "grad_norm": 1.9307880401611328, "learning_rate": 8.928628177966102e-06, "loss": 1.3861, "mean_token_accuracy": 0.7128681987524033, "num_tokens": 6509046.0, "step": 8092 }, { "epoch": 2.14353813559322, "grad_norm": 1.6374523639678955, "learning_rate": 8.928363347457627e-06, "loss": 1.4458, "mean_token_accuracy": 0.6830486059188843, "num_tokens": 6510597.0, "step": 8094 }, { "epoch": 2.1440677966101696, "grad_norm": 2.007624864578247, "learning_rate": 8.928098516949152e-06, "loss": 1.156, "mean_token_accuracy": 0.7375775799155235, "num_tokens": 6512173.0, "step": 8096 }, { "epoch": 2.1445974576271185, "grad_norm": 1.2669761180877686, "learning_rate": 8.927833686440679e-06, "loss": 0.9034, "mean_token_accuracy": 0.7624062448740005, "num_tokens": 6513793.0, "step": 8098 }, { "epoch": 2.145127118644068, "grad_norm": 1.6619287729263306, "learning_rate": 8.927568855932204e-06, "loss": 1.0828, "mean_token_accuracy": 0.7631727829575539, "num_tokens": 6515049.0, "step": 8100 }, { "epoch": 2.145656779661017, "grad_norm": 1.35452139377594, "learning_rate": 8.92730402542373e-06, "loss": 1.2498, "mean_token_accuracy": 0.6923214942216873, "num_tokens": 6516450.0, "step": 8102 }, { "epoch": 2.1461864406779663, "grad_norm": 1.9923356771469116, "learning_rate": 8.927039194915255e-06, "loss": 1.288, "mean_token_accuracy": 0.6927614957094193, "num_tokens": 6517911.0, "step": 8104 }, { "epoch": 2.1467161016949152, "grad_norm": 1.4859727621078491, "learning_rate": 8.92677436440678e-06, "loss": 1.1831, "mean_token_accuracy": 0.7370287328958511, "num_tokens": 6519574.0, "step": 8106 }, { "epoch": 2.147245762711864, "grad_norm": 1.6128628253936768, "learning_rate": 8.926509533898305e-06, "loss": 1.3988, "mean_token_accuracy": 0.6880354732275009, "num_tokens": 6521253.0, "step": 8108 }, { "epoch": 2.1477754237288136, "grad_norm": 1.5646542310714722, "learning_rate": 8.926244703389832e-06, "loss": 1.4141, "mean_token_accuracy": 0.6788011342287064, "num_tokens": 6522854.0, "step": 8110 }, { "epoch": 2.1483050847457625, "grad_norm": 1.7752583026885986, "learning_rate": 8.925979872881357e-06, "loss": 1.5262, "mean_token_accuracy": 0.6274600327014923, "num_tokens": 6524605.0, "step": 8112 }, { "epoch": 2.148834745762712, "grad_norm": 2.2072043418884277, "learning_rate": 8.925715042372882e-06, "loss": 1.7538, "mean_token_accuracy": 0.6167886629700661, "num_tokens": 6526248.0, "step": 8114 }, { "epoch": 2.149364406779661, "grad_norm": 1.8871821165084839, "learning_rate": 8.925450211864407e-06, "loss": 1.2273, "mean_token_accuracy": 0.7243938744068146, "num_tokens": 6527819.0, "step": 8116 }, { "epoch": 2.1498940677966103, "grad_norm": 1.7029353380203247, "learning_rate": 8.925185381355933e-06, "loss": 1.4267, "mean_token_accuracy": 0.6778000220656395, "num_tokens": 6529330.0, "step": 8118 }, { "epoch": 2.1504237288135593, "grad_norm": 1.6841075420379639, "learning_rate": 8.924920550847458e-06, "loss": 1.4969, "mean_token_accuracy": 0.6596304774284363, "num_tokens": 6531093.0, "step": 8120 }, { "epoch": 2.1509533898305087, "grad_norm": 1.6246345043182373, "learning_rate": 8.924655720338983e-06, "loss": 0.8751, "mean_token_accuracy": 0.7754790931940079, "num_tokens": 6532876.0, "step": 8122 }, { "epoch": 2.1514830508474576, "grad_norm": 1.5818170309066772, "learning_rate": 8.924390889830508e-06, "loss": 1.2631, "mean_token_accuracy": 0.7022677138447762, "num_tokens": 6534381.0, "step": 8124 }, { "epoch": 2.1520127118644066, "grad_norm": 1.5381418466567993, "learning_rate": 8.924126059322035e-06, "loss": 1.4551, "mean_token_accuracy": 0.6742332726716995, "num_tokens": 6535824.0, "step": 8126 }, { "epoch": 2.152542372881356, "grad_norm": 1.7155064344406128, "learning_rate": 8.92386122881356e-06, "loss": 1.1997, "mean_token_accuracy": 0.7199737578630447, "num_tokens": 6537489.0, "step": 8128 }, { "epoch": 2.153072033898305, "grad_norm": 1.6591572761535645, "learning_rate": 8.923596398305086e-06, "loss": 1.4703, "mean_token_accuracy": 0.6758968606591225, "num_tokens": 6538970.0, "step": 8130 }, { "epoch": 2.1536016949152543, "grad_norm": 1.6319009065628052, "learning_rate": 8.92333156779661e-06, "loss": 1.3573, "mean_token_accuracy": 0.6957408785820007, "num_tokens": 6540537.0, "step": 8132 }, { "epoch": 2.1541313559322033, "grad_norm": 2.3352904319763184, "learning_rate": 8.923066737288136e-06, "loss": 1.6044, "mean_token_accuracy": 0.6366505846381187, "num_tokens": 6541913.0, "step": 8134 }, { "epoch": 2.1546610169491527, "grad_norm": 1.297627568244934, "learning_rate": 8.922801906779663e-06, "loss": 1.1285, "mean_token_accuracy": 0.7413879185914993, "num_tokens": 6543650.0, "step": 8136 }, { "epoch": 2.1551906779661016, "grad_norm": 1.7275913953781128, "learning_rate": 8.922537076271188e-06, "loss": 1.6249, "mean_token_accuracy": 0.634085938334465, "num_tokens": 6545245.0, "step": 8138 }, { "epoch": 2.155720338983051, "grad_norm": 1.8317933082580566, "learning_rate": 8.922272245762713e-06, "loss": 1.4703, "mean_token_accuracy": 0.6844330877065659, "num_tokens": 6546549.0, "step": 8140 }, { "epoch": 2.15625, "grad_norm": 1.383600115776062, "learning_rate": 8.922007415254238e-06, "loss": 1.1337, "mean_token_accuracy": 0.7462056502699852, "num_tokens": 6548383.0, "step": 8142 }, { "epoch": 2.156779661016949, "grad_norm": 1.2206939458847046, "learning_rate": 8.921742584745764e-06, "loss": 0.793, "mean_token_accuracy": 0.7953946217894554, "num_tokens": 6549979.0, "step": 8144 }, { "epoch": 2.1573093220338984, "grad_norm": 1.5215368270874023, "learning_rate": 8.921477754237289e-06, "loss": 1.4132, "mean_token_accuracy": 0.6784833259880543, "num_tokens": 6551783.0, "step": 8146 }, { "epoch": 2.1578389830508473, "grad_norm": 1.4286017417907715, "learning_rate": 8.921212923728814e-06, "loss": 1.0651, "mean_token_accuracy": 0.736231803894043, "num_tokens": 6553318.0, "step": 8148 }, { "epoch": 2.1583686440677967, "grad_norm": 1.561342477798462, "learning_rate": 8.920948093220339e-06, "loss": 1.1944, "mean_token_accuracy": 0.7096772938966751, "num_tokens": 6555172.0, "step": 8150 }, { "epoch": 2.1588983050847457, "grad_norm": 1.4046038389205933, "learning_rate": 8.920683262711866e-06, "loss": 1.1137, "mean_token_accuracy": 0.7162375301122665, "num_tokens": 6557268.0, "step": 8152 }, { "epoch": 2.159427966101695, "grad_norm": 1.7163219451904297, "learning_rate": 8.92041843220339e-06, "loss": 1.4539, "mean_token_accuracy": 0.6790797971189022, "num_tokens": 6558791.0, "step": 8154 }, { "epoch": 2.159957627118644, "grad_norm": 1.283140778541565, "learning_rate": 8.920153601694917e-06, "loss": 1.2046, "mean_token_accuracy": 0.6889596730470657, "num_tokens": 6561156.0, "step": 8156 }, { "epoch": 2.1604872881355934, "grad_norm": 1.418365716934204, "learning_rate": 8.919888771186442e-06, "loss": 1.3764, "mean_token_accuracy": 0.6825217679142952, "num_tokens": 6562922.0, "step": 8158 }, { "epoch": 2.1610169491525424, "grad_norm": 1.6380776166915894, "learning_rate": 8.919623940677967e-06, "loss": 1.4617, "mean_token_accuracy": 0.6789894625544548, "num_tokens": 6564453.0, "step": 8160 }, { "epoch": 2.1615466101694913, "grad_norm": 1.7271628379821777, "learning_rate": 8.919359110169492e-06, "loss": 1.0515, "mean_token_accuracy": 0.7387828007340431, "num_tokens": 6565900.0, "step": 8162 }, { "epoch": 2.1620762711864407, "grad_norm": 1.8244218826293945, "learning_rate": 8.919094279661019e-06, "loss": 1.6321, "mean_token_accuracy": 0.6683252528309822, "num_tokens": 6567433.0, "step": 8164 }, { "epoch": 2.1626059322033897, "grad_norm": 1.9957247972488403, "learning_rate": 8.918829449152543e-06, "loss": 1.0713, "mean_token_accuracy": 0.7433370649814606, "num_tokens": 6568909.0, "step": 8166 }, { "epoch": 2.163135593220339, "grad_norm": 1.5474584102630615, "learning_rate": 8.918564618644068e-06, "loss": 1.1621, "mean_token_accuracy": 0.7157599329948425, "num_tokens": 6570766.0, "step": 8168 }, { "epoch": 2.163665254237288, "grad_norm": 1.5972654819488525, "learning_rate": 8.918299788135593e-06, "loss": 1.2145, "mean_token_accuracy": 0.7231658846139908, "num_tokens": 6572343.0, "step": 8170 }, { "epoch": 2.1641949152542375, "grad_norm": 1.3717901706695557, "learning_rate": 8.91803495762712e-06, "loss": 0.8446, "mean_token_accuracy": 0.7882556989789009, "num_tokens": 6573879.0, "step": 8172 }, { "epoch": 2.1647245762711864, "grad_norm": 1.7945611476898193, "learning_rate": 8.917770127118645e-06, "loss": 1.7194, "mean_token_accuracy": 0.6242920011281967, "num_tokens": 6575557.0, "step": 8174 }, { "epoch": 2.165254237288136, "grad_norm": 1.8545095920562744, "learning_rate": 8.91750529661017e-06, "loss": 1.4456, "mean_token_accuracy": 0.6841975226998329, "num_tokens": 6577237.0, "step": 8176 }, { "epoch": 2.1657838983050848, "grad_norm": 1.6709153652191162, "learning_rate": 8.917240466101695e-06, "loss": 1.065, "mean_token_accuracy": 0.7543656080961227, "num_tokens": 6578668.0, "step": 8178 }, { "epoch": 2.1663135593220337, "grad_norm": 1.5345160961151123, "learning_rate": 8.916975635593221e-06, "loss": 1.2387, "mean_token_accuracy": 0.7029416114091873, "num_tokens": 6580470.0, "step": 8180 }, { "epoch": 2.166843220338983, "grad_norm": 1.1589051485061646, "learning_rate": 8.916710805084746e-06, "loss": 1.0391, "mean_token_accuracy": 0.7383010759949684, "num_tokens": 6583227.0, "step": 8182 }, { "epoch": 2.167372881355932, "grad_norm": 1.8745040893554688, "learning_rate": 8.916445974576273e-06, "loss": 1.5727, "mean_token_accuracy": 0.6606931239366531, "num_tokens": 6584845.0, "step": 8184 }, { "epoch": 2.1679025423728815, "grad_norm": 1.7513837814331055, "learning_rate": 8.916181144067796e-06, "loss": 1.5189, "mean_token_accuracy": 0.6656245738267899, "num_tokens": 6586444.0, "step": 8186 }, { "epoch": 2.1684322033898304, "grad_norm": 1.739683747291565, "learning_rate": 8.915916313559323e-06, "loss": 1.6242, "mean_token_accuracy": 0.6297882795333862, "num_tokens": 6588135.0, "step": 8188 }, { "epoch": 2.16896186440678, "grad_norm": 1.5888192653656006, "learning_rate": 8.915651483050848e-06, "loss": 1.2367, "mean_token_accuracy": 0.6933599039912224, "num_tokens": 6590075.0, "step": 8190 }, { "epoch": 2.169491525423729, "grad_norm": 1.6163758039474487, "learning_rate": 8.915386652542374e-06, "loss": 1.3509, "mean_token_accuracy": 0.7215175181627274, "num_tokens": 6591848.0, "step": 8192 }, { "epoch": 2.1700211864406778, "grad_norm": 1.5447443723678589, "learning_rate": 8.9151218220339e-06, "loss": 1.3528, "mean_token_accuracy": 0.6796664372086525, "num_tokens": 6593444.0, "step": 8194 }, { "epoch": 2.170550847457627, "grad_norm": 1.8276029825210571, "learning_rate": 8.914856991525424e-06, "loss": 0.9797, "mean_token_accuracy": 0.7598633617162704, "num_tokens": 6594760.0, "step": 8196 }, { "epoch": 2.171080508474576, "grad_norm": 1.4971414804458618, "learning_rate": 8.914592161016949e-06, "loss": 1.2381, "mean_token_accuracy": 0.7202826663851738, "num_tokens": 6596369.0, "step": 8198 }, { "epoch": 2.1716101694915255, "grad_norm": 1.722970962524414, "learning_rate": 8.914327330508476e-06, "loss": 1.0679, "mean_token_accuracy": 0.7495765686035156, "num_tokens": 6598165.0, "step": 8200 }, { "epoch": 2.1721398305084745, "grad_norm": 1.7295840978622437, "learning_rate": 8.9140625e-06, "loss": 1.3352, "mean_token_accuracy": 0.6933890208601952, "num_tokens": 6599898.0, "step": 8202 }, { "epoch": 2.172669491525424, "grad_norm": 1.581964373588562, "learning_rate": 8.913797669491526e-06, "loss": 1.0745, "mean_token_accuracy": 0.716147743165493, "num_tokens": 6601486.0, "step": 8204 }, { "epoch": 2.173199152542373, "grad_norm": 1.9223679304122925, "learning_rate": 8.91353283898305e-06, "loss": 1.8459, "mean_token_accuracy": 0.5809829011559486, "num_tokens": 6603173.0, "step": 8206 }, { "epoch": 2.1737288135593222, "grad_norm": 1.890175461769104, "learning_rate": 8.913268008474577e-06, "loss": 1.54, "mean_token_accuracy": 0.655507929623127, "num_tokens": 6604738.0, "step": 8208 }, { "epoch": 2.174258474576271, "grad_norm": 1.6341830492019653, "learning_rate": 8.913003177966102e-06, "loss": 1.5716, "mean_token_accuracy": 0.6801955923438072, "num_tokens": 6606393.0, "step": 8210 }, { "epoch": 2.17478813559322, "grad_norm": 1.9126416444778442, "learning_rate": 8.912738347457629e-06, "loss": 1.7081, "mean_token_accuracy": 0.6435742974281311, "num_tokens": 6607855.0, "step": 8212 }, { "epoch": 2.1753177966101696, "grad_norm": 1.632424235343933, "learning_rate": 8.912473516949152e-06, "loss": 1.0996, "mean_token_accuracy": 0.7108504325151443, "num_tokens": 6609747.0, "step": 8214 }, { "epoch": 2.1758474576271185, "grad_norm": 1.910785436630249, "learning_rate": 8.912208686440679e-06, "loss": 1.5982, "mean_token_accuracy": 0.6612241603434086, "num_tokens": 6611235.0, "step": 8216 }, { "epoch": 2.176377118644068, "grad_norm": 1.409499168395996, "learning_rate": 8.911943855932204e-06, "loss": 1.274, "mean_token_accuracy": 0.7165522202849388, "num_tokens": 6613036.0, "step": 8218 }, { "epoch": 2.176906779661017, "grad_norm": 1.9602243900299072, "learning_rate": 8.91167902542373e-06, "loss": 1.3119, "mean_token_accuracy": 0.714757889509201, "num_tokens": 6614457.0, "step": 8220 }, { "epoch": 2.1774364406779663, "grad_norm": 1.8354243040084839, "learning_rate": 8.911414194915255e-06, "loss": 1.1119, "mean_token_accuracy": 0.7206361442804337, "num_tokens": 6616005.0, "step": 8222 }, { "epoch": 2.1779661016949152, "grad_norm": 1.388027548789978, "learning_rate": 8.91114936440678e-06, "loss": 0.6503, "mean_token_accuracy": 0.8285311982035637, "num_tokens": 6617172.0, "step": 8224 }, { "epoch": 2.178495762711864, "grad_norm": 2.2356908321380615, "learning_rate": 8.910884533898307e-06, "loss": 1.209, "mean_token_accuracy": 0.7461407408118248, "num_tokens": 6618469.0, "step": 8226 }, { "epoch": 2.1790254237288136, "grad_norm": 1.4968838691711426, "learning_rate": 8.910619703389832e-06, "loss": 1.3328, "mean_token_accuracy": 0.6947222501039505, "num_tokens": 6620238.0, "step": 8228 }, { "epoch": 2.1795550847457625, "grad_norm": 1.7233186960220337, "learning_rate": 8.910354872881356e-06, "loss": 1.2826, "mean_token_accuracy": 0.6959069669246674, "num_tokens": 6621899.0, "step": 8230 }, { "epoch": 2.180084745762712, "grad_norm": 1.9791513681411743, "learning_rate": 8.910090042372881e-06, "loss": 1.469, "mean_token_accuracy": 0.6517442315816879, "num_tokens": 6623446.0, "step": 8232 }, { "epoch": 2.180614406779661, "grad_norm": 2.0773894786834717, "learning_rate": 8.909825211864408e-06, "loss": 1.7764, "mean_token_accuracy": 0.6442415341734886, "num_tokens": 6625081.0, "step": 8234 }, { "epoch": 2.1811440677966103, "grad_norm": 1.4863649606704712, "learning_rate": 8.909560381355933e-06, "loss": 1.2854, "mean_token_accuracy": 0.7086229100823402, "num_tokens": 6626630.0, "step": 8236 }, { "epoch": 2.1816737288135593, "grad_norm": 1.896251916885376, "learning_rate": 8.90929555084746e-06, "loss": 1.6037, "mean_token_accuracy": 0.6449344828724861, "num_tokens": 6628118.0, "step": 8238 }, { "epoch": 2.1822033898305087, "grad_norm": 1.4511382579803467, "learning_rate": 8.909030720338983e-06, "loss": 1.1987, "mean_token_accuracy": 0.7065047919750214, "num_tokens": 6629749.0, "step": 8240 }, { "epoch": 2.1827330508474576, "grad_norm": 1.7479521036148071, "learning_rate": 8.90876588983051e-06, "loss": 1.1946, "mean_token_accuracy": 0.718342587351799, "num_tokens": 6631431.0, "step": 8242 }, { "epoch": 2.1832627118644066, "grad_norm": 1.9469118118286133, "learning_rate": 8.908501059322034e-06, "loss": 1.3477, "mean_token_accuracy": 0.6933163553476334, "num_tokens": 6633075.0, "step": 8244 }, { "epoch": 2.183792372881356, "grad_norm": 2.0946953296661377, "learning_rate": 8.908236228813561e-06, "loss": 1.3967, "mean_token_accuracy": 0.6919237971305847, "num_tokens": 6634619.0, "step": 8246 }, { "epoch": 2.184322033898305, "grad_norm": 1.2845534086227417, "learning_rate": 8.907971398305086e-06, "loss": 0.744, "mean_token_accuracy": 0.8106963559985161, "num_tokens": 6636486.0, "step": 8248 }, { "epoch": 2.1848516949152543, "grad_norm": 1.9819523096084595, "learning_rate": 8.90770656779661e-06, "loss": 1.4224, "step": 8250 }, { "epoch": 2.1848516949152543, "eval_loss": 1.314505696296692, "eval_mean_token_accuracy": 0.6999907768391943, "eval_num_tokens": 6637875.0, "eval_runtime": 49.1955, "eval_samples_per_second": 6.261, "eval_steps_per_second": 6.261, "step": 8250 }, { "epoch": 2.1853813559322033, "grad_norm": 1.9355974197387695, "learning_rate": 8.907441737288136e-06, "loss": 1.8625, "mean_token_accuracy": 0.6521957293152809, "num_tokens": 6639384.0, "step": 8252 }, { "epoch": 2.1859110169491527, "grad_norm": 1.813943862915039, "learning_rate": 8.907176906779662e-06, "loss": 1.0983, "mean_token_accuracy": 0.7562438696622849, "num_tokens": 6640913.0, "step": 8254 }, { "epoch": 2.1864406779661016, "grad_norm": 1.6642991304397583, "learning_rate": 8.906912076271187e-06, "loss": 1.2905, "mean_token_accuracy": 0.7151813991367817, "num_tokens": 6642435.0, "step": 8256 }, { "epoch": 2.186970338983051, "grad_norm": 1.6170880794525146, "learning_rate": 8.906647245762712e-06, "loss": 1.4443, "mean_token_accuracy": 0.6833068057894707, "num_tokens": 6644296.0, "step": 8258 }, { "epoch": 2.1875, "grad_norm": 1.6848255395889282, "learning_rate": 8.906382415254237e-06, "loss": 1.4139, "mean_token_accuracy": 0.6806720793247223, "num_tokens": 6645908.0, "step": 8260 }, { "epoch": 2.188029661016949, "grad_norm": 1.5614306926727295, "learning_rate": 8.906117584745764e-06, "loss": 1.4315, "mean_token_accuracy": 0.6831137612462044, "num_tokens": 6648098.0, "step": 8262 }, { "epoch": 2.1885593220338984, "grad_norm": 1.791408658027649, "learning_rate": 8.905852754237289e-06, "loss": 1.0954, "mean_token_accuracy": 0.7075636982917786, "num_tokens": 6649683.0, "step": 8264 }, { "epoch": 2.1890889830508473, "grad_norm": 1.789131999015808, "learning_rate": 8.905587923728815e-06, "loss": 1.251, "mean_token_accuracy": 0.7179747372865677, "num_tokens": 6651076.0, "step": 8266 }, { "epoch": 2.1896186440677967, "grad_norm": 1.7181224822998047, "learning_rate": 8.905323093220339e-06, "loss": 1.1619, "mean_token_accuracy": 0.722035177052021, "num_tokens": 6652647.0, "step": 8268 }, { "epoch": 2.1901483050847457, "grad_norm": 1.6484780311584473, "learning_rate": 8.905058262711865e-06, "loss": 1.1913, "mean_token_accuracy": 0.7084590494632721, "num_tokens": 6654480.0, "step": 8270 }, { "epoch": 2.190677966101695, "grad_norm": 1.310658574104309, "learning_rate": 8.90479343220339e-06, "loss": 1.2055, "mean_token_accuracy": 0.7034342736005783, "num_tokens": 6656349.0, "step": 8272 }, { "epoch": 2.191207627118644, "grad_norm": 1.9610607624053955, "learning_rate": 8.904528601694917e-06, "loss": 1.1244, "mean_token_accuracy": 0.7564974799752235, "num_tokens": 6657684.0, "step": 8274 }, { "epoch": 2.1917372881355934, "grad_norm": 1.648669958114624, "learning_rate": 8.904263771186442e-06, "loss": 1.3064, "mean_token_accuracy": 0.7010694295167923, "num_tokens": 6659190.0, "step": 8276 }, { "epoch": 2.1922669491525424, "grad_norm": 1.567578673362732, "learning_rate": 8.903998940677967e-06, "loss": 1.3042, "mean_token_accuracy": 0.6837503612041473, "num_tokens": 6660818.0, "step": 8278 }, { "epoch": 2.1927966101694913, "grad_norm": 1.494471549987793, "learning_rate": 8.903734110169492e-06, "loss": 1.3791, "mean_token_accuracy": 0.7148759514093399, "num_tokens": 6662192.0, "step": 8280 }, { "epoch": 2.1933262711864407, "grad_norm": 1.4975433349609375, "learning_rate": 8.903469279661018e-06, "loss": 1.3781, "mean_token_accuracy": 0.7080559656023979, "num_tokens": 6663944.0, "step": 8282 }, { "epoch": 2.1938559322033897, "grad_norm": 1.5787034034729004, "learning_rate": 8.903204449152543e-06, "loss": 1.1848, "mean_token_accuracy": 0.7240853384137154, "num_tokens": 6665638.0, "step": 8284 }, { "epoch": 2.194385593220339, "grad_norm": 1.7746484279632568, "learning_rate": 8.902939618644068e-06, "loss": 1.4202, "mean_token_accuracy": 0.6854310780763626, "num_tokens": 6667314.0, "step": 8286 }, { "epoch": 2.194915254237288, "grad_norm": 1.4904959201812744, "learning_rate": 8.902674788135593e-06, "loss": 1.2215, "mean_token_accuracy": 0.7162632346153259, "num_tokens": 6668919.0, "step": 8288 }, { "epoch": 2.1954449152542375, "grad_norm": 1.5824588537216187, "learning_rate": 8.90240995762712e-06, "loss": 1.0831, "mean_token_accuracy": 0.7562466040253639, "num_tokens": 6670604.0, "step": 8290 }, { "epoch": 2.1959745762711864, "grad_norm": 1.5210894346237183, "learning_rate": 8.902145127118645e-06, "loss": 1.0437, "mean_token_accuracy": 0.763839527964592, "num_tokens": 6672151.0, "step": 8292 }, { "epoch": 2.196504237288136, "grad_norm": 1.8044064044952393, "learning_rate": 8.90188029661017e-06, "loss": 1.5225, "mean_token_accuracy": 0.6660966128110886, "num_tokens": 6673703.0, "step": 8294 }, { "epoch": 2.1970338983050848, "grad_norm": 2.1913273334503174, "learning_rate": 8.901615466101694e-06, "loss": 1.3794, "mean_token_accuracy": 0.6844263300299644, "num_tokens": 6675413.0, "step": 8296 }, { "epoch": 2.1975635593220337, "grad_norm": 2.2227442264556885, "learning_rate": 8.901350635593221e-06, "loss": 1.2549, "mean_token_accuracy": 0.7122565135359764, "num_tokens": 6676864.0, "step": 8298 }, { "epoch": 2.198093220338983, "grad_norm": 1.7670550346374512, "learning_rate": 8.901085805084746e-06, "loss": 1.5462, "mean_token_accuracy": 0.6350067220628262, "num_tokens": 6678602.0, "step": 8300 }, { "epoch": 2.198622881355932, "grad_norm": 1.700573205947876, "learning_rate": 8.900820974576273e-06, "loss": 1.3107, "mean_token_accuracy": 0.7251150906085968, "num_tokens": 6680197.0, "step": 8302 }, { "epoch": 2.1991525423728815, "grad_norm": 1.5799667835235596, "learning_rate": 8.900556144067797e-06, "loss": 1.5625, "mean_token_accuracy": 0.6441710889339447, "num_tokens": 6681885.0, "step": 8304 }, { "epoch": 2.1996822033898304, "grad_norm": 1.4482983350753784, "learning_rate": 8.900291313559322e-06, "loss": 1.4289, "mean_token_accuracy": 0.6936950460076332, "num_tokens": 6683553.0, "step": 8306 }, { "epoch": 2.20021186440678, "grad_norm": 2.0012764930725098, "learning_rate": 8.900026483050849e-06, "loss": 1.7601, "mean_token_accuracy": 0.6329336538910866, "num_tokens": 6685064.0, "step": 8308 }, { "epoch": 2.200741525423729, "grad_norm": 1.744946837425232, "learning_rate": 8.899761652542374e-06, "loss": 1.1964, "mean_token_accuracy": 0.7300570607185364, "num_tokens": 6686636.0, "step": 8310 }, { "epoch": 2.2012711864406778, "grad_norm": 1.533791422843933, "learning_rate": 8.899496822033899e-06, "loss": 1.1545, "mean_token_accuracy": 0.7244354635477066, "num_tokens": 6688467.0, "step": 8312 }, { "epoch": 2.201800847457627, "grad_norm": 1.689210057258606, "learning_rate": 8.899231991525424e-06, "loss": 1.4569, "mean_token_accuracy": 0.6536698564887047, "num_tokens": 6690227.0, "step": 8314 }, { "epoch": 2.202330508474576, "grad_norm": 1.8383867740631104, "learning_rate": 8.89896716101695e-06, "loss": 1.8696, "mean_token_accuracy": 0.6119837164878845, "num_tokens": 6691898.0, "step": 8316 }, { "epoch": 2.2028601694915255, "grad_norm": 2.0521280765533447, "learning_rate": 8.898702330508475e-06, "loss": 1.2813, "mean_token_accuracy": 0.7134240344166756, "num_tokens": 6693446.0, "step": 8318 }, { "epoch": 2.2033898305084745, "grad_norm": 1.8229312896728516, "learning_rate": 8.898437500000002e-06, "loss": 1.6279, "mean_token_accuracy": 0.6457670852541924, "num_tokens": 6694945.0, "step": 8320 }, { "epoch": 2.203919491525424, "grad_norm": 2.002986431121826, "learning_rate": 8.898172669491525e-06, "loss": 1.5876, "mean_token_accuracy": 0.6526765748858452, "num_tokens": 6696502.0, "step": 8322 }, { "epoch": 2.204449152542373, "grad_norm": 1.578999400138855, "learning_rate": 8.897907838983052e-06, "loss": 1.0222, "mean_token_accuracy": 0.7382197678089142, "num_tokens": 6697960.0, "step": 8324 }, { "epoch": 2.2049788135593222, "grad_norm": 1.9033634662628174, "learning_rate": 8.897643008474577e-06, "loss": 1.606, "mean_token_accuracy": 0.6461651623249054, "num_tokens": 6699884.0, "step": 8326 }, { "epoch": 2.205508474576271, "grad_norm": 1.4322102069854736, "learning_rate": 8.897378177966103e-06, "loss": 0.8278, "mean_token_accuracy": 0.7800286412239075, "num_tokens": 6701607.0, "step": 8328 }, { "epoch": 2.20603813559322, "grad_norm": 1.6437320709228516, "learning_rate": 8.897113347457628e-06, "loss": 1.2104, "mean_token_accuracy": 0.7292239740490913, "num_tokens": 6703264.0, "step": 8330 }, { "epoch": 2.2065677966101696, "grad_norm": 1.6841421127319336, "learning_rate": 8.896848516949153e-06, "loss": 1.0646, "mean_token_accuracy": 0.7453481107950211, "num_tokens": 6704980.0, "step": 8332 }, { "epoch": 2.2070974576271185, "grad_norm": 1.7947804927825928, "learning_rate": 8.896583686440678e-06, "loss": 1.3484, "mean_token_accuracy": 0.6718812212347984, "num_tokens": 6706562.0, "step": 8334 }, { "epoch": 2.207627118644068, "grad_norm": 1.872093915939331, "learning_rate": 8.896318855932205e-06, "loss": 1.0404, "mean_token_accuracy": 0.744452990591526, "num_tokens": 6708019.0, "step": 8336 }, { "epoch": 2.208156779661017, "grad_norm": 1.8014508485794067, "learning_rate": 8.89605402542373e-06, "loss": 1.723, "mean_token_accuracy": 0.6074455082416534, "num_tokens": 6709877.0, "step": 8338 }, { "epoch": 2.2086864406779663, "grad_norm": 3.2500574588775635, "learning_rate": 8.895789194915255e-06, "loss": 1.3173, "mean_token_accuracy": 0.698833130300045, "num_tokens": 6711526.0, "step": 8340 }, { "epoch": 2.2092161016949152, "grad_norm": 2.150430917739868, "learning_rate": 8.89552436440678e-06, "loss": 1.3943, "mean_token_accuracy": 0.6845940351486206, "num_tokens": 6712863.0, "step": 8342 }, { "epoch": 2.209745762711864, "grad_norm": 1.4777127504348755, "learning_rate": 8.895259533898306e-06, "loss": 0.9761, "mean_token_accuracy": 0.7536101117730141, "num_tokens": 6714365.0, "step": 8344 }, { "epoch": 2.2102754237288136, "grad_norm": 1.9757730960845947, "learning_rate": 8.894994703389831e-06, "loss": 1.2752, "mean_token_accuracy": 0.7005492486059666, "num_tokens": 6715625.0, "step": 8346 }, { "epoch": 2.2108050847457625, "grad_norm": 1.772946834564209, "learning_rate": 8.894729872881356e-06, "loss": 1.1432, "mean_token_accuracy": 0.7076257839798927, "num_tokens": 6717372.0, "step": 8348 }, { "epoch": 2.211334745762712, "grad_norm": 1.5997428894042969, "learning_rate": 8.894465042372881e-06, "loss": 1.4101, "mean_token_accuracy": 0.6801401898264885, "num_tokens": 6718992.0, "step": 8350 }, { "epoch": 2.211864406779661, "grad_norm": 2.556504487991333, "learning_rate": 8.894200211864408e-06, "loss": 1.308, "mean_token_accuracy": 0.7161480411887169, "num_tokens": 6720448.0, "step": 8352 }, { "epoch": 2.2123940677966103, "grad_norm": 2.05358624458313, "learning_rate": 8.893935381355933e-06, "loss": 1.535, "mean_token_accuracy": 0.645442470908165, "num_tokens": 6722181.0, "step": 8354 }, { "epoch": 2.2129237288135593, "grad_norm": 1.9483871459960938, "learning_rate": 8.89367055084746e-06, "loss": 1.1373, "mean_token_accuracy": 0.7205469384789467, "num_tokens": 6723571.0, "step": 8356 }, { "epoch": 2.2134533898305087, "grad_norm": 1.958161473274231, "learning_rate": 8.893405720338984e-06, "loss": 1.3063, "mean_token_accuracy": 0.7195102870464325, "num_tokens": 6725207.0, "step": 8358 }, { "epoch": 2.2139830508474576, "grad_norm": 2.2764317989349365, "learning_rate": 8.893140889830509e-06, "loss": 1.2157, "mean_token_accuracy": 0.7130291238427162, "num_tokens": 6726754.0, "step": 8360 }, { "epoch": 2.2145127118644066, "grad_norm": 1.5310227870941162, "learning_rate": 8.892876059322034e-06, "loss": 1.0425, "mean_token_accuracy": 0.7410160377621651, "num_tokens": 6728442.0, "step": 8362 }, { "epoch": 2.215042372881356, "grad_norm": 1.4801080226898193, "learning_rate": 8.89261122881356e-06, "loss": 0.934, "mean_token_accuracy": 0.7621708959341049, "num_tokens": 6730119.0, "step": 8364 }, { "epoch": 2.215572033898305, "grad_norm": 1.4555363655090332, "learning_rate": 8.892346398305086e-06, "loss": 1.2605, "mean_token_accuracy": 0.7143339440226555, "num_tokens": 6731721.0, "step": 8366 }, { "epoch": 2.2161016949152543, "grad_norm": 1.4271122217178345, "learning_rate": 8.89208156779661e-06, "loss": 1.4168, "mean_token_accuracy": 0.670974500477314, "num_tokens": 6733453.0, "step": 8368 }, { "epoch": 2.2166313559322033, "grad_norm": 1.8896660804748535, "learning_rate": 8.891816737288135e-06, "loss": 1.2751, "mean_token_accuracy": 0.6966666802763939, "num_tokens": 6735051.0, "step": 8370 }, { "epoch": 2.2171610169491527, "grad_norm": 1.1299325227737427, "learning_rate": 8.891551906779662e-06, "loss": 1.1129, "mean_token_accuracy": 0.7255759090185165, "num_tokens": 6737516.0, "step": 8372 }, { "epoch": 2.2176906779661016, "grad_norm": 1.8690742254257202, "learning_rate": 8.891287076271187e-06, "loss": 1.1315, "mean_token_accuracy": 0.7365809865295887, "num_tokens": 6738944.0, "step": 8374 }, { "epoch": 2.218220338983051, "grad_norm": 1.7584654092788696, "learning_rate": 8.891022245762712e-06, "loss": 1.2744, "mean_token_accuracy": 0.7127504646778107, "num_tokens": 6740453.0, "step": 8376 }, { "epoch": 2.21875, "grad_norm": 2.0693490505218506, "learning_rate": 8.890757415254237e-06, "loss": 1.6423, "mean_token_accuracy": 0.6586962565779686, "num_tokens": 6741835.0, "step": 8378 }, { "epoch": 2.219279661016949, "grad_norm": 1.7841691970825195, "learning_rate": 8.890492584745763e-06, "loss": 1.5012, "mean_token_accuracy": 0.6702368706464767, "num_tokens": 6743244.0, "step": 8380 }, { "epoch": 2.2198093220338984, "grad_norm": 1.5704635381698608, "learning_rate": 8.890227754237288e-06, "loss": 1.363, "mean_token_accuracy": 0.7062028273940086, "num_tokens": 6744914.0, "step": 8382 }, { "epoch": 2.2203389830508473, "grad_norm": 1.6279823780059814, "learning_rate": 8.889962923728815e-06, "loss": 1.3155, "mean_token_accuracy": 0.7247020155191422, "num_tokens": 6746461.0, "step": 8384 }, { "epoch": 2.2208686440677967, "grad_norm": 1.4457929134368896, "learning_rate": 8.88969809322034e-06, "loss": 1.0587, "mean_token_accuracy": 0.7403191030025482, "num_tokens": 6748174.0, "step": 8386 }, { "epoch": 2.2213983050847457, "grad_norm": 1.6120429039001465, "learning_rate": 8.889433262711865e-06, "loss": 1.3256, "mean_token_accuracy": 0.7066061273217201, "num_tokens": 6749728.0, "step": 8388 }, { "epoch": 2.221927966101695, "grad_norm": 1.602920651435852, "learning_rate": 8.889168432203391e-06, "loss": 0.9366, "mean_token_accuracy": 0.7678571715950966, "num_tokens": 6751469.0, "step": 8390 }, { "epoch": 2.222457627118644, "grad_norm": 2.1170332431793213, "learning_rate": 8.888903601694916e-06, "loss": 1.7453, "mean_token_accuracy": 0.6279960572719574, "num_tokens": 6753049.0, "step": 8392 }, { "epoch": 2.2229872881355934, "grad_norm": 1.817014455795288, "learning_rate": 8.888638771186441e-06, "loss": 1.4512, "mean_token_accuracy": 0.6679273098707199, "num_tokens": 6754762.0, "step": 8394 }, { "epoch": 2.2235169491525424, "grad_norm": 2.0290122032165527, "learning_rate": 8.888373940677966e-06, "loss": 1.1524, "mean_token_accuracy": 0.7240764126181602, "num_tokens": 6755904.0, "step": 8396 }, { "epoch": 2.2240466101694913, "grad_norm": 1.3037222623825073, "learning_rate": 8.888109110169493e-06, "loss": 1.4192, "mean_token_accuracy": 0.7070068791508675, "num_tokens": 6757525.0, "step": 8398 }, { "epoch": 2.2245762711864407, "grad_norm": 2.060208797454834, "learning_rate": 8.887844279661018e-06, "loss": 1.5516, "mean_token_accuracy": 0.6588655114173889, "num_tokens": 6759343.0, "step": 8400 }, { "epoch": 2.2251059322033897, "grad_norm": 1.6991162300109863, "learning_rate": 8.887579449152543e-06, "loss": 1.198, "mean_token_accuracy": 0.7242869511246681, "num_tokens": 6760641.0, "step": 8402 }, { "epoch": 2.225635593220339, "grad_norm": 1.2962706089019775, "learning_rate": 8.887314618644068e-06, "loss": 1.2053, "mean_token_accuracy": 0.7404323220252991, "num_tokens": 6762353.0, "step": 8404 }, { "epoch": 2.226165254237288, "grad_norm": 1.9645495414733887, "learning_rate": 8.887049788135594e-06, "loss": 1.4959, "mean_token_accuracy": 0.6585772708058357, "num_tokens": 6764098.0, "step": 8406 }, { "epoch": 2.2266949152542375, "grad_norm": 2.0398504734039307, "learning_rate": 8.88678495762712e-06, "loss": 1.2517, "mean_token_accuracy": 0.7140311226248741, "num_tokens": 6765538.0, "step": 8408 }, { "epoch": 2.2272245762711864, "grad_norm": 1.7005614042282104, "learning_rate": 8.886520127118646e-06, "loss": 1.3477, "mean_token_accuracy": 0.6815667748451233, "num_tokens": 6767306.0, "step": 8410 }, { "epoch": 2.227754237288136, "grad_norm": 1.5916385650634766, "learning_rate": 8.88625529661017e-06, "loss": 1.3424, "mean_token_accuracy": 0.6957708075642586, "num_tokens": 6768767.0, "step": 8412 }, { "epoch": 2.2282838983050848, "grad_norm": 1.707073450088501, "learning_rate": 8.885990466101696e-06, "loss": 1.5615, "mean_token_accuracy": 0.6809554025530815, "num_tokens": 6770201.0, "step": 8414 }, { "epoch": 2.2288135593220337, "grad_norm": 1.5993727445602417, "learning_rate": 8.88572563559322e-06, "loss": 1.386, "mean_token_accuracy": 0.6760594472289085, "num_tokens": 6771679.0, "step": 8416 }, { "epoch": 2.229343220338983, "grad_norm": 1.791278600692749, "learning_rate": 8.885460805084747e-06, "loss": 1.3594, "mean_token_accuracy": 0.7081866413354874, "num_tokens": 6773244.0, "step": 8418 }, { "epoch": 2.229872881355932, "grad_norm": 1.3853071928024292, "learning_rate": 8.885195974576272e-06, "loss": 1.214, "mean_token_accuracy": 0.6972266733646393, "num_tokens": 6775041.0, "step": 8420 }, { "epoch": 2.2304025423728815, "grad_norm": 1.4159493446350098, "learning_rate": 8.884931144067797e-06, "loss": 1.0514, "mean_token_accuracy": 0.7512766495347023, "num_tokens": 6776899.0, "step": 8422 }, { "epoch": 2.2309322033898304, "grad_norm": 1.7458523511886597, "learning_rate": 8.884666313559322e-06, "loss": 1.5232, "mean_token_accuracy": 0.6821293532848358, "num_tokens": 6778574.0, "step": 8424 }, { "epoch": 2.23146186440678, "grad_norm": 2.009188413619995, "learning_rate": 8.884401483050849e-06, "loss": 1.8487, "mean_token_accuracy": 0.6026230305433273, "num_tokens": 6780185.0, "step": 8426 }, { "epoch": 2.231991525423729, "grad_norm": 1.6289368867874146, "learning_rate": 8.884136652542374e-06, "loss": 0.7762, "mean_token_accuracy": 0.7972964718937874, "num_tokens": 6781600.0, "step": 8428 }, { "epoch": 2.2325211864406778, "grad_norm": 1.7636444568634033, "learning_rate": 8.883871822033899e-06, "loss": 1.4093, "mean_token_accuracy": 0.6965540051460266, "num_tokens": 6782898.0, "step": 8430 }, { "epoch": 2.233050847457627, "grad_norm": 1.2971947193145752, "learning_rate": 8.883606991525423e-06, "loss": 1.0605, "mean_token_accuracy": 0.7520215287804604, "num_tokens": 6784611.0, "step": 8432 }, { "epoch": 2.233580508474576, "grad_norm": 1.6250431537628174, "learning_rate": 8.88334216101695e-06, "loss": 1.3464, "mean_token_accuracy": 0.7203768342733383, "num_tokens": 6786172.0, "step": 8434 }, { "epoch": 2.2341101694915255, "grad_norm": 1.6839572191238403, "learning_rate": 8.883077330508475e-06, "loss": 1.0001, "mean_token_accuracy": 0.7630324736237526, "num_tokens": 6787617.0, "step": 8436 }, { "epoch": 2.2346398305084745, "grad_norm": 1.8141636848449707, "learning_rate": 8.882812500000002e-06, "loss": 1.4672, "mean_token_accuracy": 0.6753565594553947, "num_tokens": 6789241.0, "step": 8438 }, { "epoch": 2.235169491525424, "grad_norm": 1.7285298109054565, "learning_rate": 8.882547669491527e-06, "loss": 1.6489, "mean_token_accuracy": 0.6559925898909569, "num_tokens": 6790930.0, "step": 8440 }, { "epoch": 2.235699152542373, "grad_norm": 1.6580668687820435, "learning_rate": 8.882282838983051e-06, "loss": 0.9611, "mean_token_accuracy": 0.7600180506706238, "num_tokens": 6792337.0, "step": 8442 }, { "epoch": 2.2362288135593222, "grad_norm": 1.4362938404083252, "learning_rate": 8.882018008474576e-06, "loss": 0.8849, "mean_token_accuracy": 0.7740409672260284, "num_tokens": 6794079.0, "step": 8444 }, { "epoch": 2.236758474576271, "grad_norm": 1.5634413957595825, "learning_rate": 8.881753177966103e-06, "loss": 1.101, "mean_token_accuracy": 0.7226848155260086, "num_tokens": 6795794.0, "step": 8446 }, { "epoch": 2.23728813559322, "grad_norm": 2.147498369216919, "learning_rate": 8.881488347457628e-06, "loss": 1.399, "mean_token_accuracy": 0.6839876770973206, "num_tokens": 6797269.0, "step": 8448 }, { "epoch": 2.2378177966101696, "grad_norm": 3.288336753845215, "learning_rate": 8.881223516949153e-06, "loss": 1.2954, "mean_token_accuracy": 0.7012826465070248, "num_tokens": 6798965.0, "step": 8450 }, { "epoch": 2.2383474576271185, "grad_norm": 1.8596608638763428, "learning_rate": 8.880958686440678e-06, "loss": 0.9745, "mean_token_accuracy": 0.753777764737606, "num_tokens": 6800443.0, "step": 8452 }, { "epoch": 2.238877118644068, "grad_norm": 1.5801737308502197, "learning_rate": 8.880693855932204e-06, "loss": 1.059, "mean_token_accuracy": 0.7192802652716637, "num_tokens": 6802232.0, "step": 8454 }, { "epoch": 2.239406779661017, "grad_norm": 1.5418058633804321, "learning_rate": 8.88042902542373e-06, "loss": 1.1468, "mean_token_accuracy": 0.7308389320969582, "num_tokens": 6803621.0, "step": 8456 }, { "epoch": 2.2399364406779663, "grad_norm": 1.4939229488372803, "learning_rate": 8.880164194915254e-06, "loss": 1.5936, "mean_token_accuracy": 0.6499641388654709, "num_tokens": 6805551.0, "step": 8458 }, { "epoch": 2.2404661016949152, "grad_norm": 2.040480613708496, "learning_rate": 8.87989936440678e-06, "loss": 1.2619, "mean_token_accuracy": 0.7265590354800224, "num_tokens": 6806973.0, "step": 8460 }, { "epoch": 2.240995762711864, "grad_norm": 1.6701536178588867, "learning_rate": 8.879634533898306e-06, "loss": 1.1637, "mean_token_accuracy": 0.7154671028256416, "num_tokens": 6808572.0, "step": 8462 }, { "epoch": 2.2415254237288136, "grad_norm": 1.9130501747131348, "learning_rate": 8.87936970338983e-06, "loss": 1.3638, "mean_token_accuracy": 0.6807204633951187, "num_tokens": 6810333.0, "step": 8464 }, { "epoch": 2.2420550847457625, "grad_norm": 1.665534257888794, "learning_rate": 8.879104872881357e-06, "loss": 1.3357, "mean_token_accuracy": 0.6950761675834656, "num_tokens": 6812097.0, "step": 8466 }, { "epoch": 2.242584745762712, "grad_norm": 1.7343531847000122, "learning_rate": 8.878840042372882e-06, "loss": 1.0352, "mean_token_accuracy": 0.7430136129260063, "num_tokens": 6813558.0, "step": 8468 }, { "epoch": 2.243114406779661, "grad_norm": 1.6885364055633545, "learning_rate": 8.878575211864407e-06, "loss": 1.7173, "mean_token_accuracy": 0.634737066924572, "num_tokens": 6815529.0, "step": 8470 }, { "epoch": 2.2436440677966103, "grad_norm": 1.4436218738555908, "learning_rate": 8.878310381355934e-06, "loss": 1.1607, "mean_token_accuracy": 0.7410688176751137, "num_tokens": 6817253.0, "step": 8472 }, { "epoch": 2.2441737288135593, "grad_norm": 1.8200500011444092, "learning_rate": 8.878045550847459e-06, "loss": 1.4063, "mean_token_accuracy": 0.6956204921007156, "num_tokens": 6818776.0, "step": 8474 }, { "epoch": 2.2447033898305087, "grad_norm": 2.1053574085235596, "learning_rate": 8.877780720338984e-06, "loss": 1.933, "mean_token_accuracy": 0.6134576983749866, "num_tokens": 6820301.0, "step": 8476 }, { "epoch": 2.2452330508474576, "grad_norm": 1.6897351741790771, "learning_rate": 8.877515889830509e-06, "loss": 1.1011, "mean_token_accuracy": 0.7635720297694206, "num_tokens": 6821769.0, "step": 8478 }, { "epoch": 2.2457627118644066, "grad_norm": 1.6533995866775513, "learning_rate": 8.877251059322035e-06, "loss": 1.4318, "mean_token_accuracy": 0.6918177157640457, "num_tokens": 6823419.0, "step": 8480 }, { "epoch": 2.246292372881356, "grad_norm": 1.489845871925354, "learning_rate": 8.87698622881356e-06, "loss": 1.6074, "mean_token_accuracy": 0.6413723081350327, "num_tokens": 6825405.0, "step": 8482 }, { "epoch": 2.246822033898305, "grad_norm": 1.6408113241195679, "learning_rate": 8.876721398305085e-06, "loss": 0.9569, "mean_token_accuracy": 0.7593488246202469, "num_tokens": 6827018.0, "step": 8484 }, { "epoch": 2.2473516949152543, "grad_norm": 1.8027925491333008, "learning_rate": 8.87645656779661e-06, "loss": 1.331, "mean_token_accuracy": 0.6801471635699272, "num_tokens": 6828731.0, "step": 8486 }, { "epoch": 2.2478813559322033, "grad_norm": 1.6754850149154663, "learning_rate": 8.876191737288137e-06, "loss": 1.5165, "mean_token_accuracy": 0.6603657156229019, "num_tokens": 6830169.0, "step": 8488 }, { "epoch": 2.2484110169491527, "grad_norm": 1.581291913986206, "learning_rate": 8.875926906779662e-06, "loss": 1.5527, "mean_token_accuracy": 0.6793844252824783, "num_tokens": 6831393.0, "step": 8490 }, { "epoch": 2.2489406779661016, "grad_norm": 1.99314284324646, "learning_rate": 8.875662076271188e-06, "loss": 1.3043, "mean_token_accuracy": 0.7315346077084541, "num_tokens": 6832737.0, "step": 8492 }, { "epoch": 2.249470338983051, "grad_norm": 1.8014028072357178, "learning_rate": 8.875397245762713e-06, "loss": 1.2492, "mean_token_accuracy": 0.6908949688076973, "num_tokens": 6834171.0, "step": 8494 }, { "epoch": 2.25, "grad_norm": 1.5653588771820068, "learning_rate": 8.875132415254238e-06, "loss": 0.897, "mean_token_accuracy": 0.7785683050751686, "num_tokens": 6835489.0, "step": 8496 }, { "epoch": 2.250529661016949, "grad_norm": 1.8570351600646973, "learning_rate": 8.874867584745763e-06, "loss": 1.4551, "mean_token_accuracy": 0.6888885423541069, "num_tokens": 6837066.0, "step": 8498 }, { "epoch": 2.2510593220338984, "grad_norm": 1.6486470699310303, "learning_rate": 8.87460275423729e-06, "loss": 1.2338, "step": 8500 }, { "epoch": 2.2510593220338984, "eval_loss": 1.3139780759811401, "eval_mean_token_accuracy": 0.7006534036103781, "eval_num_tokens": 6838417.0, "eval_runtime": 48.2418, "eval_samples_per_second": 6.385, "eval_steps_per_second": 6.385, "step": 8500 }, { "epoch": 2.2515889830508473, "grad_norm": 2.072356700897217, "learning_rate": 8.874337923728815e-06, "loss": 1.8499, "mean_token_accuracy": 0.6447476334869862, "num_tokens": 6840381.0, "step": 8502 }, { "epoch": 2.2521186440677967, "grad_norm": 1.820656418800354, "learning_rate": 8.87407309322034e-06, "loss": 1.7902, "mean_token_accuracy": 0.6185216903686523, "num_tokens": 6842073.0, "step": 8504 }, { "epoch": 2.2526483050847457, "grad_norm": 1.9495844841003418, "learning_rate": 8.873808262711864e-06, "loss": 0.9683, "mean_token_accuracy": 0.7695955485105515, "num_tokens": 6843614.0, "step": 8506 }, { "epoch": 2.253177966101695, "grad_norm": 1.2763789892196655, "learning_rate": 8.873543432203391e-06, "loss": 1.2697, "mean_token_accuracy": 0.6884602755308151, "num_tokens": 6846031.0, "step": 8508 }, { "epoch": 2.253707627118644, "grad_norm": 1.7718828916549683, "learning_rate": 8.873278601694916e-06, "loss": 1.2786, "mean_token_accuracy": 0.7179745510220528, "num_tokens": 6847715.0, "step": 8510 }, { "epoch": 2.2542372881355934, "grad_norm": 2.5797512531280518, "learning_rate": 8.873013771186441e-06, "loss": 1.4458, "mean_token_accuracy": 0.6762012392282486, "num_tokens": 6849275.0, "step": 8512 }, { "epoch": 2.2547669491525424, "grad_norm": 1.4379401206970215, "learning_rate": 8.872748940677966e-06, "loss": 1.3876, "mean_token_accuracy": 0.6830110847949982, "num_tokens": 6851153.0, "step": 8514 }, { "epoch": 2.2552966101694913, "grad_norm": 1.6671664714813232, "learning_rate": 8.872484110169492e-06, "loss": 1.45, "mean_token_accuracy": 0.6780624985694885, "num_tokens": 6852771.0, "step": 8516 }, { "epoch": 2.2558262711864407, "grad_norm": 1.7908276319503784, "learning_rate": 8.872219279661017e-06, "loss": 1.7087, "mean_token_accuracy": 0.6302923187613487, "num_tokens": 6854389.0, "step": 8518 }, { "epoch": 2.2563559322033897, "grad_norm": 2.2101006507873535, "learning_rate": 8.871954449152544e-06, "loss": 1.5907, "mean_token_accuracy": 0.6493700072169304, "num_tokens": 6855838.0, "step": 8520 }, { "epoch": 2.256885593220339, "grad_norm": 1.3958632946014404, "learning_rate": 8.871689618644069e-06, "loss": 1.051, "mean_token_accuracy": 0.7788814827799797, "num_tokens": 6857390.0, "step": 8522 }, { "epoch": 2.257415254237288, "grad_norm": 1.6144057512283325, "learning_rate": 8.871424788135594e-06, "loss": 1.3569, "mean_token_accuracy": 0.6979567408561707, "num_tokens": 6858960.0, "step": 8524 }, { "epoch": 2.2579449152542375, "grad_norm": 1.6066478490829468, "learning_rate": 8.871159957627119e-06, "loss": 1.3318, "mean_token_accuracy": 0.6966055855154991, "num_tokens": 6860312.0, "step": 8526 }, { "epoch": 2.2584745762711864, "grad_norm": 1.7021125555038452, "learning_rate": 8.870895127118645e-06, "loss": 1.3771, "mean_token_accuracy": 0.6981335133314133, "num_tokens": 6862100.0, "step": 8528 }, { "epoch": 2.259004237288136, "grad_norm": 1.6871631145477295, "learning_rate": 8.87063029661017e-06, "loss": 1.5034, "mean_token_accuracy": 0.6705206632614136, "num_tokens": 6863788.0, "step": 8530 }, { "epoch": 2.2595338983050848, "grad_norm": 1.8826030492782593, "learning_rate": 8.870365466101695e-06, "loss": 1.2667, "mean_token_accuracy": 0.7136383429169655, "num_tokens": 6865167.0, "step": 8532 }, { "epoch": 2.2600635593220337, "grad_norm": 1.7483879327774048, "learning_rate": 8.87010063559322e-06, "loss": 1.461, "mean_token_accuracy": 0.6915972605347633, "num_tokens": 6866688.0, "step": 8534 }, { "epoch": 2.260593220338983, "grad_norm": 1.4420915842056274, "learning_rate": 8.869835805084747e-06, "loss": 0.8948, "mean_token_accuracy": 0.7660421803593636, "num_tokens": 6868440.0, "step": 8536 }, { "epoch": 2.261122881355932, "grad_norm": 1.5304197072982788, "learning_rate": 8.869570974576272e-06, "loss": 0.9878, "mean_token_accuracy": 0.7572097554802895, "num_tokens": 6869946.0, "step": 8538 }, { "epoch": 2.2616525423728815, "grad_norm": 1.4894825220108032, "learning_rate": 8.869306144067797e-06, "loss": 1.011, "mean_token_accuracy": 0.76646538823843, "num_tokens": 6871575.0, "step": 8540 }, { "epoch": 2.2621822033898304, "grad_norm": 1.7122392654418945, "learning_rate": 8.869041313559322e-06, "loss": 1.335, "mean_token_accuracy": 0.6955787762999535, "num_tokens": 6873133.0, "step": 8542 }, { "epoch": 2.26271186440678, "grad_norm": 1.8765674829483032, "learning_rate": 8.868776483050848e-06, "loss": 1.4089, "mean_token_accuracy": 0.6630347669124603, "num_tokens": 6874797.0, "step": 8544 }, { "epoch": 2.263241525423729, "grad_norm": 1.888830542564392, "learning_rate": 8.868511652542373e-06, "loss": 1.3046, "mean_token_accuracy": 0.7269761189818382, "num_tokens": 6876181.0, "step": 8546 }, { "epoch": 2.263771186440678, "grad_norm": 1.4499149322509766, "learning_rate": 8.8682468220339e-06, "loss": 1.2272, "mean_token_accuracy": 0.7347927615046501, "num_tokens": 6877822.0, "step": 8548 }, { "epoch": 2.264300847457627, "grad_norm": 1.6120222806930542, "learning_rate": 8.867981991525425e-06, "loss": 1.0973, "mean_token_accuracy": 0.7479224428534508, "num_tokens": 6879511.0, "step": 8550 }, { "epoch": 2.264830508474576, "grad_norm": 1.5936567783355713, "learning_rate": 8.86771716101695e-06, "loss": 1.2505, "mean_token_accuracy": 0.7264874801039696, "num_tokens": 6881168.0, "step": 8552 }, { "epoch": 2.2653601694915255, "grad_norm": 1.9319788217544556, "learning_rate": 8.867452330508475e-06, "loss": 1.2839, "mean_token_accuracy": 0.7101799324154854, "num_tokens": 6882725.0, "step": 8554 }, { "epoch": 2.2658898305084745, "grad_norm": 1.768946647644043, "learning_rate": 8.867187500000001e-06, "loss": 1.1689, "mean_token_accuracy": 0.7359309494495392, "num_tokens": 6884396.0, "step": 8556 }, { "epoch": 2.266419491525424, "grad_norm": 1.8384960889816284, "learning_rate": 8.866922669491526e-06, "loss": 1.4827, "mean_token_accuracy": 0.6724418103694916, "num_tokens": 6885856.0, "step": 8558 }, { "epoch": 2.266949152542373, "grad_norm": 1.5529093742370605, "learning_rate": 8.866657838983051e-06, "loss": 1.0322, "mean_token_accuracy": 0.7487656846642494, "num_tokens": 6887368.0, "step": 8560 }, { "epoch": 2.267478813559322, "grad_norm": 1.5001649856567383, "learning_rate": 8.866393008474578e-06, "loss": 1.3592, "mean_token_accuracy": 0.6939818412065506, "num_tokens": 6888886.0, "step": 8562 }, { "epoch": 2.268008474576271, "grad_norm": 1.8179378509521484, "learning_rate": 8.866128177966103e-06, "loss": 1.2538, "mean_token_accuracy": 0.7348556332290173, "num_tokens": 6890430.0, "step": 8564 }, { "epoch": 2.26853813559322, "grad_norm": 1.275793433189392, "learning_rate": 8.865863347457628e-06, "loss": 1.3305, "mean_token_accuracy": 0.6804038733243942, "num_tokens": 6892804.0, "step": 8566 }, { "epoch": 2.2690677966101696, "grad_norm": 1.9016183614730835, "learning_rate": 8.865598516949153e-06, "loss": 1.4781, "mean_token_accuracy": 0.6805231049656868, "num_tokens": 6894325.0, "step": 8568 }, { "epoch": 2.2695974576271185, "grad_norm": 1.8403366804122925, "learning_rate": 8.865333686440679e-06, "loss": 1.1497, "mean_token_accuracy": 0.7243361696600914, "num_tokens": 6896046.0, "step": 8570 }, { "epoch": 2.270127118644068, "grad_norm": 2.060295820236206, "learning_rate": 8.865068855932204e-06, "loss": 1.3424, "mean_token_accuracy": 0.6835847795009613, "num_tokens": 6897472.0, "step": 8572 }, { "epoch": 2.270656779661017, "grad_norm": 1.7182481288909912, "learning_rate": 8.86480402542373e-06, "loss": 1.5505, "mean_token_accuracy": 0.6608857586979866, "num_tokens": 6899219.0, "step": 8574 }, { "epoch": 2.2711864406779663, "grad_norm": 1.6966116428375244, "learning_rate": 8.864539194915256e-06, "loss": 1.4023, "mean_token_accuracy": 0.6980183906853199, "num_tokens": 6900648.0, "step": 8576 }, { "epoch": 2.2717161016949152, "grad_norm": 1.5253512859344482, "learning_rate": 8.86427436440678e-06, "loss": 1.1054, "mean_token_accuracy": 0.7456996142864227, "num_tokens": 6902250.0, "step": 8578 }, { "epoch": 2.272245762711864, "grad_norm": 1.6158756017684937, "learning_rate": 8.864009533898305e-06, "loss": 1.5178, "mean_token_accuracy": 0.6402509473264217, "num_tokens": 6903900.0, "step": 8580 }, { "epoch": 2.2727754237288136, "grad_norm": 1.7281596660614014, "learning_rate": 8.863744703389832e-06, "loss": 1.5463, "mean_token_accuracy": 0.6472237706184387, "num_tokens": 6905371.0, "step": 8582 }, { "epoch": 2.2733050847457625, "grad_norm": 1.7730469703674316, "learning_rate": 8.863479872881357e-06, "loss": 1.3897, "mean_token_accuracy": 0.6698442026972771, "num_tokens": 6907091.0, "step": 8584 }, { "epoch": 2.273834745762712, "grad_norm": 1.7900289297103882, "learning_rate": 8.863215042372882e-06, "loss": 1.3529, "mean_token_accuracy": 0.685039222240448, "num_tokens": 6908737.0, "step": 8586 }, { "epoch": 2.274364406779661, "grad_norm": 1.824726939201355, "learning_rate": 8.862950211864407e-06, "loss": 1.3226, "mean_token_accuracy": 0.7094137221574783, "num_tokens": 6910088.0, "step": 8588 }, { "epoch": 2.2748940677966103, "grad_norm": 1.751973271369934, "learning_rate": 8.862685381355934e-06, "loss": 1.4284, "mean_token_accuracy": 0.6863544061779976, "num_tokens": 6911658.0, "step": 8590 }, { "epoch": 2.2754237288135593, "grad_norm": 1.7299312353134155, "learning_rate": 8.862420550847458e-06, "loss": 1.2637, "mean_token_accuracy": 0.70976672321558, "num_tokens": 6913079.0, "step": 8592 }, { "epoch": 2.2759533898305087, "grad_norm": 1.4960265159606934, "learning_rate": 8.862155720338983e-06, "loss": 1.2868, "mean_token_accuracy": 0.718329519033432, "num_tokens": 6914731.0, "step": 8594 }, { "epoch": 2.2764830508474576, "grad_norm": 1.516669511795044, "learning_rate": 8.861890889830508e-06, "loss": 0.9013, "mean_token_accuracy": 0.7864948362112045, "num_tokens": 6916398.0, "step": 8596 }, { "epoch": 2.2770127118644066, "grad_norm": 1.8814152479171753, "learning_rate": 8.861626059322035e-06, "loss": 1.3085, "mean_token_accuracy": 0.7078345790505409, "num_tokens": 6917963.0, "step": 8598 }, { "epoch": 2.277542372881356, "grad_norm": 2.063432216644287, "learning_rate": 8.86136122881356e-06, "loss": 1.1128, "mean_token_accuracy": 0.7415951490402222, "num_tokens": 6919331.0, "step": 8600 }, { "epoch": 2.278072033898305, "grad_norm": 1.9633086919784546, "learning_rate": 8.861096398305086e-06, "loss": 1.1806, "mean_token_accuracy": 0.7083720788359642, "num_tokens": 6920769.0, "step": 8602 }, { "epoch": 2.2786016949152543, "grad_norm": 1.2892121076583862, "learning_rate": 8.860831567796611e-06, "loss": 0.8106, "mean_token_accuracy": 0.7901059314608574, "num_tokens": 6922392.0, "step": 8604 }, { "epoch": 2.2791313559322033, "grad_norm": 1.7217239141464233, "learning_rate": 8.860566737288136e-06, "loss": 1.261, "mean_token_accuracy": 0.727351151406765, "num_tokens": 6923901.0, "step": 8606 }, { "epoch": 2.2796610169491527, "grad_norm": 1.5432785749435425, "learning_rate": 8.860301906779661e-06, "loss": 1.4461, "mean_token_accuracy": 0.6899721249938011, "num_tokens": 6925644.0, "step": 8608 }, { "epoch": 2.2801906779661016, "grad_norm": 1.978810429573059, "learning_rate": 8.860037076271188e-06, "loss": 1.0565, "mean_token_accuracy": 0.769732803106308, "num_tokens": 6927073.0, "step": 8610 }, { "epoch": 2.280720338983051, "grad_norm": 1.3163422346115112, "learning_rate": 8.859772245762713e-06, "loss": 1.3025, "mean_token_accuracy": 0.6888119019567966, "num_tokens": 6929039.0, "step": 8612 }, { "epoch": 2.28125, "grad_norm": 1.5480679273605347, "learning_rate": 8.859507415254238e-06, "loss": 1.2409, "mean_token_accuracy": 0.7264309078454971, "num_tokens": 6930667.0, "step": 8614 }, { "epoch": 2.281779661016949, "grad_norm": 1.9180306196212769, "learning_rate": 8.859242584745763e-06, "loss": 1.2208, "mean_token_accuracy": 0.6888343691825867, "num_tokens": 6932174.0, "step": 8616 }, { "epoch": 2.2823093220338984, "grad_norm": 1.7428665161132812, "learning_rate": 8.85897775423729e-06, "loss": 1.3124, "mean_token_accuracy": 0.6986287385225296, "num_tokens": 6933797.0, "step": 8618 }, { "epoch": 2.2828389830508473, "grad_norm": 1.9885873794555664, "learning_rate": 8.858712923728814e-06, "loss": 1.2574, "mean_token_accuracy": 0.707438051700592, "num_tokens": 6935416.0, "step": 8620 }, { "epoch": 2.2833686440677967, "grad_norm": 1.921641230583191, "learning_rate": 8.858448093220339e-06, "loss": 1.6126, "mean_token_accuracy": 0.6729678884148598, "num_tokens": 6936865.0, "step": 8622 }, { "epoch": 2.2838983050847457, "grad_norm": 1.5569396018981934, "learning_rate": 8.858183262711864e-06, "loss": 1.3135, "mean_token_accuracy": 0.7463125362992287, "num_tokens": 6938434.0, "step": 8624 }, { "epoch": 2.284427966101695, "grad_norm": 1.6099351644515991, "learning_rate": 8.85791843220339e-06, "loss": 1.1577, "mean_token_accuracy": 0.7202639952301979, "num_tokens": 6940324.0, "step": 8626 }, { "epoch": 2.284957627118644, "grad_norm": 1.7570321559906006, "learning_rate": 8.857653601694916e-06, "loss": 1.1362, "mean_token_accuracy": 0.7380736619234085, "num_tokens": 6942069.0, "step": 8628 }, { "epoch": 2.2854872881355934, "grad_norm": 1.7720894813537598, "learning_rate": 8.857388771186442e-06, "loss": 1.3078, "mean_token_accuracy": 0.6781072989106178, "num_tokens": 6943859.0, "step": 8630 }, { "epoch": 2.2860169491525424, "grad_norm": 1.5681713819503784, "learning_rate": 8.857123940677965e-06, "loss": 1.1661, "mean_token_accuracy": 0.7181467115879059, "num_tokens": 6945726.0, "step": 8632 }, { "epoch": 2.2865466101694913, "grad_norm": 1.5041003227233887, "learning_rate": 8.856859110169492e-06, "loss": 1.1884, "mean_token_accuracy": 0.7380094453692436, "num_tokens": 6947472.0, "step": 8634 }, { "epoch": 2.2870762711864407, "grad_norm": 1.4998974800109863, "learning_rate": 8.856594279661017e-06, "loss": 1.1472, "mean_token_accuracy": 0.7502269223332405, "num_tokens": 6949137.0, "step": 8636 }, { "epoch": 2.2876059322033897, "grad_norm": 1.880180835723877, "learning_rate": 8.856329449152544e-06, "loss": 1.178, "mean_token_accuracy": 0.7083131149411201, "num_tokens": 6950955.0, "step": 8638 }, { "epoch": 2.288135593220339, "grad_norm": 1.7625949382781982, "learning_rate": 8.856064618644069e-06, "loss": 1.2168, "mean_token_accuracy": 0.7114936336874962, "num_tokens": 6952496.0, "step": 8640 }, { "epoch": 2.288665254237288, "grad_norm": 1.6513819694519043, "learning_rate": 8.855799788135594e-06, "loss": 1.4683, "mean_token_accuracy": 0.6542350798845291, "num_tokens": 6954189.0, "step": 8642 }, { "epoch": 2.2891949152542375, "grad_norm": 1.5866155624389648, "learning_rate": 8.85553495762712e-06, "loss": 1.2673, "mean_token_accuracy": 0.7246931195259094, "num_tokens": 6955946.0, "step": 8644 }, { "epoch": 2.2897245762711864, "grad_norm": 1.7364219427108765, "learning_rate": 8.855270127118645e-06, "loss": 1.3485, "mean_token_accuracy": 0.7020581141114235, "num_tokens": 6957422.0, "step": 8646 }, { "epoch": 2.290254237288136, "grad_norm": 1.5266908407211304, "learning_rate": 8.85500529661017e-06, "loss": 1.1868, "mean_token_accuracy": 0.7189556024968624, "num_tokens": 6958961.0, "step": 8648 }, { "epoch": 2.2907838983050848, "grad_norm": 1.925618052482605, "learning_rate": 8.854740466101695e-06, "loss": 1.3606, "mean_token_accuracy": 0.6709893867373466, "num_tokens": 6960378.0, "step": 8650 }, { "epoch": 2.2913135593220337, "grad_norm": 2.4875075817108154, "learning_rate": 8.854475635593222e-06, "loss": 1.1055, "mean_token_accuracy": 0.7241089716553688, "num_tokens": 6961705.0, "step": 8652 }, { "epoch": 2.291843220338983, "grad_norm": 1.8473429679870605, "learning_rate": 8.854210805084746e-06, "loss": 1.3115, "mean_token_accuracy": 0.6955955922603607, "num_tokens": 6963370.0, "step": 8654 }, { "epoch": 2.292372881355932, "grad_norm": 1.9752321243286133, "learning_rate": 8.853945974576273e-06, "loss": 1.4857, "mean_token_accuracy": 0.6686709299683571, "num_tokens": 6965013.0, "step": 8656 }, { "epoch": 2.2929025423728815, "grad_norm": 2.367250919342041, "learning_rate": 8.853681144067798e-06, "loss": 1.1501, "mean_token_accuracy": 0.7411537319421768, "num_tokens": 6966460.0, "step": 8658 }, { "epoch": 2.2934322033898304, "grad_norm": 1.3568437099456787, "learning_rate": 8.853416313559323e-06, "loss": 1.4455, "mean_token_accuracy": 0.6917849704623222, "num_tokens": 6969293.0, "step": 8660 }, { "epoch": 2.29396186440678, "grad_norm": 1.572662115097046, "learning_rate": 8.853151483050848e-06, "loss": 1.1073, "mean_token_accuracy": 0.7533335983753204, "num_tokens": 6970925.0, "step": 8662 }, { "epoch": 2.294491525423729, "grad_norm": 1.7178301811218262, "learning_rate": 8.852886652542375e-06, "loss": 1.3369, "mean_token_accuracy": 0.6927498057484627, "num_tokens": 6972495.0, "step": 8664 }, { "epoch": 2.295021186440678, "grad_norm": 1.4898369312286377, "learning_rate": 8.8526218220339e-06, "loss": 1.0521, "mean_token_accuracy": 0.759884312748909, "num_tokens": 6973968.0, "step": 8666 }, { "epoch": 2.295550847457627, "grad_norm": 1.8231537342071533, "learning_rate": 8.852356991525424e-06, "loss": 1.3582, "mean_token_accuracy": 0.6639577113091946, "num_tokens": 6975872.0, "step": 8668 }, { "epoch": 2.296080508474576, "grad_norm": 1.6346423625946045, "learning_rate": 8.85209216101695e-06, "loss": 1.2753, "mean_token_accuracy": 0.6996329724788666, "num_tokens": 6977454.0, "step": 8670 }, { "epoch": 2.2966101694915255, "grad_norm": 1.6681138277053833, "learning_rate": 8.851827330508476e-06, "loss": 1.2642, "mean_token_accuracy": 0.7239865511655807, "num_tokens": 6978919.0, "step": 8672 }, { "epoch": 2.2971398305084745, "grad_norm": 1.619310975074768, "learning_rate": 8.851562500000001e-06, "loss": 1.1039, "mean_token_accuracy": 0.7574589177966118, "num_tokens": 6980328.0, "step": 8674 }, { "epoch": 2.297669491525424, "grad_norm": 1.6875741481781006, "learning_rate": 8.851297669491526e-06, "loss": 1.4643, "mean_token_accuracy": 0.6522873863577843, "num_tokens": 6982162.0, "step": 8676 }, { "epoch": 2.298199152542373, "grad_norm": 1.5795687437057495, "learning_rate": 8.85103283898305e-06, "loss": 1.465, "mean_token_accuracy": 0.6592653580009937, "num_tokens": 6983900.0, "step": 8678 }, { "epoch": 2.298728813559322, "grad_norm": 1.9766825437545776, "learning_rate": 8.850768008474577e-06, "loss": 1.6794, "mean_token_accuracy": 0.6290063112974167, "num_tokens": 6985273.0, "step": 8680 }, { "epoch": 2.299258474576271, "grad_norm": 2.134037494659424, "learning_rate": 8.850503177966102e-06, "loss": 1.7418, "mean_token_accuracy": 0.6443088054656982, "num_tokens": 6986941.0, "step": 8682 }, { "epoch": 2.29978813559322, "grad_norm": 1.3915128707885742, "learning_rate": 8.850238347457629e-06, "loss": 1.1207, "mean_token_accuracy": 0.7309376373887062, "num_tokens": 6988445.0, "step": 8684 }, { "epoch": 2.3003177966101696, "grad_norm": 1.4254961013793945, "learning_rate": 8.849973516949152e-06, "loss": 1.0153, "mean_token_accuracy": 0.7471297606825829, "num_tokens": 6990085.0, "step": 8686 }, { "epoch": 2.3008474576271185, "grad_norm": 1.6671223640441895, "learning_rate": 8.849708686440679e-06, "loss": 1.0077, "mean_token_accuracy": 0.7437949925661087, "num_tokens": 6991502.0, "step": 8688 }, { "epoch": 2.301377118644068, "grad_norm": 1.3407765626907349, "learning_rate": 8.849443855932204e-06, "loss": 1.547, "mean_token_accuracy": 0.6523257791996002, "num_tokens": 6993086.0, "step": 8690 }, { "epoch": 2.301906779661017, "grad_norm": 1.6596331596374512, "learning_rate": 8.84917902542373e-06, "loss": 1.3012, "mean_token_accuracy": 0.6850642412900925, "num_tokens": 6994652.0, "step": 8692 }, { "epoch": 2.3024364406779663, "grad_norm": 1.4772621393203735, "learning_rate": 8.848914194915255e-06, "loss": 0.9266, "mean_token_accuracy": 0.7602907046675682, "num_tokens": 6996301.0, "step": 8694 }, { "epoch": 2.3029661016949152, "grad_norm": 1.8263731002807617, "learning_rate": 8.84864936440678e-06, "loss": 1.3315, "mean_token_accuracy": 0.6830111965537071, "num_tokens": 6998069.0, "step": 8696 }, { "epoch": 2.303495762711864, "grad_norm": 1.920238971710205, "learning_rate": 8.848384533898305e-06, "loss": 1.0922, "mean_token_accuracy": 0.7441611886024475, "num_tokens": 6999781.0, "step": 8698 }, { "epoch": 2.3040254237288136, "grad_norm": 1.5974003076553345, "learning_rate": 8.848119703389832e-06, "loss": 1.2709, "mean_token_accuracy": 0.7031638398766518, "num_tokens": 7001395.0, "step": 8700 }, { "epoch": 2.3045550847457625, "grad_norm": 1.8376015424728394, "learning_rate": 8.847854872881357e-06, "loss": 1.3408, "mean_token_accuracy": 0.6851415634155273, "num_tokens": 7003007.0, "step": 8702 }, { "epoch": 2.305084745762712, "grad_norm": 1.9108250141143799, "learning_rate": 8.847590042372882e-06, "loss": 0.8586, "mean_token_accuracy": 0.7928019315004349, "num_tokens": 7004881.0, "step": 8704 }, { "epoch": 2.305614406779661, "grad_norm": 1.5426313877105713, "learning_rate": 8.847325211864406e-06, "loss": 1.4781, "mean_token_accuracy": 0.6844632774591446, "num_tokens": 7006628.0, "step": 8706 }, { "epoch": 2.3061440677966103, "grad_norm": 1.2274092435836792, "learning_rate": 8.847060381355933e-06, "loss": 1.1192, "mean_token_accuracy": 0.7382764294743538, "num_tokens": 7008086.0, "step": 8708 }, { "epoch": 2.3066737288135593, "grad_norm": 1.3517374992370605, "learning_rate": 8.846795550847458e-06, "loss": 1.1346, "mean_token_accuracy": 0.7261626794934273, "num_tokens": 7009840.0, "step": 8710 }, { "epoch": 2.3072033898305087, "grad_norm": 1.430992603302002, "learning_rate": 8.846530720338985e-06, "loss": 0.9616, "mean_token_accuracy": 0.7522506043314934, "num_tokens": 7011710.0, "step": 8712 }, { "epoch": 2.3077330508474576, "grad_norm": 1.7278889417648315, "learning_rate": 8.846265889830508e-06, "loss": 0.9523, "mean_token_accuracy": 0.7707125321030617, "num_tokens": 7013278.0, "step": 8714 }, { "epoch": 2.3082627118644066, "grad_norm": 1.7362926006317139, "learning_rate": 8.846001059322035e-06, "loss": 1.3023, "mean_token_accuracy": 0.6998953297734261, "num_tokens": 7015156.0, "step": 8716 }, { "epoch": 2.308792372881356, "grad_norm": 1.983733892440796, "learning_rate": 8.84573622881356e-06, "loss": 1.7271, "mean_token_accuracy": 0.6154942885041237, "num_tokens": 7016773.0, "step": 8718 }, { "epoch": 2.309322033898305, "grad_norm": 1.6533979177474976, "learning_rate": 8.845471398305086e-06, "loss": 1.0097, "mean_token_accuracy": 0.7505990564823151, "num_tokens": 7018366.0, "step": 8720 }, { "epoch": 2.3098516949152543, "grad_norm": 1.70695161819458, "learning_rate": 8.845206567796611e-06, "loss": 1.3777, "mean_token_accuracy": 0.6816442534327507, "num_tokens": 7020120.0, "step": 8722 }, { "epoch": 2.3103813559322033, "grad_norm": 1.5670969486236572, "learning_rate": 8.844941737288136e-06, "loss": 1.2221, "mean_token_accuracy": 0.7075217366218567, "num_tokens": 7021969.0, "step": 8724 }, { "epoch": 2.3109110169491527, "grad_norm": 1.5046892166137695, "learning_rate": 8.844676906779663e-06, "loss": 1.3222, "mean_token_accuracy": 0.7046268992125988, "num_tokens": 7023498.0, "step": 8726 }, { "epoch": 2.3114406779661016, "grad_norm": 1.86384117603302, "learning_rate": 8.844412076271187e-06, "loss": 1.3515, "mean_token_accuracy": 0.6983268857002258, "num_tokens": 7025041.0, "step": 8728 }, { "epoch": 2.311970338983051, "grad_norm": 1.678272008895874, "learning_rate": 8.844147245762712e-06, "loss": 1.2837, "mean_token_accuracy": 0.7141032442450523, "num_tokens": 7026433.0, "step": 8730 }, { "epoch": 2.3125, "grad_norm": 1.3812757730484009, "learning_rate": 8.843882415254237e-06, "loss": 1.173, "mean_token_accuracy": 0.7345620766282082, "num_tokens": 7028394.0, "step": 8732 }, { "epoch": 2.313029661016949, "grad_norm": 1.692885398864746, "learning_rate": 8.843617584745764e-06, "loss": 1.3665, "mean_token_accuracy": 0.6742443144321442, "num_tokens": 7029992.0, "step": 8734 }, { "epoch": 2.3135593220338984, "grad_norm": 1.7600297927856445, "learning_rate": 8.843352754237289e-06, "loss": 1.1973, "mean_token_accuracy": 0.7273185700178146, "num_tokens": 7031545.0, "step": 8736 }, { "epoch": 2.3140889830508473, "grad_norm": 1.693783164024353, "learning_rate": 8.843087923728816e-06, "loss": 1.497, "mean_token_accuracy": 0.6690458133816719, "num_tokens": 7033215.0, "step": 8738 }, { "epoch": 2.3146186440677967, "grad_norm": 1.7021708488464355, "learning_rate": 8.842823093220339e-06, "loss": 1.2767, "mean_token_accuracy": 0.6923200115561485, "num_tokens": 7034757.0, "step": 8740 }, { "epoch": 2.3151483050847457, "grad_norm": 1.746199131011963, "learning_rate": 8.842558262711865e-06, "loss": 1.3469, "mean_token_accuracy": 0.6841458007693291, "num_tokens": 7036280.0, "step": 8742 }, { "epoch": 2.315677966101695, "grad_norm": 1.4461604356765747, "learning_rate": 8.84229343220339e-06, "loss": 1.0665, "mean_token_accuracy": 0.7507932856678963, "num_tokens": 7037954.0, "step": 8744 }, { "epoch": 2.316207627118644, "grad_norm": 1.744314193725586, "learning_rate": 8.842028601694917e-06, "loss": 1.1079, "mean_token_accuracy": 0.7176023125648499, "num_tokens": 7039609.0, "step": 8746 }, { "epoch": 2.3167372881355934, "grad_norm": 2.0603489875793457, "learning_rate": 8.841763771186442e-06, "loss": 1.2611, "mean_token_accuracy": 0.6894972324371338, "num_tokens": 7040993.0, "step": 8748 }, { "epoch": 2.3172669491525424, "grad_norm": 1.5261074304580688, "learning_rate": 8.841498940677967e-06, "loss": 1.0243, "step": 8750 }, { "epoch": 2.3172669491525424, "eval_loss": 1.313179850578308, "eval_mean_token_accuracy": 0.7000193958739181, "eval_num_tokens": 7042791.0, "eval_runtime": 48.1348, "eval_samples_per_second": 6.399, "eval_steps_per_second": 6.399, "step": 8750 }, { "epoch": 2.3177966101694913, "grad_norm": 1.7664331197738647, "learning_rate": 8.841234110169492e-06, "loss": 1.5125, "mean_token_accuracy": 0.7285009883344173, "num_tokens": 7044398.0, "step": 8752 }, { "epoch": 2.3183262711864407, "grad_norm": 1.659995436668396, "learning_rate": 8.840969279661018e-06, "loss": 1.0273, "mean_token_accuracy": 0.7564221955835819, "num_tokens": 7046107.0, "step": 8754 }, { "epoch": 2.3188559322033897, "grad_norm": 2.013293743133545, "learning_rate": 8.840704449152543e-06, "loss": 1.4657, "mean_token_accuracy": 0.6609645485877991, "num_tokens": 7047618.0, "step": 8756 }, { "epoch": 2.319385593220339, "grad_norm": 1.6273069381713867, "learning_rate": 8.840439618644068e-06, "loss": 1.3561, "mean_token_accuracy": 0.7125554084777832, "num_tokens": 7049289.0, "step": 8758 }, { "epoch": 2.319915254237288, "grad_norm": 1.885678768157959, "learning_rate": 8.840174788135593e-06, "loss": 1.4336, "mean_token_accuracy": 0.6766145005822182, "num_tokens": 7051078.0, "step": 8760 }, { "epoch": 2.3204449152542375, "grad_norm": 1.6837608814239502, "learning_rate": 8.83990995762712e-06, "loss": 1.4647, "mean_token_accuracy": 0.6847458034753799, "num_tokens": 7052631.0, "step": 8762 }, { "epoch": 2.3209745762711864, "grad_norm": 1.720850944519043, "learning_rate": 8.839645127118645e-06, "loss": 1.2068, "mean_token_accuracy": 0.7145116180181503, "num_tokens": 7054240.0, "step": 8764 }, { "epoch": 2.321504237288136, "grad_norm": 1.889297604560852, "learning_rate": 8.839380296610171e-06, "loss": 1.464, "mean_token_accuracy": 0.6766949892044067, "num_tokens": 7055913.0, "step": 8766 }, { "epoch": 2.3220338983050848, "grad_norm": 1.7599760293960571, "learning_rate": 8.839115466101695e-06, "loss": 1.2456, "mean_token_accuracy": 0.6982362642884254, "num_tokens": 7057740.0, "step": 8768 }, { "epoch": 2.3225635593220337, "grad_norm": 2.015165328979492, "learning_rate": 8.838850635593221e-06, "loss": 1.8912, "mean_token_accuracy": 0.6102140173316002, "num_tokens": 7059145.0, "step": 8770 }, { "epoch": 2.323093220338983, "grad_norm": 1.6890192031860352, "learning_rate": 8.838585805084746e-06, "loss": 1.0181, "mean_token_accuracy": 0.7405825108289719, "num_tokens": 7060783.0, "step": 8772 }, { "epoch": 2.323622881355932, "grad_norm": 1.8685015439987183, "learning_rate": 8.838320974576273e-06, "loss": 1.6769, "mean_token_accuracy": 0.6232111491262913, "num_tokens": 7062275.0, "step": 8774 }, { "epoch": 2.3241525423728815, "grad_norm": 2.2394604682922363, "learning_rate": 8.838056144067798e-06, "loss": 1.2225, "mean_token_accuracy": 0.7240141034126282, "num_tokens": 7063668.0, "step": 8776 }, { "epoch": 2.3246822033898304, "grad_norm": 1.7289830446243286, "learning_rate": 8.837791313559323e-06, "loss": 1.2623, "mean_token_accuracy": 0.7278509140014648, "num_tokens": 7065163.0, "step": 8778 }, { "epoch": 2.32521186440678, "grad_norm": 1.985721468925476, "learning_rate": 8.837526483050848e-06, "loss": 1.6417, "mean_token_accuracy": 0.6377415433526039, "num_tokens": 7066727.0, "step": 8780 }, { "epoch": 2.325741525423729, "grad_norm": 1.4688767194747925, "learning_rate": 8.837261652542374e-06, "loss": 1.3051, "mean_token_accuracy": 0.698852501809597, "num_tokens": 7068272.0, "step": 8782 }, { "epoch": 2.326271186440678, "grad_norm": 2.0112814903259277, "learning_rate": 8.836996822033899e-06, "loss": 1.453, "mean_token_accuracy": 0.6735450923442841, "num_tokens": 7070275.0, "step": 8784 }, { "epoch": 2.326800847457627, "grad_norm": 1.40998113155365, "learning_rate": 8.836731991525424e-06, "loss": 1.1395, "mean_token_accuracy": 0.7228618040680885, "num_tokens": 7072315.0, "step": 8786 }, { "epoch": 2.327330508474576, "grad_norm": 2.2286787033081055, "learning_rate": 8.836467161016949e-06, "loss": 1.6309, "mean_token_accuracy": 0.654036670923233, "num_tokens": 7073795.0, "step": 8788 }, { "epoch": 2.3278601694915255, "grad_norm": 2.0471982955932617, "learning_rate": 8.836202330508476e-06, "loss": 1.2355, "mean_token_accuracy": 0.7198924459517002, "num_tokens": 7075151.0, "step": 8790 }, { "epoch": 2.3283898305084745, "grad_norm": 1.484842300415039, "learning_rate": 8.8359375e-06, "loss": 1.2183, "mean_token_accuracy": 0.700164407491684, "num_tokens": 7076640.0, "step": 8792 }, { "epoch": 2.328919491525424, "grad_norm": 1.5681310892105103, "learning_rate": 8.835672669491525e-06, "loss": 1.1029, "mean_token_accuracy": 0.714080922305584, "num_tokens": 7078171.0, "step": 8794 }, { "epoch": 2.329449152542373, "grad_norm": 1.7554854154586792, "learning_rate": 8.83540783898305e-06, "loss": 1.554, "mean_token_accuracy": 0.6390794888138771, "num_tokens": 7079965.0, "step": 8796 }, { "epoch": 2.329978813559322, "grad_norm": 1.349692702293396, "learning_rate": 8.835143008474577e-06, "loss": 1.0351, "mean_token_accuracy": 0.7545521110296249, "num_tokens": 7081722.0, "step": 8798 }, { "epoch": 2.330508474576271, "grad_norm": 1.8750656843185425, "learning_rate": 8.834878177966102e-06, "loss": 1.0144, "mean_token_accuracy": 0.7683342024683952, "num_tokens": 7083274.0, "step": 8800 }, { "epoch": 2.33103813559322, "grad_norm": 2.0760245323181152, "learning_rate": 8.834613347457629e-06, "loss": 1.4307, "mean_token_accuracy": 0.6928002014756203, "num_tokens": 7084810.0, "step": 8802 }, { "epoch": 2.3315677966101696, "grad_norm": 1.8697906732559204, "learning_rate": 8.834348516949153e-06, "loss": 1.506, "mean_token_accuracy": 0.6794477552175522, "num_tokens": 7086377.0, "step": 8804 }, { "epoch": 2.3320974576271185, "grad_norm": 1.8708410263061523, "learning_rate": 8.834083686440678e-06, "loss": 1.0421, "mean_token_accuracy": 0.7428606674075127, "num_tokens": 7088040.0, "step": 8806 }, { "epoch": 2.332627118644068, "grad_norm": 2.2093496322631836, "learning_rate": 8.833818855932203e-06, "loss": 1.7555, "mean_token_accuracy": 0.6177740842103958, "num_tokens": 7089502.0, "step": 8808 }, { "epoch": 2.333156779661017, "grad_norm": 2.1217234134674072, "learning_rate": 8.83355402542373e-06, "loss": 1.1749, "mean_token_accuracy": 0.7218338772654533, "num_tokens": 7091097.0, "step": 8810 }, { "epoch": 2.3336864406779663, "grad_norm": 1.4953440427780151, "learning_rate": 8.833289194915255e-06, "loss": 1.4656, "mean_token_accuracy": 0.7122238874435425, "num_tokens": 7092815.0, "step": 8812 }, { "epoch": 2.3342161016949152, "grad_norm": 1.4681731462478638, "learning_rate": 8.83302436440678e-06, "loss": 1.2071, "mean_token_accuracy": 0.7261857837438583, "num_tokens": 7094690.0, "step": 8814 }, { "epoch": 2.334745762711864, "grad_norm": 1.9167637825012207, "learning_rate": 8.832759533898306e-06, "loss": 1.2894, "mean_token_accuracy": 0.7051942422986031, "num_tokens": 7096503.0, "step": 8816 }, { "epoch": 2.3352754237288136, "grad_norm": 1.4425246715545654, "learning_rate": 8.832494703389831e-06, "loss": 1.055, "mean_token_accuracy": 0.7199627012014389, "num_tokens": 7098277.0, "step": 8818 }, { "epoch": 2.3358050847457625, "grad_norm": 1.278862476348877, "learning_rate": 8.832229872881358e-06, "loss": 1.004, "mean_token_accuracy": 0.7419771701097488, "num_tokens": 7100131.0, "step": 8820 }, { "epoch": 2.336334745762712, "grad_norm": 1.4388047456741333, "learning_rate": 8.831965042372881e-06, "loss": 1.0775, "mean_token_accuracy": 0.7500271610915661, "num_tokens": 7101768.0, "step": 8822 }, { "epoch": 2.336864406779661, "grad_norm": 1.7833298444747925, "learning_rate": 8.831700211864408e-06, "loss": 1.3455, "mean_token_accuracy": 0.7153714746236801, "num_tokens": 7103187.0, "step": 8824 }, { "epoch": 2.3373940677966103, "grad_norm": 1.6751896142959595, "learning_rate": 8.831435381355933e-06, "loss": 1.0711, "mean_token_accuracy": 0.7453478202223778, "num_tokens": 7104855.0, "step": 8826 }, { "epoch": 2.3379237288135593, "grad_norm": 1.428283452987671, "learning_rate": 8.83117055084746e-06, "loss": 1.5606, "mean_token_accuracy": 0.6604330688714981, "num_tokens": 7106691.0, "step": 8828 }, { "epoch": 2.3384533898305087, "grad_norm": 1.4659903049468994, "learning_rate": 8.830905720338984e-06, "loss": 1.0547, "mean_token_accuracy": 0.735846072435379, "num_tokens": 7108328.0, "step": 8830 }, { "epoch": 2.3389830508474576, "grad_norm": 1.6955167055130005, "learning_rate": 8.83064088983051e-06, "loss": 1.0755, "mean_token_accuracy": 0.7434941679239273, "num_tokens": 7109789.0, "step": 8832 }, { "epoch": 2.3395127118644066, "grad_norm": 1.7291488647460938, "learning_rate": 8.830376059322034e-06, "loss": 1.3219, "mean_token_accuracy": 0.703701876103878, "num_tokens": 7111243.0, "step": 8834 }, { "epoch": 2.340042372881356, "grad_norm": 1.6603325605392456, "learning_rate": 8.83011122881356e-06, "loss": 1.2125, "mean_token_accuracy": 0.7128204442560673, "num_tokens": 7112871.0, "step": 8836 }, { "epoch": 2.340572033898305, "grad_norm": 1.6532325744628906, "learning_rate": 8.829846398305086e-06, "loss": 1.3782, "mean_token_accuracy": 0.6668294258415699, "num_tokens": 7114723.0, "step": 8838 }, { "epoch": 2.3411016949152543, "grad_norm": 1.994386911392212, "learning_rate": 8.82958156779661e-06, "loss": 1.3226, "mean_token_accuracy": 0.6898374110460281, "num_tokens": 7116025.0, "step": 8840 }, { "epoch": 2.3416313559322033, "grad_norm": 1.8562897443771362, "learning_rate": 8.829316737288136e-06, "loss": 1.4513, "mean_token_accuracy": 0.6732812263071537, "num_tokens": 7117389.0, "step": 8842 }, { "epoch": 2.3421610169491527, "grad_norm": 1.3674957752227783, "learning_rate": 8.829051906779662e-06, "loss": 1.0154, "mean_token_accuracy": 0.7524682134389877, "num_tokens": 7119084.0, "step": 8844 }, { "epoch": 2.3426906779661016, "grad_norm": 1.7340370416641235, "learning_rate": 8.828787076271187e-06, "loss": 0.9802, "mean_token_accuracy": 0.7520961090922356, "num_tokens": 7120528.0, "step": 8846 }, { "epoch": 2.343220338983051, "grad_norm": 1.9883661270141602, "learning_rate": 8.828522245762712e-06, "loss": 0.9347, "mean_token_accuracy": 0.7614659890532494, "num_tokens": 7121877.0, "step": 8848 }, { "epoch": 2.34375, "grad_norm": 1.6517665386199951, "learning_rate": 8.828257415254237e-06, "loss": 1.3659, "mean_token_accuracy": 0.6841384395956993, "num_tokens": 7123407.0, "step": 8850 }, { "epoch": 2.344279661016949, "grad_norm": 2.262529134750366, "learning_rate": 8.827992584745764e-06, "loss": 1.4528, "mean_token_accuracy": 0.684654638171196, "num_tokens": 7124812.0, "step": 8852 }, { "epoch": 2.3448093220338984, "grad_norm": 2.252639055252075, "learning_rate": 8.827727754237289e-06, "loss": 1.8428, "mean_token_accuracy": 0.5973830595612526, "num_tokens": 7126286.0, "step": 8854 }, { "epoch": 2.3453389830508473, "grad_norm": 1.4562923908233643, "learning_rate": 8.827462923728815e-06, "loss": 1.425, "mean_token_accuracy": 0.6677072197198868, "num_tokens": 7128072.0, "step": 8856 }, { "epoch": 2.3458686440677967, "grad_norm": 1.8753248453140259, "learning_rate": 8.82719809322034e-06, "loss": 1.5049, "mean_token_accuracy": 0.6597680374979973, "num_tokens": 7129643.0, "step": 8858 }, { "epoch": 2.3463983050847457, "grad_norm": 1.6835181713104248, "learning_rate": 8.826933262711865e-06, "loss": 1.2001, "mean_token_accuracy": 0.7091685906052589, "num_tokens": 7131421.0, "step": 8860 }, { "epoch": 2.346927966101695, "grad_norm": 2.064678430557251, "learning_rate": 8.82666843220339e-06, "loss": 1.3009, "mean_token_accuracy": 0.6979794725775719, "num_tokens": 7132909.0, "step": 8862 }, { "epoch": 2.347457627118644, "grad_norm": 1.8374407291412354, "learning_rate": 8.826403601694917e-06, "loss": 1.3453, "mean_token_accuracy": 0.6909090504050255, "num_tokens": 7134385.0, "step": 8864 }, { "epoch": 2.3479872881355934, "grad_norm": 1.9524691104888916, "learning_rate": 8.826138771186441e-06, "loss": 1.4936, "mean_token_accuracy": 0.6536353640258312, "num_tokens": 7136078.0, "step": 8866 }, { "epoch": 2.3485169491525424, "grad_norm": 2.0354161262512207, "learning_rate": 8.825873940677966e-06, "loss": 1.4948, "mean_token_accuracy": 0.6922943294048309, "num_tokens": 7137470.0, "step": 8868 }, { "epoch": 2.3490466101694913, "grad_norm": 1.8157634735107422, "learning_rate": 8.825609110169491e-06, "loss": 1.5025, "mean_token_accuracy": 0.6790298372507095, "num_tokens": 7139181.0, "step": 8870 }, { "epoch": 2.3495762711864407, "grad_norm": 1.9143664836883545, "learning_rate": 8.825344279661018e-06, "loss": 1.5529, "mean_token_accuracy": 0.6510041207075119, "num_tokens": 7140907.0, "step": 8872 }, { "epoch": 2.3501059322033897, "grad_norm": 1.523863673210144, "learning_rate": 8.825079449152543e-06, "loss": 1.5663, "mean_token_accuracy": 0.6775388084352016, "num_tokens": 7142403.0, "step": 8874 }, { "epoch": 2.350635593220339, "grad_norm": 1.4940129518508911, "learning_rate": 8.824814618644068e-06, "loss": 0.8291, "mean_token_accuracy": 0.7917790487408638, "num_tokens": 7143945.0, "step": 8876 }, { "epoch": 2.351165254237288, "grad_norm": 1.7659729719161987, "learning_rate": 8.824549788135593e-06, "loss": 1.4804, "mean_token_accuracy": 0.6559179574251175, "num_tokens": 7145932.0, "step": 8878 }, { "epoch": 2.3516949152542375, "grad_norm": 1.8290667533874512, "learning_rate": 8.82428495762712e-06, "loss": 1.318, "mean_token_accuracy": 0.6877919398248196, "num_tokens": 7147588.0, "step": 8880 }, { "epoch": 2.3522245762711864, "grad_norm": 1.6560404300689697, "learning_rate": 8.824020127118644e-06, "loss": 1.1593, "mean_token_accuracy": 0.7271155491471291, "num_tokens": 7149359.0, "step": 8882 }, { "epoch": 2.352754237288136, "grad_norm": 1.5334707498550415, "learning_rate": 8.823755296610171e-06, "loss": 0.9239, "mean_token_accuracy": 0.7786190807819366, "num_tokens": 7151147.0, "step": 8884 }, { "epoch": 2.3532838983050848, "grad_norm": 1.454585313796997, "learning_rate": 8.823490466101696e-06, "loss": 1.2707, "mean_token_accuracy": 0.7382001876831055, "num_tokens": 7152707.0, "step": 8886 }, { "epoch": 2.3538135593220337, "grad_norm": 1.9358222484588623, "learning_rate": 8.82322563559322e-06, "loss": 0.9609, "mean_token_accuracy": 0.7508215680718422, "num_tokens": 7154226.0, "step": 8888 }, { "epoch": 2.354343220338983, "grad_norm": 1.936286211013794, "learning_rate": 8.822960805084746e-06, "loss": 1.3145, "mean_token_accuracy": 0.6918871998786926, "num_tokens": 7156000.0, "step": 8890 }, { "epoch": 2.354872881355932, "grad_norm": 1.8674224615097046, "learning_rate": 8.822695974576272e-06, "loss": 1.1745, "mean_token_accuracy": 0.7077648788690567, "num_tokens": 7157699.0, "step": 8892 }, { "epoch": 2.3554025423728815, "grad_norm": 1.831847071647644, "learning_rate": 8.822431144067797e-06, "loss": 1.4031, "mean_token_accuracy": 0.701352171599865, "num_tokens": 7159237.0, "step": 8894 }, { "epoch": 2.3559322033898304, "grad_norm": 1.3600962162017822, "learning_rate": 8.822166313559322e-06, "loss": 1.0444, "mean_token_accuracy": 0.7475833743810654, "num_tokens": 7160773.0, "step": 8896 }, { "epoch": 2.35646186440678, "grad_norm": 1.9230091571807861, "learning_rate": 8.821901483050849e-06, "loss": 1.6696, "mean_token_accuracy": 0.6091461256146431, "num_tokens": 7162677.0, "step": 8898 }, { "epoch": 2.356991525423729, "grad_norm": 1.644033670425415, "learning_rate": 8.821636652542374e-06, "loss": 1.1887, "mean_token_accuracy": 0.7138952165842056, "num_tokens": 7164417.0, "step": 8900 }, { "epoch": 2.357521186440678, "grad_norm": 1.738818883895874, "learning_rate": 8.821371822033899e-06, "loss": 1.6455, "mean_token_accuracy": 0.6393958404660225, "num_tokens": 7166021.0, "step": 8902 }, { "epoch": 2.358050847457627, "grad_norm": 1.6257178783416748, "learning_rate": 8.821106991525424e-06, "loss": 1.7726, "mean_token_accuracy": 0.6158711910247803, "num_tokens": 7167906.0, "step": 8904 }, { "epoch": 2.358580508474576, "grad_norm": 1.6926213502883911, "learning_rate": 8.82084216101695e-06, "loss": 1.4414, "mean_token_accuracy": 0.6709453463554382, "num_tokens": 7169619.0, "step": 8906 }, { "epoch": 2.3591101694915255, "grad_norm": 1.5135021209716797, "learning_rate": 8.820577330508475e-06, "loss": 1.3145, "mean_token_accuracy": 0.7090931162238121, "num_tokens": 7171323.0, "step": 8908 }, { "epoch": 2.3596398305084745, "grad_norm": 2.058518648147583, "learning_rate": 8.820312500000002e-06, "loss": 1.5087, "mean_token_accuracy": 0.6654773280024529, "num_tokens": 7172769.0, "step": 8910 }, { "epoch": 2.360169491525424, "grad_norm": 1.6750625371932983, "learning_rate": 8.820047669491527e-06, "loss": 1.3152, "mean_token_accuracy": 0.7088673040270805, "num_tokens": 7174479.0, "step": 8912 }, { "epoch": 2.360699152542373, "grad_norm": 1.7252212762832642, "learning_rate": 8.819782838983052e-06, "loss": 1.5906, "mean_token_accuracy": 0.6339102387428284, "num_tokens": 7176297.0, "step": 8914 }, { "epoch": 2.361228813559322, "grad_norm": 1.6719636917114258, "learning_rate": 8.819518008474577e-06, "loss": 1.2069, "mean_token_accuracy": 0.7235284298658371, "num_tokens": 7177592.0, "step": 8916 }, { "epoch": 2.361758474576271, "grad_norm": 1.6189305782318115, "learning_rate": 8.819253177966103e-06, "loss": 1.5074, "mean_token_accuracy": 0.6609725505113602, "num_tokens": 7179198.0, "step": 8918 }, { "epoch": 2.36228813559322, "grad_norm": 1.616356611251831, "learning_rate": 8.818988347457628e-06, "loss": 1.2569, "mean_token_accuracy": 0.7271182388067245, "num_tokens": 7181014.0, "step": 8920 }, { "epoch": 2.3628177966101696, "grad_norm": 1.7656480073928833, "learning_rate": 8.818723516949153e-06, "loss": 1.5049, "mean_token_accuracy": 0.6576345190405846, "num_tokens": 7182679.0, "step": 8922 }, { "epoch": 2.3633474576271185, "grad_norm": 1.906673550605774, "learning_rate": 8.818458686440678e-06, "loss": 1.1753, "mean_token_accuracy": 0.7158587425947189, "num_tokens": 7184133.0, "step": 8924 }, { "epoch": 2.363877118644068, "grad_norm": 1.9307804107666016, "learning_rate": 8.818193855932205e-06, "loss": 1.2315, "mean_token_accuracy": 0.7153347730636597, "num_tokens": 7185506.0, "step": 8926 }, { "epoch": 2.364406779661017, "grad_norm": 1.6766865253448486, "learning_rate": 8.81792902542373e-06, "loss": 1.1096, "mean_token_accuracy": 0.7318310365080833, "num_tokens": 7186962.0, "step": 8928 }, { "epoch": 2.3649364406779663, "grad_norm": 1.7574399709701538, "learning_rate": 8.817664194915254e-06, "loss": 1.473, "mean_token_accuracy": 0.6937877610325813, "num_tokens": 7188362.0, "step": 8930 }, { "epoch": 2.3654661016949152, "grad_norm": 1.8720773458480835, "learning_rate": 8.81739936440678e-06, "loss": 1.3453, "mean_token_accuracy": 0.6819197461009026, "num_tokens": 7190049.0, "step": 8932 }, { "epoch": 2.365995762711864, "grad_norm": 1.7401779890060425, "learning_rate": 8.817134533898306e-06, "loss": 1.1877, "mean_token_accuracy": 0.7244329154491425, "num_tokens": 7191713.0, "step": 8934 }, { "epoch": 2.3665254237288136, "grad_norm": 2.2144904136657715, "learning_rate": 8.816869703389831e-06, "loss": 1.1937, "mean_token_accuracy": 0.7017109394073486, "num_tokens": 7193161.0, "step": 8936 }, { "epoch": 2.3670550847457625, "grad_norm": 1.7356295585632324, "learning_rate": 8.816604872881358e-06, "loss": 1.3013, "mean_token_accuracy": 0.6877656430006027, "num_tokens": 7194787.0, "step": 8938 }, { "epoch": 2.367584745762712, "grad_norm": 1.6514240503311157, "learning_rate": 8.816340042372883e-06, "loss": 1.4282, "mean_token_accuracy": 0.6826251596212387, "num_tokens": 7196369.0, "step": 8940 }, { "epoch": 2.368114406779661, "grad_norm": 1.720272183418274, "learning_rate": 8.816075211864407e-06, "loss": 1.2693, "mean_token_accuracy": 0.728011816740036, "num_tokens": 7197891.0, "step": 8942 }, { "epoch": 2.3686440677966103, "grad_norm": 1.9114383459091187, "learning_rate": 8.815810381355932e-06, "loss": 1.3634, "mean_token_accuracy": 0.6807639002799988, "num_tokens": 7199402.0, "step": 8944 }, { "epoch": 2.3691737288135593, "grad_norm": 1.26347017288208, "learning_rate": 8.815545550847459e-06, "loss": 1.2247, "mean_token_accuracy": 0.7200656533241272, "num_tokens": 7201121.0, "step": 8946 }, { "epoch": 2.3697033898305087, "grad_norm": 1.3362241983413696, "learning_rate": 8.815280720338984e-06, "loss": 0.8092, "mean_token_accuracy": 0.7908381894230843, "num_tokens": 7202624.0, "step": 8948 }, { "epoch": 2.3702330508474576, "grad_norm": 1.8694286346435547, "learning_rate": 8.815015889830509e-06, "loss": 1.6064, "mean_token_accuracy": 0.6286932714283466, "num_tokens": 7204474.0, "step": 8950 }, { "epoch": 2.3707627118644066, "grad_norm": 2.0184152126312256, "learning_rate": 8.814751059322034e-06, "loss": 1.1014, "mean_token_accuracy": 0.7732052877545357, "num_tokens": 7205956.0, "step": 8952 }, { "epoch": 2.371292372881356, "grad_norm": 1.6186667680740356, "learning_rate": 8.81448622881356e-06, "loss": 1.3202, "mean_token_accuracy": 0.7021990194916725, "num_tokens": 7207741.0, "step": 8954 }, { "epoch": 2.371822033898305, "grad_norm": 1.7427574396133423, "learning_rate": 8.814221398305085e-06, "loss": 1.7164, "mean_token_accuracy": 0.622536439448595, "num_tokens": 7209509.0, "step": 8956 }, { "epoch": 2.3723516949152543, "grad_norm": 1.4273967742919922, "learning_rate": 8.81395656779661e-06, "loss": 0.8286, "mean_token_accuracy": 0.7884825319051743, "num_tokens": 7211112.0, "step": 8958 }, { "epoch": 2.3728813559322033, "grad_norm": 1.3990774154663086, "learning_rate": 8.813691737288135e-06, "loss": 0.8466, "mean_token_accuracy": 0.7855470553040504, "num_tokens": 7212786.0, "step": 8960 }, { "epoch": 2.3734110169491527, "grad_norm": 1.5391066074371338, "learning_rate": 8.813426906779662e-06, "loss": 1.0593, "mean_token_accuracy": 0.7543461993336678, "num_tokens": 7214309.0, "step": 8962 }, { "epoch": 2.3739406779661016, "grad_norm": 1.8778090476989746, "learning_rate": 8.813162076271187e-06, "loss": 1.673, "mean_token_accuracy": 0.6432852074503899, "num_tokens": 7215873.0, "step": 8964 }, { "epoch": 2.374470338983051, "grad_norm": 1.6879597902297974, "learning_rate": 8.812897245762713e-06, "loss": 1.4839, "mean_token_accuracy": 0.6797243542969227, "num_tokens": 7217649.0, "step": 8966 }, { "epoch": 2.375, "grad_norm": 2.06026291847229, "learning_rate": 8.812632415254238e-06, "loss": 1.7302, "mean_token_accuracy": 0.6528142467141151, "num_tokens": 7219226.0, "step": 8968 }, { "epoch": 2.375529661016949, "grad_norm": 1.7980449199676514, "learning_rate": 8.812367584745763e-06, "loss": 1.4563, "mean_token_accuracy": 0.6720658093690872, "num_tokens": 7220781.0, "step": 8970 }, { "epoch": 2.3760593220338984, "grad_norm": 1.4397310018539429, "learning_rate": 8.812102754237288e-06, "loss": 1.1604, "mean_token_accuracy": 0.7382644936442375, "num_tokens": 7222752.0, "step": 8972 }, { "epoch": 2.3765889830508473, "grad_norm": 1.4599123001098633, "learning_rate": 8.811837923728815e-06, "loss": 1.2517, "mean_token_accuracy": 0.7152785137295723, "num_tokens": 7224501.0, "step": 8974 }, { "epoch": 2.3771186440677967, "grad_norm": 1.621370553970337, "learning_rate": 8.81157309322034e-06, "loss": 1.3493, "mean_token_accuracy": 0.7198576778173447, "num_tokens": 7226380.0, "step": 8976 }, { "epoch": 2.3776483050847457, "grad_norm": 1.8026922941207886, "learning_rate": 8.811308262711865e-06, "loss": 1.481, "mean_token_accuracy": 0.6812005043029785, "num_tokens": 7227812.0, "step": 8978 }, { "epoch": 2.378177966101695, "grad_norm": 1.4563242197036743, "learning_rate": 8.811043432203391e-06, "loss": 1.5196, "mean_token_accuracy": 0.6783224493265152, "num_tokens": 7229527.0, "step": 8980 }, { "epoch": 2.378707627118644, "grad_norm": 1.803198218345642, "learning_rate": 8.810778601694916e-06, "loss": 1.5124, "mean_token_accuracy": 0.6606050133705139, "num_tokens": 7231184.0, "step": 8982 }, { "epoch": 2.3792372881355934, "grad_norm": 1.751545786857605, "learning_rate": 8.810513771186441e-06, "loss": 1.4598, "mean_token_accuracy": 0.6617123261094093, "num_tokens": 7232800.0, "step": 8984 }, { "epoch": 2.3797669491525424, "grad_norm": 1.7387402057647705, "learning_rate": 8.810248940677966e-06, "loss": 1.3175, "mean_token_accuracy": 0.7045497074723244, "num_tokens": 7234510.0, "step": 8986 }, { "epoch": 2.3802966101694913, "grad_norm": 1.753838300704956, "learning_rate": 8.809984110169493e-06, "loss": 1.7114, "mean_token_accuracy": 0.6225738450884819, "num_tokens": 7236192.0, "step": 8988 }, { "epoch": 2.3808262711864407, "grad_norm": 1.6274380683898926, "learning_rate": 8.809719279661018e-06, "loss": 1.3683, "mean_token_accuracy": 0.6769137904047966, "num_tokens": 7237974.0, "step": 8990 }, { "epoch": 2.3813559322033897, "grad_norm": 1.8579097986221313, "learning_rate": 8.809454449152544e-06, "loss": 1.04, "mean_token_accuracy": 0.7561189606785774, "num_tokens": 7239284.0, "step": 8992 }, { "epoch": 2.381885593220339, "grad_norm": 1.724125862121582, "learning_rate": 8.809189618644069e-06, "loss": 1.4015, "mean_token_accuracy": 0.6926752924919128, "num_tokens": 7240902.0, "step": 8994 }, { "epoch": 2.382415254237288, "grad_norm": 1.607008695602417, "learning_rate": 8.808924788135594e-06, "loss": 1.2308, "mean_token_accuracy": 0.6979269608855247, "num_tokens": 7242465.0, "step": 8996 }, { "epoch": 2.3829449152542375, "grad_norm": 1.926734209060669, "learning_rate": 8.808659957627119e-06, "loss": 1.5512, "mean_token_accuracy": 0.6357419267296791, "num_tokens": 7243984.0, "step": 8998 }, { "epoch": 2.3834745762711864, "grad_norm": 1.4022859334945679, "learning_rate": 8.808395127118646e-06, "loss": 0.7449, "step": 9000 }, { "epoch": 2.3834745762711864, "eval_loss": 1.3137080669403076, "eval_mean_token_accuracy": 0.70009114293309, "eval_num_tokens": 7245420.0, "eval_runtime": 48.8117, "eval_samples_per_second": 6.31, "eval_steps_per_second": 6.31, "step": 9000 }, { "epoch": 2.384004237288136, "grad_norm": 1.8640352487564087, "learning_rate": 8.80813029661017e-06, "loss": 1.3769, "mean_token_accuracy": 0.7631059475243092, "num_tokens": 7246783.0, "step": 9002 }, { "epoch": 2.3845338983050848, "grad_norm": 1.2521706819534302, "learning_rate": 8.807865466101695e-06, "loss": 0.7951, "mean_token_accuracy": 0.7892717272043228, "num_tokens": 7248563.0, "step": 9004 }, { "epoch": 2.3850635593220337, "grad_norm": 1.7075283527374268, "learning_rate": 8.80760063559322e-06, "loss": 1.1698, "mean_token_accuracy": 0.7314861044287682, "num_tokens": 7249978.0, "step": 9006 }, { "epoch": 2.385593220338983, "grad_norm": 1.6009020805358887, "learning_rate": 8.807335805084747e-06, "loss": 1.2905, "mean_token_accuracy": 0.7117594853043556, "num_tokens": 7251531.0, "step": 9008 }, { "epoch": 2.386122881355932, "grad_norm": 1.972385048866272, "learning_rate": 8.807070974576272e-06, "loss": 1.1414, "mean_token_accuracy": 0.744790829718113, "num_tokens": 7253146.0, "step": 9010 }, { "epoch": 2.3866525423728815, "grad_norm": 2.024216413497925, "learning_rate": 8.806806144067797e-06, "loss": 1.53, "mean_token_accuracy": 0.6386132910847664, "num_tokens": 7254639.0, "step": 9012 }, { "epoch": 2.3871822033898304, "grad_norm": 2.199232816696167, "learning_rate": 8.806541313559322e-06, "loss": 1.15, "mean_token_accuracy": 0.738589458167553, "num_tokens": 7256059.0, "step": 9014 }, { "epoch": 2.38771186440678, "grad_norm": 2.0252907276153564, "learning_rate": 8.806276483050848e-06, "loss": 1.2895, "mean_token_accuracy": 0.6867802739143372, "num_tokens": 7257785.0, "step": 9016 }, { "epoch": 2.388241525423729, "grad_norm": 1.6241976022720337, "learning_rate": 8.806011652542373e-06, "loss": 1.3004, "mean_token_accuracy": 0.721725195646286, "num_tokens": 7259412.0, "step": 9018 }, { "epoch": 2.388771186440678, "grad_norm": 1.5118367671966553, "learning_rate": 8.8057468220339e-06, "loss": 1.071, "mean_token_accuracy": 0.7533910647034645, "num_tokens": 7260850.0, "step": 9020 }, { "epoch": 2.389300847457627, "grad_norm": 1.7016775608062744, "learning_rate": 8.805481991525425e-06, "loss": 1.3105, "mean_token_accuracy": 0.7386796399950981, "num_tokens": 7262367.0, "step": 9022 }, { "epoch": 2.389830508474576, "grad_norm": 1.249112606048584, "learning_rate": 8.80521716101695e-06, "loss": 1.0194, "mean_token_accuracy": 0.7558159232139587, "num_tokens": 7264007.0, "step": 9024 }, { "epoch": 2.3903601694915255, "grad_norm": 1.5341092348098755, "learning_rate": 8.804952330508475e-06, "loss": 1.3384, "mean_token_accuracy": 0.68722278252244, "num_tokens": 7265586.0, "step": 9026 }, { "epoch": 2.3908898305084745, "grad_norm": 1.5892159938812256, "learning_rate": 8.804687500000001e-06, "loss": 1.2097, "mean_token_accuracy": 0.7304667457938194, "num_tokens": 7267177.0, "step": 9028 }, { "epoch": 2.391419491525424, "grad_norm": 1.8197753429412842, "learning_rate": 8.804422669491526e-06, "loss": 1.8632, "mean_token_accuracy": 0.5918399430811405, "num_tokens": 7268855.0, "step": 9030 }, { "epoch": 2.391949152542373, "grad_norm": 1.8639951944351196, "learning_rate": 8.804157838983051e-06, "loss": 1.5698, "mean_token_accuracy": 0.6599434018135071, "num_tokens": 7270444.0, "step": 9032 }, { "epoch": 2.392478813559322, "grad_norm": 1.9279247522354126, "learning_rate": 8.803893008474576e-06, "loss": 1.3836, "mean_token_accuracy": 0.6872608289122581, "num_tokens": 7271966.0, "step": 9034 }, { "epoch": 2.393008474576271, "grad_norm": 1.718244194984436, "learning_rate": 8.803628177966103e-06, "loss": 1.5908, "mean_token_accuracy": 0.6559309512376785, "num_tokens": 7273790.0, "step": 9036 }, { "epoch": 2.39353813559322, "grad_norm": 1.6725010871887207, "learning_rate": 8.803363347457628e-06, "loss": 1.1799, "mean_token_accuracy": 0.7363463193178177, "num_tokens": 7275448.0, "step": 9038 }, { "epoch": 2.3940677966101696, "grad_norm": 1.357566475868225, "learning_rate": 8.803098516949153e-06, "loss": 1.2152, "mean_token_accuracy": 0.7123404368758202, "num_tokens": 7277044.0, "step": 9040 }, { "epoch": 2.3945974576271185, "grad_norm": 1.7438135147094727, "learning_rate": 8.802833686440678e-06, "loss": 1.2487, "mean_token_accuracy": 0.697079598903656, "num_tokens": 7278639.0, "step": 9042 }, { "epoch": 2.395127118644068, "grad_norm": 2.264047384262085, "learning_rate": 8.802568855932204e-06, "loss": 1.4948, "mean_token_accuracy": 0.6745948642492294, "num_tokens": 7280027.0, "step": 9044 }, { "epoch": 2.395656779661017, "grad_norm": 1.9194411039352417, "learning_rate": 8.80230402542373e-06, "loss": 1.625, "mean_token_accuracy": 0.6562379896640778, "num_tokens": 7281611.0, "step": 9046 }, { "epoch": 2.3961864406779663, "grad_norm": 2.1484830379486084, "learning_rate": 8.802039194915256e-06, "loss": 1.6169, "mean_token_accuracy": 0.6402079463005066, "num_tokens": 7283163.0, "step": 9048 }, { "epoch": 2.3967161016949152, "grad_norm": 1.7008589506149292, "learning_rate": 8.80177436440678e-06, "loss": 1.2904, "mean_token_accuracy": 0.6966385245323181, "num_tokens": 7284984.0, "step": 9050 }, { "epoch": 2.397245762711864, "grad_norm": 1.8577594757080078, "learning_rate": 8.801509533898306e-06, "loss": 1.4534, "mean_token_accuracy": 0.6948699131608009, "num_tokens": 7286478.0, "step": 9052 }, { "epoch": 2.3977754237288136, "grad_norm": 1.5302585363388062, "learning_rate": 8.80124470338983e-06, "loss": 1.0802, "mean_token_accuracy": 0.7448428645730019, "num_tokens": 7288006.0, "step": 9054 }, { "epoch": 2.3983050847457625, "grad_norm": 1.6397563219070435, "learning_rate": 8.800979872881357e-06, "loss": 1.0712, "mean_token_accuracy": 0.7441672906279564, "num_tokens": 7289542.0, "step": 9056 }, { "epoch": 2.398834745762712, "grad_norm": 1.5312492847442627, "learning_rate": 8.800715042372882e-06, "loss": 1.4821, "mean_token_accuracy": 0.6582143679261208, "num_tokens": 7291295.0, "step": 9058 }, { "epoch": 2.399364406779661, "grad_norm": 1.0705589056015015, "learning_rate": 8.800450211864407e-06, "loss": 1.0313, "mean_token_accuracy": 0.744579628109932, "num_tokens": 7294056.0, "step": 9060 }, { "epoch": 2.3998940677966103, "grad_norm": 1.914194107055664, "learning_rate": 8.800185381355934e-06, "loss": 1.2527, "mean_token_accuracy": 0.6916708722710609, "num_tokens": 7295645.0, "step": 9062 }, { "epoch": 2.4004237288135593, "grad_norm": 1.4700767993927002, "learning_rate": 8.799920550847459e-06, "loss": 1.4354, "mean_token_accuracy": 0.6625431925058365, "num_tokens": 7297278.0, "step": 9064 }, { "epoch": 2.4009533898305087, "grad_norm": 1.6508108377456665, "learning_rate": 8.799655720338984e-06, "loss": 1.1406, "mean_token_accuracy": 0.711589552462101, "num_tokens": 7298667.0, "step": 9066 }, { "epoch": 2.4014830508474576, "grad_norm": 1.7219974994659424, "learning_rate": 8.799390889830508e-06, "loss": 1.6592, "mean_token_accuracy": 0.6281329654157162, "num_tokens": 7300402.0, "step": 9068 }, { "epoch": 2.4020127118644066, "grad_norm": 1.4384974241256714, "learning_rate": 8.799126059322035e-06, "loss": 1.1992, "mean_token_accuracy": 0.7255934998393059, "num_tokens": 7302089.0, "step": 9070 }, { "epoch": 2.402542372881356, "grad_norm": 1.321983814239502, "learning_rate": 8.79886122881356e-06, "loss": 1.1475, "mean_token_accuracy": 0.7348652333021164, "num_tokens": 7303803.0, "step": 9072 }, { "epoch": 2.403072033898305, "grad_norm": 2.403853416442871, "learning_rate": 8.798596398305087e-06, "loss": 1.6558, "mean_token_accuracy": 0.636256568133831, "num_tokens": 7305085.0, "step": 9074 }, { "epoch": 2.4036016949152543, "grad_norm": 1.7761383056640625, "learning_rate": 8.798331567796612e-06, "loss": 1.1675, "mean_token_accuracy": 0.72158033400774, "num_tokens": 7306393.0, "step": 9076 }, { "epoch": 2.4041313559322033, "grad_norm": 1.080453634262085, "learning_rate": 8.798066737288136e-06, "loss": 0.8391, "mean_token_accuracy": 0.7867728695273399, "num_tokens": 7308367.0, "step": 9078 }, { "epoch": 2.4046610169491527, "grad_norm": 1.7554152011871338, "learning_rate": 8.797801906779661e-06, "loss": 1.49, "mean_token_accuracy": 0.6738824397325516, "num_tokens": 7310084.0, "step": 9080 }, { "epoch": 2.4051906779661016, "grad_norm": 1.5448050498962402, "learning_rate": 8.797537076271188e-06, "loss": 1.34, "mean_token_accuracy": 0.6859096586704254, "num_tokens": 7311936.0, "step": 9082 }, { "epoch": 2.405720338983051, "grad_norm": 1.5088167190551758, "learning_rate": 8.797272245762713e-06, "loss": 1.299, "mean_token_accuracy": 0.7038649618625641, "num_tokens": 7313685.0, "step": 9084 }, { "epoch": 2.40625, "grad_norm": 1.964133620262146, "learning_rate": 8.797007415254238e-06, "loss": 1.461, "mean_token_accuracy": 0.6656758710741997, "num_tokens": 7315280.0, "step": 9086 }, { "epoch": 2.406779661016949, "grad_norm": 1.8010270595550537, "learning_rate": 8.796742584745763e-06, "loss": 1.1725, "mean_token_accuracy": 0.7197775915265083, "num_tokens": 7316799.0, "step": 9088 }, { "epoch": 2.4073093220338984, "grad_norm": 1.6803970336914062, "learning_rate": 8.79647775423729e-06, "loss": 0.9652, "mean_token_accuracy": 0.786336749792099, "num_tokens": 7318364.0, "step": 9090 }, { "epoch": 2.4078389830508473, "grad_norm": 1.8660730123519897, "learning_rate": 8.796212923728814e-06, "loss": 1.6503, "mean_token_accuracy": 0.6516550406813622, "num_tokens": 7320142.0, "step": 9092 }, { "epoch": 2.4083686440677967, "grad_norm": 1.9108232259750366, "learning_rate": 8.79594809322034e-06, "loss": 1.0237, "mean_token_accuracy": 0.7373803332448006, "num_tokens": 7321822.0, "step": 9094 }, { "epoch": 2.4088983050847457, "grad_norm": 1.821352243423462, "learning_rate": 8.795683262711864e-06, "loss": 1.3363, "mean_token_accuracy": 0.7179295569658279, "num_tokens": 7323648.0, "step": 9096 }, { "epoch": 2.409427966101695, "grad_norm": 2.5606486797332764, "learning_rate": 8.795418432203391e-06, "loss": 1.6771, "mean_token_accuracy": 0.6432164162397385, "num_tokens": 7325142.0, "step": 9098 }, { "epoch": 2.409957627118644, "grad_norm": 1.5469461679458618, "learning_rate": 8.795153601694916e-06, "loss": 0.9798, "mean_token_accuracy": 0.7545370683073997, "num_tokens": 7327059.0, "step": 9100 }, { "epoch": 2.4104872881355934, "grad_norm": 1.6333445310592651, "learning_rate": 8.794888771186442e-06, "loss": 1.2243, "mean_token_accuracy": 0.717278003692627, "num_tokens": 7329037.0, "step": 9102 }, { "epoch": 2.4110169491525424, "grad_norm": 1.4432690143585205, "learning_rate": 8.794623940677967e-06, "loss": 1.4546, "mean_token_accuracy": 0.6693461276590824, "num_tokens": 7330925.0, "step": 9104 }, { "epoch": 2.4115466101694913, "grad_norm": 2.235945224761963, "learning_rate": 8.794359110169492e-06, "loss": 1.6753, "mean_token_accuracy": 0.6449730917811394, "num_tokens": 7332550.0, "step": 9106 }, { "epoch": 2.4120762711864407, "grad_norm": 1.916438341140747, "learning_rate": 8.794094279661017e-06, "loss": 1.4049, "mean_token_accuracy": 0.7021566182374954, "num_tokens": 7333882.0, "step": 9108 }, { "epoch": 2.4126059322033897, "grad_norm": 1.6631773710250854, "learning_rate": 8.793829449152544e-06, "loss": 1.2274, "mean_token_accuracy": 0.7083575055003166, "num_tokens": 7335337.0, "step": 9110 }, { "epoch": 2.413135593220339, "grad_norm": 2.8051364421844482, "learning_rate": 8.793564618644069e-06, "loss": 1.5201, "mean_token_accuracy": 0.6443722769618034, "num_tokens": 7336976.0, "step": 9112 }, { "epoch": 2.413665254237288, "grad_norm": 2.038419485092163, "learning_rate": 8.793299788135594e-06, "loss": 1.2969, "mean_token_accuracy": 0.7050881087779999, "num_tokens": 7338311.0, "step": 9114 }, { "epoch": 2.4141949152542375, "grad_norm": 1.8972703218460083, "learning_rate": 8.793034957627119e-06, "loss": 1.8042, "mean_token_accuracy": 0.610909715294838, "num_tokens": 7340018.0, "step": 9116 }, { "epoch": 2.4147245762711864, "grad_norm": 1.9830565452575684, "learning_rate": 8.792770127118645e-06, "loss": 1.2313, "mean_token_accuracy": 0.705268494784832, "num_tokens": 7341535.0, "step": 9118 }, { "epoch": 2.415254237288136, "grad_norm": 1.956041932106018, "learning_rate": 8.79250529661017e-06, "loss": 1.3294, "mean_token_accuracy": 0.7099040895700455, "num_tokens": 7342895.0, "step": 9120 }, { "epoch": 2.4157838983050848, "grad_norm": 1.6673728227615356, "learning_rate": 8.792240466101695e-06, "loss": 1.3311, "mean_token_accuracy": 0.6986021548509598, "num_tokens": 7344315.0, "step": 9122 }, { "epoch": 2.4163135593220337, "grad_norm": 1.76470947265625, "learning_rate": 8.79197563559322e-06, "loss": 1.1487, "mean_token_accuracy": 0.7182519286870956, "num_tokens": 7345901.0, "step": 9124 }, { "epoch": 2.416843220338983, "grad_norm": 1.738784670829773, "learning_rate": 8.791710805084747e-06, "loss": 1.488, "mean_token_accuracy": 0.6738742738962173, "num_tokens": 7347405.0, "step": 9126 }, { "epoch": 2.417372881355932, "grad_norm": 1.6400741338729858, "learning_rate": 8.791445974576272e-06, "loss": 1.0074, "mean_token_accuracy": 0.7399731054902077, "num_tokens": 7348971.0, "step": 9128 }, { "epoch": 2.4179025423728815, "grad_norm": 1.5314688682556152, "learning_rate": 8.791181144067798e-06, "loss": 1.0061, "mean_token_accuracy": 0.7557380497455597, "num_tokens": 7350482.0, "step": 9130 }, { "epoch": 2.4184322033898304, "grad_norm": 1.7143690586090088, "learning_rate": 8.790916313559321e-06, "loss": 1.1509, "mean_token_accuracy": 0.7357307001948357, "num_tokens": 7352091.0, "step": 9132 }, { "epoch": 2.41896186440678, "grad_norm": 1.4539202451705933, "learning_rate": 8.790651483050848e-06, "loss": 1.3569, "mean_token_accuracy": 0.6882645934820175, "num_tokens": 7353767.0, "step": 9134 }, { "epoch": 2.419491525423729, "grad_norm": 1.9988303184509277, "learning_rate": 8.790386652542373e-06, "loss": 1.3947, "mean_token_accuracy": 0.6749027818441391, "num_tokens": 7355209.0, "step": 9136 }, { "epoch": 2.420021186440678, "grad_norm": 1.7598204612731934, "learning_rate": 8.7901218220339e-06, "loss": 1.4382, "mean_token_accuracy": 0.709301769733429, "num_tokens": 7357186.0, "step": 9138 }, { "epoch": 2.420550847457627, "grad_norm": 1.5261937379837036, "learning_rate": 8.789856991525425e-06, "loss": 1.0384, "mean_token_accuracy": 0.7528772503137589, "num_tokens": 7358677.0, "step": 9140 }, { "epoch": 2.421080508474576, "grad_norm": 1.9792453050613403, "learning_rate": 8.78959216101695e-06, "loss": 1.1324, "mean_token_accuracy": 0.7420996502041817, "num_tokens": 7360088.0, "step": 9142 }, { "epoch": 2.4216101694915255, "grad_norm": 1.6520277261734009, "learning_rate": 8.789327330508474e-06, "loss": 1.2275, "mean_token_accuracy": 0.7141694873571396, "num_tokens": 7361834.0, "step": 9144 }, { "epoch": 2.4221398305084745, "grad_norm": 1.6274585723876953, "learning_rate": 8.789062500000001e-06, "loss": 1.4247, "mean_token_accuracy": 0.6799116656184196, "num_tokens": 7363323.0, "step": 9146 }, { "epoch": 2.422669491525424, "grad_norm": 1.6308015584945679, "learning_rate": 8.788797669491526e-06, "loss": 1.4672, "mean_token_accuracy": 0.7088970243930817, "num_tokens": 7364958.0, "step": 9148 }, { "epoch": 2.423199152542373, "grad_norm": 1.9409537315368652, "learning_rate": 8.788532838983051e-06, "loss": 1.607, "mean_token_accuracy": 0.6385196819901466, "num_tokens": 7366726.0, "step": 9150 }, { "epoch": 2.423728813559322, "grad_norm": 1.5446370840072632, "learning_rate": 8.788268008474578e-06, "loss": 1.2778, "mean_token_accuracy": 0.7045823782682419, "num_tokens": 7368259.0, "step": 9152 }, { "epoch": 2.424258474576271, "grad_norm": 1.5724154710769653, "learning_rate": 8.788003177966102e-06, "loss": 1.2711, "mean_token_accuracy": 0.7174918353557587, "num_tokens": 7369730.0, "step": 9154 }, { "epoch": 2.42478813559322, "grad_norm": 1.5203839540481567, "learning_rate": 8.787738347457629e-06, "loss": 1.1197, "mean_token_accuracy": 0.7402953207492828, "num_tokens": 7371492.0, "step": 9156 }, { "epoch": 2.4253177966101696, "grad_norm": 2.0046017169952393, "learning_rate": 8.787473516949154e-06, "loss": 1.4607, "mean_token_accuracy": 0.6721339486539364, "num_tokens": 7372920.0, "step": 9158 }, { "epoch": 2.4258474576271185, "grad_norm": 1.7613250017166138, "learning_rate": 8.787208686440679e-06, "loss": 1.3111, "mean_token_accuracy": 0.68891716375947, "num_tokens": 7374710.0, "step": 9160 }, { "epoch": 2.426377118644068, "grad_norm": 1.5274620056152344, "learning_rate": 8.786943855932204e-06, "loss": 1.2015, "mean_token_accuracy": 0.7229928895831108, "num_tokens": 7377070.0, "step": 9162 }, { "epoch": 2.426906779661017, "grad_norm": 1.5558377504348755, "learning_rate": 8.78667902542373e-06, "loss": 1.0715, "mean_token_accuracy": 0.7379346862435341, "num_tokens": 7378817.0, "step": 9164 }, { "epoch": 2.4274364406779663, "grad_norm": 1.569728970527649, "learning_rate": 8.786414194915255e-06, "loss": 0.9355, "mean_token_accuracy": 0.7694828733801842, "num_tokens": 7380321.0, "step": 9166 }, { "epoch": 2.4279661016949152, "grad_norm": 1.9258227348327637, "learning_rate": 8.78614936440678e-06, "loss": 1.1946, "mean_token_accuracy": 0.7381193712353706, "num_tokens": 7381607.0, "step": 9168 }, { "epoch": 2.428495762711864, "grad_norm": 1.5699975490570068, "learning_rate": 8.785884533898305e-06, "loss": 1.466, "mean_token_accuracy": 0.6323963478207588, "num_tokens": 7383450.0, "step": 9170 }, { "epoch": 2.4290254237288136, "grad_norm": 1.497679352760315, "learning_rate": 8.785619703389832e-06, "loss": 1.1688, "mean_token_accuracy": 0.7402315065264702, "num_tokens": 7384995.0, "step": 9172 }, { "epoch": 2.4295550847457625, "grad_norm": 1.582732915878296, "learning_rate": 8.785354872881357e-06, "loss": 1.1788, "mean_token_accuracy": 0.7465911209583282, "num_tokens": 7386519.0, "step": 9174 }, { "epoch": 2.430084745762712, "grad_norm": 1.4342297315597534, "learning_rate": 8.785090042372882e-06, "loss": 1.0788, "mean_token_accuracy": 0.7161101549863815, "num_tokens": 7388204.0, "step": 9176 }, { "epoch": 2.430614406779661, "grad_norm": 1.9652589559555054, "learning_rate": 8.784825211864407e-06, "loss": 1.7333, "mean_token_accuracy": 0.6330838017165661, "num_tokens": 7389684.0, "step": 9178 }, { "epoch": 2.4311440677966103, "grad_norm": 1.995063066482544, "learning_rate": 8.784560381355933e-06, "loss": 1.4267, "mean_token_accuracy": 0.6763480752706528, "num_tokens": 7391047.0, "step": 9180 }, { "epoch": 2.4316737288135593, "grad_norm": 1.4063646793365479, "learning_rate": 8.784295550847458e-06, "loss": 1.2747, "mean_token_accuracy": 0.7299267649650574, "num_tokens": 7392773.0, "step": 9182 }, { "epoch": 2.4322033898305087, "grad_norm": 2.1719534397125244, "learning_rate": 8.784030720338985e-06, "loss": 1.467, "mean_token_accuracy": 0.6650865375995636, "num_tokens": 7394179.0, "step": 9184 }, { "epoch": 2.4327330508474576, "grad_norm": 1.5848159790039062, "learning_rate": 8.783765889830508e-06, "loss": 1.2787, "mean_token_accuracy": 0.7238948121666908, "num_tokens": 7395795.0, "step": 9186 }, { "epoch": 2.4332627118644066, "grad_norm": 1.3812754154205322, "learning_rate": 8.783501059322035e-06, "loss": 0.788, "mean_token_accuracy": 0.7947943359613419, "num_tokens": 7397315.0, "step": 9188 }, { "epoch": 2.433792372881356, "grad_norm": 1.6983314752578735, "learning_rate": 8.78323622881356e-06, "loss": 1.3194, "mean_token_accuracy": 0.6917547285556793, "num_tokens": 7398901.0, "step": 9190 }, { "epoch": 2.434322033898305, "grad_norm": 1.7832790613174438, "learning_rate": 8.782971398305086e-06, "loss": 0.9572, "mean_token_accuracy": 0.7681620419025421, "num_tokens": 7400192.0, "step": 9192 }, { "epoch": 2.4348516949152543, "grad_norm": 1.7941200733184814, "learning_rate": 8.782706567796611e-06, "loss": 1.1969, "mean_token_accuracy": 0.7325832322239876, "num_tokens": 7401510.0, "step": 9194 }, { "epoch": 2.4353813559322033, "grad_norm": 2.0772786140441895, "learning_rate": 8.782441737288136e-06, "loss": 1.4595, "mean_token_accuracy": 0.678919829428196, "num_tokens": 7402828.0, "step": 9196 }, { "epoch": 2.4359110169491527, "grad_norm": 1.6257565021514893, "learning_rate": 8.782176906779661e-06, "loss": 1.1461, "mean_token_accuracy": 0.7202297300100327, "num_tokens": 7405098.0, "step": 9198 }, { "epoch": 2.4364406779661016, "grad_norm": 2.0994510650634766, "learning_rate": 8.781912076271188e-06, "loss": 1.7596, "mean_token_accuracy": 0.6100983954966068, "num_tokens": 7406618.0, "step": 9200 }, { "epoch": 2.436970338983051, "grad_norm": 1.6516836881637573, "learning_rate": 8.781647245762713e-06, "loss": 0.7846, "mean_token_accuracy": 0.8015391454100609, "num_tokens": 7408019.0, "step": 9202 }, { "epoch": 2.4375, "grad_norm": 1.8468360900878906, "learning_rate": 8.781382415254238e-06, "loss": 1.4301, "mean_token_accuracy": 0.6727654747664928, "num_tokens": 7409618.0, "step": 9204 }, { "epoch": 2.438029661016949, "grad_norm": 1.7288944721221924, "learning_rate": 8.781117584745762e-06, "loss": 1.08, "mean_token_accuracy": 0.7290955632925034, "num_tokens": 7411089.0, "step": 9206 }, { "epoch": 2.4385593220338984, "grad_norm": 2.0598042011260986, "learning_rate": 8.780852754237289e-06, "loss": 1.6124, "mean_token_accuracy": 0.6548548638820648, "num_tokens": 7412858.0, "step": 9208 }, { "epoch": 2.4390889830508473, "grad_norm": 1.6243774890899658, "learning_rate": 8.780587923728814e-06, "loss": 1.1017, "mean_token_accuracy": 0.739852711558342, "num_tokens": 7414582.0, "step": 9210 }, { "epoch": 2.4396186440677967, "grad_norm": 1.9736242294311523, "learning_rate": 8.78032309322034e-06, "loss": 1.9015, "mean_token_accuracy": 0.6208673715591431, "num_tokens": 7416107.0, "step": 9212 }, { "epoch": 2.4401483050847457, "grad_norm": 1.7024585008621216, "learning_rate": 8.780058262711864e-06, "loss": 1.1484, "mean_token_accuracy": 0.721815288066864, "num_tokens": 7417860.0, "step": 9214 }, { "epoch": 2.440677966101695, "grad_norm": 2.0716593265533447, "learning_rate": 8.77979343220339e-06, "loss": 1.6423, "mean_token_accuracy": 0.6423930637538433, "num_tokens": 7419405.0, "step": 9216 }, { "epoch": 2.441207627118644, "grad_norm": 1.6749173402786255, "learning_rate": 8.779528601694915e-06, "loss": 0.9881, "mean_token_accuracy": 0.7689717635512352, "num_tokens": 7420883.0, "step": 9218 }, { "epoch": 2.4417372881355934, "grad_norm": 1.3906705379486084, "learning_rate": 8.779263771186442e-06, "loss": 1.0071, "mean_token_accuracy": 0.7657370269298553, "num_tokens": 7422413.0, "step": 9220 }, { "epoch": 2.4422669491525424, "grad_norm": 1.5978810787200928, "learning_rate": 8.778998940677967e-06, "loss": 1.529, "mean_token_accuracy": 0.6603913903236389, "num_tokens": 7424095.0, "step": 9222 }, { "epoch": 2.4427966101694913, "grad_norm": 1.4035372734069824, "learning_rate": 8.778734110169492e-06, "loss": 1.1338, "mean_token_accuracy": 0.7292569354176521, "num_tokens": 7425640.0, "step": 9224 }, { "epoch": 2.4433262711864407, "grad_norm": 1.792243480682373, "learning_rate": 8.778469279661017e-06, "loss": 1.1364, "mean_token_accuracy": 0.7164763957262039, "num_tokens": 7427349.0, "step": 9226 }, { "epoch": 2.4438559322033897, "grad_norm": 1.7204669713974, "learning_rate": 8.778204449152543e-06, "loss": 1.3418, "mean_token_accuracy": 0.694894403219223, "num_tokens": 7428966.0, "step": 9228 }, { "epoch": 2.444385593220339, "grad_norm": 1.759978175163269, "learning_rate": 8.777939618644068e-06, "loss": 1.1864, "mean_token_accuracy": 0.7302655354142189, "num_tokens": 7430398.0, "step": 9230 }, { "epoch": 2.444915254237288, "grad_norm": 1.8133070468902588, "learning_rate": 8.777674788135593e-06, "loss": 1.6066, "mean_token_accuracy": 0.6746312081813812, "num_tokens": 7431992.0, "step": 9232 }, { "epoch": 2.4454449152542375, "grad_norm": 2.0216972827911377, "learning_rate": 8.77740995762712e-06, "loss": 1.005, "mean_token_accuracy": 0.7426271215081215, "num_tokens": 7433473.0, "step": 9234 }, { "epoch": 2.4459745762711864, "grad_norm": 1.9616189002990723, "learning_rate": 8.777145127118645e-06, "loss": 1.4317, "mean_token_accuracy": 0.6672087907791138, "num_tokens": 7435262.0, "step": 9236 }, { "epoch": 2.446504237288136, "grad_norm": 1.709226131439209, "learning_rate": 8.776880296610171e-06, "loss": 1.1077, "mean_token_accuracy": 0.7523331940174103, "num_tokens": 7436658.0, "step": 9238 }, { "epoch": 2.4470338983050848, "grad_norm": 1.642844557762146, "learning_rate": 8.776615466101695e-06, "loss": 1.3406, "mean_token_accuracy": 0.6973482295870781, "num_tokens": 7438226.0, "step": 9240 }, { "epoch": 2.4475635593220337, "grad_norm": 1.7024943828582764, "learning_rate": 8.776350635593221e-06, "loss": 1.4401, "mean_token_accuracy": 0.7000027000904083, "num_tokens": 7439517.0, "step": 9242 }, { "epoch": 2.448093220338983, "grad_norm": 1.8207288980484009, "learning_rate": 8.776085805084746e-06, "loss": 1.4847, "mean_token_accuracy": 0.6794462651014328, "num_tokens": 7441000.0, "step": 9244 }, { "epoch": 2.448622881355932, "grad_norm": 2.0299248695373535, "learning_rate": 8.775820974576273e-06, "loss": 1.4241, "mean_token_accuracy": 0.7052666395902634, "num_tokens": 7442470.0, "step": 9246 }, { "epoch": 2.4491525423728815, "grad_norm": 1.9088053703308105, "learning_rate": 8.775556144067798e-06, "loss": 1.5428, "mean_token_accuracy": 0.687858946621418, "num_tokens": 7443953.0, "step": 9248 }, { "epoch": 2.4496822033898304, "grad_norm": 1.695632815361023, "learning_rate": 8.775291313559323e-06, "loss": 1.4703, "step": 9250 }, { "epoch": 2.4496822033898304, "eval_loss": 1.3119611740112305, "eval_mean_token_accuracy": 0.6999636559517353, "eval_num_tokens": 7446274.0, "eval_runtime": 48.0692, "eval_samples_per_second": 6.407, "eval_steps_per_second": 6.407, "step": 9250 }, { "epoch": 2.45021186440678, "grad_norm": 2.12825345993042, "learning_rate": 8.775026483050848e-06, "loss": 1.6181, "mean_token_accuracy": 0.6536988448351622, "num_tokens": 7447887.0, "step": 9252 }, { "epoch": 2.450741525423729, "grad_norm": 2.033748149871826, "learning_rate": 8.774761652542374e-06, "loss": 1.3536, "mean_token_accuracy": 0.6706097051501274, "num_tokens": 7449299.0, "step": 9254 }, { "epoch": 2.451271186440678, "grad_norm": 1.637803077697754, "learning_rate": 8.7744968220339e-06, "loss": 1.3484, "mean_token_accuracy": 0.7056309431791306, "num_tokens": 7451096.0, "step": 9256 }, { "epoch": 2.451800847457627, "grad_norm": 1.6820180416107178, "learning_rate": 8.774231991525424e-06, "loss": 1.0954, "mean_token_accuracy": 0.7306715250015259, "num_tokens": 7452501.0, "step": 9258 }, { "epoch": 2.452330508474576, "grad_norm": 1.4107495546340942, "learning_rate": 8.773967161016949e-06, "loss": 0.9629, "mean_token_accuracy": 0.7572584450244904, "num_tokens": 7454032.0, "step": 9260 }, { "epoch": 2.4528601694915255, "grad_norm": 1.8259892463684082, "learning_rate": 8.773702330508476e-06, "loss": 0.8544, "mean_token_accuracy": 0.7901085019111633, "num_tokens": 7455330.0, "step": 9262 }, { "epoch": 2.4533898305084745, "grad_norm": 1.6397275924682617, "learning_rate": 8.7734375e-06, "loss": 1.1843, "mean_token_accuracy": 0.7193849012255669, "num_tokens": 7457050.0, "step": 9264 }, { "epoch": 2.453919491525424, "grad_norm": 1.8308525085449219, "learning_rate": 8.773172669491527e-06, "loss": 1.1724, "mean_token_accuracy": 0.7463957667350769, "num_tokens": 7458599.0, "step": 9266 }, { "epoch": 2.454449152542373, "grad_norm": 1.5585216283798218, "learning_rate": 8.77290783898305e-06, "loss": 1.4471, "mean_token_accuracy": 0.6822285503149033, "num_tokens": 7460304.0, "step": 9268 }, { "epoch": 2.454978813559322, "grad_norm": 2.2060601711273193, "learning_rate": 8.772643008474577e-06, "loss": 1.2594, "mean_token_accuracy": 0.711109071969986, "num_tokens": 7461596.0, "step": 9270 }, { "epoch": 2.455508474576271, "grad_norm": 2.121204137802124, "learning_rate": 8.772378177966102e-06, "loss": 1.7048, "mean_token_accuracy": 0.6396210864186287, "num_tokens": 7463059.0, "step": 9272 }, { "epoch": 2.45603813559322, "grad_norm": 1.6112642288208008, "learning_rate": 8.772113347457629e-06, "loss": 1.2782, "mean_token_accuracy": 0.7111798822879791, "num_tokens": 7464536.0, "step": 9274 }, { "epoch": 2.4565677966101696, "grad_norm": 1.807739019393921, "learning_rate": 8.771848516949154e-06, "loss": 1.4536, "mean_token_accuracy": 0.6792284846305847, "num_tokens": 7466282.0, "step": 9276 }, { "epoch": 2.4570974576271185, "grad_norm": 1.5143637657165527, "learning_rate": 8.771583686440679e-06, "loss": 1.4648, "mean_token_accuracy": 0.6788873076438904, "num_tokens": 7467933.0, "step": 9278 }, { "epoch": 2.457627118644068, "grad_norm": 1.5419963598251343, "learning_rate": 8.771318855932203e-06, "loss": 0.9867, "mean_token_accuracy": 0.7550051733851433, "num_tokens": 7469539.0, "step": 9280 }, { "epoch": 2.458156779661017, "grad_norm": 1.5587013959884644, "learning_rate": 8.77105402542373e-06, "loss": 0.9606, "mean_token_accuracy": 0.7640242502093315, "num_tokens": 7471262.0, "step": 9282 }, { "epoch": 2.4586864406779663, "grad_norm": 1.6174871921539307, "learning_rate": 8.770789194915255e-06, "loss": 1.0396, "mean_token_accuracy": 0.7569015249609947, "num_tokens": 7472749.0, "step": 9284 }, { "epoch": 2.4592161016949152, "grad_norm": 1.5159038305282593, "learning_rate": 8.77052436440678e-06, "loss": 0.9938, "mean_token_accuracy": 0.7443854212760925, "num_tokens": 7474104.0, "step": 9286 }, { "epoch": 2.459745762711864, "grad_norm": 1.6758556365966797, "learning_rate": 8.770259533898305e-06, "loss": 0.9618, "mean_token_accuracy": 0.7650464326143265, "num_tokens": 7475819.0, "step": 9288 }, { "epoch": 2.4602754237288136, "grad_norm": 1.6444064378738403, "learning_rate": 8.769994703389832e-06, "loss": 1.2556, "mean_token_accuracy": 0.712653785943985, "num_tokens": 7477558.0, "step": 9290 }, { "epoch": 2.4608050847457625, "grad_norm": 1.3479523658752441, "learning_rate": 8.769729872881356e-06, "loss": 1.4605, "mean_token_accuracy": 0.6884385496377945, "num_tokens": 7479803.0, "step": 9292 }, { "epoch": 2.461334745762712, "grad_norm": 1.6056143045425415, "learning_rate": 8.769465042372881e-06, "loss": 0.7213, "mean_token_accuracy": 0.8174151629209518, "num_tokens": 7481234.0, "step": 9294 }, { "epoch": 2.461864406779661, "grad_norm": 1.4508261680603027, "learning_rate": 8.769200211864406e-06, "loss": 1.2564, "mean_token_accuracy": 0.6886316984891891, "num_tokens": 7483208.0, "step": 9296 }, { "epoch": 2.4623940677966103, "grad_norm": 1.8890436887741089, "learning_rate": 8.768935381355933e-06, "loss": 1.3946, "mean_token_accuracy": 0.678131639957428, "num_tokens": 7484699.0, "step": 9298 }, { "epoch": 2.4629237288135593, "grad_norm": 1.4281662702560425, "learning_rate": 8.768670550847458e-06, "loss": 1.1477, "mean_token_accuracy": 0.7272488251328468, "num_tokens": 7486518.0, "step": 9300 }, { "epoch": 2.4634533898305087, "grad_norm": 2.1224586963653564, "learning_rate": 8.768405720338984e-06, "loss": 1.9932, "mean_token_accuracy": 0.5903731770813465, "num_tokens": 7488046.0, "step": 9302 }, { "epoch": 2.4639830508474576, "grad_norm": 1.2485631704330444, "learning_rate": 8.76814088983051e-06, "loss": 1.0316, "mean_token_accuracy": 0.7653576880693436, "num_tokens": 7489444.0, "step": 9304 }, { "epoch": 2.4645127118644066, "grad_norm": 1.6040109395980835, "learning_rate": 8.767876059322034e-06, "loss": 1.362, "mean_token_accuracy": 0.6649560779333115, "num_tokens": 7491230.0, "step": 9306 }, { "epoch": 2.465042372881356, "grad_norm": 2.1258718967437744, "learning_rate": 8.76761122881356e-06, "loss": 1.4974, "mean_token_accuracy": 0.6634852960705757, "num_tokens": 7492676.0, "step": 9308 }, { "epoch": 2.465572033898305, "grad_norm": 1.783362865447998, "learning_rate": 8.767346398305086e-06, "loss": 1.1991, "mean_token_accuracy": 0.70940200984478, "num_tokens": 7494262.0, "step": 9310 }, { "epoch": 2.4661016949152543, "grad_norm": 1.6353238821029663, "learning_rate": 8.76708156779661e-06, "loss": 1.2541, "mean_token_accuracy": 0.7108473628759384, "num_tokens": 7495746.0, "step": 9312 }, { "epoch": 2.4666313559322033, "grad_norm": 2.1311075687408447, "learning_rate": 8.766816737288136e-06, "loss": 1.3642, "mean_token_accuracy": 0.6919130682945251, "num_tokens": 7497311.0, "step": 9314 }, { "epoch": 2.4671610169491527, "grad_norm": 1.8725489377975464, "learning_rate": 8.766551906779662e-06, "loss": 1.3524, "mean_token_accuracy": 0.6940892264246941, "num_tokens": 7499020.0, "step": 9316 }, { "epoch": 2.4676906779661016, "grad_norm": 2.158827781677246, "learning_rate": 8.766287076271187e-06, "loss": 1.5662, "mean_token_accuracy": 0.6460311412811279, "num_tokens": 7500516.0, "step": 9318 }, { "epoch": 2.468220338983051, "grad_norm": 1.6446460485458374, "learning_rate": 8.766022245762714e-06, "loss": 1.1643, "mean_token_accuracy": 0.7329973429441452, "num_tokens": 7502261.0, "step": 9320 }, { "epoch": 2.46875, "grad_norm": 1.849440097808838, "learning_rate": 8.765757415254237e-06, "loss": 1.3207, "mean_token_accuracy": 0.7078996002674103, "num_tokens": 7503904.0, "step": 9322 }, { "epoch": 2.469279661016949, "grad_norm": 1.722598910331726, "learning_rate": 8.765492584745764e-06, "loss": 1.4891, "mean_token_accuracy": 0.6835031881928444, "num_tokens": 7505537.0, "step": 9324 }, { "epoch": 2.4698093220338984, "grad_norm": 1.9063835144042969, "learning_rate": 8.765227754237289e-06, "loss": 1.716, "mean_token_accuracy": 0.6439599096775055, "num_tokens": 7507008.0, "step": 9326 }, { "epoch": 2.4703389830508473, "grad_norm": 1.5210962295532227, "learning_rate": 8.764962923728815e-06, "loss": 1.3221, "mean_token_accuracy": 0.6833614557981491, "num_tokens": 7508607.0, "step": 9328 }, { "epoch": 2.4708686440677967, "grad_norm": 1.9455815553665161, "learning_rate": 8.76469809322034e-06, "loss": 1.6307, "mean_token_accuracy": 0.6227407902479172, "num_tokens": 7510283.0, "step": 9330 }, { "epoch": 2.4713983050847457, "grad_norm": 1.6293727159500122, "learning_rate": 8.764433262711865e-06, "loss": 1.0897, "mean_token_accuracy": 0.745950810611248, "num_tokens": 7512046.0, "step": 9332 }, { "epoch": 2.471927966101695, "grad_norm": 1.8165959119796753, "learning_rate": 8.76416843220339e-06, "loss": 1.708, "mean_token_accuracy": 0.6302641853690147, "num_tokens": 7513540.0, "step": 9334 }, { "epoch": 2.472457627118644, "grad_norm": 1.6210206747055054, "learning_rate": 8.763903601694917e-06, "loss": 1.2998, "mean_token_accuracy": 0.7097479030489922, "num_tokens": 7515266.0, "step": 9336 }, { "epoch": 2.4729872881355934, "grad_norm": 1.5883655548095703, "learning_rate": 8.763638771186442e-06, "loss": 1.0129, "mean_token_accuracy": 0.7543300315737724, "num_tokens": 7516868.0, "step": 9338 }, { "epoch": 2.4735169491525424, "grad_norm": 2.140820264816284, "learning_rate": 8.763373940677967e-06, "loss": 1.9631, "mean_token_accuracy": 0.6051263809204102, "num_tokens": 7518350.0, "step": 9340 }, { "epoch": 2.4740466101694913, "grad_norm": 1.8317855596542358, "learning_rate": 8.763109110169492e-06, "loss": 1.5134, "mean_token_accuracy": 0.6653696298599243, "num_tokens": 7519835.0, "step": 9342 }, { "epoch": 2.4745762711864407, "grad_norm": 1.5768119096755981, "learning_rate": 8.762844279661018e-06, "loss": 1.3612, "mean_token_accuracy": 0.7108829170465469, "num_tokens": 7521350.0, "step": 9344 }, { "epoch": 2.4751059322033897, "grad_norm": 1.65812087059021, "learning_rate": 8.762579449152543e-06, "loss": 1.1364, "mean_token_accuracy": 0.7347629517316818, "num_tokens": 7522907.0, "step": 9346 }, { "epoch": 2.475635593220339, "grad_norm": 1.5098309516906738, "learning_rate": 8.762314618644068e-06, "loss": 1.1972, "mean_token_accuracy": 0.7110578045248985, "num_tokens": 7524460.0, "step": 9348 }, { "epoch": 2.476165254237288, "grad_norm": 1.5924546718597412, "learning_rate": 8.762049788135593e-06, "loss": 1.1844, "mean_token_accuracy": 0.7118270993232727, "num_tokens": 7526227.0, "step": 9350 }, { "epoch": 2.4766949152542375, "grad_norm": 1.4561196565628052, "learning_rate": 8.76178495762712e-06, "loss": 1.2481, "mean_token_accuracy": 0.7106265649199486, "num_tokens": 7527687.0, "step": 9352 }, { "epoch": 2.4772245762711864, "grad_norm": 1.7330965995788574, "learning_rate": 8.761520127118644e-06, "loss": 1.2894, "mean_token_accuracy": 0.7143260985612869, "num_tokens": 7529291.0, "step": 9354 }, { "epoch": 2.477754237288136, "grad_norm": 1.4286483526229858, "learning_rate": 8.761255296610171e-06, "loss": 1.2776, "mean_token_accuracy": 0.7040386125445366, "num_tokens": 7531205.0, "step": 9356 }, { "epoch": 2.4782838983050848, "grad_norm": 2.2101311683654785, "learning_rate": 8.760990466101696e-06, "loss": 1.2939, "mean_token_accuracy": 0.7130190134048462, "num_tokens": 7532613.0, "step": 9358 }, { "epoch": 2.4788135593220337, "grad_norm": 2.067502498626709, "learning_rate": 8.760725635593221e-06, "loss": 1.5665, "mean_token_accuracy": 0.6770105883479118, "num_tokens": 7534071.0, "step": 9360 }, { "epoch": 2.479343220338983, "grad_norm": 2.5961508750915527, "learning_rate": 8.760460805084746e-06, "loss": 1.5198, "mean_token_accuracy": 0.6623163372278214, "num_tokens": 7535270.0, "step": 9362 }, { "epoch": 2.479872881355932, "grad_norm": 1.6027992963790894, "learning_rate": 8.760195974576273e-06, "loss": 1.1405, "mean_token_accuracy": 0.7500542998313904, "num_tokens": 7536626.0, "step": 9364 }, { "epoch": 2.4804025423728815, "grad_norm": 1.6799170970916748, "learning_rate": 8.759931144067797e-06, "loss": 0.9807, "mean_token_accuracy": 0.7486283928155899, "num_tokens": 7538322.0, "step": 9366 }, { "epoch": 2.4809322033898304, "grad_norm": 1.9509143829345703, "learning_rate": 8.759666313559322e-06, "loss": 1.3778, "mean_token_accuracy": 0.6857530996203423, "num_tokens": 7539593.0, "step": 9368 }, { "epoch": 2.48146186440678, "grad_norm": 1.5733839273452759, "learning_rate": 8.759401483050847e-06, "loss": 0.8537, "mean_token_accuracy": 0.7931017726659775, "num_tokens": 7540916.0, "step": 9370 }, { "epoch": 2.481991525423729, "grad_norm": 1.8120996952056885, "learning_rate": 8.759136652542374e-06, "loss": 1.1791, "mean_token_accuracy": 0.7094584107398987, "num_tokens": 7542413.0, "step": 9372 }, { "epoch": 2.482521186440678, "grad_norm": 1.450551986694336, "learning_rate": 8.758871822033899e-06, "loss": 1.2006, "mean_token_accuracy": 0.7079313360154629, "num_tokens": 7544209.0, "step": 9374 }, { "epoch": 2.483050847457627, "grad_norm": 1.524814248085022, "learning_rate": 8.758606991525424e-06, "loss": 0.8942, "mean_token_accuracy": 0.775192953646183, "num_tokens": 7546014.0, "step": 9376 }, { "epoch": 2.483580508474576, "grad_norm": 1.6641182899475098, "learning_rate": 8.758342161016949e-06, "loss": 1.4761, "mean_token_accuracy": 0.692670963704586, "num_tokens": 7547411.0, "step": 9378 }, { "epoch": 2.4841101694915255, "grad_norm": 1.7689942121505737, "learning_rate": 8.758077330508475e-06, "loss": 1.4208, "mean_token_accuracy": 0.6784215122461319, "num_tokens": 7549113.0, "step": 9380 }, { "epoch": 2.4846398305084745, "grad_norm": 1.390734076499939, "learning_rate": 8.7578125e-06, "loss": 1.3633, "mean_token_accuracy": 0.6953999623656273, "num_tokens": 7550760.0, "step": 9382 }, { "epoch": 2.485169491525424, "grad_norm": 2.1298811435699463, "learning_rate": 8.757547669491527e-06, "loss": 1.7273, "mean_token_accuracy": 0.6208637617528439, "num_tokens": 7552359.0, "step": 9384 }, { "epoch": 2.485699152542373, "grad_norm": 1.6921380758285522, "learning_rate": 8.757282838983052e-06, "loss": 1.609, "mean_token_accuracy": 0.6283173561096191, "num_tokens": 7553864.0, "step": 9386 }, { "epoch": 2.486228813559322, "grad_norm": 2.0080087184906006, "learning_rate": 8.757018008474577e-06, "loss": 1.4404, "mean_token_accuracy": 0.6745292246341705, "num_tokens": 7555407.0, "step": 9388 }, { "epoch": 2.486758474576271, "grad_norm": 1.8438173532485962, "learning_rate": 8.756753177966102e-06, "loss": 1.6104, "mean_token_accuracy": 0.6338898912072182, "num_tokens": 7557047.0, "step": 9390 }, { "epoch": 2.48728813559322, "grad_norm": 2.361114263534546, "learning_rate": 8.756488347457628e-06, "loss": 1.552, "mean_token_accuracy": 0.6536708138883114, "num_tokens": 7558546.0, "step": 9392 }, { "epoch": 2.4878177966101696, "grad_norm": 1.7693896293640137, "learning_rate": 8.756223516949153e-06, "loss": 0.9597, "mean_token_accuracy": 0.7730664387345314, "num_tokens": 7560000.0, "step": 9394 }, { "epoch": 2.4883474576271185, "grad_norm": 1.7422488927841187, "learning_rate": 8.755958686440678e-06, "loss": 1.6409, "mean_token_accuracy": 0.6494725272059441, "num_tokens": 7561950.0, "step": 9396 }, { "epoch": 2.488877118644068, "grad_norm": 1.60696280002594, "learning_rate": 8.755693855932203e-06, "loss": 1.2089, "mean_token_accuracy": 0.7169297710061073, "num_tokens": 7563576.0, "step": 9398 }, { "epoch": 2.489406779661017, "grad_norm": 1.509335994720459, "learning_rate": 8.75542902542373e-06, "loss": 1.3068, "mean_token_accuracy": 0.6992611661553383, "num_tokens": 7565056.0, "step": 9400 }, { "epoch": 2.4899364406779663, "grad_norm": 1.5904847383499146, "learning_rate": 8.755164194915255e-06, "loss": 1.3294, "mean_token_accuracy": 0.7039422281086445, "num_tokens": 7566520.0, "step": 9402 }, { "epoch": 2.4904661016949152, "grad_norm": 1.7481920719146729, "learning_rate": 8.75489936440678e-06, "loss": 1.3958, "mean_token_accuracy": 0.6952984035015106, "num_tokens": 7568267.0, "step": 9404 }, { "epoch": 2.490995762711864, "grad_norm": 1.8275566101074219, "learning_rate": 8.754634533898306e-06, "loss": 1.4989, "mean_token_accuracy": 0.684588335454464, "num_tokens": 7569736.0, "step": 9406 }, { "epoch": 2.4915254237288136, "grad_norm": 1.9775465726852417, "learning_rate": 8.754369703389831e-06, "loss": 1.4105, "mean_token_accuracy": 0.7036005184054375, "num_tokens": 7571134.0, "step": 9408 }, { "epoch": 2.4920550847457625, "grad_norm": 1.7660136222839355, "learning_rate": 8.754104872881358e-06, "loss": 1.6154, "mean_token_accuracy": 0.6453648135066032, "num_tokens": 7572730.0, "step": 9410 }, { "epoch": 2.492584745762712, "grad_norm": 1.7474510669708252, "learning_rate": 8.753840042372883e-06, "loss": 1.6499, "mean_token_accuracy": 0.6476164236664772, "num_tokens": 7574268.0, "step": 9412 }, { "epoch": 2.493114406779661, "grad_norm": 1.869032382965088, "learning_rate": 8.753575211864408e-06, "loss": 1.5186, "mean_token_accuracy": 0.6615736708045006, "num_tokens": 7575955.0, "step": 9414 }, { "epoch": 2.4936440677966103, "grad_norm": 1.4175177812576294, "learning_rate": 8.753310381355933e-06, "loss": 1.2112, "mean_token_accuracy": 0.7076563090085983, "num_tokens": 7577679.0, "step": 9416 }, { "epoch": 2.4941737288135593, "grad_norm": 1.8673211336135864, "learning_rate": 8.75304555084746e-06, "loss": 1.5026, "mean_token_accuracy": 0.6894452422857285, "num_tokens": 7579222.0, "step": 9418 }, { "epoch": 2.4947033898305087, "grad_norm": 1.492596983909607, "learning_rate": 8.752780720338984e-06, "loss": 1.1354, "mean_token_accuracy": 0.7267879396677017, "num_tokens": 7580770.0, "step": 9420 }, { "epoch": 2.4952330508474576, "grad_norm": 1.8466144800186157, "learning_rate": 8.752515889830509e-06, "loss": 1.2828, "mean_token_accuracy": 0.7185975983738899, "num_tokens": 7582332.0, "step": 9422 }, { "epoch": 2.4957627118644066, "grad_norm": 1.5543742179870605, "learning_rate": 8.752251059322034e-06, "loss": 1.1918, "mean_token_accuracy": 0.7163454443216324, "num_tokens": 7584016.0, "step": 9424 }, { "epoch": 2.496292372881356, "grad_norm": 1.6227480173110962, "learning_rate": 8.75198622881356e-06, "loss": 1.4894, "mean_token_accuracy": 0.6971356309950352, "num_tokens": 7585769.0, "step": 9426 }, { "epoch": 2.496822033898305, "grad_norm": 1.5631895065307617, "learning_rate": 8.751721398305085e-06, "loss": 1.0763, "mean_token_accuracy": 0.747778408229351, "num_tokens": 7587207.0, "step": 9428 }, { "epoch": 2.4973516949152543, "grad_norm": 1.3776137828826904, "learning_rate": 8.75145656779661e-06, "loss": 1.0242, "mean_token_accuracy": 0.7634067013859749, "num_tokens": 7588875.0, "step": 9430 }, { "epoch": 2.4978813559322033, "grad_norm": 1.8935202360153198, "learning_rate": 8.751191737288135e-06, "loss": 1.0527, "mean_token_accuracy": 0.7543874606490135, "num_tokens": 7590623.0, "step": 9432 }, { "epoch": 2.4984110169491527, "grad_norm": 1.8540138006210327, "learning_rate": 8.750926906779662e-06, "loss": 1.3994, "mean_token_accuracy": 0.6900995671749115, "num_tokens": 7592171.0, "step": 9434 }, { "epoch": 2.4989406779661016, "grad_norm": 1.5424145460128784, "learning_rate": 8.750662076271187e-06, "loss": 0.8094, "mean_token_accuracy": 0.7839893251657486, "num_tokens": 7593599.0, "step": 9436 }, { "epoch": 2.499470338983051, "grad_norm": 1.7403841018676758, "learning_rate": 8.750397245762714e-06, "loss": 1.3491, "mean_token_accuracy": 0.7273761257529259, "num_tokens": 7595102.0, "step": 9438 }, { "epoch": 2.5, "grad_norm": 1.3716604709625244, "learning_rate": 8.750132415254238e-06, "loss": 1.0752, "mean_token_accuracy": 0.7313388362526894, "num_tokens": 7596928.0, "step": 9440 }, { "epoch": 2.500529661016949, "grad_norm": 1.811231255531311, "learning_rate": 8.749867584745763e-06, "loss": 1.4457, "mean_token_accuracy": 0.6751245334744453, "num_tokens": 7598679.0, "step": 9442 }, { "epoch": 2.5010593220338984, "grad_norm": 1.936376690864563, "learning_rate": 8.749602754237288e-06, "loss": 1.1829, "mean_token_accuracy": 0.7356252148747444, "num_tokens": 7600008.0, "step": 9444 }, { "epoch": 2.5015889830508473, "grad_norm": 1.6560543775558472, "learning_rate": 8.749337923728815e-06, "loss": 1.1243, "mean_token_accuracy": 0.7333818897604942, "num_tokens": 7601594.0, "step": 9446 }, { "epoch": 2.5021186440677967, "grad_norm": 1.667930245399475, "learning_rate": 8.74907309322034e-06, "loss": 1.4132, "mean_token_accuracy": 0.6841906830668449, "num_tokens": 7603289.0, "step": 9448 }, { "epoch": 2.5026483050847457, "grad_norm": 1.6298836469650269, "learning_rate": 8.748808262711865e-06, "loss": 1.4507, "mean_token_accuracy": 0.6775838136672974, "num_tokens": 7604913.0, "step": 9450 }, { "epoch": 2.503177966101695, "grad_norm": 2.041975736618042, "learning_rate": 8.74854343220339e-06, "loss": 1.6766, "mean_token_accuracy": 0.6536596342921257, "num_tokens": 7606633.0, "step": 9452 }, { "epoch": 2.503707627118644, "grad_norm": 1.959952473640442, "learning_rate": 8.748278601694916e-06, "loss": 1.6844, "mean_token_accuracy": 0.6365855485200882, "num_tokens": 7607972.0, "step": 9454 }, { "epoch": 2.5042372881355934, "grad_norm": 1.4201518297195435, "learning_rate": 8.748013771186441e-06, "loss": 1.3774, "mean_token_accuracy": 0.7103384733200073, "num_tokens": 7609580.0, "step": 9456 }, { "epoch": 2.5047669491525424, "grad_norm": 1.7917307615280151, "learning_rate": 8.747748940677966e-06, "loss": 1.3756, "mean_token_accuracy": 0.6848414987325668, "num_tokens": 7611252.0, "step": 9458 }, { "epoch": 2.5052966101694913, "grad_norm": 1.6632508039474487, "learning_rate": 8.747484110169491e-06, "loss": 0.9957, "mean_token_accuracy": 0.7479902654886246, "num_tokens": 7612910.0, "step": 9460 }, { "epoch": 2.5058262711864407, "grad_norm": 1.726656436920166, "learning_rate": 8.747219279661018e-06, "loss": 1.1106, "mean_token_accuracy": 0.7163143903017044, "num_tokens": 7614461.0, "step": 9462 }, { "epoch": 2.5063559322033897, "grad_norm": 1.8599662780761719, "learning_rate": 8.746954449152543e-06, "loss": 1.2378, "mean_token_accuracy": 0.701957568526268, "num_tokens": 7615926.0, "step": 9464 }, { "epoch": 2.506885593220339, "grad_norm": 1.5950467586517334, "learning_rate": 8.74668961864407e-06, "loss": 1.4869, "mean_token_accuracy": 0.6453878059983253, "num_tokens": 7617435.0, "step": 9466 }, { "epoch": 2.507415254237288, "grad_norm": 1.7903631925582886, "learning_rate": 8.746424788135594e-06, "loss": 1.5293, "mean_token_accuracy": 0.6630916744470596, "num_tokens": 7618862.0, "step": 9468 }, { "epoch": 2.507944915254237, "grad_norm": 1.6326298713684082, "learning_rate": 8.74615995762712e-06, "loss": 1.4064, "mean_token_accuracy": 0.6776672527194023, "num_tokens": 7620454.0, "step": 9470 }, { "epoch": 2.5084745762711864, "grad_norm": 1.476845383644104, "learning_rate": 8.745895127118644e-06, "loss": 1.2261, "mean_token_accuracy": 0.7054696306586266, "num_tokens": 7622776.0, "step": 9472 }, { "epoch": 2.509004237288136, "grad_norm": 1.8690541982650757, "learning_rate": 8.74563029661017e-06, "loss": 1.4831, "mean_token_accuracy": 0.657558798789978, "num_tokens": 7624357.0, "step": 9474 }, { "epoch": 2.5095338983050848, "grad_norm": 2.047710418701172, "learning_rate": 8.745365466101696e-06, "loss": 1.6156, "mean_token_accuracy": 0.6447734013199806, "num_tokens": 7625992.0, "step": 9476 }, { "epoch": 2.5100635593220337, "grad_norm": 1.568171739578247, "learning_rate": 8.74510063559322e-06, "loss": 1.0226, "mean_token_accuracy": 0.7555339634418488, "num_tokens": 7627450.0, "step": 9478 }, { "epoch": 2.510593220338983, "grad_norm": 1.7138841152191162, "learning_rate": 8.744835805084746e-06, "loss": 1.2407, "mean_token_accuracy": 0.7402229458093643, "num_tokens": 7629132.0, "step": 9480 }, { "epoch": 2.511122881355932, "grad_norm": 1.755199909210205, "learning_rate": 8.744570974576272e-06, "loss": 1.4049, "mean_token_accuracy": 0.6806683987379074, "num_tokens": 7630628.0, "step": 9482 }, { "epoch": 2.5116525423728815, "grad_norm": 1.440342903137207, "learning_rate": 8.744306144067797e-06, "loss": 1.4432, "mean_token_accuracy": 0.6482350118458271, "num_tokens": 7632451.0, "step": 9484 }, { "epoch": 2.5121822033898304, "grad_norm": 1.8479291200637817, "learning_rate": 8.744041313559322e-06, "loss": 1.5139, "mean_token_accuracy": 0.6724726185202599, "num_tokens": 7634302.0, "step": 9486 }, { "epoch": 2.5127118644067794, "grad_norm": 1.639485478401184, "learning_rate": 8.743776483050849e-06, "loss": 1.3547, "mean_token_accuracy": 0.6985343173146248, "num_tokens": 7635975.0, "step": 9488 }, { "epoch": 2.513241525423729, "grad_norm": 1.3509420156478882, "learning_rate": 8.743511652542374e-06, "loss": 1.2839, "mean_token_accuracy": 0.7049970254302025, "num_tokens": 7637555.0, "step": 9490 }, { "epoch": 2.513771186440678, "grad_norm": 1.6856558322906494, "learning_rate": 8.7432468220339e-06, "loss": 1.6121, "mean_token_accuracy": 0.6280004531145096, "num_tokens": 7639259.0, "step": 9492 }, { "epoch": 2.514300847457627, "grad_norm": 2.044874668121338, "learning_rate": 8.742981991525425e-06, "loss": 1.3421, "mean_token_accuracy": 0.691457636654377, "num_tokens": 7640968.0, "step": 9494 }, { "epoch": 2.514830508474576, "grad_norm": 1.6399537324905396, "learning_rate": 8.74271716101695e-06, "loss": 1.2435, "mean_token_accuracy": 0.7272603735327721, "num_tokens": 7642670.0, "step": 9496 }, { "epoch": 2.5153601694915255, "grad_norm": 1.7522119283676147, "learning_rate": 8.742452330508475e-06, "loss": 1.6383, "mean_token_accuracy": 0.6472386568784714, "num_tokens": 7644173.0, "step": 9498 }, { "epoch": 2.5158898305084745, "grad_norm": 1.5632438659667969, "learning_rate": 8.742187500000002e-06, "loss": 0.9423, "step": 9500 }, { "epoch": 2.5158898305084745, "eval_loss": 1.3102375268936157, "eval_mean_token_accuracy": 0.700597571571926, "eval_num_tokens": 7645832.0, "eval_runtime": 48.0682, "eval_samples_per_second": 6.408, "eval_steps_per_second": 6.408, "step": 9500 }, { "epoch": 2.516419491525424, "grad_norm": 1.4685397148132324, "learning_rate": 8.741922669491527e-06, "loss": 0.7799, "mean_token_accuracy": 0.777363084256649, "num_tokens": 7647386.0, "step": 9502 }, { "epoch": 2.516949152542373, "grad_norm": 2.1181061267852783, "learning_rate": 8.741657838983051e-06, "loss": 1.4326, "mean_token_accuracy": 0.7060295045375824, "num_tokens": 7649214.0, "step": 9504 }, { "epoch": 2.517478813559322, "grad_norm": 1.5623632669448853, "learning_rate": 8.741393008474576e-06, "loss": 1.1664, "mean_token_accuracy": 0.7194086126983166, "num_tokens": 7650807.0, "step": 9506 }, { "epoch": 2.518008474576271, "grad_norm": 1.7923580408096313, "learning_rate": 8.741128177966103e-06, "loss": 1.0585, "mean_token_accuracy": 0.7394875027239323, "num_tokens": 7652418.0, "step": 9508 }, { "epoch": 2.5185381355932206, "grad_norm": 1.8625901937484741, "learning_rate": 8.740863347457628e-06, "loss": 1.2679, "mean_token_accuracy": 0.7211181372404099, "num_tokens": 7654171.0, "step": 9510 }, { "epoch": 2.5190677966101696, "grad_norm": 1.524707317352295, "learning_rate": 8.740598516949153e-06, "loss": 0.7716, "mean_token_accuracy": 0.7983533293008804, "num_tokens": 7655534.0, "step": 9512 }, { "epoch": 2.5195974576271185, "grad_norm": 2.145977258682251, "learning_rate": 8.740333686440678e-06, "loss": 1.4825, "mean_token_accuracy": 0.6836447417736053, "num_tokens": 7656933.0, "step": 9514 }, { "epoch": 2.520127118644068, "grad_norm": 1.6880519390106201, "learning_rate": 8.740068855932204e-06, "loss": 0.9415, "mean_token_accuracy": 0.7733772471547127, "num_tokens": 7658519.0, "step": 9516 }, { "epoch": 2.520656779661017, "grad_norm": 1.493484616279602, "learning_rate": 8.73980402542373e-06, "loss": 1.1004, "mean_token_accuracy": 0.7419897764921188, "num_tokens": 7659965.0, "step": 9518 }, { "epoch": 2.5211864406779663, "grad_norm": 1.9848968982696533, "learning_rate": 8.739539194915256e-06, "loss": 1.2563, "mean_token_accuracy": 0.7063976228237152, "num_tokens": 7661466.0, "step": 9520 }, { "epoch": 2.5217161016949152, "grad_norm": 1.6071925163269043, "learning_rate": 8.739274364406781e-06, "loss": 1.0324, "mean_token_accuracy": 0.7448461204767227, "num_tokens": 7663251.0, "step": 9522 }, { "epoch": 2.522245762711864, "grad_norm": 1.625754475593567, "learning_rate": 8.739009533898306e-06, "loss": 1.3008, "mean_token_accuracy": 0.692800872027874, "num_tokens": 7665006.0, "step": 9524 }, { "epoch": 2.5227754237288136, "grad_norm": 1.8235163688659668, "learning_rate": 8.73874470338983e-06, "loss": 1.4593, "mean_token_accuracy": 0.6668164618313313, "num_tokens": 7666811.0, "step": 9526 }, { "epoch": 2.523305084745763, "grad_norm": 2.180687427520752, "learning_rate": 8.738479872881357e-06, "loss": 1.5592, "mean_token_accuracy": 0.6662779375910759, "num_tokens": 7668140.0, "step": 9528 }, { "epoch": 2.523834745762712, "grad_norm": 1.2234094142913818, "learning_rate": 8.738215042372882e-06, "loss": 1.0842, "mean_token_accuracy": 0.7319173291325569, "num_tokens": 7670058.0, "step": 9530 }, { "epoch": 2.524364406779661, "grad_norm": 1.6608892679214478, "learning_rate": 8.737950211864407e-06, "loss": 1.3961, "mean_token_accuracy": 0.6705297417938709, "num_tokens": 7671554.0, "step": 9532 }, { "epoch": 2.5248940677966103, "grad_norm": 1.8911926746368408, "learning_rate": 8.737685381355932e-06, "loss": 1.4131, "mean_token_accuracy": 0.6826171800494194, "num_tokens": 7673146.0, "step": 9534 }, { "epoch": 2.5254237288135593, "grad_norm": 1.6786186695098877, "learning_rate": 8.737420550847459e-06, "loss": 1.4912, "mean_token_accuracy": 0.6514644809067249, "num_tokens": 7674749.0, "step": 9536 }, { "epoch": 2.5259533898305087, "grad_norm": 1.8344346284866333, "learning_rate": 8.737155720338984e-06, "loss": 1.4614, "mean_token_accuracy": 0.6638969406485558, "num_tokens": 7676288.0, "step": 9538 }, { "epoch": 2.5264830508474576, "grad_norm": 2.1278343200683594, "learning_rate": 8.736890889830509e-06, "loss": 1.6036, "mean_token_accuracy": 0.6390415802598, "num_tokens": 7678304.0, "step": 9540 }, { "epoch": 2.5270127118644066, "grad_norm": 1.6335206031799316, "learning_rate": 8.736626059322034e-06, "loss": 1.2499, "mean_token_accuracy": 0.707434818148613, "num_tokens": 7679958.0, "step": 9542 }, { "epoch": 2.527542372881356, "grad_norm": 1.8166970014572144, "learning_rate": 8.73636122881356e-06, "loss": 1.3513, "mean_token_accuracy": 0.691597692668438, "num_tokens": 7681637.0, "step": 9544 }, { "epoch": 2.528072033898305, "grad_norm": 1.8339639902114868, "learning_rate": 8.736096398305085e-06, "loss": 1.4027, "mean_token_accuracy": 0.7164143957197666, "num_tokens": 7683297.0, "step": 9546 }, { "epoch": 2.5286016949152543, "grad_norm": 1.4612113237380981, "learning_rate": 8.735831567796612e-06, "loss": 0.9487, "mean_token_accuracy": 0.7364264130592346, "num_tokens": 7684892.0, "step": 9548 }, { "epoch": 2.5291313559322033, "grad_norm": 1.8616214990615845, "learning_rate": 8.735566737288137e-06, "loss": 1.2242, "mean_token_accuracy": 0.7242445945739746, "num_tokens": 7686480.0, "step": 9550 }, { "epoch": 2.5296610169491527, "grad_norm": 2.086629629135132, "learning_rate": 8.735301906779662e-06, "loss": 1.8673, "mean_token_accuracy": 0.6326394900679588, "num_tokens": 7688010.0, "step": 9552 }, { "epoch": 2.5301906779661016, "grad_norm": 1.6401574611663818, "learning_rate": 8.735037076271187e-06, "loss": 1.5238, "mean_token_accuracy": 0.6630630195140839, "num_tokens": 7689547.0, "step": 9554 }, { "epoch": 2.530720338983051, "grad_norm": 1.7458157539367676, "learning_rate": 8.734772245762713e-06, "loss": 1.3151, "mean_token_accuracy": 0.7011857330799103, "num_tokens": 7691309.0, "step": 9556 }, { "epoch": 2.53125, "grad_norm": 1.791883111000061, "learning_rate": 8.734507415254238e-06, "loss": 1.6685, "mean_token_accuracy": 0.6251071915030479, "num_tokens": 7693156.0, "step": 9558 }, { "epoch": 2.531779661016949, "grad_norm": 1.8987442255020142, "learning_rate": 8.734242584745763e-06, "loss": 1.1526, "mean_token_accuracy": 0.7413547039031982, "num_tokens": 7694639.0, "step": 9560 }, { "epoch": 2.5323093220338984, "grad_norm": 1.7008644342422485, "learning_rate": 8.733977754237288e-06, "loss": 1.2787, "mean_token_accuracy": 0.7073230147361755, "num_tokens": 7696112.0, "step": 9562 }, { "epoch": 2.5328389830508473, "grad_norm": 1.5206146240234375, "learning_rate": 8.733712923728815e-06, "loss": 1.1648, "mean_token_accuracy": 0.7155484259128571, "num_tokens": 7698001.0, "step": 9564 }, { "epoch": 2.5333686440677967, "grad_norm": 1.6596494913101196, "learning_rate": 8.73344809322034e-06, "loss": 1.7743, "mean_token_accuracy": 0.6132197231054306, "num_tokens": 7699936.0, "step": 9566 }, { "epoch": 2.5338983050847457, "grad_norm": 1.635430097579956, "learning_rate": 8.733183262711864e-06, "loss": 1.0506, "mean_token_accuracy": 0.7493221238255501, "num_tokens": 7701302.0, "step": 9568 }, { "epoch": 2.534427966101695, "grad_norm": 1.6194837093353271, "learning_rate": 8.732918432203391e-06, "loss": 0.9739, "mean_token_accuracy": 0.7561537772417068, "num_tokens": 7703034.0, "step": 9570 }, { "epoch": 2.534957627118644, "grad_norm": 2.192042350769043, "learning_rate": 8.732653601694916e-06, "loss": 1.4505, "mean_token_accuracy": 0.6717679612338543, "num_tokens": 7704302.0, "step": 9572 }, { "epoch": 2.5354872881355934, "grad_norm": 1.5163462162017822, "learning_rate": 8.732388771186443e-06, "loss": 0.9772, "mean_token_accuracy": 0.7557749897241592, "num_tokens": 7705842.0, "step": 9574 }, { "epoch": 2.5360169491525424, "grad_norm": 0.9640731811523438, "learning_rate": 8.732123940677968e-06, "loss": 1.9387, "mean_token_accuracy": 0.5884385108947754, "num_tokens": 7708350.0, "step": 9576 }, { "epoch": 2.5365466101694913, "grad_norm": 1.501623511314392, "learning_rate": 8.731859110169492e-06, "loss": 0.8671, "mean_token_accuracy": 0.777910441160202, "num_tokens": 7709974.0, "step": 9578 }, { "epoch": 2.5370762711864407, "grad_norm": 1.797817349433899, "learning_rate": 8.731594279661017e-06, "loss": 1.2522, "mean_token_accuracy": 0.7241685651242733, "num_tokens": 7711553.0, "step": 9580 }, { "epoch": 2.5376059322033897, "grad_norm": 1.4297921657562256, "learning_rate": 8.731329449152544e-06, "loss": 1.2998, "mean_token_accuracy": 0.7276656702160835, "num_tokens": 7713241.0, "step": 9582 }, { "epoch": 2.538135593220339, "grad_norm": 1.7249456644058228, "learning_rate": 8.731064618644069e-06, "loss": 1.5954, "mean_token_accuracy": 0.6509330347180367, "num_tokens": 7714956.0, "step": 9584 }, { "epoch": 2.538665254237288, "grad_norm": 1.8042807579040527, "learning_rate": 8.730799788135594e-06, "loss": 1.6154, "mean_token_accuracy": 0.6907426118850708, "num_tokens": 7716457.0, "step": 9586 }, { "epoch": 2.539194915254237, "grad_norm": 1.6686969995498657, "learning_rate": 8.730534957627119e-06, "loss": 1.2975, "mean_token_accuracy": 0.7154041305184364, "num_tokens": 7717835.0, "step": 9588 }, { "epoch": 2.5397245762711864, "grad_norm": 1.9628998041152954, "learning_rate": 8.730270127118645e-06, "loss": 1.1911, "mean_token_accuracy": 0.718256488442421, "num_tokens": 7719179.0, "step": 9590 }, { "epoch": 2.540254237288136, "grad_norm": 1.311829686164856, "learning_rate": 8.73000529661017e-06, "loss": 1.0318, "mean_token_accuracy": 0.7651084326207638, "num_tokens": 7721062.0, "step": 9592 }, { "epoch": 2.5407838983050848, "grad_norm": 1.8857184648513794, "learning_rate": 8.729740466101695e-06, "loss": 1.3313, "mean_token_accuracy": 0.688728041946888, "num_tokens": 7722619.0, "step": 9594 }, { "epoch": 2.5413135593220337, "grad_norm": 1.8353315591812134, "learning_rate": 8.72947563559322e-06, "loss": 1.2801, "mean_token_accuracy": 0.6897332444787025, "num_tokens": 7724142.0, "step": 9596 }, { "epoch": 2.541843220338983, "grad_norm": 1.7211476564407349, "learning_rate": 8.729210805084747e-06, "loss": 1.3433, "mean_token_accuracy": 0.6910325661301613, "num_tokens": 7725617.0, "step": 9598 }, { "epoch": 2.542372881355932, "grad_norm": 1.4511016607284546, "learning_rate": 8.728945974576272e-06, "loss": 0.9858, "mean_token_accuracy": 0.7407326623797417, "num_tokens": 7727166.0, "step": 9600 }, { "epoch": 2.5429025423728815, "grad_norm": 1.690601110458374, "learning_rate": 8.728681144067798e-06, "loss": 1.3077, "mean_token_accuracy": 0.6925788074731827, "num_tokens": 7728822.0, "step": 9602 }, { "epoch": 2.5434322033898304, "grad_norm": 1.8883861303329468, "learning_rate": 8.728416313559323e-06, "loss": 1.6145, "mean_token_accuracy": 0.6532839313149452, "num_tokens": 7730308.0, "step": 9604 }, { "epoch": 2.5439618644067794, "grad_norm": 1.2569026947021484, "learning_rate": 8.728151483050848e-06, "loss": 1.0349, "mean_token_accuracy": 0.7485036700963974, "num_tokens": 7731835.0, "step": 9606 }, { "epoch": 2.544491525423729, "grad_norm": 1.6444299221038818, "learning_rate": 8.727886652542373e-06, "loss": 1.1129, "mean_token_accuracy": 0.7367011532187462, "num_tokens": 7733544.0, "step": 9608 }, { "epoch": 2.545021186440678, "grad_norm": 1.9396302700042725, "learning_rate": 8.7276218220339e-06, "loss": 1.2749, "mean_token_accuracy": 0.7023898065090179, "num_tokens": 7734986.0, "step": 9610 }, { "epoch": 2.545550847457627, "grad_norm": 2.070436477661133, "learning_rate": 8.727356991525425e-06, "loss": 1.4552, "mean_token_accuracy": 0.6840451583266258, "num_tokens": 7736632.0, "step": 9612 }, { "epoch": 2.546080508474576, "grad_norm": 1.6359832286834717, "learning_rate": 8.72709216101695e-06, "loss": 1.3431, "mean_token_accuracy": 0.6895551607012749, "num_tokens": 7738058.0, "step": 9614 }, { "epoch": 2.5466101694915255, "grad_norm": 1.7574577331542969, "learning_rate": 8.726827330508475e-06, "loss": 1.1346, "mean_token_accuracy": 0.7358538135886192, "num_tokens": 7739843.0, "step": 9616 }, { "epoch": 2.5471398305084745, "grad_norm": 1.485421895980835, "learning_rate": 8.726562500000001e-06, "loss": 1.1703, "mean_token_accuracy": 0.7450710646808147, "num_tokens": 7741729.0, "step": 9618 }, { "epoch": 2.547669491525424, "grad_norm": 1.7615373134613037, "learning_rate": 8.726297669491526e-06, "loss": 0.8845, "mean_token_accuracy": 0.7559923604130745, "num_tokens": 7743116.0, "step": 9620 }, { "epoch": 2.548199152542373, "grad_norm": 1.7842580080032349, "learning_rate": 8.726032838983051e-06, "loss": 1.395, "mean_token_accuracy": 0.7146699205040932, "num_tokens": 7744898.0, "step": 9622 }, { "epoch": 2.548728813559322, "grad_norm": 1.8040156364440918, "learning_rate": 8.725768008474576e-06, "loss": 1.4321, "mean_token_accuracy": 0.6982350721955299, "num_tokens": 7746439.0, "step": 9624 }, { "epoch": 2.549258474576271, "grad_norm": 1.6967554092407227, "learning_rate": 8.725503177966103e-06, "loss": 1.1401, "mean_token_accuracy": 0.7138812243938446, "num_tokens": 7747820.0, "step": 9626 }, { "epoch": 2.5497881355932206, "grad_norm": 1.8108909130096436, "learning_rate": 8.725238347457628e-06, "loss": 1.1204, "mean_token_accuracy": 0.7380436956882477, "num_tokens": 7749285.0, "step": 9628 }, { "epoch": 2.5503177966101696, "grad_norm": 2.152494430541992, "learning_rate": 8.724973516949154e-06, "loss": 1.5123, "mean_token_accuracy": 0.6497642174363136, "num_tokens": 7750639.0, "step": 9630 }, { "epoch": 2.5508474576271185, "grad_norm": 1.5455586910247803, "learning_rate": 8.724708686440679e-06, "loss": 1.4556, "mean_token_accuracy": 0.6746459826827049, "num_tokens": 7752254.0, "step": 9632 }, { "epoch": 2.551377118644068, "grad_norm": 1.6894363164901733, "learning_rate": 8.724443855932204e-06, "loss": 1.3679, "mean_token_accuracy": 0.6985880248248577, "num_tokens": 7754042.0, "step": 9634 }, { "epoch": 2.551906779661017, "grad_norm": 2.058838367462158, "learning_rate": 8.724179025423729e-06, "loss": 1.6239, "mean_token_accuracy": 0.657424695789814, "num_tokens": 7755360.0, "step": 9636 }, { "epoch": 2.5524364406779663, "grad_norm": 1.8648955821990967, "learning_rate": 8.723914194915256e-06, "loss": 1.4693, "mean_token_accuracy": 0.6561135053634644, "num_tokens": 7757093.0, "step": 9638 }, { "epoch": 2.5529661016949152, "grad_norm": 1.5427587032318115, "learning_rate": 8.72364936440678e-06, "loss": 1.2518, "mean_token_accuracy": 0.7181819155812263, "num_tokens": 7758843.0, "step": 9640 }, { "epoch": 2.553495762711864, "grad_norm": 2.001103162765503, "learning_rate": 8.723384533898305e-06, "loss": 1.4268, "mean_token_accuracy": 0.6743717715144157, "num_tokens": 7760342.0, "step": 9642 }, { "epoch": 2.5540254237288136, "grad_norm": 1.638906717300415, "learning_rate": 8.72311970338983e-06, "loss": 1.233, "mean_token_accuracy": 0.737056314945221, "num_tokens": 7761768.0, "step": 9644 }, { "epoch": 2.554555084745763, "grad_norm": 1.8572123050689697, "learning_rate": 8.722854872881357e-06, "loss": 1.1159, "mean_token_accuracy": 0.7263428270816803, "num_tokens": 7763243.0, "step": 9646 }, { "epoch": 2.555084745762712, "grad_norm": 1.722441554069519, "learning_rate": 8.722590042372882e-06, "loss": 1.6475, "mean_token_accuracy": 0.6589669063687325, "num_tokens": 7765022.0, "step": 9648 }, { "epoch": 2.555614406779661, "grad_norm": 1.9505220651626587, "learning_rate": 8.722325211864407e-06, "loss": 1.688, "mean_token_accuracy": 0.6171401962637901, "num_tokens": 7766750.0, "step": 9650 }, { "epoch": 2.5561440677966103, "grad_norm": 1.4210840463638306, "learning_rate": 8.722060381355933e-06, "loss": 1.0981, "mean_token_accuracy": 0.7158292420208454, "num_tokens": 7768637.0, "step": 9652 }, { "epoch": 2.5566737288135593, "grad_norm": 1.8978601694107056, "learning_rate": 8.721795550847458e-06, "loss": 1.2506, "mean_token_accuracy": 0.7129594162106514, "num_tokens": 7770179.0, "step": 9654 }, { "epoch": 2.5572033898305087, "grad_norm": 2.0600826740264893, "learning_rate": 8.721530720338985e-06, "loss": 1.3618, "mean_token_accuracy": 0.7033286765217781, "num_tokens": 7771783.0, "step": 9656 }, { "epoch": 2.5577330508474576, "grad_norm": 1.6236095428466797, "learning_rate": 8.72126588983051e-06, "loss": 1.4794, "mean_token_accuracy": 0.678352952003479, "num_tokens": 7773331.0, "step": 9658 }, { "epoch": 2.5582627118644066, "grad_norm": 1.513353705406189, "learning_rate": 8.721001059322035e-06, "loss": 1.2044, "mean_token_accuracy": 0.720636211335659, "num_tokens": 7774850.0, "step": 9660 }, { "epoch": 2.558792372881356, "grad_norm": 1.345509648323059, "learning_rate": 8.72073622881356e-06, "loss": 1.1967, "mean_token_accuracy": 0.7008682638406754, "num_tokens": 7776379.0, "step": 9662 }, { "epoch": 2.559322033898305, "grad_norm": 1.6679807901382446, "learning_rate": 8.720471398305086e-06, "loss": 1.6587, "mean_token_accuracy": 0.6173527613282204, "num_tokens": 7778143.0, "step": 9664 }, { "epoch": 2.5598516949152543, "grad_norm": 1.822401762008667, "learning_rate": 8.720206567796611e-06, "loss": 1.4501, "mean_token_accuracy": 0.688202440738678, "num_tokens": 7779800.0, "step": 9666 }, { "epoch": 2.5603813559322033, "grad_norm": 1.7672830820083618, "learning_rate": 8.719941737288136e-06, "loss": 1.3275, "mean_token_accuracy": 0.6983940228819847, "num_tokens": 7781323.0, "step": 9668 }, { "epoch": 2.5609110169491527, "grad_norm": 1.4999274015426636, "learning_rate": 8.719676906779661e-06, "loss": 1.2191, "mean_token_accuracy": 0.7305912673473358, "num_tokens": 7783025.0, "step": 9670 }, { "epoch": 2.5614406779661016, "grad_norm": 1.4151809215545654, "learning_rate": 8.719412076271188e-06, "loss": 1.5259, "mean_token_accuracy": 0.6635250225663185, "num_tokens": 7784625.0, "step": 9672 }, { "epoch": 2.561970338983051, "grad_norm": 1.6585800647735596, "learning_rate": 8.719147245762713e-06, "loss": 1.2752, "mean_token_accuracy": 0.7083050981163979, "num_tokens": 7786100.0, "step": 9674 }, { "epoch": 2.5625, "grad_norm": 1.591254472732544, "learning_rate": 8.718882415254238e-06, "loss": 0.9236, "mean_token_accuracy": 0.7808443903923035, "num_tokens": 7787695.0, "step": 9676 }, { "epoch": 2.563029661016949, "grad_norm": 1.7184027433395386, "learning_rate": 8.718617584745763e-06, "loss": 1.3461, "mean_token_accuracy": 0.694513700902462, "num_tokens": 7789352.0, "step": 9678 }, { "epoch": 2.5635593220338984, "grad_norm": 1.5739939212799072, "learning_rate": 8.71835275423729e-06, "loss": 1.2157, "mean_token_accuracy": 0.7010696530342102, "num_tokens": 7790914.0, "step": 9680 }, { "epoch": 2.5640889830508473, "grad_norm": 2.1549298763275146, "learning_rate": 8.718087923728814e-06, "loss": 1.5108, "mean_token_accuracy": 0.6636604815721512, "num_tokens": 7792332.0, "step": 9682 }, { "epoch": 2.5646186440677967, "grad_norm": 1.8576650619506836, "learning_rate": 8.71782309322034e-06, "loss": 1.388, "mean_token_accuracy": 0.6744981706142426, "num_tokens": 7793820.0, "step": 9684 }, { "epoch": 2.5651483050847457, "grad_norm": 1.936790108680725, "learning_rate": 8.717558262711866e-06, "loss": 1.6059, "mean_token_accuracy": 0.6404940895736217, "num_tokens": 7795354.0, "step": 9686 }, { "epoch": 2.565677966101695, "grad_norm": 1.7990840673446655, "learning_rate": 8.71729343220339e-06, "loss": 1.2746, "mean_token_accuracy": 0.7103467881679535, "num_tokens": 7796878.0, "step": 9688 }, { "epoch": 2.566207627118644, "grad_norm": 1.9226411581039429, "learning_rate": 8.717028601694916e-06, "loss": 1.2866, "mean_token_accuracy": 0.7110111229121685, "num_tokens": 7798545.0, "step": 9690 }, { "epoch": 2.5667372881355934, "grad_norm": 1.6896471977233887, "learning_rate": 8.716763771186442e-06, "loss": 1.1105, "mean_token_accuracy": 0.7215417325496674, "num_tokens": 7799949.0, "step": 9692 }, { "epoch": 2.5672669491525424, "grad_norm": 1.607656478881836, "learning_rate": 8.716498940677967e-06, "loss": 1.5285, "mean_token_accuracy": 0.6640746891498566, "num_tokens": 7801800.0, "step": 9694 }, { "epoch": 2.5677966101694913, "grad_norm": 1.251495599746704, "learning_rate": 8.716234110169492e-06, "loss": 1.016, "mean_token_accuracy": 0.7549855187535286, "num_tokens": 7803471.0, "step": 9696 }, { "epoch": 2.5683262711864407, "grad_norm": 1.4452812671661377, "learning_rate": 8.715969279661017e-06, "loss": 1.3241, "mean_token_accuracy": 0.708577424287796, "num_tokens": 7805013.0, "step": 9698 }, { "epoch": 2.5688559322033897, "grad_norm": 1.730847716331482, "learning_rate": 8.715704449152544e-06, "loss": 1.1678, "mean_token_accuracy": 0.7505901381373405, "num_tokens": 7806791.0, "step": 9700 }, { "epoch": 2.569385593220339, "grad_norm": 1.349043846130371, "learning_rate": 8.715439618644069e-06, "loss": 0.7318, "mean_token_accuracy": 0.788488894701004, "num_tokens": 7808323.0, "step": 9702 }, { "epoch": 2.569915254237288, "grad_norm": 1.4881343841552734, "learning_rate": 8.715174788135593e-06, "loss": 1.3199, "mean_token_accuracy": 0.6915945410728455, "num_tokens": 7810027.0, "step": 9704 }, { "epoch": 2.570444915254237, "grad_norm": 1.6326342821121216, "learning_rate": 8.714909957627118e-06, "loss": 1.3226, "mean_token_accuracy": 0.7014452815055847, "num_tokens": 7811893.0, "step": 9706 }, { "epoch": 2.5709745762711864, "grad_norm": 1.8899508714675903, "learning_rate": 8.714645127118645e-06, "loss": 1.5085, "mean_token_accuracy": 0.6461047530174255, "num_tokens": 7813568.0, "step": 9708 }, { "epoch": 2.571504237288136, "grad_norm": 2.1012566089630127, "learning_rate": 8.71438029661017e-06, "loss": 1.6329, "mean_token_accuracy": 0.6404945999383926, "num_tokens": 7815186.0, "step": 9710 }, { "epoch": 2.5720338983050848, "grad_norm": 1.8061128854751587, "learning_rate": 8.714115466101697e-06, "loss": 1.4079, "mean_token_accuracy": 0.6799566000699997, "num_tokens": 7816770.0, "step": 9712 }, { "epoch": 2.5725635593220337, "grad_norm": 1.7629036903381348, "learning_rate": 8.71385063559322e-06, "loss": 1.3793, "mean_token_accuracy": 0.6767646223306656, "num_tokens": 7818361.0, "step": 9714 }, { "epoch": 2.573093220338983, "grad_norm": 1.560542106628418, "learning_rate": 8.713585805084746e-06, "loss": 1.2143, "mean_token_accuracy": 0.7159314155578613, "num_tokens": 7820010.0, "step": 9716 }, { "epoch": 2.573622881355932, "grad_norm": 1.3504492044448853, "learning_rate": 8.713320974576271e-06, "loss": 1.0267, "mean_token_accuracy": 0.7390694320201874, "num_tokens": 7821736.0, "step": 9718 }, { "epoch": 2.5741525423728815, "grad_norm": 1.4879122972488403, "learning_rate": 8.713056144067798e-06, "loss": 1.0898, "mean_token_accuracy": 0.7121593952178955, "num_tokens": 7823245.0, "step": 9720 }, { "epoch": 2.5746822033898304, "grad_norm": 1.6163034439086914, "learning_rate": 8.712791313559323e-06, "loss": 0.927, "mean_token_accuracy": 0.7725045159459114, "num_tokens": 7824674.0, "step": 9722 }, { "epoch": 2.5752118644067794, "grad_norm": 1.7873634099960327, "learning_rate": 8.712526483050848e-06, "loss": 1.3724, "mean_token_accuracy": 0.6912676692008972, "num_tokens": 7826089.0, "step": 9724 }, { "epoch": 2.575741525423729, "grad_norm": 2.194344997406006, "learning_rate": 8.712261652542373e-06, "loss": 1.6111, "mean_token_accuracy": 0.6505795568227768, "num_tokens": 7827675.0, "step": 9726 }, { "epoch": 2.576271186440678, "grad_norm": 1.6271344423294067, "learning_rate": 8.7119968220339e-06, "loss": 1.2038, "mean_token_accuracy": 0.7286977544426918, "num_tokens": 7829152.0, "step": 9728 }, { "epoch": 2.576800847457627, "grad_norm": 1.6029081344604492, "learning_rate": 8.711731991525424e-06, "loss": 1.1244, "mean_token_accuracy": 0.7283727750182152, "num_tokens": 7830531.0, "step": 9730 }, { "epoch": 2.577330508474576, "grad_norm": 1.4717905521392822, "learning_rate": 8.71146716101695e-06, "loss": 1.4734, "mean_token_accuracy": 0.6639296114444733, "num_tokens": 7832437.0, "step": 9732 }, { "epoch": 2.5778601694915255, "grad_norm": 1.6762248277664185, "learning_rate": 8.711202330508474e-06, "loss": 1.8295, "mean_token_accuracy": 0.6283519193530083, "num_tokens": 7834062.0, "step": 9734 }, { "epoch": 2.5783898305084745, "grad_norm": 1.8066143989562988, "learning_rate": 8.7109375e-06, "loss": 1.4148, "mean_token_accuracy": 0.6827561929821968, "num_tokens": 7835828.0, "step": 9736 }, { "epoch": 2.578919491525424, "grad_norm": 1.1742185354232788, "learning_rate": 8.710672669491527e-06, "loss": 1.2178, "mean_token_accuracy": 0.7083275392651558, "num_tokens": 7838116.0, "step": 9738 }, { "epoch": 2.579449152542373, "grad_norm": 1.7272292375564575, "learning_rate": 8.710407838983052e-06, "loss": 1.2102, "mean_token_accuracy": 0.7184580862522125, "num_tokens": 7839491.0, "step": 9740 }, { "epoch": 2.579978813559322, "grad_norm": 1.572195291519165, "learning_rate": 8.710143008474577e-06, "loss": 1.3469, "mean_token_accuracy": 0.6975988671183586, "num_tokens": 7840980.0, "step": 9742 }, { "epoch": 2.580508474576271, "grad_norm": 1.7225035429000854, "learning_rate": 8.709878177966102e-06, "loss": 1.1927, "mean_token_accuracy": 0.7204391360282898, "num_tokens": 7842500.0, "step": 9744 }, { "epoch": 2.5810381355932206, "grad_norm": 1.3175296783447266, "learning_rate": 8.709613347457629e-06, "loss": 1.2971, "mean_token_accuracy": 0.6911330074071884, "num_tokens": 7845187.0, "step": 9746 }, { "epoch": 2.5815677966101696, "grad_norm": 1.7980388402938843, "learning_rate": 8.709348516949154e-06, "loss": 1.6116, "mean_token_accuracy": 0.6540760770440102, "num_tokens": 7846507.0, "step": 9748 }, { "epoch": 2.5820974576271185, "grad_norm": 1.8166934251785278, "learning_rate": 8.709083686440679e-06, "loss": 1.2606, "step": 9750 }, { "epoch": 2.5820974576271185, "eval_loss": 1.3106135129928589, "eval_mean_token_accuracy": 0.7007792992638303, "eval_num_tokens": 7848027.0, "eval_runtime": 48.6116, "eval_samples_per_second": 6.336, "eval_steps_per_second": 6.336, "step": 9750 }, { "epoch": 2.582627118644068, "grad_norm": 1.5389244556427002, "learning_rate": 8.708818855932204e-06, "loss": 1.2929, "mean_token_accuracy": 0.7086402326822281, "num_tokens": 7849949.0, "step": 9752 }, { "epoch": 2.583156779661017, "grad_norm": 1.5910552740097046, "learning_rate": 8.70855402542373e-06, "loss": 1.0751, "mean_token_accuracy": 0.767640195786953, "num_tokens": 7851502.0, "step": 9754 }, { "epoch": 2.5836864406779663, "grad_norm": 1.4062066078186035, "learning_rate": 8.708289194915255e-06, "loss": 1.4778, "mean_token_accuracy": 0.6781368441879749, "num_tokens": 7853371.0, "step": 9756 }, { "epoch": 2.5842161016949152, "grad_norm": 1.8040704727172852, "learning_rate": 8.70802436440678e-06, "loss": 0.9633, "mean_token_accuracy": 0.7531011030077934, "num_tokens": 7854784.0, "step": 9758 }, { "epoch": 2.584745762711864, "grad_norm": 1.228089451789856, "learning_rate": 8.707759533898305e-06, "loss": 1.2132, "mean_token_accuracy": 0.7382836639881134, "num_tokens": 7856649.0, "step": 9760 }, { "epoch": 2.5852754237288136, "grad_norm": 1.4310117959976196, "learning_rate": 8.707494703389832e-06, "loss": 0.8368, "mean_token_accuracy": 0.7945086359977722, "num_tokens": 7858143.0, "step": 9762 }, { "epoch": 2.585805084745763, "grad_norm": 1.9905954599380493, "learning_rate": 8.707229872881357e-06, "loss": 1.3102, "mean_token_accuracy": 0.6989005655050278, "num_tokens": 7859528.0, "step": 9764 }, { "epoch": 2.586334745762712, "grad_norm": 1.970848560333252, "learning_rate": 8.706965042372883e-06, "loss": 1.2845, "mean_token_accuracy": 0.6935681328177452, "num_tokens": 7861368.0, "step": 9766 }, { "epoch": 2.586864406779661, "grad_norm": 1.8644437789916992, "learning_rate": 8.706700211864406e-06, "loss": 1.0987, "mean_token_accuracy": 0.7763486057519913, "num_tokens": 7862805.0, "step": 9768 }, { "epoch": 2.5873940677966103, "grad_norm": 1.7856754064559937, "learning_rate": 8.706435381355933e-06, "loss": 1.5635, "mean_token_accuracy": 0.6683555953204632, "num_tokens": 7864296.0, "step": 9770 }, { "epoch": 2.5879237288135593, "grad_norm": 1.9315497875213623, "learning_rate": 8.706170550847458e-06, "loss": 1.6159, "mean_token_accuracy": 0.666929803788662, "num_tokens": 7865845.0, "step": 9772 }, { "epoch": 2.5884533898305087, "grad_norm": 1.7742680311203003, "learning_rate": 8.705905720338985e-06, "loss": 1.3853, "mean_token_accuracy": 0.6830865778028965, "num_tokens": 7867518.0, "step": 9774 }, { "epoch": 2.5889830508474576, "grad_norm": 1.9535447359085083, "learning_rate": 8.70564088983051e-06, "loss": 1.1839, "mean_token_accuracy": 0.7190019562840462, "num_tokens": 7868988.0, "step": 9776 }, { "epoch": 2.5895127118644066, "grad_norm": 1.7162415981292725, "learning_rate": 8.705376059322035e-06, "loss": 0.9712, "mean_token_accuracy": 0.7535669654607773, "num_tokens": 7870457.0, "step": 9778 }, { "epoch": 2.590042372881356, "grad_norm": 1.710999608039856, "learning_rate": 8.70511122881356e-06, "loss": 0.8642, "mean_token_accuracy": 0.7815884724259377, "num_tokens": 7871871.0, "step": 9780 }, { "epoch": 2.590572033898305, "grad_norm": 1.876312494277954, "learning_rate": 8.704846398305086e-06, "loss": 1.5536, "mean_token_accuracy": 0.6837264820933342, "num_tokens": 7873370.0, "step": 9782 }, { "epoch": 2.5911016949152543, "grad_norm": 1.966059684753418, "learning_rate": 8.704581567796611e-06, "loss": 1.5652, "mean_token_accuracy": 0.6722047924995422, "num_tokens": 7874896.0, "step": 9784 }, { "epoch": 2.5916313559322033, "grad_norm": 1.2458893060684204, "learning_rate": 8.704316737288136e-06, "loss": 0.8076, "mean_token_accuracy": 0.8010148331522942, "num_tokens": 7876556.0, "step": 9786 }, { "epoch": 2.5921610169491527, "grad_norm": 1.8188458681106567, "learning_rate": 8.70405190677966e-06, "loss": 1.6174, "mean_token_accuracy": 0.6433072276413441, "num_tokens": 7878655.0, "step": 9788 }, { "epoch": 2.5926906779661016, "grad_norm": 1.6105968952178955, "learning_rate": 8.703787076271187e-06, "loss": 1.1133, "mean_token_accuracy": 0.7492685317993164, "num_tokens": 7880298.0, "step": 9790 }, { "epoch": 2.593220338983051, "grad_norm": 1.5961073637008667, "learning_rate": 8.703522245762712e-06, "loss": 1.0659, "mean_token_accuracy": 0.7524695619940758, "num_tokens": 7882002.0, "step": 9792 }, { "epoch": 2.59375, "grad_norm": 1.8405544757843018, "learning_rate": 8.703257415254239e-06, "loss": 1.1646, "mean_token_accuracy": 0.7241050451993942, "num_tokens": 7883564.0, "step": 9794 }, { "epoch": 2.594279661016949, "grad_norm": 1.9637556076049805, "learning_rate": 8.702992584745762e-06, "loss": 1.2727, "mean_token_accuracy": 0.719888836145401, "num_tokens": 7885023.0, "step": 9796 }, { "epoch": 2.5948093220338984, "grad_norm": 1.7278833389282227, "learning_rate": 8.702727754237289e-06, "loss": 1.0576, "mean_token_accuracy": 0.7406596094369888, "num_tokens": 7886578.0, "step": 9798 }, { "epoch": 2.5953389830508473, "grad_norm": 2.012373447418213, "learning_rate": 8.702462923728814e-06, "loss": 1.4964, "mean_token_accuracy": 0.6789634488523006, "num_tokens": 7888177.0, "step": 9800 }, { "epoch": 2.5958686440677967, "grad_norm": 2.103879690170288, "learning_rate": 8.70219809322034e-06, "loss": 1.3899, "mean_token_accuracy": 0.6682403907179832, "num_tokens": 7889669.0, "step": 9802 }, { "epoch": 2.5963983050847457, "grad_norm": 1.8935911655426025, "learning_rate": 8.701933262711865e-06, "loss": 1.549, "mean_token_accuracy": 0.6602514237165451, "num_tokens": 7891466.0, "step": 9804 }, { "epoch": 2.596927966101695, "grad_norm": 1.8791793584823608, "learning_rate": 8.70166843220339e-06, "loss": 1.4352, "mean_token_accuracy": 0.6948423832654953, "num_tokens": 7892805.0, "step": 9806 }, { "epoch": 2.597457627118644, "grad_norm": 1.4763022661209106, "learning_rate": 8.701403601694915e-06, "loss": 1.1744, "mean_token_accuracy": 0.7427585050463676, "num_tokens": 7894324.0, "step": 9808 }, { "epoch": 2.5979872881355934, "grad_norm": 1.5625652074813843, "learning_rate": 8.701138771186442e-06, "loss": 1.4846, "mean_token_accuracy": 0.6601403579115868, "num_tokens": 7896014.0, "step": 9810 }, { "epoch": 2.5985169491525424, "grad_norm": 1.7891889810562134, "learning_rate": 8.700873940677967e-06, "loss": 1.0941, "mean_token_accuracy": 0.7347702607512474, "num_tokens": 7897465.0, "step": 9812 }, { "epoch": 2.5990466101694913, "grad_norm": 2.0794153213500977, "learning_rate": 8.700609110169492e-06, "loss": 1.1103, "mean_token_accuracy": 0.7375870794057846, "num_tokens": 7898957.0, "step": 9814 }, { "epoch": 2.5995762711864407, "grad_norm": 1.744626760482788, "learning_rate": 8.700344279661017e-06, "loss": 1.1979, "mean_token_accuracy": 0.7268076539039612, "num_tokens": 7900404.0, "step": 9816 }, { "epoch": 2.6001059322033897, "grad_norm": 1.8507219552993774, "learning_rate": 8.700079449152543e-06, "loss": 1.4989, "mean_token_accuracy": 0.6800781935453415, "num_tokens": 7901728.0, "step": 9818 }, { "epoch": 2.600635593220339, "grad_norm": 1.4526593685150146, "learning_rate": 8.699814618644068e-06, "loss": 1.111, "mean_token_accuracy": 0.7374938055872917, "num_tokens": 7903600.0, "step": 9820 }, { "epoch": 2.601165254237288, "grad_norm": 1.6226919889450073, "learning_rate": 8.699549788135593e-06, "loss": 1.455, "mean_token_accuracy": 0.6775200366973877, "num_tokens": 7905169.0, "step": 9822 }, { "epoch": 2.601694915254237, "grad_norm": 1.585963487625122, "learning_rate": 8.69928495762712e-06, "loss": 1.0581, "mean_token_accuracy": 0.750326007604599, "num_tokens": 7906625.0, "step": 9824 }, { "epoch": 2.6022245762711864, "grad_norm": 2.2727041244506836, "learning_rate": 8.699020127118645e-06, "loss": 1.5627, "mean_token_accuracy": 0.647147249430418, "num_tokens": 7908117.0, "step": 9826 }, { "epoch": 2.602754237288136, "grad_norm": 1.9739986658096313, "learning_rate": 8.698755296610171e-06, "loss": 1.5546, "mean_token_accuracy": 0.6469482257962227, "num_tokens": 7909500.0, "step": 9828 }, { "epoch": 2.6032838983050848, "grad_norm": 2.054687023162842, "learning_rate": 8.698490466101696e-06, "loss": 1.4889, "mean_token_accuracy": 0.6720533221960068, "num_tokens": 7910885.0, "step": 9830 }, { "epoch": 2.6038135593220337, "grad_norm": 1.8100988864898682, "learning_rate": 8.698225635593221e-06, "loss": 1.6144, "mean_token_accuracy": 0.657628308981657, "num_tokens": 7912450.0, "step": 9832 }, { "epoch": 2.604343220338983, "grad_norm": 2.2501375675201416, "learning_rate": 8.697960805084746e-06, "loss": 1.2742, "mean_token_accuracy": 0.7316997274756432, "num_tokens": 7913675.0, "step": 9834 }, { "epoch": 2.604872881355932, "grad_norm": 1.6021100282669067, "learning_rate": 8.697695974576273e-06, "loss": 1.0655, "mean_token_accuracy": 0.737608052790165, "num_tokens": 7915497.0, "step": 9836 }, { "epoch": 2.6054025423728815, "grad_norm": 1.4176156520843506, "learning_rate": 8.697431144067798e-06, "loss": 1.3322, "mean_token_accuracy": 0.7124183624982834, "num_tokens": 7917602.0, "step": 9838 }, { "epoch": 2.6059322033898304, "grad_norm": 2.118288993835449, "learning_rate": 8.697166313559323e-06, "loss": 1.5599, "mean_token_accuracy": 0.6511239036917686, "num_tokens": 7919199.0, "step": 9840 }, { "epoch": 2.6064618644067794, "grad_norm": 1.8328763246536255, "learning_rate": 8.696901483050847e-06, "loss": 1.051, "mean_token_accuracy": 0.7546561360359192, "num_tokens": 7920688.0, "step": 9842 }, { "epoch": 2.606991525423729, "grad_norm": 1.8990845680236816, "learning_rate": 8.696636652542374e-06, "loss": 1.2743, "mean_token_accuracy": 0.6994696706533432, "num_tokens": 7922269.0, "step": 9844 }, { "epoch": 2.607521186440678, "grad_norm": 1.920066237449646, "learning_rate": 8.696371822033899e-06, "loss": 1.66, "mean_token_accuracy": 0.6367221251130104, "num_tokens": 7923768.0, "step": 9846 }, { "epoch": 2.608050847457627, "grad_norm": 1.6735939979553223, "learning_rate": 8.696106991525426e-06, "loss": 1.3586, "mean_token_accuracy": 0.6844296380877495, "num_tokens": 7925124.0, "step": 9848 }, { "epoch": 2.608580508474576, "grad_norm": 1.3555442094802856, "learning_rate": 8.695842161016949e-06, "loss": 1.343, "mean_token_accuracy": 0.7178407609462738, "num_tokens": 7926695.0, "step": 9850 }, { "epoch": 2.6091101694915255, "grad_norm": 1.7499768733978271, "learning_rate": 8.695577330508476e-06, "loss": 1.4346, "mean_token_accuracy": 0.6644283905625343, "num_tokens": 7928458.0, "step": 9852 }, { "epoch": 2.6096398305084745, "grad_norm": 1.8231152296066284, "learning_rate": 8.6953125e-06, "loss": 1.6426, "mean_token_accuracy": 0.6544779390096664, "num_tokens": 7930045.0, "step": 9854 }, { "epoch": 2.610169491525424, "grad_norm": 2.0129270553588867, "learning_rate": 8.695047669491527e-06, "loss": 1.8209, "mean_token_accuracy": 0.6046009808778763, "num_tokens": 7931609.0, "step": 9856 }, { "epoch": 2.610699152542373, "grad_norm": 1.5067236423492432, "learning_rate": 8.694782838983052e-06, "loss": 1.2366, "mean_token_accuracy": 0.702252984046936, "num_tokens": 7933252.0, "step": 9858 }, { "epoch": 2.611228813559322, "grad_norm": 1.741356611251831, "learning_rate": 8.694518008474577e-06, "loss": 1.5701, "mean_token_accuracy": 0.6521699950098991, "num_tokens": 7935021.0, "step": 9860 }, { "epoch": 2.611758474576271, "grad_norm": 1.583255648612976, "learning_rate": 8.694253177966102e-06, "loss": 1.3574, "mean_token_accuracy": 0.7075208202004433, "num_tokens": 7936626.0, "step": 9862 }, { "epoch": 2.6122881355932206, "grad_norm": 1.6855701208114624, "learning_rate": 8.693988347457628e-06, "loss": 1.5148, "mean_token_accuracy": 0.7008310332894325, "num_tokens": 7938020.0, "step": 9864 }, { "epoch": 2.6128177966101696, "grad_norm": 1.5372352600097656, "learning_rate": 8.693723516949153e-06, "loss": 1.0522, "mean_token_accuracy": 0.7534588128328323, "num_tokens": 7939460.0, "step": 9866 }, { "epoch": 2.6133474576271185, "grad_norm": 1.7288384437561035, "learning_rate": 8.693458686440678e-06, "loss": 1.5319, "mean_token_accuracy": 0.6730305328965187, "num_tokens": 7941021.0, "step": 9868 }, { "epoch": 2.613877118644068, "grad_norm": 1.683380126953125, "learning_rate": 8.693193855932203e-06, "loss": 1.6314, "mean_token_accuracy": 0.6585988290607929, "num_tokens": 7942756.0, "step": 9870 }, { "epoch": 2.614406779661017, "grad_norm": 1.549720287322998, "learning_rate": 8.69292902542373e-06, "loss": 1.3318, "mean_token_accuracy": 0.6888143867254257, "num_tokens": 7944274.0, "step": 9872 }, { "epoch": 2.6149364406779663, "grad_norm": 1.7146580219268799, "learning_rate": 8.692664194915255e-06, "loss": 1.3033, "mean_token_accuracy": 0.7107258960604668, "num_tokens": 7945808.0, "step": 9874 }, { "epoch": 2.6154661016949152, "grad_norm": 1.5888336896896362, "learning_rate": 8.69239936440678e-06, "loss": 1.3031, "mean_token_accuracy": 0.67282585054636, "num_tokens": 7947687.0, "step": 9876 }, { "epoch": 2.615995762711864, "grad_norm": 1.6409693956375122, "learning_rate": 8.692134533898305e-06, "loss": 1.3048, "mean_token_accuracy": 0.7103915251791477, "num_tokens": 7949553.0, "step": 9878 }, { "epoch": 2.6165254237288136, "grad_norm": 1.7240514755249023, "learning_rate": 8.691869703389831e-06, "loss": 1.1821, "mean_token_accuracy": 0.7267134711146355, "num_tokens": 7951033.0, "step": 9880 }, { "epoch": 2.617055084745763, "grad_norm": 1.6130412817001343, "learning_rate": 8.691604872881356e-06, "loss": 1.1878, "mean_token_accuracy": 0.7179009318351746, "num_tokens": 7952591.0, "step": 9882 }, { "epoch": 2.617584745762712, "grad_norm": 1.643053650856018, "learning_rate": 8.691340042372883e-06, "loss": 1.3157, "mean_token_accuracy": 0.7199484631419182, "num_tokens": 7954271.0, "step": 9884 }, { "epoch": 2.618114406779661, "grad_norm": 1.7872308492660522, "learning_rate": 8.691075211864408e-06, "loss": 0.986, "mean_token_accuracy": 0.7527619749307632, "num_tokens": 7955870.0, "step": 9886 }, { "epoch": 2.6186440677966103, "grad_norm": 1.770321011543274, "learning_rate": 8.690810381355933e-06, "loss": 1.0448, "mean_token_accuracy": 0.7675196006894112, "num_tokens": 7957446.0, "step": 9888 }, { "epoch": 2.6191737288135593, "grad_norm": 1.6179563999176025, "learning_rate": 8.690545550847458e-06, "loss": 1.353, "mean_token_accuracy": 0.6830219030380249, "num_tokens": 7959049.0, "step": 9890 }, { "epoch": 2.6197033898305087, "grad_norm": 1.6544013023376465, "learning_rate": 8.690280720338984e-06, "loss": 1.0884, "mean_token_accuracy": 0.7484786733984947, "num_tokens": 7960502.0, "step": 9892 }, { "epoch": 2.6202330508474576, "grad_norm": 1.695677638053894, "learning_rate": 8.69001588983051e-06, "loss": 1.1002, "mean_token_accuracy": 0.7312058955430984, "num_tokens": 7962153.0, "step": 9894 }, { "epoch": 2.6207627118644066, "grad_norm": 1.519370675086975, "learning_rate": 8.689751059322034e-06, "loss": 1.2687, "mean_token_accuracy": 0.6879985630512238, "num_tokens": 7963760.0, "step": 9896 }, { "epoch": 2.621292372881356, "grad_norm": 1.6194531917572021, "learning_rate": 8.689486228813559e-06, "loss": 1.1303, "mean_token_accuracy": 0.7305069789290428, "num_tokens": 7965380.0, "step": 9898 }, { "epoch": 2.621822033898305, "grad_norm": 1.4183499813079834, "learning_rate": 8.689221398305086e-06, "loss": 1.1042, "mean_token_accuracy": 0.7208779975771904, "num_tokens": 7967098.0, "step": 9900 }, { "epoch": 2.6223516949152543, "grad_norm": 1.7795358896255493, "learning_rate": 8.68895656779661e-06, "loss": 1.0472, "mean_token_accuracy": 0.7625321075320244, "num_tokens": 7968539.0, "step": 9902 }, { "epoch": 2.6228813559322033, "grad_norm": 1.3949264287948608, "learning_rate": 8.688691737288136e-06, "loss": 1.1526, "mean_token_accuracy": 0.7281123176217079, "num_tokens": 7970483.0, "step": 9904 }, { "epoch": 2.6234110169491527, "grad_norm": 1.62412691116333, "learning_rate": 8.688426906779662e-06, "loss": 1.3026, "mean_token_accuracy": 0.6990257725119591, "num_tokens": 7972233.0, "step": 9906 }, { "epoch": 2.6239406779661016, "grad_norm": 2.0123863220214844, "learning_rate": 8.688162076271187e-06, "loss": 1.8062, "mean_token_accuracy": 0.6177646592259407, "num_tokens": 7973737.0, "step": 9908 }, { "epoch": 2.624470338983051, "grad_norm": 1.4891715049743652, "learning_rate": 8.687897245762714e-06, "loss": 1.3273, "mean_token_accuracy": 0.6867557466030121, "num_tokens": 7975365.0, "step": 9910 }, { "epoch": 2.625, "grad_norm": 1.652782917022705, "learning_rate": 8.687632415254239e-06, "loss": 1.0913, "mean_token_accuracy": 0.7183214575052261, "num_tokens": 7977091.0, "step": 9912 }, { "epoch": 2.625529661016949, "grad_norm": 2.3115220069885254, "learning_rate": 8.687367584745764e-06, "loss": 1.6994, "mean_token_accuracy": 0.6410889253020287, "num_tokens": 7978586.0, "step": 9914 }, { "epoch": 2.6260593220338984, "grad_norm": 2.320840835571289, "learning_rate": 8.687102754237288e-06, "loss": 1.6909, "mean_token_accuracy": 0.6103508248925209, "num_tokens": 7980175.0, "step": 9916 }, { "epoch": 2.6265889830508473, "grad_norm": 1.6634786128997803, "learning_rate": 8.686837923728815e-06, "loss": 1.2364, "mean_token_accuracy": 0.7160895317792892, "num_tokens": 7981841.0, "step": 9918 }, { "epoch": 2.6271186440677967, "grad_norm": 1.4356310367584229, "learning_rate": 8.68657309322034e-06, "loss": 1.3659, "mean_token_accuracy": 0.721622034907341, "num_tokens": 7983690.0, "step": 9920 }, { "epoch": 2.6276483050847457, "grad_norm": 1.8747612237930298, "learning_rate": 8.686308262711865e-06, "loss": 1.2696, "mean_token_accuracy": 0.6955154165625572, "num_tokens": 7984884.0, "step": 9922 }, { "epoch": 2.628177966101695, "grad_norm": 2.205676317214966, "learning_rate": 8.68604343220339e-06, "loss": 1.7717, "mean_token_accuracy": 0.6307660937309265, "num_tokens": 7986248.0, "step": 9924 }, { "epoch": 2.628707627118644, "grad_norm": 1.3614487648010254, "learning_rate": 8.685778601694917e-06, "loss": 0.9954, "mean_token_accuracy": 0.7551280781626701, "num_tokens": 7987782.0, "step": 9926 }, { "epoch": 2.6292372881355934, "grad_norm": 1.5534251928329468, "learning_rate": 8.685513771186441e-06, "loss": 1.2272, "mean_token_accuracy": 0.6931020691990852, "num_tokens": 7989490.0, "step": 9928 }, { "epoch": 2.6297669491525424, "grad_norm": 1.416520357131958, "learning_rate": 8.685248940677966e-06, "loss": 1.3234, "mean_token_accuracy": 0.6906210854649544, "num_tokens": 7991201.0, "step": 9930 }, { "epoch": 2.6302966101694913, "grad_norm": 1.5200387239456177, "learning_rate": 8.684984110169491e-06, "loss": 0.9625, "mean_token_accuracy": 0.7732688188552856, "num_tokens": 7992474.0, "step": 9932 }, { "epoch": 2.6308262711864407, "grad_norm": 1.8477765321731567, "learning_rate": 8.684719279661018e-06, "loss": 1.2245, "mean_token_accuracy": 0.6973055228590965, "num_tokens": 7994114.0, "step": 9934 }, { "epoch": 2.6313559322033897, "grad_norm": 1.5293666124343872, "learning_rate": 8.684454449152543e-06, "loss": 1.0271, "mean_token_accuracy": 0.7612203285098076, "num_tokens": 7995675.0, "step": 9936 }, { "epoch": 2.631885593220339, "grad_norm": 1.6647652387619019, "learning_rate": 8.68418961864407e-06, "loss": 0.9306, "mean_token_accuracy": 0.7501338720321655, "num_tokens": 7997978.0, "step": 9938 }, { "epoch": 2.632415254237288, "grad_norm": 1.7218042612075806, "learning_rate": 8.683924788135594e-06, "loss": 1.2549, "mean_token_accuracy": 0.7114415988326073, "num_tokens": 7999455.0, "step": 9940 }, { "epoch": 2.632944915254237, "grad_norm": 1.7235755920410156, "learning_rate": 8.68365995762712e-06, "loss": 0.9857, "mean_token_accuracy": 0.7556494027376175, "num_tokens": 8000772.0, "step": 9942 }, { "epoch": 2.6334745762711864, "grad_norm": 1.877532720565796, "learning_rate": 8.683395127118644e-06, "loss": 1.1262, "mean_token_accuracy": 0.7429039403796196, "num_tokens": 8002093.0, "step": 9944 }, { "epoch": 2.634004237288136, "grad_norm": 1.7123682498931885, "learning_rate": 8.683130296610171e-06, "loss": 1.2232, "mean_token_accuracy": 0.7140936926007271, "num_tokens": 8004198.0, "step": 9946 }, { "epoch": 2.6345338983050848, "grad_norm": 2.132087230682373, "learning_rate": 8.682865466101696e-06, "loss": 1.6416, "mean_token_accuracy": 0.6495334729552269, "num_tokens": 8006475.0, "step": 9948 }, { "epoch": 2.6350635593220337, "grad_norm": 1.5462251901626587, "learning_rate": 8.68260063559322e-06, "loss": 1.0702, "mean_token_accuracy": 0.7467874214053154, "num_tokens": 8008103.0, "step": 9950 }, { "epoch": 2.635593220338983, "grad_norm": 2.132737874984741, "learning_rate": 8.682335805084746e-06, "loss": 1.4267, "mean_token_accuracy": 0.6678985022008419, "num_tokens": 8009860.0, "step": 9952 }, { "epoch": 2.636122881355932, "grad_norm": 1.9371551275253296, "learning_rate": 8.682070974576272e-06, "loss": 1.3778, "mean_token_accuracy": 0.6745811253786087, "num_tokens": 8011596.0, "step": 9954 }, { "epoch": 2.6366525423728815, "grad_norm": 1.7830733060836792, "learning_rate": 8.681806144067797e-06, "loss": 1.1472, "mean_token_accuracy": 0.7322601452469826, "num_tokens": 8013197.0, "step": 9956 }, { "epoch": 2.6371822033898304, "grad_norm": 1.534204363822937, "learning_rate": 8.681541313559322e-06, "loss": 1.1167, "mean_token_accuracy": 0.7706973925232887, "num_tokens": 8014943.0, "step": 9958 }, { "epoch": 2.6377118644067794, "grad_norm": 1.6295119524002075, "learning_rate": 8.681276483050847e-06, "loss": 1.1541, "mean_token_accuracy": 0.7184012159705162, "num_tokens": 8016699.0, "step": 9960 }, { "epoch": 2.638241525423729, "grad_norm": 1.7601940631866455, "learning_rate": 8.681011652542374e-06, "loss": 1.3631, "mean_token_accuracy": 0.6917049326002598, "num_tokens": 8018376.0, "step": 9962 }, { "epoch": 2.638771186440678, "grad_norm": 1.628603219985962, "learning_rate": 8.680746822033899e-06, "loss": 1.0362, "mean_token_accuracy": 0.7169595211744308, "num_tokens": 8020333.0, "step": 9964 }, { "epoch": 2.639300847457627, "grad_norm": 1.6433252096176147, "learning_rate": 8.680481991525425e-06, "loss": 1.1778, "mean_token_accuracy": 0.7359837964177132, "num_tokens": 8022096.0, "step": 9966 }, { "epoch": 2.639830508474576, "grad_norm": 1.835318684577942, "learning_rate": 8.68021716101695e-06, "loss": 1.1857, "mean_token_accuracy": 0.7059992179274559, "num_tokens": 8023421.0, "step": 9968 }, { "epoch": 2.6403601694915255, "grad_norm": 1.455686330795288, "learning_rate": 8.679952330508475e-06, "loss": 1.5269, "mean_token_accuracy": 0.6795151680707932, "num_tokens": 8025742.0, "step": 9970 }, { "epoch": 2.6408898305084745, "grad_norm": 1.7966327667236328, "learning_rate": 8.6796875e-06, "loss": 1.3764, "mean_token_accuracy": 0.6626833379268646, "num_tokens": 8027189.0, "step": 9972 }, { "epoch": 2.641419491525424, "grad_norm": 1.7383441925048828, "learning_rate": 8.679422669491527e-06, "loss": 1.271, "mean_token_accuracy": 0.7120005637407303, "num_tokens": 8028748.0, "step": 9974 }, { "epoch": 2.641949152542373, "grad_norm": 1.6671305894851685, "learning_rate": 8.679157838983052e-06, "loss": 1.1283, "mean_token_accuracy": 0.7320584207773209, "num_tokens": 8030243.0, "step": 9976 }, { "epoch": 2.642478813559322, "grad_norm": 1.9414584636688232, "learning_rate": 8.678893008474577e-06, "loss": 1.4467, "mean_token_accuracy": 0.6674944758415222, "num_tokens": 8031608.0, "step": 9978 }, { "epoch": 2.643008474576271, "grad_norm": 1.8759820461273193, "learning_rate": 8.678628177966101e-06, "loss": 1.4552, "mean_token_accuracy": 0.6672435626387596, "num_tokens": 8033168.0, "step": 9980 }, { "epoch": 2.6435381355932206, "grad_norm": 1.8095993995666504, "learning_rate": 8.678363347457628e-06, "loss": 1.3669, "mean_token_accuracy": 0.6883385628461838, "num_tokens": 8034631.0, "step": 9982 }, { "epoch": 2.6440677966101696, "grad_norm": 1.8282006978988647, "learning_rate": 8.678098516949153e-06, "loss": 1.3511, "mean_token_accuracy": 0.6886277198791504, "num_tokens": 8036028.0, "step": 9984 }, { "epoch": 2.6445974576271185, "grad_norm": 2.015841484069824, "learning_rate": 8.677833686440678e-06, "loss": 1.3634, "mean_token_accuracy": 0.6799472644925117, "num_tokens": 8037654.0, "step": 9986 }, { "epoch": 2.645127118644068, "grad_norm": 1.6981173753738403, "learning_rate": 8.677568855932203e-06, "loss": 1.8963, "mean_token_accuracy": 0.5799699872732162, "num_tokens": 8039841.0, "step": 9988 }, { "epoch": 2.645656779661017, "grad_norm": 1.6198561191558838, "learning_rate": 8.67730402542373e-06, "loss": 1.6563, "mean_token_accuracy": 0.6390976309776306, "num_tokens": 8041581.0, "step": 9990 }, { "epoch": 2.6461864406779663, "grad_norm": 1.709079623222351, "learning_rate": 8.677039194915256e-06, "loss": 1.3756, "mean_token_accuracy": 0.6694738790392876, "num_tokens": 8043145.0, "step": 9992 }, { "epoch": 2.6467161016949152, "grad_norm": 1.3514152765274048, "learning_rate": 8.676774364406781e-06, "loss": 0.844, "mean_token_accuracy": 0.7720257937908173, "num_tokens": 8044950.0, "step": 9994 }, { "epoch": 2.647245762711864, "grad_norm": 1.637879490852356, "learning_rate": 8.676509533898306e-06, "loss": 1.4835, "mean_token_accuracy": 0.6600528135895729, "num_tokens": 8046674.0, "step": 9996 }, { "epoch": 2.6477754237288136, "grad_norm": 1.77649986743927, "learning_rate": 8.676244703389831e-06, "loss": 1.2169, "mean_token_accuracy": 0.7141066938638687, "num_tokens": 8048388.0, "step": 9998 }, { "epoch": 2.648305084745763, "grad_norm": 1.5818227529525757, "learning_rate": 8.675979872881358e-06, "loss": 1.315, "step": 10000 }, { "epoch": 2.648305084745763, "eval_loss": 1.3123456239700317, "eval_mean_token_accuracy": 0.7001676556545419, "eval_num_tokens": 8050102.0, "eval_runtime": 48.3548, "eval_samples_per_second": 6.37, "eval_steps_per_second": 6.37, "step": 10000 }, { "epoch": 2.648834745762712, "grad_norm": 1.8027914762496948, "learning_rate": 8.675715042372882e-06, "loss": 1.5763, "mean_token_accuracy": 0.6867944784462452, "num_tokens": 8051600.0, "step": 10002 }, { "epoch": 2.649364406779661, "grad_norm": 1.7471154928207397, "learning_rate": 8.675450211864407e-06, "loss": 1.6171, "mean_token_accuracy": 0.6405051648616791, "num_tokens": 8053280.0, "step": 10004 }, { "epoch": 2.6498940677966103, "grad_norm": 1.8636581897735596, "learning_rate": 8.675185381355932e-06, "loss": 1.2221, "mean_token_accuracy": 0.7156940475106239, "num_tokens": 8054718.0, "step": 10006 }, { "epoch": 2.6504237288135593, "grad_norm": 2.2164864540100098, "learning_rate": 8.674920550847459e-06, "loss": 1.4509, "mean_token_accuracy": 0.6919041946530342, "num_tokens": 8056161.0, "step": 10008 }, { "epoch": 2.6509533898305087, "grad_norm": 2.0304644107818604, "learning_rate": 8.674655720338984e-06, "loss": 1.2318, "mean_token_accuracy": 0.7082960084080696, "num_tokens": 8057908.0, "step": 10010 }, { "epoch": 2.6514830508474576, "grad_norm": 1.707732081413269, "learning_rate": 8.674390889830509e-06, "loss": 1.4095, "mean_token_accuracy": 0.6881234422326088, "num_tokens": 8059494.0, "step": 10012 }, { "epoch": 2.6520127118644066, "grad_norm": 1.5070968866348267, "learning_rate": 8.674126059322034e-06, "loss": 1.0632, "mean_token_accuracy": 0.731853298842907, "num_tokens": 8060991.0, "step": 10014 }, { "epoch": 2.652542372881356, "grad_norm": 2.3127307891845703, "learning_rate": 8.67386122881356e-06, "loss": 1.8595, "mean_token_accuracy": 0.5936630144715309, "num_tokens": 8062418.0, "step": 10016 }, { "epoch": 2.653072033898305, "grad_norm": 1.674817442893982, "learning_rate": 8.673596398305085e-06, "loss": 1.5401, "mean_token_accuracy": 0.6593150496482849, "num_tokens": 8063950.0, "step": 10018 }, { "epoch": 2.6536016949152543, "grad_norm": 1.8106391429901123, "learning_rate": 8.673331567796612e-06, "loss": 0.96, "mean_token_accuracy": 0.7385157719254494, "num_tokens": 8066190.0, "step": 10020 }, { "epoch": 2.6541313559322033, "grad_norm": 1.5332050323486328, "learning_rate": 8.673066737288137e-06, "loss": 0.914, "mean_token_accuracy": 0.7789040058851242, "num_tokens": 8067654.0, "step": 10022 }, { "epoch": 2.6546610169491527, "grad_norm": 1.7706222534179688, "learning_rate": 8.672801906779662e-06, "loss": 1.2589, "mean_token_accuracy": 0.7226581946015358, "num_tokens": 8068990.0, "step": 10024 }, { "epoch": 2.6551906779661016, "grad_norm": 1.901845097541809, "learning_rate": 8.672537076271187e-06, "loss": 1.6671, "mean_token_accuracy": 0.6363421082496643, "num_tokens": 8070667.0, "step": 10026 }, { "epoch": 2.655720338983051, "grad_norm": 1.8086283206939697, "learning_rate": 8.672272245762713e-06, "loss": 1.4505, "mean_token_accuracy": 0.6982966512441635, "num_tokens": 8072111.0, "step": 10028 }, { "epoch": 2.65625, "grad_norm": 1.8507959842681885, "learning_rate": 8.672007415254238e-06, "loss": 1.7423, "mean_token_accuracy": 0.6299820393323898, "num_tokens": 8073956.0, "step": 10030 }, { "epoch": 2.656779661016949, "grad_norm": 1.390748143196106, "learning_rate": 8.671742584745763e-06, "loss": 1.0512, "mean_token_accuracy": 0.7446142211556435, "num_tokens": 8075505.0, "step": 10032 }, { "epoch": 2.6573093220338984, "grad_norm": 1.6738475561141968, "learning_rate": 8.671477754237288e-06, "loss": 1.5459, "mean_token_accuracy": 0.6386293172836304, "num_tokens": 8077384.0, "step": 10034 }, { "epoch": 2.6578389830508473, "grad_norm": 1.4712129831314087, "learning_rate": 8.671212923728815e-06, "loss": 1.0103, "mean_token_accuracy": 0.7751767337322235, "num_tokens": 8078868.0, "step": 10036 }, { "epoch": 2.6583686440677967, "grad_norm": 2.324885845184326, "learning_rate": 8.67094809322034e-06, "loss": 1.3804, "mean_token_accuracy": 0.7059327065944672, "num_tokens": 8080202.0, "step": 10038 }, { "epoch": 2.6588983050847457, "grad_norm": 1.8089245557785034, "learning_rate": 8.670683262711865e-06, "loss": 1.2233, "mean_token_accuracy": 0.7143255546689034, "num_tokens": 8081846.0, "step": 10040 }, { "epoch": 2.659427966101695, "grad_norm": 1.7064435482025146, "learning_rate": 8.67041843220339e-06, "loss": 1.2033, "mean_token_accuracy": 0.7222323343157768, "num_tokens": 8083267.0, "step": 10042 }, { "epoch": 2.659957627118644, "grad_norm": 1.8220380544662476, "learning_rate": 8.670153601694916e-06, "loss": 1.5658, "mean_token_accuracy": 0.6575572527945042, "num_tokens": 8084862.0, "step": 10044 }, { "epoch": 2.6604872881355934, "grad_norm": 1.8451766967773438, "learning_rate": 8.669888771186441e-06, "loss": 1.4907, "mean_token_accuracy": 0.7010940611362457, "num_tokens": 8086366.0, "step": 10046 }, { "epoch": 2.6610169491525424, "grad_norm": 1.4824535846710205, "learning_rate": 8.669623940677968e-06, "loss": 1.1847, "mean_token_accuracy": 0.7311671376228333, "num_tokens": 8088047.0, "step": 10048 }, { "epoch": 2.6615466101694913, "grad_norm": 2.037848949432373, "learning_rate": 8.669359110169493e-06, "loss": 1.6308, "mean_token_accuracy": 0.6522414609789848, "num_tokens": 8089563.0, "step": 10050 }, { "epoch": 2.6620762711864407, "grad_norm": 1.2881981134414673, "learning_rate": 8.669094279661018e-06, "loss": 1.181, "mean_token_accuracy": 0.7120140343904495, "num_tokens": 8091727.0, "step": 10052 }, { "epoch": 2.6626059322033897, "grad_norm": 2.0453941822052, "learning_rate": 8.668829449152542e-06, "loss": 1.6972, "mean_token_accuracy": 0.643968366086483, "num_tokens": 8092993.0, "step": 10054 }, { "epoch": 2.663135593220339, "grad_norm": 1.2333197593688965, "learning_rate": 8.668564618644069e-06, "loss": 0.7682, "mean_token_accuracy": 0.8145990520715714, "num_tokens": 8094474.0, "step": 10056 }, { "epoch": 2.663665254237288, "grad_norm": 1.4749712944030762, "learning_rate": 8.668299788135594e-06, "loss": 1.4021, "mean_token_accuracy": 0.6873382031917572, "num_tokens": 8096119.0, "step": 10058 }, { "epoch": 2.664194915254237, "grad_norm": 2.0292680263519287, "learning_rate": 8.668034957627119e-06, "loss": 1.4743, "mean_token_accuracy": 0.6866162866353989, "num_tokens": 8097464.0, "step": 10060 }, { "epoch": 2.6647245762711864, "grad_norm": 1.904737949371338, "learning_rate": 8.667770127118644e-06, "loss": 1.5572, "mean_token_accuracy": 0.6759843230247498, "num_tokens": 8099010.0, "step": 10062 }, { "epoch": 2.665254237288136, "grad_norm": 1.8962454795837402, "learning_rate": 8.66750529661017e-06, "loss": 1.3487, "mean_token_accuracy": 0.6973877102136612, "num_tokens": 8100696.0, "step": 10064 }, { "epoch": 2.6657838983050848, "grad_norm": 1.9229329824447632, "learning_rate": 8.667240466101695e-06, "loss": 1.3076, "mean_token_accuracy": 0.7070222645998001, "num_tokens": 8102298.0, "step": 10066 }, { "epoch": 2.6663135593220337, "grad_norm": 1.5151307582855225, "learning_rate": 8.66697563559322e-06, "loss": 0.967, "mean_token_accuracy": 0.7819297537207603, "num_tokens": 8103805.0, "step": 10068 }, { "epoch": 2.666843220338983, "grad_norm": 1.361910104751587, "learning_rate": 8.666710805084745e-06, "loss": 1.1247, "mean_token_accuracy": 0.7563208788633347, "num_tokens": 8105840.0, "step": 10070 }, { "epoch": 2.667372881355932, "grad_norm": 1.6553412675857544, "learning_rate": 8.666445974576272e-06, "loss": 1.1335, "mean_token_accuracy": 0.7464451193809509, "num_tokens": 8107498.0, "step": 10072 }, { "epoch": 2.6679025423728815, "grad_norm": 1.7162691354751587, "learning_rate": 8.666181144067799e-06, "loss": 1.2525, "mean_token_accuracy": 0.7028416022658348, "num_tokens": 8109047.0, "step": 10074 }, { "epoch": 2.6684322033898304, "grad_norm": 1.9415552616119385, "learning_rate": 8.665916313559323e-06, "loss": 1.6053, "mean_token_accuracy": 0.628619559109211, "num_tokens": 8110627.0, "step": 10076 }, { "epoch": 2.6689618644067794, "grad_norm": 1.8681343793869019, "learning_rate": 8.665651483050848e-06, "loss": 0.932, "mean_token_accuracy": 0.7723725810647011, "num_tokens": 8112295.0, "step": 10078 }, { "epoch": 2.669491525423729, "grad_norm": 1.863884449005127, "learning_rate": 8.665386652542373e-06, "loss": 1.286, "mean_token_accuracy": 0.6993971019983292, "num_tokens": 8113907.0, "step": 10080 }, { "epoch": 2.670021186440678, "grad_norm": 1.231930136680603, "learning_rate": 8.6651218220339e-06, "loss": 1.072, "mean_token_accuracy": 0.7501471117138863, "num_tokens": 8115622.0, "step": 10082 }, { "epoch": 2.670550847457627, "grad_norm": 1.8023165464401245, "learning_rate": 8.664856991525425e-06, "loss": 1.4792, "mean_token_accuracy": 0.6794123873114586, "num_tokens": 8117155.0, "step": 10084 }, { "epoch": 2.671080508474576, "grad_norm": 1.5931504964828491, "learning_rate": 8.66459216101695e-06, "loss": 1.4647, "mean_token_accuracy": 0.674685038626194, "num_tokens": 8118733.0, "step": 10086 }, { "epoch": 2.6716101694915255, "grad_norm": 2.002429246902466, "learning_rate": 8.664327330508475e-06, "loss": 1.1896, "mean_token_accuracy": 0.7332537770271301, "num_tokens": 8120190.0, "step": 10088 }, { "epoch": 2.6721398305084745, "grad_norm": 1.323751449584961, "learning_rate": 8.664062500000001e-06, "loss": 0.9532, "mean_token_accuracy": 0.7698959335684776, "num_tokens": 8121884.0, "step": 10090 }, { "epoch": 2.672669491525424, "grad_norm": 1.944571852684021, "learning_rate": 8.663797669491526e-06, "loss": 1.4407, "mean_token_accuracy": 0.6748309694230556, "num_tokens": 8123230.0, "step": 10092 }, { "epoch": 2.673199152542373, "grad_norm": 1.951047420501709, "learning_rate": 8.663532838983051e-06, "loss": 1.5618, "mean_token_accuracy": 0.660735234618187, "num_tokens": 8124926.0, "step": 10094 }, { "epoch": 2.673728813559322, "grad_norm": 1.7831565141677856, "learning_rate": 8.663268008474576e-06, "loss": 1.4828, "mean_token_accuracy": 0.6703258454799652, "num_tokens": 8126386.0, "step": 10096 }, { "epoch": 2.674258474576271, "grad_norm": 1.6234439611434937, "learning_rate": 8.663003177966103e-06, "loss": 1.2742, "mean_token_accuracy": 0.7037405669689178, "num_tokens": 8127899.0, "step": 10098 }, { "epoch": 2.6747881355932206, "grad_norm": 1.8896981477737427, "learning_rate": 8.662738347457628e-06, "loss": 1.5225, "mean_token_accuracy": 0.6623575761914253, "num_tokens": 8129356.0, "step": 10100 }, { "epoch": 2.6753177966101696, "grad_norm": 1.7396423816680908, "learning_rate": 8.662473516949154e-06, "loss": 1.2829, "mean_token_accuracy": 0.6961225867271423, "num_tokens": 8130931.0, "step": 10102 }, { "epoch": 2.6758474576271185, "grad_norm": 1.596670389175415, "learning_rate": 8.66220868644068e-06, "loss": 1.0905, "mean_token_accuracy": 0.7388087213039398, "num_tokens": 8132363.0, "step": 10104 }, { "epoch": 2.676377118644068, "grad_norm": 1.7805086374282837, "learning_rate": 8.661943855932204e-06, "loss": 1.271, "mean_token_accuracy": 0.7145341522991657, "num_tokens": 8133997.0, "step": 10106 }, { "epoch": 2.676906779661017, "grad_norm": 1.6971633434295654, "learning_rate": 8.661679025423729e-06, "loss": 1.1923, "mean_token_accuracy": 0.7207980155944824, "num_tokens": 8135650.0, "step": 10108 }, { "epoch": 2.6774364406779663, "grad_norm": 1.8784403800964355, "learning_rate": 8.661414194915256e-06, "loss": 1.727, "mean_token_accuracy": 0.6316244378685951, "num_tokens": 8137433.0, "step": 10110 }, { "epoch": 2.6779661016949152, "grad_norm": 1.9864076375961304, "learning_rate": 8.66114936440678e-06, "loss": 1.7001, "mean_token_accuracy": 0.636248417198658, "num_tokens": 8138875.0, "step": 10112 }, { "epoch": 2.678495762711864, "grad_norm": 2.145434617996216, "learning_rate": 8.660884533898306e-06, "loss": 1.7895, "mean_token_accuracy": 0.6146926879882812, "num_tokens": 8140504.0, "step": 10114 }, { "epoch": 2.6790254237288136, "grad_norm": 1.4851078987121582, "learning_rate": 8.66061970338983e-06, "loss": 1.1273, "mean_token_accuracy": 0.7624158039689064, "num_tokens": 8142007.0, "step": 10116 }, { "epoch": 2.679555084745763, "grad_norm": 1.5609525442123413, "learning_rate": 8.660354872881357e-06, "loss": 1.046, "mean_token_accuracy": 0.7242034077644348, "num_tokens": 8143795.0, "step": 10118 }, { "epoch": 2.680084745762712, "grad_norm": 1.3911362886428833, "learning_rate": 8.660090042372882e-06, "loss": 1.2698, "mean_token_accuracy": 0.6899825818836689, "num_tokens": 8145586.0, "step": 10120 }, { "epoch": 2.680614406779661, "grad_norm": 1.9974079132080078, "learning_rate": 8.659825211864407e-06, "loss": 1.3447, "mean_token_accuracy": 0.6972046419978142, "num_tokens": 8147050.0, "step": 10122 }, { "epoch": 2.6811440677966103, "grad_norm": 1.351920247077942, "learning_rate": 8.659560381355932e-06, "loss": 0.9657, "mean_token_accuracy": 0.7659417912364006, "num_tokens": 8148715.0, "step": 10124 }, { "epoch": 2.6816737288135593, "grad_norm": 1.3563190698623657, "learning_rate": 8.659295550847459e-06, "loss": 1.0834, "mean_token_accuracy": 0.7422539815306664, "num_tokens": 8150659.0, "step": 10126 }, { "epoch": 2.6822033898305087, "grad_norm": 1.8842839002609253, "learning_rate": 8.659030720338984e-06, "loss": 1.206, "mean_token_accuracy": 0.7126815915107727, "num_tokens": 8152104.0, "step": 10128 }, { "epoch": 2.6827330508474576, "grad_norm": 1.5475437641143799, "learning_rate": 8.65876588983051e-06, "loss": 1.386, "mean_token_accuracy": 0.672192670404911, "num_tokens": 8153836.0, "step": 10130 }, { "epoch": 2.6832627118644066, "grad_norm": 2.142561435699463, "learning_rate": 8.658501059322035e-06, "loss": 1.2802, "mean_token_accuracy": 0.714536190032959, "num_tokens": 8155190.0, "step": 10132 }, { "epoch": 2.683792372881356, "grad_norm": 1.9095317125320435, "learning_rate": 8.65823622881356e-06, "loss": 1.3661, "mean_token_accuracy": 0.6940885558724403, "num_tokens": 8156731.0, "step": 10134 }, { "epoch": 2.684322033898305, "grad_norm": 1.7324143648147583, "learning_rate": 8.657971398305085e-06, "loss": 1.7671, "mean_token_accuracy": 0.6339847296476364, "num_tokens": 8158275.0, "step": 10136 }, { "epoch": 2.6848516949152543, "grad_norm": 1.6886128187179565, "learning_rate": 8.657706567796612e-06, "loss": 1.2788, "mean_token_accuracy": 0.6995366513729095, "num_tokens": 8159813.0, "step": 10138 }, { "epoch": 2.6853813559322033, "grad_norm": 1.8977638483047485, "learning_rate": 8.657441737288136e-06, "loss": 1.4365, "mean_token_accuracy": 0.6721867099404335, "num_tokens": 8161640.0, "step": 10140 }, { "epoch": 2.6859110169491527, "grad_norm": 1.7263157367706299, "learning_rate": 8.657176906779661e-06, "loss": 1.3317, "mean_token_accuracy": 0.7156793624162674, "num_tokens": 8163057.0, "step": 10142 }, { "epoch": 2.6864406779661016, "grad_norm": 2.163478136062622, "learning_rate": 8.656912076271186e-06, "loss": 1.0882, "mean_token_accuracy": 0.7392285987734795, "num_tokens": 8164483.0, "step": 10144 }, { "epoch": 2.686970338983051, "grad_norm": 1.7516371011734009, "learning_rate": 8.656647245762713e-06, "loss": 1.3002, "mean_token_accuracy": 0.6876059621572495, "num_tokens": 8166138.0, "step": 10146 }, { "epoch": 2.6875, "grad_norm": 1.991040587425232, "learning_rate": 8.656382415254238e-06, "loss": 1.062, "mean_token_accuracy": 0.7427110522985458, "num_tokens": 8167661.0, "step": 10148 }, { "epoch": 2.688029661016949, "grad_norm": 1.8445435762405396, "learning_rate": 8.656117584745763e-06, "loss": 1.2309, "mean_token_accuracy": 0.7249437719583511, "num_tokens": 8169266.0, "step": 10150 }, { "epoch": 2.6885593220338984, "grad_norm": 1.66487455368042, "learning_rate": 8.655852754237288e-06, "loss": 1.3723, "mean_token_accuracy": 0.6862817145884037, "num_tokens": 8171264.0, "step": 10152 }, { "epoch": 2.6890889830508473, "grad_norm": 1.4558682441711426, "learning_rate": 8.655587923728814e-06, "loss": 1.1292, "mean_token_accuracy": 0.7504077032208443, "num_tokens": 8173038.0, "step": 10154 }, { "epoch": 2.6896186440677967, "grad_norm": 1.8304743766784668, "learning_rate": 8.65532309322034e-06, "loss": 1.6229, "mean_token_accuracy": 0.6570954695343971, "num_tokens": 8174640.0, "step": 10156 }, { "epoch": 2.6901483050847457, "grad_norm": 2.0529727935791016, "learning_rate": 8.655058262711866e-06, "loss": 1.5176, "mean_token_accuracy": 0.659812718629837, "num_tokens": 8176317.0, "step": 10158 }, { "epoch": 2.690677966101695, "grad_norm": 1.8724563121795654, "learning_rate": 8.65479343220339e-06, "loss": 1.4219, "mean_token_accuracy": 0.6710312515497208, "num_tokens": 8178216.0, "step": 10160 }, { "epoch": 2.691207627118644, "grad_norm": 2.0367519855499268, "learning_rate": 8.654528601694916e-06, "loss": 1.6689, "mean_token_accuracy": 0.6429261714220047, "num_tokens": 8179730.0, "step": 10162 }, { "epoch": 2.6917372881355934, "grad_norm": 1.634157419204712, "learning_rate": 8.654263771186442e-06, "loss": 1.2957, "mean_token_accuracy": 0.7077913209795952, "num_tokens": 8181331.0, "step": 10164 }, { "epoch": 2.6922669491525424, "grad_norm": 2.3667070865631104, "learning_rate": 8.653998940677967e-06, "loss": 0.8948, "mean_token_accuracy": 0.784240335226059, "num_tokens": 8182499.0, "step": 10166 }, { "epoch": 2.6927966101694913, "grad_norm": 2.144083261489868, "learning_rate": 8.653734110169492e-06, "loss": 1.3295, "mean_token_accuracy": 0.6790506988763809, "num_tokens": 8184294.0, "step": 10168 }, { "epoch": 2.6933262711864407, "grad_norm": 1.4105370044708252, "learning_rate": 8.653469279661017e-06, "loss": 1.1451, "mean_token_accuracy": 0.7225934937596321, "num_tokens": 8186214.0, "step": 10170 }, { "epoch": 2.6938559322033897, "grad_norm": 2.126042127609253, "learning_rate": 8.653204449152544e-06, "loss": 1.4393, "mean_token_accuracy": 0.6697066873311996, "num_tokens": 8187662.0, "step": 10172 }, { "epoch": 2.694385593220339, "grad_norm": 1.8519244194030762, "learning_rate": 8.652939618644069e-06, "loss": 1.0663, "mean_token_accuracy": 0.732056275010109, "num_tokens": 8189395.0, "step": 10174 }, { "epoch": 2.694915254237288, "grad_norm": 1.7492729425430298, "learning_rate": 8.652674788135594e-06, "loss": 1.6738, "mean_token_accuracy": 0.6456721052527428, "num_tokens": 8190876.0, "step": 10176 }, { "epoch": 2.695444915254237, "grad_norm": 1.9363007545471191, "learning_rate": 8.652409957627119e-06, "loss": 1.5769, "mean_token_accuracy": 0.671797513961792, "num_tokens": 8192382.0, "step": 10178 }, { "epoch": 2.6959745762711864, "grad_norm": 1.7320387363433838, "learning_rate": 8.652145127118645e-06, "loss": 1.45, "mean_token_accuracy": 0.6821109503507614, "num_tokens": 8193957.0, "step": 10180 }, { "epoch": 2.696504237288136, "grad_norm": 1.674831509590149, "learning_rate": 8.65188029661017e-06, "loss": 1.3546, "mean_token_accuracy": 0.7016747668385506, "num_tokens": 8195638.0, "step": 10182 }, { "epoch": 2.6970338983050848, "grad_norm": 2.1577014923095703, "learning_rate": 8.651615466101697e-06, "loss": 1.5281, "mean_token_accuracy": 0.6719071567058563, "num_tokens": 8197144.0, "step": 10184 }, { "epoch": 2.6975635593220337, "grad_norm": 1.8956934213638306, "learning_rate": 8.651350635593222e-06, "loss": 1.4762, "mean_token_accuracy": 0.6844452917575836, "num_tokens": 8198592.0, "step": 10186 }, { "epoch": 2.698093220338983, "grad_norm": 1.7429332733154297, "learning_rate": 8.651085805084747e-06, "loss": 1.1809, "mean_token_accuracy": 0.6900874301791191, "num_tokens": 8200246.0, "step": 10188 }, { "epoch": 2.698622881355932, "grad_norm": 1.6370972394943237, "learning_rate": 8.650820974576272e-06, "loss": 0.8211, "mean_token_accuracy": 0.7725675329566002, "num_tokens": 8201639.0, "step": 10190 }, { "epoch": 2.6991525423728815, "grad_norm": 1.6732827425003052, "learning_rate": 8.650556144067798e-06, "loss": 1.1311, "mean_token_accuracy": 0.7325625121593475, "num_tokens": 8203237.0, "step": 10192 }, { "epoch": 2.6996822033898304, "grad_norm": 1.8444371223449707, "learning_rate": 8.650291313559323e-06, "loss": 1.4964, "mean_token_accuracy": 0.6539380550384521, "num_tokens": 8204899.0, "step": 10194 }, { "epoch": 2.7002118644067794, "grad_norm": 1.668429970741272, "learning_rate": 8.650026483050848e-06, "loss": 1.4747, "mean_token_accuracy": 0.7144151702523232, "num_tokens": 8206644.0, "step": 10196 }, { "epoch": 2.700741525423729, "grad_norm": 1.6821726560592651, "learning_rate": 8.649761652542373e-06, "loss": 1.2956, "mean_token_accuracy": 0.6942304521799088, "num_tokens": 8208094.0, "step": 10198 }, { "epoch": 2.701271186440678, "grad_norm": 1.652382254600525, "learning_rate": 8.6494968220339e-06, "loss": 1.1053, "mean_token_accuracy": 0.7249520421028137, "num_tokens": 8209750.0, "step": 10200 }, { "epoch": 2.701800847457627, "grad_norm": 1.572172999382019, "learning_rate": 8.649231991525425e-06, "loss": 0.7108, "mean_token_accuracy": 0.8110998496413231, "num_tokens": 8211565.0, "step": 10202 }, { "epoch": 2.702330508474576, "grad_norm": 1.511348009109497, "learning_rate": 8.64896716101695e-06, "loss": 1.1151, "mean_token_accuracy": 0.7294414564967155, "num_tokens": 8212977.0, "step": 10204 }, { "epoch": 2.7028601694915255, "grad_norm": 1.6675728559494019, "learning_rate": 8.648702330508474e-06, "loss": 1.6772, "mean_token_accuracy": 0.6544632464647293, "num_tokens": 8214762.0, "step": 10206 }, { "epoch": 2.7033898305084745, "grad_norm": 1.8502601385116577, "learning_rate": 8.648437500000001e-06, "loss": 1.0438, "mean_token_accuracy": 0.7626761719584465, "num_tokens": 8216286.0, "step": 10208 }, { "epoch": 2.703919491525424, "grad_norm": 2.1895434856414795, "learning_rate": 8.648172669491526e-06, "loss": 1.3559, "mean_token_accuracy": 0.6892651543021202, "num_tokens": 8217652.0, "step": 10210 }, { "epoch": 2.704449152542373, "grad_norm": 1.8285088539123535, "learning_rate": 8.647907838983053e-06, "loss": 1.2724, "mean_token_accuracy": 0.7071131691336632, "num_tokens": 8219020.0, "step": 10212 }, { "epoch": 2.704978813559322, "grad_norm": 1.6467108726501465, "learning_rate": 8.647643008474576e-06, "loss": 1.016, "mean_token_accuracy": 0.7516909018158913, "num_tokens": 8220330.0, "step": 10214 }, { "epoch": 2.705508474576271, "grad_norm": 1.8293668031692505, "learning_rate": 8.647378177966102e-06, "loss": 1.0614, "mean_token_accuracy": 0.7561639472842216, "num_tokens": 8221793.0, "step": 10216 }, { "epoch": 2.7060381355932206, "grad_norm": 1.3431679010391235, "learning_rate": 8.647113347457627e-06, "loss": 0.922, "mean_token_accuracy": 0.7842161878943443, "num_tokens": 8223673.0, "step": 10218 }, { "epoch": 2.7065677966101696, "grad_norm": 1.7094465494155884, "learning_rate": 8.646848516949154e-06, "loss": 1.3611, "mean_token_accuracy": 0.7005020156502724, "num_tokens": 8225455.0, "step": 10220 }, { "epoch": 2.7070974576271185, "grad_norm": 1.8729097843170166, "learning_rate": 8.646583686440679e-06, "loss": 1.3126, "mean_token_accuracy": 0.7110308259725571, "num_tokens": 8226844.0, "step": 10222 }, { "epoch": 2.707627118644068, "grad_norm": 1.8721227645874023, "learning_rate": 8.646318855932204e-06, "loss": 1.3771, "mean_token_accuracy": 0.6721112728118896, "num_tokens": 8228468.0, "step": 10224 }, { "epoch": 2.708156779661017, "grad_norm": 1.6130671501159668, "learning_rate": 8.646054025423729e-06, "loss": 1.345, "mean_token_accuracy": 0.7133773863315582, "num_tokens": 8230062.0, "step": 10226 }, { "epoch": 2.7086864406779663, "grad_norm": 1.5979636907577515, "learning_rate": 8.645789194915255e-06, "loss": 1.0411, "mean_token_accuracy": 0.7363651245832443, "num_tokens": 8231807.0, "step": 10228 }, { "epoch": 2.7092161016949152, "grad_norm": 1.34623122215271, "learning_rate": 8.64552436440678e-06, "loss": 0.8643, "mean_token_accuracy": 0.7947681322693825, "num_tokens": 8233485.0, "step": 10230 }, { "epoch": 2.709745762711864, "grad_norm": 1.9672139883041382, "learning_rate": 8.645259533898305e-06, "loss": 1.5608, "mean_token_accuracy": 0.6544933393597603, "num_tokens": 8235170.0, "step": 10232 }, { "epoch": 2.7102754237288136, "grad_norm": 1.821258544921875, "learning_rate": 8.64499470338983e-06, "loss": 1.4336, "mean_token_accuracy": 0.7148740738630295, "num_tokens": 8236782.0, "step": 10234 }, { "epoch": 2.710805084745763, "grad_norm": 2.1621170043945312, "learning_rate": 8.644729872881357e-06, "loss": 1.336, "mean_token_accuracy": 0.707102045416832, "num_tokens": 8238227.0, "step": 10236 }, { "epoch": 2.711334745762712, "grad_norm": 2.0732181072235107, "learning_rate": 8.644465042372882e-06, "loss": 1.2121, "mean_token_accuracy": 0.7388740815222263, "num_tokens": 8239829.0, "step": 10238 }, { "epoch": 2.711864406779661, "grad_norm": 1.3853341341018677, "learning_rate": 8.644200211864408e-06, "loss": 1.177, "mean_token_accuracy": 0.7377530857920647, "num_tokens": 8241413.0, "step": 10240 }, { "epoch": 2.7123940677966103, "grad_norm": 2.0264101028442383, "learning_rate": 8.643935381355933e-06, "loss": 1.3103, "mean_token_accuracy": 0.6975258737802505, "num_tokens": 8243100.0, "step": 10242 }, { "epoch": 2.7129237288135593, "grad_norm": 1.7508665323257446, "learning_rate": 8.643670550847458e-06, "loss": 1.4802, "mean_token_accuracy": 0.6522830724716187, "num_tokens": 8244675.0, "step": 10244 }, { "epoch": 2.7134533898305087, "grad_norm": 1.6847063302993774, "learning_rate": 8.643405720338985e-06, "loss": 1.3526, "mean_token_accuracy": 0.6999415084719658, "num_tokens": 8246325.0, "step": 10246 }, { "epoch": 2.7139830508474576, "grad_norm": 1.8825517892837524, "learning_rate": 8.64314088983051e-06, "loss": 1.4049, "mean_token_accuracy": 0.6895292326807976, "num_tokens": 8247789.0, "step": 10248 }, { "epoch": 2.7145127118644066, "grad_norm": 1.1869351863861084, "learning_rate": 8.642876059322035e-06, "loss": 1.073, "step": 10250 }, { "epoch": 2.7145127118644066, "eval_loss": 1.3094878196716309, "eval_mean_token_accuracy": 0.7008903845177068, "eval_num_tokens": 8249892.0, "eval_runtime": 48.3498, "eval_samples_per_second": 6.37, "eval_steps_per_second": 6.37, "step": 10250 }, { "epoch": 2.715042372881356, "grad_norm": 1.3522711992263794, "learning_rate": 8.64261122881356e-06, "loss": 1.0917, "mean_token_accuracy": 0.7322142608463764, "num_tokens": 8251677.0, "step": 10252 }, { "epoch": 2.715572033898305, "grad_norm": 1.9111194610595703, "learning_rate": 8.642346398305086e-06, "loss": 1.2538, "mean_token_accuracy": 0.7161968275904655, "num_tokens": 8252959.0, "step": 10254 }, { "epoch": 2.7161016949152543, "grad_norm": 1.799626350402832, "learning_rate": 8.642081567796611e-06, "loss": 1.3043, "mean_token_accuracy": 0.6785641983151436, "num_tokens": 8254651.0, "step": 10256 }, { "epoch": 2.7166313559322033, "grad_norm": 1.4237042665481567, "learning_rate": 8.641816737288136e-06, "loss": 1.3546, "mean_token_accuracy": 0.723558209836483, "num_tokens": 8257147.0, "step": 10258 }, { "epoch": 2.7171610169491527, "grad_norm": 1.5936977863311768, "learning_rate": 8.641551906779661e-06, "loss": 1.1493, "mean_token_accuracy": 0.7249807417392731, "num_tokens": 8258731.0, "step": 10260 }, { "epoch": 2.7176906779661016, "grad_norm": 1.849977970123291, "learning_rate": 8.641287076271188e-06, "loss": 1.4629, "mean_token_accuracy": 0.6516211554408073, "num_tokens": 8260468.0, "step": 10262 }, { "epoch": 2.718220338983051, "grad_norm": 1.3722707033157349, "learning_rate": 8.641022245762713e-06, "loss": 0.8927, "mean_token_accuracy": 0.7709872797131538, "num_tokens": 8262098.0, "step": 10264 }, { "epoch": 2.71875, "grad_norm": 1.9846515655517578, "learning_rate": 8.64075741525424e-06, "loss": 1.3317, "mean_token_accuracy": 0.6911876052618027, "num_tokens": 8263496.0, "step": 10266 }, { "epoch": 2.719279661016949, "grad_norm": 1.8250303268432617, "learning_rate": 8.640492584745762e-06, "loss": 1.4163, "mean_token_accuracy": 0.7024379819631577, "num_tokens": 8265226.0, "step": 10268 }, { "epoch": 2.7198093220338984, "grad_norm": 1.5991462469100952, "learning_rate": 8.640227754237289e-06, "loss": 1.1066, "mean_token_accuracy": 0.7546052634716034, "num_tokens": 8266738.0, "step": 10270 }, { "epoch": 2.7203389830508473, "grad_norm": 1.8122678995132446, "learning_rate": 8.639962923728814e-06, "loss": 1.5877, "mean_token_accuracy": 0.6708870157599449, "num_tokens": 8268383.0, "step": 10272 }, { "epoch": 2.7208686440677967, "grad_norm": 1.3602997064590454, "learning_rate": 8.63969809322034e-06, "loss": 1.3149, "mean_token_accuracy": 0.7012809216976166, "num_tokens": 8270155.0, "step": 10274 }, { "epoch": 2.7213983050847457, "grad_norm": 1.4780468940734863, "learning_rate": 8.639433262711866e-06, "loss": 1.2895, "mean_token_accuracy": 0.7157647907733917, "num_tokens": 8271759.0, "step": 10276 }, { "epoch": 2.721927966101695, "grad_norm": 1.4898617267608643, "learning_rate": 8.63916843220339e-06, "loss": 0.7737, "mean_token_accuracy": 0.7989837676286697, "num_tokens": 8273306.0, "step": 10278 }, { "epoch": 2.722457627118644, "grad_norm": 1.6520252227783203, "learning_rate": 8.638903601694915e-06, "loss": 1.1293, "mean_token_accuracy": 0.7321290522813797, "num_tokens": 8274838.0, "step": 10280 }, { "epoch": 2.7229872881355934, "grad_norm": 1.6954867839813232, "learning_rate": 8.638638771186442e-06, "loss": 1.2084, "mean_token_accuracy": 0.6946392506361008, "num_tokens": 8276584.0, "step": 10282 }, { "epoch": 2.7235169491525424, "grad_norm": 1.3739168643951416, "learning_rate": 8.638373940677967e-06, "loss": 1.1561, "mean_token_accuracy": 0.7361782714724541, "num_tokens": 8278077.0, "step": 10284 }, { "epoch": 2.7240466101694913, "grad_norm": 1.7030850648880005, "learning_rate": 8.638109110169492e-06, "loss": 1.608, "mean_token_accuracy": 0.6627460196614265, "num_tokens": 8279864.0, "step": 10286 }, { "epoch": 2.7245762711864407, "grad_norm": 1.7851486206054688, "learning_rate": 8.637844279661017e-06, "loss": 0.9981, "mean_token_accuracy": 0.7478209286928177, "num_tokens": 8281359.0, "step": 10288 }, { "epoch": 2.7251059322033897, "grad_norm": 1.6007646322250366, "learning_rate": 8.637579449152543e-06, "loss": 1.5802, "mean_token_accuracy": 0.6644623875617981, "num_tokens": 8283066.0, "step": 10290 }, { "epoch": 2.725635593220339, "grad_norm": 1.6708043813705444, "learning_rate": 8.637314618644068e-06, "loss": 1.362, "mean_token_accuracy": 0.6727028340101242, "num_tokens": 8284516.0, "step": 10292 }, { "epoch": 2.726165254237288, "grad_norm": 1.43290114402771, "learning_rate": 8.637049788135595e-06, "loss": 1.0118, "mean_token_accuracy": 0.7444554716348648, "num_tokens": 8286123.0, "step": 10294 }, { "epoch": 2.726694915254237, "grad_norm": 1.667506217956543, "learning_rate": 8.636784957627118e-06, "loss": 1.2087, "mean_token_accuracy": 0.7161686047911644, "num_tokens": 8287785.0, "step": 10296 }, { "epoch": 2.7272245762711864, "grad_norm": 1.290027379989624, "learning_rate": 8.636520127118645e-06, "loss": 0.9584, "mean_token_accuracy": 0.7595606371760368, "num_tokens": 8289521.0, "step": 10298 }, { "epoch": 2.727754237288136, "grad_norm": 1.7049434185028076, "learning_rate": 8.63625529661017e-06, "loss": 1.426, "mean_token_accuracy": 0.708835132420063, "num_tokens": 8291073.0, "step": 10300 }, { "epoch": 2.7282838983050848, "grad_norm": 1.324731707572937, "learning_rate": 8.635990466101696e-06, "loss": 1.0312, "mean_token_accuracy": 0.7560704424977303, "num_tokens": 8292858.0, "step": 10302 }, { "epoch": 2.7288135593220337, "grad_norm": 1.28013014793396, "learning_rate": 8.635725635593221e-06, "loss": 0.7109, "mean_token_accuracy": 0.8230857104063034, "num_tokens": 8294185.0, "step": 10304 }, { "epoch": 2.729343220338983, "grad_norm": 2.027247905731201, "learning_rate": 8.635460805084746e-06, "loss": 1.4528, "mean_token_accuracy": 0.700432576239109, "num_tokens": 8295710.0, "step": 10306 }, { "epoch": 2.729872881355932, "grad_norm": 1.5408109426498413, "learning_rate": 8.635195974576271e-06, "loss": 1.2904, "mean_token_accuracy": 0.6923792436718941, "num_tokens": 8297482.0, "step": 10308 }, { "epoch": 2.7304025423728815, "grad_norm": 1.7435261011123657, "learning_rate": 8.634931144067798e-06, "loss": 1.0979, "mean_token_accuracy": 0.7386001497507095, "num_tokens": 8299030.0, "step": 10310 }, { "epoch": 2.7309322033898304, "grad_norm": 2.1028695106506348, "learning_rate": 8.634666313559323e-06, "loss": 0.9826, "mean_token_accuracy": 0.7600930854678154, "num_tokens": 8300363.0, "step": 10312 }, { "epoch": 2.7314618644067794, "grad_norm": 1.696471095085144, "learning_rate": 8.634401483050848e-06, "loss": 1.5052, "mean_token_accuracy": 0.6650054752826691, "num_tokens": 8302000.0, "step": 10314 }, { "epoch": 2.731991525423729, "grad_norm": 1.4681808948516846, "learning_rate": 8.634136652542373e-06, "loss": 0.8202, "mean_token_accuracy": 0.7834112867712975, "num_tokens": 8303611.0, "step": 10316 }, { "epoch": 2.732521186440678, "grad_norm": 1.572788119316101, "learning_rate": 8.6338718220339e-06, "loss": 1.3531, "mean_token_accuracy": 0.6882327571511269, "num_tokens": 8305301.0, "step": 10318 }, { "epoch": 2.733050847457627, "grad_norm": 1.8974641561508179, "learning_rate": 8.633606991525424e-06, "loss": 1.6619, "mean_token_accuracy": 0.6606922820210457, "num_tokens": 8306765.0, "step": 10320 }, { "epoch": 2.733580508474576, "grad_norm": 1.7811856269836426, "learning_rate": 8.633342161016949e-06, "loss": 1.6493, "mean_token_accuracy": 0.6405050531029701, "num_tokens": 8308441.0, "step": 10322 }, { "epoch": 2.7341101694915255, "grad_norm": 2.1633472442626953, "learning_rate": 8.633077330508474e-06, "loss": 1.4547, "mean_token_accuracy": 0.7023903653025627, "num_tokens": 8309960.0, "step": 10324 }, { "epoch": 2.7346398305084745, "grad_norm": 1.1818325519561768, "learning_rate": 8.6328125e-06, "loss": 1.0303, "mean_token_accuracy": 0.7448550760746002, "num_tokens": 8311996.0, "step": 10326 }, { "epoch": 2.735169491525424, "grad_norm": 1.394652247428894, "learning_rate": 8.632547669491527e-06, "loss": 1.2118, "mean_token_accuracy": 0.7266807407140732, "num_tokens": 8313552.0, "step": 10328 }, { "epoch": 2.735699152542373, "grad_norm": 1.9748790264129639, "learning_rate": 8.632282838983052e-06, "loss": 1.3667, "mean_token_accuracy": 0.6970266178250313, "num_tokens": 8315151.0, "step": 10330 }, { "epoch": 2.736228813559322, "grad_norm": 1.9092628955841064, "learning_rate": 8.632018008474577e-06, "loss": 1.6514, "mean_token_accuracy": 0.6343919932842255, "num_tokens": 8316912.0, "step": 10332 }, { "epoch": 2.736758474576271, "grad_norm": 1.6034281253814697, "learning_rate": 8.631753177966102e-06, "loss": 1.1979, "mean_token_accuracy": 0.7387799173593521, "num_tokens": 8318402.0, "step": 10334 }, { "epoch": 2.7372881355932206, "grad_norm": 1.5271492004394531, "learning_rate": 8.631488347457629e-06, "loss": 1.1159, "mean_token_accuracy": 0.7404142916202545, "num_tokens": 8319947.0, "step": 10336 }, { "epoch": 2.7378177966101696, "grad_norm": 2.052593469619751, "learning_rate": 8.631223516949154e-06, "loss": 1.6978, "mean_token_accuracy": 0.6367986500263214, "num_tokens": 8321489.0, "step": 10338 }, { "epoch": 2.7383474576271185, "grad_norm": 1.9799811840057373, "learning_rate": 8.630958686440679e-06, "loss": 1.3388, "mean_token_accuracy": 0.7102203741669655, "num_tokens": 8323055.0, "step": 10340 }, { "epoch": 2.738877118644068, "grad_norm": 1.8857104778289795, "learning_rate": 8.630693855932203e-06, "loss": 1.7249, "mean_token_accuracy": 0.6539606228470802, "num_tokens": 8324588.0, "step": 10342 }, { "epoch": 2.739406779661017, "grad_norm": 1.318274974822998, "learning_rate": 8.63042902542373e-06, "loss": 1.0376, "mean_token_accuracy": 0.7506202459335327, "num_tokens": 8326542.0, "step": 10344 }, { "epoch": 2.7399364406779663, "grad_norm": 1.4671509265899658, "learning_rate": 8.630164194915255e-06, "loss": 0.9549, "mean_token_accuracy": 0.7273467034101486, "num_tokens": 8328406.0, "step": 10346 }, { "epoch": 2.7404661016949152, "grad_norm": 1.9281625747680664, "learning_rate": 8.629899364406782e-06, "loss": 1.5109, "mean_token_accuracy": 0.6677718758583069, "num_tokens": 8330141.0, "step": 10348 }, { "epoch": 2.740995762711864, "grad_norm": 1.667096495628357, "learning_rate": 8.629634533898305e-06, "loss": 1.1262, "mean_token_accuracy": 0.7413694337010384, "num_tokens": 8331716.0, "step": 10350 }, { "epoch": 2.7415254237288136, "grad_norm": 2.0032413005828857, "learning_rate": 8.629369703389831e-06, "loss": 1.4734, "mean_token_accuracy": 0.692886509001255, "num_tokens": 8333243.0, "step": 10352 }, { "epoch": 2.742055084745763, "grad_norm": 1.5842894315719604, "learning_rate": 8.629104872881356e-06, "loss": 1.6632, "mean_token_accuracy": 0.6458035185933113, "num_tokens": 8334826.0, "step": 10354 }, { "epoch": 2.742584745762712, "grad_norm": 1.503528118133545, "learning_rate": 8.628840042372883e-06, "loss": 1.3229, "mean_token_accuracy": 0.7000585161149502, "num_tokens": 8336667.0, "step": 10356 }, { "epoch": 2.743114406779661, "grad_norm": 1.5615891218185425, "learning_rate": 8.628575211864408e-06, "loss": 1.2796, "mean_token_accuracy": 0.6959140971302986, "num_tokens": 8338371.0, "step": 10358 }, { "epoch": 2.7436440677966103, "grad_norm": 1.4794882535934448, "learning_rate": 8.628310381355933e-06, "loss": 1.126, "mean_token_accuracy": 0.7518616020679474, "num_tokens": 8339935.0, "step": 10360 }, { "epoch": 2.7441737288135593, "grad_norm": 2.01926326751709, "learning_rate": 8.628045550847458e-06, "loss": 1.7705, "mean_token_accuracy": 0.6159859970211983, "num_tokens": 8341402.0, "step": 10362 }, { "epoch": 2.7447033898305087, "grad_norm": 1.792243480682373, "learning_rate": 8.627780720338984e-06, "loss": 1.2315, "mean_token_accuracy": 0.7060949727892876, "num_tokens": 8342964.0, "step": 10364 }, { "epoch": 2.7452330508474576, "grad_norm": 1.9114181995391846, "learning_rate": 8.62751588983051e-06, "loss": 1.722, "mean_token_accuracy": 0.6415062993764877, "num_tokens": 8344732.0, "step": 10366 }, { "epoch": 2.7457627118644066, "grad_norm": 1.2811155319213867, "learning_rate": 8.627251059322034e-06, "loss": 1.1894, "mean_token_accuracy": 0.7278032004833221, "num_tokens": 8346269.0, "step": 10368 }, { "epoch": 2.746292372881356, "grad_norm": 1.9433517456054688, "learning_rate": 8.62698622881356e-06, "loss": 1.2385, "mean_token_accuracy": 0.7266750857234001, "num_tokens": 8347611.0, "step": 10370 }, { "epoch": 2.746822033898305, "grad_norm": 1.857473611831665, "learning_rate": 8.626721398305086e-06, "loss": 1.4633, "mean_token_accuracy": 0.6943826302886009, "num_tokens": 8349356.0, "step": 10372 }, { "epoch": 2.7473516949152543, "grad_norm": 1.4579083919525146, "learning_rate": 8.62645656779661e-06, "loss": 1.4887, "mean_token_accuracy": 0.6774809509515762, "num_tokens": 8351108.0, "step": 10374 }, { "epoch": 2.7478813559322033, "grad_norm": 1.8349785804748535, "learning_rate": 8.626191737288136e-06, "loss": 1.3063, "mean_token_accuracy": 0.6979600712656975, "num_tokens": 8352714.0, "step": 10376 }, { "epoch": 2.7484110169491527, "grad_norm": 1.3245805501937866, "learning_rate": 8.62592690677966e-06, "loss": 0.8533, "mean_token_accuracy": 0.7712437361478806, "num_tokens": 8354266.0, "step": 10378 }, { "epoch": 2.7489406779661016, "grad_norm": 1.651287317276001, "learning_rate": 8.625662076271187e-06, "loss": 1.2929, "mean_token_accuracy": 0.7115250155329704, "num_tokens": 8355970.0, "step": 10380 }, { "epoch": 2.749470338983051, "grad_norm": 1.8378468751907349, "learning_rate": 8.625397245762712e-06, "loss": 1.1311, "mean_token_accuracy": 0.7173692807555199, "num_tokens": 8357308.0, "step": 10382 }, { "epoch": 2.75, "grad_norm": 1.8989192247390747, "learning_rate": 8.625132415254239e-06, "loss": 0.8898, "mean_token_accuracy": 0.7985298708081245, "num_tokens": 8358698.0, "step": 10384 }, { "epoch": 2.750529661016949, "grad_norm": 1.7143864631652832, "learning_rate": 8.624867584745764e-06, "loss": 1.3545, "mean_token_accuracy": 0.6879820302128792, "num_tokens": 8360279.0, "step": 10386 }, { "epoch": 2.7510593220338984, "grad_norm": 1.7120819091796875, "learning_rate": 8.624602754237289e-06, "loss": 1.3358, "mean_token_accuracy": 0.6801240369677544, "num_tokens": 8362126.0, "step": 10388 }, { "epoch": 2.7515889830508473, "grad_norm": 1.9343544244766235, "learning_rate": 8.624337923728814e-06, "loss": 1.4682, "mean_token_accuracy": 0.6955795884132385, "num_tokens": 8363618.0, "step": 10390 }, { "epoch": 2.7521186440677967, "grad_norm": 2.034236192703247, "learning_rate": 8.62407309322034e-06, "loss": 1.1967, "mean_token_accuracy": 0.7356615513563156, "num_tokens": 8365179.0, "step": 10392 }, { "epoch": 2.7526483050847457, "grad_norm": 1.6419216394424438, "learning_rate": 8.623808262711865e-06, "loss": 0.9981, "mean_token_accuracy": 0.738392561674118, "num_tokens": 8366699.0, "step": 10394 }, { "epoch": 2.753177966101695, "grad_norm": 1.409232497215271, "learning_rate": 8.62354343220339e-06, "loss": 0.9083, "mean_token_accuracy": 0.7987825646996498, "num_tokens": 8368175.0, "step": 10396 }, { "epoch": 2.753707627118644, "grad_norm": 1.4995858669281006, "learning_rate": 8.623278601694915e-06, "loss": 1.4348, "mean_token_accuracy": 0.6714676320552826, "num_tokens": 8369674.0, "step": 10398 }, { "epoch": 2.7542372881355934, "grad_norm": 1.680084466934204, "learning_rate": 8.623013771186442e-06, "loss": 1.0618, "mean_token_accuracy": 0.7409526258707047, "num_tokens": 8371494.0, "step": 10400 }, { "epoch": 2.7547669491525424, "grad_norm": 1.8976531028747559, "learning_rate": 8.622748940677967e-06, "loss": 1.3316, "mean_token_accuracy": 0.7372350618243217, "num_tokens": 8373047.0, "step": 10402 }, { "epoch": 2.7552966101694913, "grad_norm": 1.945424199104309, "learning_rate": 8.622484110169491e-06, "loss": 1.5037, "mean_token_accuracy": 0.6513557732105255, "num_tokens": 8374667.0, "step": 10404 }, { "epoch": 2.7558262711864407, "grad_norm": 1.5870949029922485, "learning_rate": 8.622219279661016e-06, "loss": 1.61, "mean_token_accuracy": 0.6575387567281723, "num_tokens": 8376506.0, "step": 10406 }, { "epoch": 2.7563559322033897, "grad_norm": 1.6314902305603027, "learning_rate": 8.621954449152543e-06, "loss": 0.8959, "mean_token_accuracy": 0.7856741920113564, "num_tokens": 8377954.0, "step": 10408 }, { "epoch": 2.756885593220339, "grad_norm": 1.707472562789917, "learning_rate": 8.621689618644068e-06, "loss": 1.2136, "mean_token_accuracy": 0.7409515082836151, "num_tokens": 8379459.0, "step": 10410 }, { "epoch": 2.757415254237288, "grad_norm": 1.7135579586029053, "learning_rate": 8.621424788135595e-06, "loss": 1.1852, "mean_token_accuracy": 0.7247519567608833, "num_tokens": 8380960.0, "step": 10412 }, { "epoch": 2.757944915254237, "grad_norm": 1.9916619062423706, "learning_rate": 8.62115995762712e-06, "loss": 0.8825, "mean_token_accuracy": 0.7841001600027084, "num_tokens": 8382371.0, "step": 10414 }, { "epoch": 2.7584745762711864, "grad_norm": 1.5791956186294556, "learning_rate": 8.620895127118644e-06, "loss": 1.3094, "mean_token_accuracy": 0.7061763107776642, "num_tokens": 8383826.0, "step": 10416 }, { "epoch": 2.759004237288136, "grad_norm": 1.8577364683151245, "learning_rate": 8.620630296610171e-06, "loss": 1.1991, "mean_token_accuracy": 0.7269325405359268, "num_tokens": 8385483.0, "step": 10418 }, { "epoch": 2.7595338983050848, "grad_norm": 2.346008539199829, "learning_rate": 8.620365466101696e-06, "loss": 1.2645, "mean_token_accuracy": 0.7123191505670547, "num_tokens": 8386770.0, "step": 10420 }, { "epoch": 2.7600635593220337, "grad_norm": 1.321252703666687, "learning_rate": 8.620100635593221e-06, "loss": 1.5353, "mean_token_accuracy": 0.6405942887067795, "num_tokens": 8388652.0, "step": 10422 }, { "epoch": 2.760593220338983, "grad_norm": 1.5551398992538452, "learning_rate": 8.619835805084746e-06, "loss": 1.438, "mean_token_accuracy": 0.6652289964258671, "num_tokens": 8390199.0, "step": 10424 }, { "epoch": 2.761122881355932, "grad_norm": 1.6050686836242676, "learning_rate": 8.619570974576272e-06, "loss": 1.0534, "mean_token_accuracy": 0.7457046434283257, "num_tokens": 8391516.0, "step": 10426 }, { "epoch": 2.7616525423728815, "grad_norm": 1.9186052083969116, "learning_rate": 8.619306144067797e-06, "loss": 1.253, "mean_token_accuracy": 0.7211616709828377, "num_tokens": 8393344.0, "step": 10428 }, { "epoch": 2.7621822033898304, "grad_norm": 1.7599077224731445, "learning_rate": 8.619041313559322e-06, "loss": 1.2649, "mean_token_accuracy": 0.7228932231664658, "num_tokens": 8395007.0, "step": 10430 }, { "epoch": 2.7627118644067794, "grad_norm": 1.9283061027526855, "learning_rate": 8.618776483050847e-06, "loss": 0.8128, "mean_token_accuracy": 0.7997271865606308, "num_tokens": 8396366.0, "step": 10432 }, { "epoch": 2.763241525423729, "grad_norm": 1.8555490970611572, "learning_rate": 8.618511652542374e-06, "loss": 1.5498, "mean_token_accuracy": 0.6867103576660156, "num_tokens": 8398002.0, "step": 10434 }, { "epoch": 2.763771186440678, "grad_norm": 1.4049022197723389, "learning_rate": 8.618246822033899e-06, "loss": 1.1072, "mean_token_accuracy": 0.7493914216756821, "num_tokens": 8399410.0, "step": 10436 }, { "epoch": 2.764300847457627, "grad_norm": 2.2944908142089844, "learning_rate": 8.617981991525425e-06, "loss": 1.3524, "mean_token_accuracy": 0.709491953253746, "num_tokens": 8400861.0, "step": 10438 }, { "epoch": 2.764830508474576, "grad_norm": 1.5316379070281982, "learning_rate": 8.61771716101695e-06, "loss": 1.3027, "mean_token_accuracy": 0.7074144035577774, "num_tokens": 8402213.0, "step": 10440 }, { "epoch": 2.7653601694915255, "grad_norm": 1.4486528635025024, "learning_rate": 8.617452330508475e-06, "loss": 0.9529, "mean_token_accuracy": 0.7763093784451485, "num_tokens": 8404052.0, "step": 10442 }, { "epoch": 2.7658898305084745, "grad_norm": 1.5871800184249878, "learning_rate": 8.6171875e-06, "loss": 1.4972, "mean_token_accuracy": 0.6633592247962952, "num_tokens": 8405814.0, "step": 10444 }, { "epoch": 2.766419491525424, "grad_norm": 1.514140248298645, "learning_rate": 8.616922669491527e-06, "loss": 1.2441, "mean_token_accuracy": 0.7207202725112438, "num_tokens": 8407457.0, "step": 10446 }, { "epoch": 2.766949152542373, "grad_norm": 2.0588808059692383, "learning_rate": 8.616657838983052e-06, "loss": 1.5178, "mean_token_accuracy": 0.6661819517612457, "num_tokens": 8409044.0, "step": 10448 }, { "epoch": 2.767478813559322, "grad_norm": 1.774047613143921, "learning_rate": 8.616393008474577e-06, "loss": 1.2126, "mean_token_accuracy": 0.7350526973605156, "num_tokens": 8410503.0, "step": 10450 }, { "epoch": 2.768008474576271, "grad_norm": 1.8328070640563965, "learning_rate": 8.616128177966102e-06, "loss": 1.4673, "mean_token_accuracy": 0.6647455915808678, "num_tokens": 8412070.0, "step": 10452 }, { "epoch": 2.7685381355932206, "grad_norm": 1.7509132623672485, "learning_rate": 8.615863347457628e-06, "loss": 1.0664, "mean_token_accuracy": 0.7373932227492332, "num_tokens": 8413589.0, "step": 10454 }, { "epoch": 2.7690677966101696, "grad_norm": 1.756513237953186, "learning_rate": 8.615598516949153e-06, "loss": 1.2021, "mean_token_accuracy": 0.7088072896003723, "num_tokens": 8415342.0, "step": 10456 }, { "epoch": 2.7695974576271185, "grad_norm": 1.6843628883361816, "learning_rate": 8.615333686440678e-06, "loss": 1.1477, "mean_token_accuracy": 0.729596421122551, "num_tokens": 8416738.0, "step": 10458 }, { "epoch": 2.770127118644068, "grad_norm": 1.6983559131622314, "learning_rate": 8.615068855932203e-06, "loss": 1.2791, "mean_token_accuracy": 0.7292235866189003, "num_tokens": 8418359.0, "step": 10460 }, { "epoch": 2.770656779661017, "grad_norm": 2.224135160446167, "learning_rate": 8.61480402542373e-06, "loss": 1.8227, "mean_token_accuracy": 0.6073842570185661, "num_tokens": 8419794.0, "step": 10462 }, { "epoch": 2.7711864406779663, "grad_norm": 2.6286325454711914, "learning_rate": 8.614539194915255e-06, "loss": 1.3157, "mean_token_accuracy": 0.7120077088475227, "num_tokens": 8421268.0, "step": 10464 }, { "epoch": 2.7717161016949152, "grad_norm": 1.4565435647964478, "learning_rate": 8.614274364406781e-06, "loss": 0.9448, "mean_token_accuracy": 0.765392079949379, "num_tokens": 8423055.0, "step": 10466 }, { "epoch": 2.772245762711864, "grad_norm": 1.6218810081481934, "learning_rate": 8.614009533898306e-06, "loss": 1.1599, "mean_token_accuracy": 0.7288219928741455, "num_tokens": 8424578.0, "step": 10468 }, { "epoch": 2.7727754237288136, "grad_norm": 1.6329988241195679, "learning_rate": 8.613744703389831e-06, "loss": 1.5359, "mean_token_accuracy": 0.6792244873940945, "num_tokens": 8426251.0, "step": 10470 }, { "epoch": 2.773305084745763, "grad_norm": 1.6410244703292847, "learning_rate": 8.613479872881356e-06, "loss": 1.1882, "mean_token_accuracy": 0.7557130008935928, "num_tokens": 8427836.0, "step": 10472 }, { "epoch": 2.773834745762712, "grad_norm": 1.5418704748153687, "learning_rate": 8.613215042372883e-06, "loss": 1.6756, "mean_token_accuracy": 0.6215669885277748, "num_tokens": 8430312.0, "step": 10474 }, { "epoch": 2.774364406779661, "grad_norm": 1.9190078973770142, "learning_rate": 8.612950211864408e-06, "loss": 1.4994, "mean_token_accuracy": 0.6379512995481491, "num_tokens": 8431845.0, "step": 10476 }, { "epoch": 2.7748940677966103, "grad_norm": 1.324937343597412, "learning_rate": 8.612685381355933e-06, "loss": 0.839, "mean_token_accuracy": 0.7953475788235664, "num_tokens": 8433212.0, "step": 10478 }, { "epoch": 2.7754237288135593, "grad_norm": 2.0714428424835205, "learning_rate": 8.612420550847457e-06, "loss": 1.6339, "mean_token_accuracy": 0.6419214904308319, "num_tokens": 8434727.0, "step": 10480 }, { "epoch": 2.7759533898305087, "grad_norm": 1.6382617950439453, "learning_rate": 8.612155720338984e-06, "loss": 1.1263, "mean_token_accuracy": 0.747610330581665, "num_tokens": 8436285.0, "step": 10482 }, { "epoch": 2.7764830508474576, "grad_norm": 1.9740465879440308, "learning_rate": 8.611890889830509e-06, "loss": 1.5295, "mean_token_accuracy": 0.6842191517353058, "num_tokens": 8437730.0, "step": 10484 }, { "epoch": 2.7770127118644066, "grad_norm": 1.468372106552124, "learning_rate": 8.611626059322034e-06, "loss": 1.0738, "mean_token_accuracy": 0.7269777059555054, "num_tokens": 8439518.0, "step": 10486 }, { "epoch": 2.777542372881356, "grad_norm": 1.5804345607757568, "learning_rate": 8.611361228813559e-06, "loss": 1.0963, "mean_token_accuracy": 0.7505155578255653, "num_tokens": 8440944.0, "step": 10488 }, { "epoch": 2.778072033898305, "grad_norm": 1.7810420989990234, "learning_rate": 8.611096398305085e-06, "loss": 1.1109, "mean_token_accuracy": 0.7396164834499359, "num_tokens": 8442508.0, "step": 10490 }, { "epoch": 2.7786016949152543, "grad_norm": 1.7289272546768188, "learning_rate": 8.61083156779661e-06, "loss": 1.2664, "mean_token_accuracy": 0.7027486488223076, "num_tokens": 8444109.0, "step": 10492 }, { "epoch": 2.7791313559322033, "grad_norm": 1.7153124809265137, "learning_rate": 8.610566737288137e-06, "loss": 1.499, "mean_token_accuracy": 0.6744050234556198, "num_tokens": 8445560.0, "step": 10494 }, { "epoch": 2.7796610169491527, "grad_norm": 1.678137183189392, "learning_rate": 8.610301906779662e-06, "loss": 1.276, "mean_token_accuracy": 0.7166552022099495, "num_tokens": 8447200.0, "step": 10496 }, { "epoch": 2.7801906779661016, "grad_norm": 1.3948153257369995, "learning_rate": 8.610037076271187e-06, "loss": 1.2264, "mean_token_accuracy": 0.699286051094532, "num_tokens": 8448914.0, "step": 10498 }, { "epoch": 2.780720338983051, "grad_norm": 1.367941975593567, "learning_rate": 8.609772245762714e-06, "loss": 1.206, "step": 10500 }, { "epoch": 2.780720338983051, "eval_loss": 1.3090553283691406, "eval_mean_token_accuracy": 0.7013760750943964, "eval_num_tokens": 8451234.0, "eval_runtime": 48.5321, "eval_samples_per_second": 6.346, "eval_steps_per_second": 6.346, "step": 10500 }, { "epoch": 2.78125, "grad_norm": 1.7474255561828613, "learning_rate": 8.609507415254238e-06, "loss": 1.2083, "mean_token_accuracy": 0.7332862168550491, "num_tokens": 8452559.0, "step": 10502 }, { "epoch": 2.781779661016949, "grad_norm": 1.632373332977295, "learning_rate": 8.609242584745763e-06, "loss": 1.3056, "mean_token_accuracy": 0.7076364010572433, "num_tokens": 8454243.0, "step": 10504 }, { "epoch": 2.7823093220338984, "grad_norm": 1.9505070447921753, "learning_rate": 8.608977754237288e-06, "loss": 1.494, "mean_token_accuracy": 0.6605501398444176, "num_tokens": 8455700.0, "step": 10506 }, { "epoch": 2.7828389830508473, "grad_norm": 1.6622045040130615, "learning_rate": 8.608712923728815e-06, "loss": 1.781, "mean_token_accuracy": 0.6240998730063438, "num_tokens": 8457272.0, "step": 10508 }, { "epoch": 2.7833686440677967, "grad_norm": 1.5861815214157104, "learning_rate": 8.60844809322034e-06, "loss": 1.1908, "mean_token_accuracy": 0.7266937419772148, "num_tokens": 8458964.0, "step": 10510 }, { "epoch": 2.7838983050847457, "grad_norm": 1.6714131832122803, "learning_rate": 8.608183262711865e-06, "loss": 1.1676, "mean_token_accuracy": 0.7477807998657227, "num_tokens": 8460419.0, "step": 10512 }, { "epoch": 2.784427966101695, "grad_norm": 1.6854618787765503, "learning_rate": 8.60791843220339e-06, "loss": 0.885, "mean_token_accuracy": 0.7825224176049232, "num_tokens": 8461965.0, "step": 10514 }, { "epoch": 2.784957627118644, "grad_norm": 1.759851336479187, "learning_rate": 8.607653601694916e-06, "loss": 1.4714, "mean_token_accuracy": 0.687301829457283, "num_tokens": 8463584.0, "step": 10516 }, { "epoch": 2.7854872881355934, "grad_norm": 1.47037672996521, "learning_rate": 8.607388771186441e-06, "loss": 1.157, "mean_token_accuracy": 0.7374267429113388, "num_tokens": 8465629.0, "step": 10518 }, { "epoch": 2.7860169491525424, "grad_norm": 1.7975114583969116, "learning_rate": 8.607123940677968e-06, "loss": 0.8145, "mean_token_accuracy": 0.7915939912199974, "num_tokens": 8467057.0, "step": 10520 }, { "epoch": 2.7865466101694913, "grad_norm": 1.6343963146209717, "learning_rate": 8.606859110169493e-06, "loss": 1.3355, "mean_token_accuracy": 0.7112138122320175, "num_tokens": 8468589.0, "step": 10522 }, { "epoch": 2.7870762711864407, "grad_norm": 1.6743707656860352, "learning_rate": 8.606594279661018e-06, "loss": 1.0839, "mean_token_accuracy": 0.7307076081633568, "num_tokens": 8470146.0, "step": 10524 }, { "epoch": 2.7876059322033897, "grad_norm": 1.8558177947998047, "learning_rate": 8.606329449152543e-06, "loss": 1.2793, "mean_token_accuracy": 0.6895685717463493, "num_tokens": 8471816.0, "step": 10526 }, { "epoch": 2.788135593220339, "grad_norm": 1.7986899614334106, "learning_rate": 8.60606461864407e-06, "loss": 1.0562, "mean_token_accuracy": 0.7453114911913872, "num_tokens": 8473437.0, "step": 10528 }, { "epoch": 2.788665254237288, "grad_norm": 1.7961972951889038, "learning_rate": 8.605799788135594e-06, "loss": 1.3014, "mean_token_accuracy": 0.7196089923381805, "num_tokens": 8474922.0, "step": 10530 }, { "epoch": 2.789194915254237, "grad_norm": 1.7604060173034668, "learning_rate": 8.605534957627119e-06, "loss": 1.2937, "mean_token_accuracy": 0.6926335096359253, "num_tokens": 8476568.0, "step": 10532 }, { "epoch": 2.7897245762711864, "grad_norm": 1.3600428104400635, "learning_rate": 8.605270127118644e-06, "loss": 0.9019, "mean_token_accuracy": 0.7742876410484314, "num_tokens": 8478096.0, "step": 10534 }, { "epoch": 2.790254237288136, "grad_norm": 1.1935402154922485, "learning_rate": 8.60500529661017e-06, "loss": 1.2432, "mean_token_accuracy": 0.6619851775467396, "num_tokens": 8480536.0, "step": 10536 }, { "epoch": 2.7907838983050848, "grad_norm": 1.8320934772491455, "learning_rate": 8.604740466101696e-06, "loss": 1.4767, "mean_token_accuracy": 0.6798962540924549, "num_tokens": 8481913.0, "step": 10538 }, { "epoch": 2.7913135593220337, "grad_norm": 1.4372316598892212, "learning_rate": 8.60447563559322e-06, "loss": 1.1061, "mean_token_accuracy": 0.7426073029637337, "num_tokens": 8483487.0, "step": 10540 }, { "epoch": 2.791843220338983, "grad_norm": 1.8136330842971802, "learning_rate": 8.604210805084745e-06, "loss": 1.3705, "mean_token_accuracy": 0.7027605324983597, "num_tokens": 8484948.0, "step": 10542 }, { "epoch": 2.792372881355932, "grad_norm": 1.9089914560317993, "learning_rate": 8.603945974576272e-06, "loss": 1.4796, "mean_token_accuracy": 0.6785353496670723, "num_tokens": 8486570.0, "step": 10544 }, { "epoch": 2.7929025423728815, "grad_norm": 1.4184688329696655, "learning_rate": 8.603681144067797e-06, "loss": 1.0334, "mean_token_accuracy": 0.7605973556637764, "num_tokens": 8488087.0, "step": 10546 }, { "epoch": 2.7934322033898304, "grad_norm": 1.4290589094161987, "learning_rate": 8.603416313559324e-06, "loss": 0.9036, "mean_token_accuracy": 0.7774649932980537, "num_tokens": 8489542.0, "step": 10548 }, { "epoch": 2.7939618644067794, "grad_norm": 1.7858856916427612, "learning_rate": 8.603151483050849e-06, "loss": 0.9856, "mean_token_accuracy": 0.7475552558898926, "num_tokens": 8491121.0, "step": 10550 }, { "epoch": 2.794491525423729, "grad_norm": 1.3475253582000732, "learning_rate": 8.602886652542374e-06, "loss": 1.3914, "mean_token_accuracy": 0.6902167275547981, "num_tokens": 8492744.0, "step": 10552 }, { "epoch": 2.795021186440678, "grad_norm": 1.8284510374069214, "learning_rate": 8.602621822033898e-06, "loss": 1.5011, "mean_token_accuracy": 0.6749813035130501, "num_tokens": 8494286.0, "step": 10554 }, { "epoch": 2.795550847457627, "grad_norm": 1.7491616010665894, "learning_rate": 8.602356991525425e-06, "loss": 0.8977, "mean_token_accuracy": 0.760154016315937, "num_tokens": 8495841.0, "step": 10556 }, { "epoch": 2.796080508474576, "grad_norm": 1.6107141971588135, "learning_rate": 8.60209216101695e-06, "loss": 1.0628, "mean_token_accuracy": 0.7431936338543892, "num_tokens": 8497473.0, "step": 10558 }, { "epoch": 2.7966101694915255, "grad_norm": 1.757981300354004, "learning_rate": 8.601827330508475e-06, "loss": 1.4313, "mean_token_accuracy": 0.6975200474262238, "num_tokens": 8498990.0, "step": 10560 }, { "epoch": 2.7971398305084745, "grad_norm": 1.795925259590149, "learning_rate": 8.6015625e-06, "loss": 1.3487, "mean_token_accuracy": 0.7178583741188049, "num_tokens": 8500368.0, "step": 10562 }, { "epoch": 2.797669491525424, "grad_norm": 1.8872624635696411, "learning_rate": 8.601297669491526e-06, "loss": 1.5169, "mean_token_accuracy": 0.6622330546379089, "num_tokens": 8502258.0, "step": 10564 }, { "epoch": 2.798199152542373, "grad_norm": 1.7623664140701294, "learning_rate": 8.601032838983051e-06, "loss": 1.2646, "mean_token_accuracy": 0.7152320072054863, "num_tokens": 8503983.0, "step": 10566 }, { "epoch": 2.798728813559322, "grad_norm": 2.0306339263916016, "learning_rate": 8.600768008474576e-06, "loss": 1.166, "mean_token_accuracy": 0.7341010123491287, "num_tokens": 8505343.0, "step": 10568 }, { "epoch": 2.799258474576271, "grad_norm": 1.769252896308899, "learning_rate": 8.600503177966101e-06, "loss": 1.3629, "mean_token_accuracy": 0.6902765110135078, "num_tokens": 8506736.0, "step": 10570 }, { "epoch": 2.7997881355932206, "grad_norm": 1.8169416189193726, "learning_rate": 8.600238347457628e-06, "loss": 1.6663, "mean_token_accuracy": 0.6239303573966026, "num_tokens": 8508544.0, "step": 10572 }, { "epoch": 2.8003177966101696, "grad_norm": 1.8449300527572632, "learning_rate": 8.599973516949153e-06, "loss": 1.4611, "mean_token_accuracy": 0.6742641553282738, "num_tokens": 8509983.0, "step": 10574 }, { "epoch": 2.8008474576271185, "grad_norm": 1.8883057832717896, "learning_rate": 8.59970868644068e-06, "loss": 1.1008, "mean_token_accuracy": 0.7478493675589561, "num_tokens": 8511707.0, "step": 10576 }, { "epoch": 2.801377118644068, "grad_norm": 1.7552253007888794, "learning_rate": 8.599443855932204e-06, "loss": 1.8454, "mean_token_accuracy": 0.5905129909515381, "num_tokens": 8513273.0, "step": 10578 }, { "epoch": 2.801906779661017, "grad_norm": 1.4464471340179443, "learning_rate": 8.59917902542373e-06, "loss": 0.9168, "mean_token_accuracy": 0.7629233300685883, "num_tokens": 8514887.0, "step": 10580 }, { "epoch": 2.8024364406779663, "grad_norm": 1.5854109525680542, "learning_rate": 8.598914194915256e-06, "loss": 1.5938, "mean_token_accuracy": 0.6600034981966019, "num_tokens": 8516767.0, "step": 10582 }, { "epoch": 2.8029661016949152, "grad_norm": 1.8508358001708984, "learning_rate": 8.598649364406781e-06, "loss": 1.7334, "mean_token_accuracy": 0.6152865141630173, "num_tokens": 8518233.0, "step": 10584 }, { "epoch": 2.803495762711864, "grad_norm": 1.6318359375, "learning_rate": 8.598384533898306e-06, "loss": 1.4509, "mean_token_accuracy": 0.6806862577795982, "num_tokens": 8520039.0, "step": 10586 }, { "epoch": 2.8040254237288136, "grad_norm": 1.7253482341766357, "learning_rate": 8.59811970338983e-06, "loss": 1.3519, "mean_token_accuracy": 0.7168690003454685, "num_tokens": 8521548.0, "step": 10588 }, { "epoch": 2.804555084745763, "grad_norm": 1.8749477863311768, "learning_rate": 8.597854872881357e-06, "loss": 1.2304, "mean_token_accuracy": 0.7098672240972519, "num_tokens": 8523132.0, "step": 10590 }, { "epoch": 2.805084745762712, "grad_norm": 1.5849854946136475, "learning_rate": 8.597590042372882e-06, "loss": 1.0637, "mean_token_accuracy": 0.7406995818018913, "num_tokens": 8524486.0, "step": 10592 }, { "epoch": 2.805614406779661, "grad_norm": 1.6948246955871582, "learning_rate": 8.597325211864407e-06, "loss": 1.1218, "mean_token_accuracy": 0.7447976842522621, "num_tokens": 8525947.0, "step": 10594 }, { "epoch": 2.8061440677966103, "grad_norm": 1.5313823223114014, "learning_rate": 8.597060381355932e-06, "loss": 1.3065, "mean_token_accuracy": 0.7101522535085678, "num_tokens": 8527495.0, "step": 10596 }, { "epoch": 2.8066737288135593, "grad_norm": 2.1012914180755615, "learning_rate": 8.596795550847459e-06, "loss": 1.2759, "mean_token_accuracy": 0.6994592472910881, "num_tokens": 8528924.0, "step": 10598 }, { "epoch": 2.8072033898305087, "grad_norm": 1.354817509651184, "learning_rate": 8.596530720338984e-06, "loss": 1.2785, "mean_token_accuracy": 0.721453208476305, "num_tokens": 8530736.0, "step": 10600 }, { "epoch": 2.8077330508474576, "grad_norm": 1.858219027519226, "learning_rate": 8.59626588983051e-06, "loss": 1.7349, "mean_token_accuracy": 0.6245744526386261, "num_tokens": 8532299.0, "step": 10602 }, { "epoch": 2.8082627118644066, "grad_norm": 1.644585132598877, "learning_rate": 8.596001059322035e-06, "loss": 1.3237, "mean_token_accuracy": 0.727170180529356, "num_tokens": 8533888.0, "step": 10604 }, { "epoch": 2.808792372881356, "grad_norm": 1.6470134258270264, "learning_rate": 8.59573622881356e-06, "loss": 1.2808, "mean_token_accuracy": 0.6805369481444359, "num_tokens": 8535564.0, "step": 10606 }, { "epoch": 2.809322033898305, "grad_norm": 1.3644696474075317, "learning_rate": 8.595471398305085e-06, "loss": 1.2285, "mean_token_accuracy": 0.7298530340194702, "num_tokens": 8537311.0, "step": 10608 }, { "epoch": 2.8098516949152543, "grad_norm": 1.6530460119247437, "learning_rate": 8.595206567796612e-06, "loss": 1.3612, "mean_token_accuracy": 0.668747715651989, "num_tokens": 8538894.0, "step": 10610 }, { "epoch": 2.8103813559322033, "grad_norm": 1.3141924142837524, "learning_rate": 8.594941737288137e-06, "loss": 1.1339, "mean_token_accuracy": 0.7456530854105949, "num_tokens": 8540472.0, "step": 10612 }, { "epoch": 2.8109110169491527, "grad_norm": 1.8504213094711304, "learning_rate": 8.594676906779662e-06, "loss": 1.1842, "mean_token_accuracy": 0.7189308553934097, "num_tokens": 8542124.0, "step": 10614 }, { "epoch": 2.8114406779661016, "grad_norm": 1.8155066967010498, "learning_rate": 8.594412076271187e-06, "loss": 1.6378, "mean_token_accuracy": 0.6484022736549377, "num_tokens": 8543688.0, "step": 10616 }, { "epoch": 2.811970338983051, "grad_norm": 1.4062436819076538, "learning_rate": 8.594147245762713e-06, "loss": 1.2989, "mean_token_accuracy": 0.7053695172071457, "num_tokens": 8545358.0, "step": 10618 }, { "epoch": 2.8125, "grad_norm": 1.9033491611480713, "learning_rate": 8.593882415254238e-06, "loss": 1.5934, "mean_token_accuracy": 0.6647096201777458, "num_tokens": 8546990.0, "step": 10620 }, { "epoch": 2.813029661016949, "grad_norm": 1.5893104076385498, "learning_rate": 8.593617584745763e-06, "loss": 1.2596, "mean_token_accuracy": 0.7174297720193863, "num_tokens": 8548467.0, "step": 10622 }, { "epoch": 2.8135593220338984, "grad_norm": 1.5899978876113892, "learning_rate": 8.593352754237288e-06, "loss": 0.9442, "mean_token_accuracy": 0.7456287294626236, "num_tokens": 8549948.0, "step": 10624 }, { "epoch": 2.8140889830508473, "grad_norm": 1.9134984016418457, "learning_rate": 8.593087923728815e-06, "loss": 1.6423, "mean_token_accuracy": 0.6475328579545021, "num_tokens": 8551898.0, "step": 10626 }, { "epoch": 2.8146186440677967, "grad_norm": 1.8683804273605347, "learning_rate": 8.59282309322034e-06, "loss": 1.0778, "mean_token_accuracy": 0.7523404061794281, "num_tokens": 8553424.0, "step": 10628 }, { "epoch": 2.8151483050847457, "grad_norm": 1.737514615058899, "learning_rate": 8.592558262711866e-06, "loss": 0.941, "mean_token_accuracy": 0.7746342048048973, "num_tokens": 8555241.0, "step": 10630 }, { "epoch": 2.815677966101695, "grad_norm": 1.8182883262634277, "learning_rate": 8.592293432203391e-06, "loss": 1.1629, "mean_token_accuracy": 0.7237274199724197, "num_tokens": 8556868.0, "step": 10632 }, { "epoch": 2.816207627118644, "grad_norm": 1.7951687574386597, "learning_rate": 8.592028601694916e-06, "loss": 1.6145, "mean_token_accuracy": 0.643159881234169, "num_tokens": 8558429.0, "step": 10634 }, { "epoch": 2.8167372881355934, "grad_norm": 1.8396706581115723, "learning_rate": 8.591763771186441e-06, "loss": 1.6906, "mean_token_accuracy": 0.6275242269039154, "num_tokens": 8560098.0, "step": 10636 }, { "epoch": 2.8172669491525424, "grad_norm": 1.83406400680542, "learning_rate": 8.591498940677967e-06, "loss": 1.4618, "mean_token_accuracy": 0.6635538712143898, "num_tokens": 8561603.0, "step": 10638 }, { "epoch": 2.8177966101694913, "grad_norm": 1.675143837928772, "learning_rate": 8.591234110169492e-06, "loss": 1.1828, "mean_token_accuracy": 0.7233400344848633, "num_tokens": 8563069.0, "step": 10640 }, { "epoch": 2.8183262711864407, "grad_norm": 1.8409291505813599, "learning_rate": 8.590969279661017e-06, "loss": 1.4168, "mean_token_accuracy": 0.6948782205581665, "num_tokens": 8564495.0, "step": 10642 }, { "epoch": 2.8188559322033897, "grad_norm": 1.720816731452942, "learning_rate": 8.590704449152542e-06, "loss": 1.0751, "mean_token_accuracy": 0.7630570605397224, "num_tokens": 8566018.0, "step": 10644 }, { "epoch": 2.819385593220339, "grad_norm": 1.8969053030014038, "learning_rate": 8.590439618644069e-06, "loss": 1.0373, "mean_token_accuracy": 0.7625932171940804, "num_tokens": 8567409.0, "step": 10646 }, { "epoch": 2.819915254237288, "grad_norm": 1.5052595138549805, "learning_rate": 8.590174788135594e-06, "loss": 1.2959, "mean_token_accuracy": 0.7204787731170654, "num_tokens": 8568890.0, "step": 10648 }, { "epoch": 2.820444915254237, "grad_norm": 1.5511045455932617, "learning_rate": 8.589909957627119e-06, "loss": 1.1994, "mean_token_accuracy": 0.7100897133350372, "num_tokens": 8570591.0, "step": 10650 }, { "epoch": 2.8209745762711864, "grad_norm": 1.7556225061416626, "learning_rate": 8.589645127118644e-06, "loss": 1.3088, "mean_token_accuracy": 0.6942583322525024, "num_tokens": 8572320.0, "step": 10652 }, { "epoch": 2.821504237288136, "grad_norm": 1.7081832885742188, "learning_rate": 8.58938029661017e-06, "loss": 1.4302, "mean_token_accuracy": 0.6617555096745491, "num_tokens": 8573780.0, "step": 10654 }, { "epoch": 2.8220338983050848, "grad_norm": 1.6999216079711914, "learning_rate": 8.589115466101695e-06, "loss": 1.0052, "mean_token_accuracy": 0.7754732295870781, "num_tokens": 8575082.0, "step": 10656 }, { "epoch": 2.8225635593220337, "grad_norm": 1.7359923124313354, "learning_rate": 8.588850635593222e-06, "loss": 1.2896, "mean_token_accuracy": 0.707259476184845, "num_tokens": 8576713.0, "step": 10658 }, { "epoch": 2.823093220338983, "grad_norm": 1.6101794242858887, "learning_rate": 8.588585805084745e-06, "loss": 0.9278, "mean_token_accuracy": 0.7559974491596222, "num_tokens": 8578357.0, "step": 10660 }, { "epoch": 2.823622881355932, "grad_norm": 1.8099948167800903, "learning_rate": 8.588320974576272e-06, "loss": 1.7841, "mean_token_accuracy": 0.6176184713840485, "num_tokens": 8579964.0, "step": 10662 }, { "epoch": 2.8241525423728815, "grad_norm": 1.6212323904037476, "learning_rate": 8.588056144067798e-06, "loss": 1.3786, "mean_token_accuracy": 0.7044181153178215, "num_tokens": 8581516.0, "step": 10664 }, { "epoch": 2.8246822033898304, "grad_norm": 2.070335865020752, "learning_rate": 8.587791313559323e-06, "loss": 1.7066, "mean_token_accuracy": 0.649487741291523, "num_tokens": 8582787.0, "step": 10666 }, { "epoch": 2.8252118644067794, "grad_norm": 2.1562862396240234, "learning_rate": 8.587526483050848e-06, "loss": 1.0852, "mean_token_accuracy": 0.7615067362785339, "num_tokens": 8584100.0, "step": 10668 }, { "epoch": 2.825741525423729, "grad_norm": 1.4484063386917114, "learning_rate": 8.587261652542373e-06, "loss": 0.9368, "mean_token_accuracy": 0.7803579121828079, "num_tokens": 8585566.0, "step": 10670 }, { "epoch": 2.826271186440678, "grad_norm": 1.7466284036636353, "learning_rate": 8.5869968220339e-06, "loss": 1.3737, "mean_token_accuracy": 0.6915844976902008, "num_tokens": 8587052.0, "step": 10672 }, { "epoch": 2.826800847457627, "grad_norm": 1.6136410236358643, "learning_rate": 8.586731991525425e-06, "loss": 1.1954, "mean_token_accuracy": 0.7460020706057549, "num_tokens": 8588416.0, "step": 10674 }, { "epoch": 2.827330508474576, "grad_norm": 1.5636271238327026, "learning_rate": 8.58646716101695e-06, "loss": 1.155, "mean_token_accuracy": 0.7527130842208862, "num_tokens": 8590319.0, "step": 10676 }, { "epoch": 2.8278601694915255, "grad_norm": 1.3274147510528564, "learning_rate": 8.586202330508475e-06, "loss": 1.1005, "mean_token_accuracy": 0.7524131685495377, "num_tokens": 8592004.0, "step": 10678 }, { "epoch": 2.8283898305084745, "grad_norm": 1.5673587322235107, "learning_rate": 8.585937500000001e-06, "loss": 1.1542, "mean_token_accuracy": 0.7528040856122971, "num_tokens": 8593383.0, "step": 10680 }, { "epoch": 2.828919491525424, "grad_norm": 1.888454794883728, "learning_rate": 8.585672669491526e-06, "loss": 1.6138, "mean_token_accuracy": 0.6155823022127151, "num_tokens": 8595111.0, "step": 10682 }, { "epoch": 2.829449152542373, "grad_norm": 1.9665895700454712, "learning_rate": 8.585407838983053e-06, "loss": 1.474, "mean_token_accuracy": 0.686926607042551, "num_tokens": 8596744.0, "step": 10684 }, { "epoch": 2.829978813559322, "grad_norm": 2.0908429622650146, "learning_rate": 8.585143008474578e-06, "loss": 1.3074, "mean_token_accuracy": 0.6894397139549255, "num_tokens": 8598355.0, "step": 10686 }, { "epoch": 2.830508474576271, "grad_norm": 1.7892531156539917, "learning_rate": 8.584878177966103e-06, "loss": 1.274, "mean_token_accuracy": 0.7294379621744156, "num_tokens": 8599897.0, "step": 10688 }, { "epoch": 2.8310381355932206, "grad_norm": 1.6153727769851685, "learning_rate": 8.584613347457628e-06, "loss": 0.9552, "mean_token_accuracy": 0.7751408517360687, "num_tokens": 8601421.0, "step": 10690 }, { "epoch": 2.8315677966101696, "grad_norm": 1.540390968322754, "learning_rate": 8.584348516949154e-06, "loss": 1.3354, "mean_token_accuracy": 0.7096364907920361, "num_tokens": 8603135.0, "step": 10692 }, { "epoch": 2.8320974576271185, "grad_norm": 1.4172077178955078, "learning_rate": 8.584083686440679e-06, "loss": 1.5426, "mean_token_accuracy": 0.6629315949976444, "num_tokens": 8604949.0, "step": 10694 }, { "epoch": 2.832627118644068, "grad_norm": 1.6566715240478516, "learning_rate": 8.583818855932204e-06, "loss": 1.262, "mean_token_accuracy": 0.7225133702158928, "num_tokens": 8606603.0, "step": 10696 }, { "epoch": 2.833156779661017, "grad_norm": 1.9780950546264648, "learning_rate": 8.583554025423729e-06, "loss": 1.236, "mean_token_accuracy": 0.6987394988536835, "num_tokens": 8607989.0, "step": 10698 }, { "epoch": 2.8336864406779663, "grad_norm": 2.038935899734497, "learning_rate": 8.583289194915256e-06, "loss": 1.4483, "mean_token_accuracy": 0.6775545626878738, "num_tokens": 8609445.0, "step": 10700 }, { "epoch": 2.8342161016949152, "grad_norm": 1.7861871719360352, "learning_rate": 8.58302436440678e-06, "loss": 1.578, "mean_token_accuracy": 0.6528407298028469, "num_tokens": 8611362.0, "step": 10702 }, { "epoch": 2.834745762711864, "grad_norm": 1.6842745542526245, "learning_rate": 8.582759533898305e-06, "loss": 1.5361, "mean_token_accuracy": 0.6576796472072601, "num_tokens": 8612948.0, "step": 10704 }, { "epoch": 2.8352754237288136, "grad_norm": 1.5045689344406128, "learning_rate": 8.58249470338983e-06, "loss": 1.2109, "mean_token_accuracy": 0.7183509990572929, "num_tokens": 8614593.0, "step": 10706 }, { "epoch": 2.835805084745763, "grad_norm": 1.9268074035644531, "learning_rate": 8.582229872881357e-06, "loss": 0.8385, "mean_token_accuracy": 0.7848267704248428, "num_tokens": 8615766.0, "step": 10708 }, { "epoch": 2.836334745762712, "grad_norm": 1.7664947509765625, "learning_rate": 8.581965042372882e-06, "loss": 1.0996, "mean_token_accuracy": 0.7187050580978394, "num_tokens": 8617632.0, "step": 10710 }, { "epoch": 2.836864406779661, "grad_norm": 1.6768980026245117, "learning_rate": 8.581700211864409e-06, "loss": 1.0497, "mean_token_accuracy": 0.7351494506001472, "num_tokens": 8619141.0, "step": 10712 }, { "epoch": 2.8373940677966103, "grad_norm": 1.6786799430847168, "learning_rate": 8.581435381355932e-06, "loss": 1.3081, "mean_token_accuracy": 0.722547821700573, "num_tokens": 8620723.0, "step": 10714 }, { "epoch": 2.8379237288135593, "grad_norm": 1.7074064016342163, "learning_rate": 8.581170550847458e-06, "loss": 1.3349, "mean_token_accuracy": 0.6783963665366173, "num_tokens": 8622291.0, "step": 10716 }, { "epoch": 2.8384533898305087, "grad_norm": 1.661134123802185, "learning_rate": 8.580905720338983e-06, "loss": 1.3055, "mean_token_accuracy": 0.708023201674223, "num_tokens": 8624018.0, "step": 10718 }, { "epoch": 2.8389830508474576, "grad_norm": 1.9817970991134644, "learning_rate": 8.58064088983051e-06, "loss": 1.6513, "mean_token_accuracy": 0.6490191705524921, "num_tokens": 8625662.0, "step": 10720 }, { "epoch": 2.8395127118644066, "grad_norm": 1.667187213897705, "learning_rate": 8.580376059322035e-06, "loss": 1.1929, "mean_token_accuracy": 0.7293088808655739, "num_tokens": 8627171.0, "step": 10722 }, { "epoch": 2.840042372881356, "grad_norm": 1.3765437602996826, "learning_rate": 8.58011122881356e-06, "loss": 1.2253, "mean_token_accuracy": 0.710851289331913, "num_tokens": 8628683.0, "step": 10724 }, { "epoch": 2.840572033898305, "grad_norm": 1.7790141105651855, "learning_rate": 8.579846398305085e-06, "loss": 1.4207, "mean_token_accuracy": 0.6931621059775352, "num_tokens": 8630323.0, "step": 10726 }, { "epoch": 2.8411016949152543, "grad_norm": 1.8179798126220703, "learning_rate": 8.579581567796611e-06, "loss": 1.4899, "mean_token_accuracy": 0.6898733526468277, "num_tokens": 8631926.0, "step": 10728 }, { "epoch": 2.8416313559322033, "grad_norm": 1.4935961961746216, "learning_rate": 8.579316737288136e-06, "loss": 1.3991, "mean_token_accuracy": 0.6935219019651413, "num_tokens": 8633728.0, "step": 10730 }, { "epoch": 2.8421610169491527, "grad_norm": 1.7943925857543945, "learning_rate": 8.579051906779661e-06, "loss": 1.2936, "mean_token_accuracy": 0.6980056613683701, "num_tokens": 8635030.0, "step": 10732 }, { "epoch": 2.8426906779661016, "grad_norm": 1.5289582014083862, "learning_rate": 8.578787076271186e-06, "loss": 1.1036, "mean_token_accuracy": 0.7071779295802116, "num_tokens": 8636701.0, "step": 10734 }, { "epoch": 2.843220338983051, "grad_norm": 1.492268681526184, "learning_rate": 8.578522245762713e-06, "loss": 1.1258, "mean_token_accuracy": 0.7437061443924904, "num_tokens": 8638576.0, "step": 10736 }, { "epoch": 2.84375, "grad_norm": 1.697190523147583, "learning_rate": 8.578257415254238e-06, "loss": 1.1732, "mean_token_accuracy": 0.7333952486515045, "num_tokens": 8640030.0, "step": 10738 }, { "epoch": 2.844279661016949, "grad_norm": 1.526818871498108, "learning_rate": 8.577992584745764e-06, "loss": 1.3172, "mean_token_accuracy": 0.7112894505262375, "num_tokens": 8641557.0, "step": 10740 }, { "epoch": 2.8448093220338984, "grad_norm": 1.8292979001998901, "learning_rate": 8.577727754237288e-06, "loss": 1.4258, "mean_token_accuracy": 0.680344469845295, "num_tokens": 8643325.0, "step": 10742 }, { "epoch": 2.8453389830508473, "grad_norm": 2.02634596824646, "learning_rate": 8.577462923728814e-06, "loss": 1.5146, "mean_token_accuracy": 0.6621270403265953, "num_tokens": 8644768.0, "step": 10744 }, { "epoch": 2.8458686440677967, "grad_norm": 1.5307600498199463, "learning_rate": 8.577198093220339e-06, "loss": 1.2412, "mean_token_accuracy": 0.7214826717972755, "num_tokens": 8646367.0, "step": 10746 }, { "epoch": 2.8463983050847457, "grad_norm": 1.7533214092254639, "learning_rate": 8.576933262711866e-06, "loss": 0.9977, "mean_token_accuracy": 0.7576958984136581, "num_tokens": 8648053.0, "step": 10748 }, { "epoch": 2.846927966101695, "grad_norm": 1.657272219657898, "learning_rate": 8.57666843220339e-06, "loss": 1.0792, "step": 10750 }, { "epoch": 2.846927966101695, "eval_loss": 1.3084203004837036, "eval_mean_token_accuracy": 0.7011879174546762, "eval_num_tokens": 8649697.0, "eval_runtime": 48.3655, "eval_samples_per_second": 6.368, "eval_steps_per_second": 6.368, "step": 10750 }, { "epoch": 2.847457627118644, "grad_norm": 1.4649407863616943, "learning_rate": 8.576403601694916e-06, "loss": 1.4764, "mean_token_accuracy": 0.7146118469536304, "num_tokens": 8651294.0, "step": 10752 }, { "epoch": 2.8479872881355934, "grad_norm": 1.8979346752166748, "learning_rate": 8.576138771186442e-06, "loss": 1.5372, "mean_token_accuracy": 0.6805263459682465, "num_tokens": 8652821.0, "step": 10754 }, { "epoch": 2.8485169491525424, "grad_norm": 1.847886323928833, "learning_rate": 8.575873940677967e-06, "loss": 1.2945, "mean_token_accuracy": 0.7088804021477699, "num_tokens": 8654328.0, "step": 10756 }, { "epoch": 2.8490466101694913, "grad_norm": 1.7102916240692139, "learning_rate": 8.575609110169492e-06, "loss": 1.5155, "mean_token_accuracy": 0.6666109375655651, "num_tokens": 8655990.0, "step": 10758 }, { "epoch": 2.8495762711864407, "grad_norm": 1.7702205181121826, "learning_rate": 8.575344279661017e-06, "loss": 1.2048, "mean_token_accuracy": 0.6966387256979942, "num_tokens": 8657770.0, "step": 10760 }, { "epoch": 2.8501059322033897, "grad_norm": 1.9265024662017822, "learning_rate": 8.575079449152544e-06, "loss": 1.1898, "mean_token_accuracy": 0.7168862372636795, "num_tokens": 8659318.0, "step": 10762 }, { "epoch": 2.850635593220339, "grad_norm": 1.9415483474731445, "learning_rate": 8.574814618644069e-06, "loss": 1.2131, "mean_token_accuracy": 0.7051974534988403, "num_tokens": 8660703.0, "step": 10764 }, { "epoch": 2.851165254237288, "grad_norm": 1.3092329502105713, "learning_rate": 8.574549788135595e-06, "loss": 1.2448, "mean_token_accuracy": 0.7292209938168526, "num_tokens": 8662474.0, "step": 10766 }, { "epoch": 2.851694915254237, "grad_norm": 1.7892200946807861, "learning_rate": 8.574284957627118e-06, "loss": 1.0509, "mean_token_accuracy": 0.7548419237136841, "num_tokens": 8663897.0, "step": 10768 }, { "epoch": 2.8522245762711864, "grad_norm": 1.8808796405792236, "learning_rate": 8.574020127118645e-06, "loss": 1.7165, "mean_token_accuracy": 0.6524816825985909, "num_tokens": 8665535.0, "step": 10770 }, { "epoch": 2.852754237288136, "grad_norm": 1.7037972211837769, "learning_rate": 8.57375529661017e-06, "loss": 1.5064, "mean_token_accuracy": 0.6870884150266647, "num_tokens": 8667245.0, "step": 10772 }, { "epoch": 2.8532838983050848, "grad_norm": 1.3579179048538208, "learning_rate": 8.573490466101697e-06, "loss": 1.3489, "mean_token_accuracy": 0.7226606607437134, "num_tokens": 8668955.0, "step": 10774 }, { "epoch": 2.8538135593220337, "grad_norm": 1.5367921590805054, "learning_rate": 8.573225635593221e-06, "loss": 1.3986, "mean_token_accuracy": 0.6930413246154785, "num_tokens": 8670717.0, "step": 10776 }, { "epoch": 2.854343220338983, "grad_norm": 2.0937092304229736, "learning_rate": 8.572960805084746e-06, "loss": 1.7403, "mean_token_accuracy": 0.6264596097171307, "num_tokens": 8672045.0, "step": 10778 }, { "epoch": 2.854872881355932, "grad_norm": 1.9914296865463257, "learning_rate": 8.572695974576271e-06, "loss": 1.1742, "mean_token_accuracy": 0.7279501035809517, "num_tokens": 8673328.0, "step": 10780 }, { "epoch": 2.8554025423728815, "grad_norm": 1.7299330234527588, "learning_rate": 8.572431144067798e-06, "loss": 0.941, "mean_token_accuracy": 0.7640436142683029, "num_tokens": 8674994.0, "step": 10782 }, { "epoch": 2.8559322033898304, "grad_norm": 2.223212480545044, "learning_rate": 8.572166313559323e-06, "loss": 1.7814, "mean_token_accuracy": 0.6325211748480797, "num_tokens": 8676294.0, "step": 10784 }, { "epoch": 2.8564618644067794, "grad_norm": 1.8173526525497437, "learning_rate": 8.571901483050848e-06, "loss": 1.2509, "mean_token_accuracy": 0.6851014085114002, "num_tokens": 8677899.0, "step": 10786 }, { "epoch": 2.856991525423729, "grad_norm": 1.8302204608917236, "learning_rate": 8.571636652542373e-06, "loss": 1.4452, "mean_token_accuracy": 0.6619445905089378, "num_tokens": 8679175.0, "step": 10788 }, { "epoch": 2.857521186440678, "grad_norm": 1.2557387351989746, "learning_rate": 8.5713718220339e-06, "loss": 0.764, "mean_token_accuracy": 0.8014320582151413, "num_tokens": 8681090.0, "step": 10790 }, { "epoch": 2.858050847457627, "grad_norm": 1.9092514514923096, "learning_rate": 8.571106991525424e-06, "loss": 1.5982, "mean_token_accuracy": 0.665063388645649, "num_tokens": 8682700.0, "step": 10792 }, { "epoch": 2.858580508474576, "grad_norm": 1.8069292306900024, "learning_rate": 8.570842161016951e-06, "loss": 1.3384, "mean_token_accuracy": 0.7093072086572647, "num_tokens": 8684148.0, "step": 10794 }, { "epoch": 2.8591101694915255, "grad_norm": 1.8130072355270386, "learning_rate": 8.570577330508474e-06, "loss": 1.663, "mean_token_accuracy": 0.648804560303688, "num_tokens": 8685827.0, "step": 10796 }, { "epoch": 2.8596398305084745, "grad_norm": 1.7344447374343872, "learning_rate": 8.5703125e-06, "loss": 1.2405, "mean_token_accuracy": 0.7142388075590134, "num_tokens": 8687367.0, "step": 10798 }, { "epoch": 2.860169491525424, "grad_norm": 1.5459139347076416, "learning_rate": 8.570047669491526e-06, "loss": 1.1564, "mean_token_accuracy": 0.7138645648956299, "num_tokens": 8689115.0, "step": 10800 }, { "epoch": 2.860699152542373, "grad_norm": 1.6078850030899048, "learning_rate": 8.569782838983052e-06, "loss": 0.9725, "mean_token_accuracy": 0.7604802213609219, "num_tokens": 8690782.0, "step": 10802 }, { "epoch": 2.861228813559322, "grad_norm": 1.3662387132644653, "learning_rate": 8.569518008474577e-06, "loss": 1.2124, "mean_token_accuracy": 0.709532730281353, "num_tokens": 8692440.0, "step": 10804 }, { "epoch": 2.861758474576271, "grad_norm": 1.7893006801605225, "learning_rate": 8.569253177966102e-06, "loss": 1.7618, "mean_token_accuracy": 0.5975250825285912, "num_tokens": 8694251.0, "step": 10806 }, { "epoch": 2.8622881355932206, "grad_norm": 1.8616975545883179, "learning_rate": 8.568988347457627e-06, "loss": 1.3249, "mean_token_accuracy": 0.700500376522541, "num_tokens": 8695713.0, "step": 10808 }, { "epoch": 2.8628177966101696, "grad_norm": 1.6147667169570923, "learning_rate": 8.568723516949154e-06, "loss": 1.5793, "mean_token_accuracy": 0.6337868422269821, "num_tokens": 8697369.0, "step": 10810 }, { "epoch": 2.8633474576271185, "grad_norm": 1.58595609664917, "learning_rate": 8.568458686440679e-06, "loss": 1.4537, "mean_token_accuracy": 0.6711708679795265, "num_tokens": 8699303.0, "step": 10812 }, { "epoch": 2.863877118644068, "grad_norm": 1.6647688150405884, "learning_rate": 8.568193855932204e-06, "loss": 1.2486, "mean_token_accuracy": 0.7129455357789993, "num_tokens": 8700737.0, "step": 10814 }, { "epoch": 2.864406779661017, "grad_norm": 2.410896062850952, "learning_rate": 8.567929025423729e-06, "loss": 1.3426, "mean_token_accuracy": 0.6985915005207062, "num_tokens": 8702323.0, "step": 10816 }, { "epoch": 2.8649364406779663, "grad_norm": 1.800171136856079, "learning_rate": 8.567664194915255e-06, "loss": 1.2502, "mean_token_accuracy": 0.6919736862182617, "num_tokens": 8704064.0, "step": 10818 }, { "epoch": 2.8654661016949152, "grad_norm": 1.8819043636322021, "learning_rate": 8.56739936440678e-06, "loss": 1.1516, "mean_token_accuracy": 0.7428476512432098, "num_tokens": 8705417.0, "step": 10820 }, { "epoch": 2.865995762711864, "grad_norm": 1.6396958827972412, "learning_rate": 8.567134533898305e-06, "loss": 1.2299, "mean_token_accuracy": 0.6964505463838577, "num_tokens": 8707168.0, "step": 10822 }, { "epoch": 2.8665254237288136, "grad_norm": 1.8185538053512573, "learning_rate": 8.56686970338983e-06, "loss": 1.7122, "mean_token_accuracy": 0.6511615961790085, "num_tokens": 8708709.0, "step": 10824 }, { "epoch": 2.867055084745763, "grad_norm": 1.7775770425796509, "learning_rate": 8.566604872881357e-06, "loss": 1.254, "mean_token_accuracy": 0.7060649618506432, "num_tokens": 8710306.0, "step": 10826 }, { "epoch": 2.867584745762712, "grad_norm": 1.766649842262268, "learning_rate": 8.566340042372882e-06, "loss": 1.7285, "mean_token_accuracy": 0.61187694221735, "num_tokens": 8712009.0, "step": 10828 }, { "epoch": 2.868114406779661, "grad_norm": 1.3728687763214111, "learning_rate": 8.566075211864408e-06, "loss": 1.0298, "mean_token_accuracy": 0.7827984988689423, "num_tokens": 8713508.0, "step": 10830 }, { "epoch": 2.8686440677966103, "grad_norm": 2.0059378147125244, "learning_rate": 8.565810381355933e-06, "loss": 1.4322, "mean_token_accuracy": 0.6783008873462677, "num_tokens": 8715000.0, "step": 10832 }, { "epoch": 2.8691737288135593, "grad_norm": 1.4036083221435547, "learning_rate": 8.565545550847458e-06, "loss": 1.2154, "mean_token_accuracy": 0.7177482843399048, "num_tokens": 8716654.0, "step": 10834 }, { "epoch": 2.8697033898305087, "grad_norm": 1.6960878372192383, "learning_rate": 8.565280720338985e-06, "loss": 1.2892, "mean_token_accuracy": 0.7178556099534035, "num_tokens": 8718213.0, "step": 10836 }, { "epoch": 2.8702330508474576, "grad_norm": 1.2325239181518555, "learning_rate": 8.56501588983051e-06, "loss": 1.0028, "mean_token_accuracy": 0.7203871086239815, "num_tokens": 8720352.0, "step": 10838 }, { "epoch": 2.8707627118644066, "grad_norm": 1.3881579637527466, "learning_rate": 8.564751059322034e-06, "loss": 1.4185, "mean_token_accuracy": 0.6597358658909798, "num_tokens": 8722380.0, "step": 10840 }, { "epoch": 2.871292372881356, "grad_norm": 2.2788543701171875, "learning_rate": 8.56448622881356e-06, "loss": 1.408, "mean_token_accuracy": 0.6707604005932808, "num_tokens": 8723552.0, "step": 10842 }, { "epoch": 2.871822033898305, "grad_norm": 1.6956003904342651, "learning_rate": 8.564221398305086e-06, "loss": 1.1318, "mean_token_accuracy": 0.7243356630206108, "num_tokens": 8724983.0, "step": 10844 }, { "epoch": 2.8723516949152543, "grad_norm": 1.631677508354187, "learning_rate": 8.563956567796611e-06, "loss": 1.3123, "mean_token_accuracy": 0.7061871141195297, "num_tokens": 8726445.0, "step": 10846 }, { "epoch": 2.8728813559322033, "grad_norm": 1.6886072158813477, "learning_rate": 8.563691737288138e-06, "loss": 1.3679, "mean_token_accuracy": 0.6897560134530067, "num_tokens": 8728236.0, "step": 10848 }, { "epoch": 2.8734110169491527, "grad_norm": 1.6508868932724, "learning_rate": 8.56342690677966e-06, "loss": 1.3649, "mean_token_accuracy": 0.7007014080882072, "num_tokens": 8729793.0, "step": 10850 }, { "epoch": 2.8739406779661016, "grad_norm": 1.7004088163375854, "learning_rate": 8.563162076271187e-06, "loss": 1.5637, "mean_token_accuracy": 0.6661757826805115, "num_tokens": 8731237.0, "step": 10852 }, { "epoch": 2.874470338983051, "grad_norm": 1.7557262182235718, "learning_rate": 8.562897245762712e-06, "loss": 1.2022, "mean_token_accuracy": 0.7177650183439255, "num_tokens": 8732877.0, "step": 10854 }, { "epoch": 2.875, "grad_norm": 1.9229377508163452, "learning_rate": 8.562632415254239e-06, "loss": 1.3972, "mean_token_accuracy": 0.6895378232002258, "num_tokens": 8734599.0, "step": 10856 }, { "epoch": 2.875529661016949, "grad_norm": 1.366172432899475, "learning_rate": 8.562367584745764e-06, "loss": 1.1923, "mean_token_accuracy": 0.7235536724328995, "num_tokens": 8736193.0, "step": 10858 }, { "epoch": 2.8760593220338984, "grad_norm": 1.1266065835952759, "learning_rate": 8.562102754237289e-06, "loss": 0.8769, "mean_token_accuracy": 0.7700311616063118, "num_tokens": 8738769.0, "step": 10860 }, { "epoch": 2.8765889830508473, "grad_norm": 1.5702407360076904, "learning_rate": 8.561837923728814e-06, "loss": 1.1307, "mean_token_accuracy": 0.7243494391441345, "num_tokens": 8740580.0, "step": 10862 }, { "epoch": 2.8771186440677967, "grad_norm": 1.8243281841278076, "learning_rate": 8.56157309322034e-06, "loss": 1.7571, "mean_token_accuracy": 0.6444510109722614, "num_tokens": 8742142.0, "step": 10864 }, { "epoch": 2.8776483050847457, "grad_norm": 1.5765920877456665, "learning_rate": 8.561308262711865e-06, "loss": 1.0148, "mean_token_accuracy": 0.7470299154520035, "num_tokens": 8743483.0, "step": 10866 }, { "epoch": 2.878177966101695, "grad_norm": 1.6097412109375, "learning_rate": 8.56104343220339e-06, "loss": 1.8157, "mean_token_accuracy": 0.6143028661608696, "num_tokens": 8745280.0, "step": 10868 }, { "epoch": 2.878707627118644, "grad_norm": 1.6631649732589722, "learning_rate": 8.560778601694915e-06, "loss": 1.4731, "mean_token_accuracy": 0.6892671585083008, "num_tokens": 8746964.0, "step": 10870 }, { "epoch": 2.8792372881355934, "grad_norm": 1.6111252307891846, "learning_rate": 8.560513771186442e-06, "loss": 1.2532, "mean_token_accuracy": 0.7231753282248974, "num_tokens": 8748860.0, "step": 10872 }, { "epoch": 2.8797669491525424, "grad_norm": 1.8664748668670654, "learning_rate": 8.560248940677967e-06, "loss": 1.4149, "mean_token_accuracy": 0.6977668479084969, "num_tokens": 8750318.0, "step": 10874 }, { "epoch": 2.8802966101694913, "grad_norm": 1.6946229934692383, "learning_rate": 8.559984110169492e-06, "loss": 1.2918, "mean_token_accuracy": 0.7038031220436096, "num_tokens": 8752104.0, "step": 10876 }, { "epoch": 2.8808262711864407, "grad_norm": 1.7426905632019043, "learning_rate": 8.559719279661017e-06, "loss": 1.1743, "mean_token_accuracy": 0.7191590219736099, "num_tokens": 8753568.0, "step": 10878 }, { "epoch": 2.8813559322033897, "grad_norm": 1.9842472076416016, "learning_rate": 8.559454449152543e-06, "loss": 1.1685, "mean_token_accuracy": 0.7258270531892776, "num_tokens": 8755305.0, "step": 10880 }, { "epoch": 2.881885593220339, "grad_norm": 1.5934122800827026, "learning_rate": 8.559189618644068e-06, "loss": 1.2678, "mean_token_accuracy": 0.720105804502964, "num_tokens": 8756745.0, "step": 10882 }, { "epoch": 2.882415254237288, "grad_norm": 1.7248977422714233, "learning_rate": 8.558924788135595e-06, "loss": 1.5785, "mean_token_accuracy": 0.6520396173000336, "num_tokens": 8758290.0, "step": 10884 }, { "epoch": 2.882944915254237, "grad_norm": 1.8532971143722534, "learning_rate": 8.55865995762712e-06, "loss": 1.2343, "mean_token_accuracy": 0.7119725421071053, "num_tokens": 8759669.0, "step": 10886 }, { "epoch": 2.8834745762711864, "grad_norm": 1.3747014999389648, "learning_rate": 8.558395127118645e-06, "loss": 0.9283, "mean_token_accuracy": 0.758670799434185, "num_tokens": 8761617.0, "step": 10888 }, { "epoch": 2.884004237288136, "grad_norm": 1.526872992515564, "learning_rate": 8.55813029661017e-06, "loss": 1.2268, "mean_token_accuracy": 0.7367947176098824, "num_tokens": 8763316.0, "step": 10890 }, { "epoch": 2.8845338983050848, "grad_norm": 1.9313169717788696, "learning_rate": 8.557865466101696e-06, "loss": 1.3345, "mean_token_accuracy": 0.7318636700510979, "num_tokens": 8764806.0, "step": 10892 }, { "epoch": 2.8850635593220337, "grad_norm": 1.8791784048080444, "learning_rate": 8.557600635593221e-06, "loss": 1.3912, "mean_token_accuracy": 0.6850984394550323, "num_tokens": 8766279.0, "step": 10894 }, { "epoch": 2.885593220338983, "grad_norm": 1.586462140083313, "learning_rate": 8.557335805084746e-06, "loss": 0.9617, "mean_token_accuracy": 0.763418510556221, "num_tokens": 8767771.0, "step": 10896 }, { "epoch": 2.886122881355932, "grad_norm": 1.8049153089523315, "learning_rate": 8.557070974576271e-06, "loss": 1.2342, "mean_token_accuracy": 0.7218340262770653, "num_tokens": 8769231.0, "step": 10898 }, { "epoch": 2.8866525423728815, "grad_norm": 1.3152450323104858, "learning_rate": 8.556806144067798e-06, "loss": 0.8352, "mean_token_accuracy": 0.7860424667596817, "num_tokens": 8770987.0, "step": 10900 }, { "epoch": 2.8871822033898304, "grad_norm": 1.7505552768707275, "learning_rate": 8.556541313559323e-06, "loss": 1.4793, "mean_token_accuracy": 0.6820105165243149, "num_tokens": 8772700.0, "step": 10902 }, { "epoch": 2.8877118644067794, "grad_norm": 1.7221018075942993, "learning_rate": 8.556276483050847e-06, "loss": 0.8369, "mean_token_accuracy": 0.7926026433706284, "num_tokens": 8774076.0, "step": 10904 }, { "epoch": 2.888241525423729, "grad_norm": 1.4162756204605103, "learning_rate": 8.556011652542372e-06, "loss": 1.3831, "mean_token_accuracy": 0.7126702293753624, "num_tokens": 8775519.0, "step": 10906 }, { "epoch": 2.888771186440678, "grad_norm": 1.4421511888504028, "learning_rate": 8.555746822033899e-06, "loss": 1.0379, "mean_token_accuracy": 0.7489799410104752, "num_tokens": 8777252.0, "step": 10908 }, { "epoch": 2.889300847457627, "grad_norm": 1.7709039449691772, "learning_rate": 8.555481991525424e-06, "loss": 1.3913, "mean_token_accuracy": 0.6823863349854946, "num_tokens": 8778725.0, "step": 10910 }, { "epoch": 2.889830508474576, "grad_norm": 1.6655949354171753, "learning_rate": 8.55521716101695e-06, "loss": 1.0745, "mean_token_accuracy": 0.7547178417444229, "num_tokens": 8780230.0, "step": 10912 }, { "epoch": 2.8903601694915255, "grad_norm": 1.7800748348236084, "learning_rate": 8.554952330508475e-06, "loss": 1.5236, "mean_token_accuracy": 0.651592068374157, "num_tokens": 8781960.0, "step": 10914 }, { "epoch": 2.8908898305084745, "grad_norm": 1.774926781654358, "learning_rate": 8.5546875e-06, "loss": 1.4069, "mean_token_accuracy": 0.6805282533168793, "num_tokens": 8783788.0, "step": 10916 }, { "epoch": 2.891419491525424, "grad_norm": 1.7874950170516968, "learning_rate": 8.554422669491527e-06, "loss": 1.5538, "mean_token_accuracy": 0.6765510961413383, "num_tokens": 8785219.0, "step": 10918 }, { "epoch": 2.891949152542373, "grad_norm": 1.6911755800247192, "learning_rate": 8.554157838983052e-06, "loss": 1.6839, "mean_token_accuracy": 0.6499990522861481, "num_tokens": 8786691.0, "step": 10920 }, { "epoch": 2.892478813559322, "grad_norm": 1.823125958442688, "learning_rate": 8.553893008474577e-06, "loss": 1.59, "mean_token_accuracy": 0.6497353687882423, "num_tokens": 8788473.0, "step": 10922 }, { "epoch": 2.893008474576271, "grad_norm": 1.784571886062622, "learning_rate": 8.553628177966102e-06, "loss": 1.3765, "mean_token_accuracy": 0.6865354925394058, "num_tokens": 8790000.0, "step": 10924 }, { "epoch": 2.8935381355932206, "grad_norm": 1.8513497114181519, "learning_rate": 8.553363347457628e-06, "loss": 1.2967, "mean_token_accuracy": 0.7127895429730415, "num_tokens": 8791625.0, "step": 10926 }, { "epoch": 2.8940677966101696, "grad_norm": 1.4463403224945068, "learning_rate": 8.553098516949153e-06, "loss": 0.9933, "mean_token_accuracy": 0.7511327043175697, "num_tokens": 8793270.0, "step": 10928 }, { "epoch": 2.8945974576271185, "grad_norm": 1.6802836656570435, "learning_rate": 8.552833686440678e-06, "loss": 1.1303, "mean_token_accuracy": 0.7564579546451569, "num_tokens": 8794665.0, "step": 10930 }, { "epoch": 2.895127118644068, "grad_norm": 1.533522605895996, "learning_rate": 8.552568855932203e-06, "loss": 1.6674, "mean_token_accuracy": 0.6640731021761894, "num_tokens": 8796339.0, "step": 10932 }, { "epoch": 2.895656779661017, "grad_norm": 1.7683411836624146, "learning_rate": 8.55230402542373e-06, "loss": 1.032, "mean_token_accuracy": 0.7327115312218666, "num_tokens": 8797726.0, "step": 10934 }, { "epoch": 2.8961864406779663, "grad_norm": 1.7489420175552368, "learning_rate": 8.552039194915255e-06, "loss": 1.2931, "mean_token_accuracy": 0.7262837290763855, "num_tokens": 8799270.0, "step": 10936 }, { "epoch": 2.8967161016949152, "grad_norm": 1.7902642488479614, "learning_rate": 8.551774364406781e-06, "loss": 1.1598, "mean_token_accuracy": 0.7162776291370392, "num_tokens": 8800856.0, "step": 10938 }, { "epoch": 2.897245762711864, "grad_norm": 1.88307785987854, "learning_rate": 8.551509533898306e-06, "loss": 1.1419, "mean_token_accuracy": 0.7418464720249176, "num_tokens": 8802313.0, "step": 10940 }, { "epoch": 2.8977754237288136, "grad_norm": 1.6705121994018555, "learning_rate": 8.551244703389831e-06, "loss": 1.162, "mean_token_accuracy": 0.7162549868226051, "num_tokens": 8803893.0, "step": 10942 }, { "epoch": 2.898305084745763, "grad_norm": 1.6829237937927246, "learning_rate": 8.550979872881356e-06, "loss": 1.2724, "mean_token_accuracy": 0.6991873010993004, "num_tokens": 8805381.0, "step": 10944 }, { "epoch": 2.898834745762712, "grad_norm": 1.5508410930633545, "learning_rate": 8.550715042372883e-06, "loss": 1.4773, "mean_token_accuracy": 0.6727242991328239, "num_tokens": 8807303.0, "step": 10946 }, { "epoch": 2.899364406779661, "grad_norm": 1.7093884944915771, "learning_rate": 8.550450211864408e-06, "loss": 1.3049, "mean_token_accuracy": 0.7200841084122658, "num_tokens": 8808689.0, "step": 10948 }, { "epoch": 2.8998940677966103, "grad_norm": 1.8717851638793945, "learning_rate": 8.550185381355933e-06, "loss": 1.5595, "mean_token_accuracy": 0.6801642030477524, "num_tokens": 8810236.0, "step": 10950 }, { "epoch": 2.9004237288135593, "grad_norm": 1.8912972211837769, "learning_rate": 8.549920550847458e-06, "loss": 0.9525, "mean_token_accuracy": 0.7630276530981064, "num_tokens": 8811514.0, "step": 10952 }, { "epoch": 2.9009533898305087, "grad_norm": 1.806347131729126, "learning_rate": 8.549655720338984e-06, "loss": 1.3505, "mean_token_accuracy": 0.707168847322464, "num_tokens": 8813127.0, "step": 10954 }, { "epoch": 2.9014830508474576, "grad_norm": 1.7687238454818726, "learning_rate": 8.54939088983051e-06, "loss": 1.6737, "mean_token_accuracy": 0.6267415285110474, "num_tokens": 8814812.0, "step": 10956 }, { "epoch": 2.9020127118644066, "grad_norm": 1.4907711744308472, "learning_rate": 8.549126059322034e-06, "loss": 1.3879, "mean_token_accuracy": 0.6782440021634102, "num_tokens": 8816426.0, "step": 10958 }, { "epoch": 2.902542372881356, "grad_norm": 1.5641473531723022, "learning_rate": 8.548861228813559e-06, "loss": 1.2845, "mean_token_accuracy": 0.6955735385417938, "num_tokens": 8817856.0, "step": 10960 }, { "epoch": 2.903072033898305, "grad_norm": 1.8849095106124878, "learning_rate": 8.548596398305086e-06, "loss": 1.6351, "mean_token_accuracy": 0.6570460870862007, "num_tokens": 8819294.0, "step": 10962 }, { "epoch": 2.9036016949152543, "grad_norm": 1.6632969379425049, "learning_rate": 8.54833156779661e-06, "loss": 1.4085, "mean_token_accuracy": 0.6921520531177521, "num_tokens": 8821013.0, "step": 10964 }, { "epoch": 2.9041313559322033, "grad_norm": 1.435954213142395, "learning_rate": 8.548066737288137e-06, "loss": 1.0316, "mean_token_accuracy": 0.7320942506194115, "num_tokens": 8822575.0, "step": 10966 }, { "epoch": 2.9046610169491527, "grad_norm": 1.5715763568878174, "learning_rate": 8.547801906779662e-06, "loss": 1.1226, "mean_token_accuracy": 0.7124052792787552, "num_tokens": 8824432.0, "step": 10968 }, { "epoch": 2.9051906779661016, "grad_norm": 1.7033144235610962, "learning_rate": 8.547537076271187e-06, "loss": 1.5847, "mean_token_accuracy": 0.6375257596373558, "num_tokens": 8826251.0, "step": 10970 }, { "epoch": 2.905720338983051, "grad_norm": 2.2441000938415527, "learning_rate": 8.547272245762712e-06, "loss": 1.2328, "mean_token_accuracy": 0.7266024947166443, "num_tokens": 8827762.0, "step": 10972 }, { "epoch": 2.90625, "grad_norm": 1.5010651350021362, "learning_rate": 8.547007415254239e-06, "loss": 1.3858, "mean_token_accuracy": 0.6867693364620209, "num_tokens": 8829469.0, "step": 10974 }, { "epoch": 2.906779661016949, "grad_norm": 1.5417934656143188, "learning_rate": 8.546742584745764e-06, "loss": 1.5186, "mean_token_accuracy": 0.6644007787108421, "num_tokens": 8831352.0, "step": 10976 }, { "epoch": 2.9073093220338984, "grad_norm": 1.6136842966079712, "learning_rate": 8.546477754237288e-06, "loss": 1.1741, "mean_token_accuracy": 0.7203476428985596, "num_tokens": 8833157.0, "step": 10978 }, { "epoch": 2.9078389830508473, "grad_norm": 1.68544602394104, "learning_rate": 8.546212923728813e-06, "loss": 1.1715, "mean_token_accuracy": 0.7164175286889076, "num_tokens": 8834539.0, "step": 10980 }, { "epoch": 2.9083686440677967, "grad_norm": 1.624870777130127, "learning_rate": 8.54594809322034e-06, "loss": 1.223, "mean_token_accuracy": 0.7196910157799721, "num_tokens": 8836170.0, "step": 10982 }, { "epoch": 2.9088983050847457, "grad_norm": 1.8539550304412842, "learning_rate": 8.545683262711865e-06, "loss": 1.7112, "mean_token_accuracy": 0.6294004693627357, "num_tokens": 8837785.0, "step": 10984 }, { "epoch": 2.909427966101695, "grad_norm": 1.600744605064392, "learning_rate": 8.54541843220339e-06, "loss": 1.1032, "mean_token_accuracy": 0.7331735268235207, "num_tokens": 8839200.0, "step": 10986 }, { "epoch": 2.909957627118644, "grad_norm": 1.6020917892456055, "learning_rate": 8.545153601694915e-06, "loss": 1.2689, "mean_token_accuracy": 0.7045271247625351, "num_tokens": 8840699.0, "step": 10988 }, { "epoch": 2.9104872881355934, "grad_norm": 1.6639537811279297, "learning_rate": 8.544888771186441e-06, "loss": 1.2609, "mean_token_accuracy": 0.7239251285791397, "num_tokens": 8842338.0, "step": 10990 }, { "epoch": 2.9110169491525424, "grad_norm": 1.2382620573043823, "learning_rate": 8.544623940677966e-06, "loss": 1.1666, "mean_token_accuracy": 0.7199603766202927, "num_tokens": 8844219.0, "step": 10992 }, { "epoch": 2.9115466101694913, "grad_norm": 1.4370694160461426, "learning_rate": 8.544359110169493e-06, "loss": 1.5815, "mean_token_accuracy": 0.6686442047357559, "num_tokens": 8845944.0, "step": 10994 }, { "epoch": 2.9120762711864407, "grad_norm": 1.6779168844223022, "learning_rate": 8.544094279661018e-06, "loss": 1.167, "mean_token_accuracy": 0.7439001202583313, "num_tokens": 8847577.0, "step": 10996 }, { "epoch": 2.9126059322033897, "grad_norm": 1.7031757831573486, "learning_rate": 8.543829449152543e-06, "loss": 1.5152, "mean_token_accuracy": 0.674115501344204, "num_tokens": 8849327.0, "step": 10998 }, { "epoch": 2.913135593220339, "grad_norm": 1.8174781799316406, "learning_rate": 8.543564618644068e-06, "loss": 1.4032, "step": 11000 }, { "epoch": 2.913135593220339, "eval_loss": 1.3081916570663452, "eval_mean_token_accuracy": 0.7014487353818757, "eval_num_tokens": 8850974.0, "eval_runtime": 48.1662, "eval_samples_per_second": 6.395, "eval_steps_per_second": 6.395, "step": 11000 }, { "epoch": 2.913665254237288, "grad_norm": 1.6783939599990845, "learning_rate": 8.543299788135594e-06, "loss": 1.635, "mean_token_accuracy": 0.6722212769091129, "num_tokens": 8852702.0, "step": 11002 }, { "epoch": 2.914194915254237, "grad_norm": 1.7485861778259277, "learning_rate": 8.54303495762712e-06, "loss": 1.2971, "mean_token_accuracy": 0.7131061106920242, "num_tokens": 8854366.0, "step": 11004 }, { "epoch": 2.9147245762711864, "grad_norm": 1.654500961303711, "learning_rate": 8.542770127118644e-06, "loss": 1.2888, "mean_token_accuracy": 0.697282686829567, "num_tokens": 8856095.0, "step": 11006 }, { "epoch": 2.915254237288136, "grad_norm": 1.504971981048584, "learning_rate": 8.542505296610171e-06, "loss": 1.0569, "mean_token_accuracy": 0.7269118428230286, "num_tokens": 8858670.0, "step": 11008 }, { "epoch": 2.9157838983050848, "grad_norm": 1.2872095108032227, "learning_rate": 8.542240466101696e-06, "loss": 1.5116, "mean_token_accuracy": 0.6430501490831375, "num_tokens": 8860644.0, "step": 11010 }, { "epoch": 2.9163135593220337, "grad_norm": 2.551292896270752, "learning_rate": 8.54197563559322e-06, "loss": 1.1964, "mean_token_accuracy": 0.7192088142037392, "num_tokens": 8861960.0, "step": 11012 }, { "epoch": 2.916843220338983, "grad_norm": 1.8087292909622192, "learning_rate": 8.541710805084746e-06, "loss": 1.1944, "mean_token_accuracy": 0.718289352953434, "num_tokens": 8863386.0, "step": 11014 }, { "epoch": 2.917372881355932, "grad_norm": 1.9355705976486206, "learning_rate": 8.541445974576272e-06, "loss": 1.3586, "mean_token_accuracy": 0.7059256881475449, "num_tokens": 8864809.0, "step": 11016 }, { "epoch": 2.9179025423728815, "grad_norm": 1.259620189666748, "learning_rate": 8.541181144067797e-06, "loss": 0.8081, "mean_token_accuracy": 0.7807729244232178, "num_tokens": 8866807.0, "step": 11018 }, { "epoch": 2.9184322033898304, "grad_norm": 1.3738123178482056, "learning_rate": 8.540916313559324e-06, "loss": 0.9347, "mean_token_accuracy": 0.7634151577949524, "num_tokens": 8868245.0, "step": 11020 }, { "epoch": 2.9189618644067794, "grad_norm": 1.4236247539520264, "learning_rate": 8.540651483050849e-06, "loss": 1.0962, "mean_token_accuracy": 0.7162895351648331, "num_tokens": 8869923.0, "step": 11022 }, { "epoch": 2.919491525423729, "grad_norm": 1.533440113067627, "learning_rate": 8.540386652542374e-06, "loss": 1.47, "mean_token_accuracy": 0.6867110729217529, "num_tokens": 8871883.0, "step": 11024 }, { "epoch": 2.920021186440678, "grad_norm": 1.803815245628357, "learning_rate": 8.540121822033899e-06, "loss": 1.4655, "mean_token_accuracy": 0.6668088883161545, "num_tokens": 8873425.0, "step": 11026 }, { "epoch": 2.920550847457627, "grad_norm": 1.6396466493606567, "learning_rate": 8.539856991525425e-06, "loss": 1.1982, "mean_token_accuracy": 0.7216385714709759, "num_tokens": 8875060.0, "step": 11028 }, { "epoch": 2.921080508474576, "grad_norm": 1.719956874847412, "learning_rate": 8.53959216101695e-06, "loss": 1.2518, "mean_token_accuracy": 0.7246228829026222, "num_tokens": 8876601.0, "step": 11030 }, { "epoch": 2.9216101694915255, "grad_norm": 1.7771985530853271, "learning_rate": 8.539327330508475e-06, "loss": 1.1509, "mean_token_accuracy": 0.7369973137974739, "num_tokens": 8878091.0, "step": 11032 }, { "epoch": 2.9221398305084745, "grad_norm": 1.536490797996521, "learning_rate": 8.5390625e-06, "loss": 1.0804, "mean_token_accuracy": 0.7350741103291512, "num_tokens": 8879752.0, "step": 11034 }, { "epoch": 2.922669491525424, "grad_norm": 1.8321025371551514, "learning_rate": 8.538797669491527e-06, "loss": 1.1573, "mean_token_accuracy": 0.7253996357321739, "num_tokens": 8881196.0, "step": 11036 }, { "epoch": 2.923199152542373, "grad_norm": 1.63969886302948, "learning_rate": 8.538532838983052e-06, "loss": 1.783, "mean_token_accuracy": 0.6027919687330723, "num_tokens": 8882839.0, "step": 11038 }, { "epoch": 2.923728813559322, "grad_norm": 1.8586313724517822, "learning_rate": 8.538268008474577e-06, "loss": 1.143, "mean_token_accuracy": 0.7271187603473663, "num_tokens": 8884207.0, "step": 11040 }, { "epoch": 2.924258474576271, "grad_norm": 1.8236216306686401, "learning_rate": 8.538003177966101e-06, "loss": 1.2916, "mean_token_accuracy": 0.7068982273340225, "num_tokens": 8885688.0, "step": 11042 }, { "epoch": 2.9247881355932206, "grad_norm": 1.6862872838974, "learning_rate": 8.537738347457628e-06, "loss": 1.4216, "mean_token_accuracy": 0.6901895180344582, "num_tokens": 8887182.0, "step": 11044 }, { "epoch": 2.9253177966101696, "grad_norm": 1.757314682006836, "learning_rate": 8.537473516949153e-06, "loss": 1.7754, "mean_token_accuracy": 0.6180834770202637, "num_tokens": 8888928.0, "step": 11046 }, { "epoch": 2.9258474576271185, "grad_norm": 1.8486248254776, "learning_rate": 8.53720868644068e-06, "loss": 0.9394, "mean_token_accuracy": 0.7733684405684471, "num_tokens": 8890411.0, "step": 11048 }, { "epoch": 2.926377118644068, "grad_norm": 1.655042290687561, "learning_rate": 8.536943855932205e-06, "loss": 1.4056, "mean_token_accuracy": 0.6845411509275436, "num_tokens": 8892122.0, "step": 11050 }, { "epoch": 2.926906779661017, "grad_norm": 2.127609968185425, "learning_rate": 8.53667902542373e-06, "loss": 1.2897, "mean_token_accuracy": 0.7058233246207237, "num_tokens": 8893518.0, "step": 11052 }, { "epoch": 2.9274364406779663, "grad_norm": 1.9719200134277344, "learning_rate": 8.536414194915254e-06, "loss": 1.4028, "mean_token_accuracy": 0.6883885189890862, "num_tokens": 8894858.0, "step": 11054 }, { "epoch": 2.9279661016949152, "grad_norm": 2.104463577270508, "learning_rate": 8.536149364406781e-06, "loss": 1.4339, "mean_token_accuracy": 0.6965419910848141, "num_tokens": 8896381.0, "step": 11056 }, { "epoch": 2.928495762711864, "grad_norm": 1.7647545337677002, "learning_rate": 8.535884533898306e-06, "loss": 1.6198, "mean_token_accuracy": 0.6680653616786003, "num_tokens": 8897738.0, "step": 11058 }, { "epoch": 2.9290254237288136, "grad_norm": 1.8199694156646729, "learning_rate": 8.535619703389831e-06, "loss": 1.6035, "mean_token_accuracy": 0.6435888856649399, "num_tokens": 8899453.0, "step": 11060 }, { "epoch": 2.929555084745763, "grad_norm": 1.8817559480667114, "learning_rate": 8.535354872881356e-06, "loss": 1.4064, "mean_token_accuracy": 0.6756678223609924, "num_tokens": 8900977.0, "step": 11062 }, { "epoch": 2.930084745762712, "grad_norm": 1.6438724994659424, "learning_rate": 8.535090042372882e-06, "loss": 1.4575, "mean_token_accuracy": 0.6851635500788689, "num_tokens": 8902656.0, "step": 11064 }, { "epoch": 2.930614406779661, "grad_norm": 1.6137735843658447, "learning_rate": 8.534825211864407e-06, "loss": 0.8426, "mean_token_accuracy": 0.7839569225907326, "num_tokens": 8904227.0, "step": 11066 }, { "epoch": 2.9311440677966103, "grad_norm": 1.8100214004516602, "learning_rate": 8.534560381355932e-06, "loss": 1.3661, "mean_token_accuracy": 0.6957317143678665, "num_tokens": 8905554.0, "step": 11068 }, { "epoch": 2.9316737288135593, "grad_norm": 1.6394355297088623, "learning_rate": 8.534295550847457e-06, "loss": 1.5143, "mean_token_accuracy": 0.6733928322792053, "num_tokens": 8907161.0, "step": 11070 }, { "epoch": 2.9322033898305087, "grad_norm": 1.2414462566375732, "learning_rate": 8.534030720338984e-06, "loss": 1.173, "mean_token_accuracy": 0.7261438518762589, "num_tokens": 8908871.0, "step": 11072 }, { "epoch": 2.9327330508474576, "grad_norm": 1.5372354984283447, "learning_rate": 8.533765889830509e-06, "loss": 1.2371, "mean_token_accuracy": 0.7092374786734581, "num_tokens": 8910762.0, "step": 11074 }, { "epoch": 2.9332627118644066, "grad_norm": 1.8715150356292725, "learning_rate": 8.533501059322035e-06, "loss": 1.5939, "mean_token_accuracy": 0.6461642682552338, "num_tokens": 8912334.0, "step": 11076 }, { "epoch": 2.933792372881356, "grad_norm": 1.8681901693344116, "learning_rate": 8.53323622881356e-06, "loss": 1.3626, "mean_token_accuracy": 0.6918838694691658, "num_tokens": 8913951.0, "step": 11078 }, { "epoch": 2.934322033898305, "grad_norm": 1.8061673641204834, "learning_rate": 8.532971398305085e-06, "loss": 1.2627, "mean_token_accuracy": 0.699641115963459, "num_tokens": 8915373.0, "step": 11080 }, { "epoch": 2.9348516949152543, "grad_norm": 1.6877919435501099, "learning_rate": 8.53270656779661e-06, "loss": 1.1981, "mean_token_accuracy": 0.729389064013958, "num_tokens": 8916964.0, "step": 11082 }, { "epoch": 2.9353813559322033, "grad_norm": 1.3680673837661743, "learning_rate": 8.532441737288137e-06, "loss": 1.3567, "mean_token_accuracy": 0.692945659160614, "num_tokens": 8918830.0, "step": 11084 }, { "epoch": 2.9359110169491527, "grad_norm": 2.245601177215576, "learning_rate": 8.532176906779662e-06, "loss": 1.887, "mean_token_accuracy": 0.6205647885799408, "num_tokens": 8920236.0, "step": 11086 }, { "epoch": 2.9364406779661016, "grad_norm": 1.3135435581207275, "learning_rate": 8.531912076271187e-06, "loss": 1.2778, "mean_token_accuracy": 0.7097836211323738, "num_tokens": 8921792.0, "step": 11088 }, { "epoch": 2.936970338983051, "grad_norm": 1.7892861366271973, "learning_rate": 8.531647245762713e-06, "loss": 1.2339, "mean_token_accuracy": 0.7329332232475281, "num_tokens": 8923319.0, "step": 11090 }, { "epoch": 2.9375, "grad_norm": 1.3121718168258667, "learning_rate": 8.531382415254238e-06, "loss": 1.0219, "mean_token_accuracy": 0.7455035969614983, "num_tokens": 8924881.0, "step": 11092 }, { "epoch": 2.938029661016949, "grad_norm": 1.4394984245300293, "learning_rate": 8.531117584745763e-06, "loss": 0.9071, "mean_token_accuracy": 0.7590076103806496, "num_tokens": 8926479.0, "step": 11094 }, { "epoch": 2.9385593220338984, "grad_norm": 1.3401919603347778, "learning_rate": 8.530852754237288e-06, "loss": 0.9502, "mean_token_accuracy": 0.7786399722099304, "num_tokens": 8927945.0, "step": 11096 }, { "epoch": 2.9390889830508473, "grad_norm": 1.6655267477035522, "learning_rate": 8.530587923728815e-06, "loss": 1.1518, "mean_token_accuracy": 0.7395713925361633, "num_tokens": 8929966.0, "step": 11098 }, { "epoch": 2.9396186440677967, "grad_norm": 2.141650676727295, "learning_rate": 8.53032309322034e-06, "loss": 1.5879, "mean_token_accuracy": 0.6540888994932175, "num_tokens": 8931520.0, "step": 11100 }, { "epoch": 2.9401483050847457, "grad_norm": 1.8347798585891724, "learning_rate": 8.530058262711866e-06, "loss": 1.3368, "mean_token_accuracy": 0.6996857970952988, "num_tokens": 8933082.0, "step": 11102 }, { "epoch": 2.940677966101695, "grad_norm": 1.7510428428649902, "learning_rate": 8.529793432203391e-06, "loss": 1.2713, "mean_token_accuracy": 0.6898874416947365, "num_tokens": 8934598.0, "step": 11104 }, { "epoch": 2.941207627118644, "grad_norm": 1.6510732173919678, "learning_rate": 8.529528601694916e-06, "loss": 1.5575, "mean_token_accuracy": 0.6457070335745811, "num_tokens": 8936206.0, "step": 11106 }, { "epoch": 2.9417372881355934, "grad_norm": 1.8436875343322754, "learning_rate": 8.529263771186441e-06, "loss": 1.4284, "mean_token_accuracy": 0.6696493253111839, "num_tokens": 8937808.0, "step": 11108 }, { "epoch": 2.9422669491525424, "grad_norm": 1.677042007446289, "learning_rate": 8.528998940677968e-06, "loss": 1.4922, "mean_token_accuracy": 0.6439766883850098, "num_tokens": 8939512.0, "step": 11110 }, { "epoch": 2.9427966101694913, "grad_norm": 1.428797721862793, "learning_rate": 8.528734110169493e-06, "loss": 1.4147, "mean_token_accuracy": 0.6759375929832458, "num_tokens": 8941234.0, "step": 11112 }, { "epoch": 2.9433262711864407, "grad_norm": 1.5308568477630615, "learning_rate": 8.528469279661018e-06, "loss": 1.0828, "mean_token_accuracy": 0.7502389997243881, "num_tokens": 8942967.0, "step": 11114 }, { "epoch": 2.9438559322033897, "grad_norm": 1.7065988779067993, "learning_rate": 8.528204449152542e-06, "loss": 1.3634, "mean_token_accuracy": 0.6976565346121788, "num_tokens": 8944649.0, "step": 11116 }, { "epoch": 2.944385593220339, "grad_norm": 1.664855718612671, "learning_rate": 8.527939618644069e-06, "loss": 1.1073, "mean_token_accuracy": 0.746721900999546, "num_tokens": 8946468.0, "step": 11118 }, { "epoch": 2.944915254237288, "grad_norm": 1.8595730066299438, "learning_rate": 8.527674788135594e-06, "loss": 1.5641, "mean_token_accuracy": 0.6659096926450729, "num_tokens": 8948228.0, "step": 11120 }, { "epoch": 2.945444915254237, "grad_norm": 1.7543261051177979, "learning_rate": 8.527409957627119e-06, "loss": 1.3419, "mean_token_accuracy": 0.7265576645731926, "num_tokens": 8949653.0, "step": 11122 }, { "epoch": 2.9459745762711864, "grad_norm": 1.4019149541854858, "learning_rate": 8.527145127118644e-06, "loss": 1.1017, "mean_token_accuracy": 0.7220607846975327, "num_tokens": 8951268.0, "step": 11124 }, { "epoch": 2.946504237288136, "grad_norm": 2.236332654953003, "learning_rate": 8.52688029661017e-06, "loss": 1.4996, "mean_token_accuracy": 0.6527846902608871, "num_tokens": 8952923.0, "step": 11126 }, { "epoch": 2.9470338983050848, "grad_norm": 1.845641851425171, "learning_rate": 8.526615466101695e-06, "loss": 1.1649, "mean_token_accuracy": 0.7306317836046219, "num_tokens": 8954294.0, "step": 11128 }, { "epoch": 2.9475635593220337, "grad_norm": 1.9617509841918945, "learning_rate": 8.526350635593222e-06, "loss": 1.4344, "mean_token_accuracy": 0.7066177949309349, "num_tokens": 8955976.0, "step": 11130 }, { "epoch": 2.948093220338983, "grad_norm": 1.634255290031433, "learning_rate": 8.526085805084747e-06, "loss": 1.1308, "mean_token_accuracy": 0.7365008816123009, "num_tokens": 8957496.0, "step": 11132 }, { "epoch": 2.948622881355932, "grad_norm": 1.895765781402588, "learning_rate": 8.525820974576272e-06, "loss": 1.4491, "mean_token_accuracy": 0.6810257211327553, "num_tokens": 8958952.0, "step": 11134 }, { "epoch": 2.9491525423728815, "grad_norm": 1.6173136234283447, "learning_rate": 8.525556144067797e-06, "loss": 1.4464, "mean_token_accuracy": 0.6861440688371658, "num_tokens": 8960646.0, "step": 11136 }, { "epoch": 2.9496822033898304, "grad_norm": 1.7429866790771484, "learning_rate": 8.525291313559323e-06, "loss": 1.1134, "mean_token_accuracy": 0.719950407743454, "num_tokens": 8962404.0, "step": 11138 }, { "epoch": 2.9502118644067794, "grad_norm": 1.5287712812423706, "learning_rate": 8.525026483050848e-06, "loss": 1.2447, "mean_token_accuracy": 0.6982776001095772, "num_tokens": 8964028.0, "step": 11140 }, { "epoch": 2.950741525423729, "grad_norm": 1.5544700622558594, "learning_rate": 8.524761652542373e-06, "loss": 1.0634, "mean_token_accuracy": 0.7353954091668129, "num_tokens": 8965841.0, "step": 11142 }, { "epoch": 2.951271186440678, "grad_norm": 1.5835347175598145, "learning_rate": 8.524496822033898e-06, "loss": 1.3016, "mean_token_accuracy": 0.6912978440523148, "num_tokens": 8967654.0, "step": 11144 }, { "epoch": 2.951800847457627, "grad_norm": 1.7418193817138672, "learning_rate": 8.524231991525425e-06, "loss": 1.4406, "mean_token_accuracy": 0.6761176213622093, "num_tokens": 8969296.0, "step": 11146 }, { "epoch": 2.952330508474576, "grad_norm": 1.686265468597412, "learning_rate": 8.52396716101695e-06, "loss": 1.0291, "mean_token_accuracy": 0.7680714204907417, "num_tokens": 8970942.0, "step": 11148 }, { "epoch": 2.9528601694915255, "grad_norm": 1.7014833688735962, "learning_rate": 8.523702330508475e-06, "loss": 1.1793, "mean_token_accuracy": 0.7327971383929253, "num_tokens": 8972286.0, "step": 11150 }, { "epoch": 2.9533898305084745, "grad_norm": 2.2859487533569336, "learning_rate": 8.5234375e-06, "loss": 1.8036, "mean_token_accuracy": 0.631330631673336, "num_tokens": 8974030.0, "step": 11152 }, { "epoch": 2.953919491525424, "grad_norm": 1.774292230606079, "learning_rate": 8.523172669491526e-06, "loss": 1.1403, "mean_token_accuracy": 0.7377105951309204, "num_tokens": 8975637.0, "step": 11154 }, { "epoch": 2.954449152542373, "grad_norm": 1.6441516876220703, "learning_rate": 8.522907838983051e-06, "loss": 1.2009, "mean_token_accuracy": 0.735498696565628, "num_tokens": 8976944.0, "step": 11156 }, { "epoch": 2.954978813559322, "grad_norm": 1.8686829805374146, "learning_rate": 8.522643008474578e-06, "loss": 1.2255, "mean_token_accuracy": 0.6979768574237823, "num_tokens": 8978525.0, "step": 11158 }, { "epoch": 2.955508474576271, "grad_norm": 1.913865327835083, "learning_rate": 8.522378177966101e-06, "loss": 1.3995, "mean_token_accuracy": 0.6820885427296162, "num_tokens": 8980261.0, "step": 11160 }, { "epoch": 2.9560381355932206, "grad_norm": 1.797768235206604, "learning_rate": 8.522113347457628e-06, "loss": 1.0957, "mean_token_accuracy": 0.7520253658294678, "num_tokens": 8981624.0, "step": 11162 }, { "epoch": 2.9565677966101696, "grad_norm": 1.7221397161483765, "learning_rate": 8.521848516949153e-06, "loss": 1.1118, "mean_token_accuracy": 0.7334692850708961, "num_tokens": 8983240.0, "step": 11164 }, { "epoch": 2.9570974576271185, "grad_norm": 1.618567705154419, "learning_rate": 8.52158368644068e-06, "loss": 0.9674, "mean_token_accuracy": 0.7745762541890144, "num_tokens": 8984858.0, "step": 11166 }, { "epoch": 2.957627118644068, "grad_norm": 1.5296909809112549, "learning_rate": 8.521318855932204e-06, "loss": 1.2765, "mean_token_accuracy": 0.6975392699241638, "num_tokens": 8986625.0, "step": 11168 }, { "epoch": 2.958156779661017, "grad_norm": 1.6301143169403076, "learning_rate": 8.521054025423729e-06, "loss": 0.8781, "mean_token_accuracy": 0.7789762020111084, "num_tokens": 8988222.0, "step": 11170 }, { "epoch": 2.9586864406779663, "grad_norm": 1.3516414165496826, "learning_rate": 8.520789194915256e-06, "loss": 1.1927, "mean_token_accuracy": 0.7441639676690102, "num_tokens": 8989646.0, "step": 11172 }, { "epoch": 2.9592161016949152, "grad_norm": 1.4308899641036987, "learning_rate": 8.52052436440678e-06, "loss": 1.5481, "mean_token_accuracy": 0.6533259376883507, "num_tokens": 8991434.0, "step": 11174 }, { "epoch": 2.959745762711864, "grad_norm": 1.836085557937622, "learning_rate": 8.520259533898306e-06, "loss": 1.0528, "mean_token_accuracy": 0.7361607551574707, "num_tokens": 8992868.0, "step": 11176 }, { "epoch": 2.9602754237288136, "grad_norm": 1.5416918992996216, "learning_rate": 8.51999470338983e-06, "loss": 1.0996, "mean_token_accuracy": 0.7236415669322014, "num_tokens": 8994492.0, "step": 11178 }, { "epoch": 2.960805084745763, "grad_norm": 1.760513424873352, "learning_rate": 8.519729872881357e-06, "loss": 1.2803, "mean_token_accuracy": 0.7064646333456039, "num_tokens": 8996152.0, "step": 11180 }, { "epoch": 2.961334745762712, "grad_norm": 1.592854619026184, "learning_rate": 8.519465042372882e-06, "loss": 1.6042, "mean_token_accuracy": 0.6600301638245583, "num_tokens": 8997641.0, "step": 11182 }, { "epoch": 2.961864406779661, "grad_norm": 1.9828006029129028, "learning_rate": 8.519200211864409e-06, "loss": 1.4638, "mean_token_accuracy": 0.6886136755347252, "num_tokens": 8999100.0, "step": 11184 }, { "epoch": 2.9623940677966103, "grad_norm": 1.5876901149749756, "learning_rate": 8.518935381355934e-06, "loss": 1.4836, "mean_token_accuracy": 0.68192083761096, "num_tokens": 9000858.0, "step": 11186 }, { "epoch": 2.9629237288135593, "grad_norm": 1.769168496131897, "learning_rate": 8.518670550847459e-06, "loss": 1.1484, "mean_token_accuracy": 0.7331712543964386, "num_tokens": 9002171.0, "step": 11188 }, { "epoch": 2.9634533898305087, "grad_norm": 1.940301537513733, "learning_rate": 8.518405720338983e-06, "loss": 1.2816, "mean_token_accuracy": 0.716360978782177, "num_tokens": 9003593.0, "step": 11190 }, { "epoch": 2.9639830508474576, "grad_norm": 1.717179536819458, "learning_rate": 8.51814088983051e-06, "loss": 1.2059, "mean_token_accuracy": 0.7000933587551117, "num_tokens": 9005130.0, "step": 11192 }, { "epoch": 2.9645127118644066, "grad_norm": 1.5685136318206787, "learning_rate": 8.517876059322035e-06, "loss": 1.3464, "mean_token_accuracy": 0.7474819347262383, "num_tokens": 9006846.0, "step": 11194 }, { "epoch": 2.965042372881356, "grad_norm": 1.7463394403457642, "learning_rate": 8.51761122881356e-06, "loss": 1.0812, "mean_token_accuracy": 0.7558057680726051, "num_tokens": 9008173.0, "step": 11196 }, { "epoch": 2.965572033898305, "grad_norm": 1.4085379838943481, "learning_rate": 8.517346398305085e-06, "loss": 1.0455, "mean_token_accuracy": 0.7481512725353241, "num_tokens": 9009900.0, "step": 11198 }, { "epoch": 2.9661016949152543, "grad_norm": 1.7515203952789307, "learning_rate": 8.517081567796612e-06, "loss": 1.4922, "mean_token_accuracy": 0.6651609316468239, "num_tokens": 9011596.0, "step": 11200 }, { "epoch": 2.9666313559322033, "grad_norm": 1.8416638374328613, "learning_rate": 8.516816737288136e-06, "loss": 1.2831, "mean_token_accuracy": 0.7136927619576454, "num_tokens": 9013145.0, "step": 11202 }, { "epoch": 2.9671610169491527, "grad_norm": 1.7043927907943726, "learning_rate": 8.516551906779661e-06, "loss": 1.2934, "mean_token_accuracy": 0.7103508934378624, "num_tokens": 9014515.0, "step": 11204 }, { "epoch": 2.9676906779661016, "grad_norm": 1.8403031826019287, "learning_rate": 8.516287076271186e-06, "loss": 1.2528, "mean_token_accuracy": 0.690239742398262, "num_tokens": 9016112.0, "step": 11206 }, { "epoch": 2.968220338983051, "grad_norm": 1.8399337530136108, "learning_rate": 8.516022245762713e-06, "loss": 1.307, "mean_token_accuracy": 0.6905442401766777, "num_tokens": 9017861.0, "step": 11208 }, { "epoch": 2.96875, "grad_norm": 1.608445405960083, "learning_rate": 8.515757415254238e-06, "loss": 1.3308, "mean_token_accuracy": 0.6915441602468491, "num_tokens": 9019595.0, "step": 11210 }, { "epoch": 2.969279661016949, "grad_norm": 1.6574716567993164, "learning_rate": 8.515492584745764e-06, "loss": 1.1879, "mean_token_accuracy": 0.723751999437809, "num_tokens": 9021161.0, "step": 11212 }, { "epoch": 2.9698093220338984, "grad_norm": 2.0457868576049805, "learning_rate": 8.515227754237288e-06, "loss": 1.2703, "mean_token_accuracy": 0.7084940448403358, "num_tokens": 9022649.0, "step": 11214 }, { "epoch": 2.9703389830508473, "grad_norm": 1.929384469985962, "learning_rate": 8.514962923728814e-06, "loss": 1.3844, "mean_token_accuracy": 0.6953727751970291, "num_tokens": 9024116.0, "step": 11216 }, { "epoch": 2.9708686440677967, "grad_norm": 1.6944160461425781, "learning_rate": 8.51469809322034e-06, "loss": 1.3666, "mean_token_accuracy": 0.7015372887253761, "num_tokens": 9025720.0, "step": 11218 }, { "epoch": 2.9713983050847457, "grad_norm": 1.7763084173202515, "learning_rate": 8.514433262711866e-06, "loss": 1.1271, "mean_token_accuracy": 0.7487253025174141, "num_tokens": 9027461.0, "step": 11220 }, { "epoch": 2.971927966101695, "grad_norm": 1.4653232097625732, "learning_rate": 8.51416843220339e-06, "loss": 1.1483, "mean_token_accuracy": 0.7063740938901901, "num_tokens": 9029000.0, "step": 11222 }, { "epoch": 2.972457627118644, "grad_norm": 1.8090176582336426, "learning_rate": 8.513903601694916e-06, "loss": 1.0917, "mean_token_accuracy": 0.728751502931118, "num_tokens": 9030558.0, "step": 11224 }, { "epoch": 2.9729872881355934, "grad_norm": 1.3857357501983643, "learning_rate": 8.51363877118644e-06, "loss": 1.3253, "mean_token_accuracy": 0.6936032772064209, "num_tokens": 9032158.0, "step": 11226 }, { "epoch": 2.9735169491525424, "grad_norm": 1.6484220027923584, "learning_rate": 8.513373940677967e-06, "loss": 1.0035, "mean_token_accuracy": 0.747444860637188, "num_tokens": 9033411.0, "step": 11228 }, { "epoch": 2.9740466101694913, "grad_norm": 1.9510917663574219, "learning_rate": 8.513109110169492e-06, "loss": 1.3705, "mean_token_accuracy": 0.7001603618264198, "num_tokens": 9034958.0, "step": 11230 }, { "epoch": 2.9745762711864407, "grad_norm": 1.4782319068908691, "learning_rate": 8.512844279661017e-06, "loss": 1.4825, "mean_token_accuracy": 0.6644520834088326, "num_tokens": 9036750.0, "step": 11232 }, { "epoch": 2.9751059322033897, "grad_norm": 1.6286864280700684, "learning_rate": 8.512579449152542e-06, "loss": 0.9962, "mean_token_accuracy": 0.757927730679512, "num_tokens": 9038261.0, "step": 11234 }, { "epoch": 2.975635593220339, "grad_norm": 1.5003275871276855, "learning_rate": 8.512314618644069e-06, "loss": 1.2535, "mean_token_accuracy": 0.6945874616503716, "num_tokens": 9039935.0, "step": 11236 }, { "epoch": 2.976165254237288, "grad_norm": 1.5938969850540161, "learning_rate": 8.512049788135594e-06, "loss": 1.1564, "mean_token_accuracy": 0.7288461402058601, "num_tokens": 9041467.0, "step": 11238 }, { "epoch": 2.976694915254237, "grad_norm": 1.8993078470230103, "learning_rate": 8.51178495762712e-06, "loss": 1.2662, "mean_token_accuracy": 0.7076663821935654, "num_tokens": 9042946.0, "step": 11240 }, { "epoch": 2.9772245762711864, "grad_norm": 1.667728304862976, "learning_rate": 8.511520127118643e-06, "loss": 1.3586, "mean_token_accuracy": 0.672066405415535, "num_tokens": 9044637.0, "step": 11242 }, { "epoch": 2.977754237288136, "grad_norm": 2.06954026222229, "learning_rate": 8.51125529661017e-06, "loss": 1.4576, "mean_token_accuracy": 0.6686391085386276, "num_tokens": 9046034.0, "step": 11244 }, { "epoch": 2.9782838983050848, "grad_norm": 1.9646955728530884, "learning_rate": 8.510990466101695e-06, "loss": 1.4987, "mean_token_accuracy": 0.6692437008023262, "num_tokens": 9047627.0, "step": 11246 }, { "epoch": 2.9788135593220337, "grad_norm": 1.6937801837921143, "learning_rate": 8.510725635593222e-06, "loss": 1.8009, "mean_token_accuracy": 0.6016033887863159, "num_tokens": 9049485.0, "step": 11248 }, { "epoch": 2.979343220338983, "grad_norm": 1.7536197900772095, "learning_rate": 8.510460805084747e-06, "loss": 1.2292, "step": 11250 }, { "epoch": 2.979343220338983, "eval_loss": 1.3084464073181152, "eval_mean_token_accuracy": 0.7013561462427115, "eval_num_tokens": 9051150.0, "eval_runtime": 48.4071, "eval_samples_per_second": 6.363, "eval_steps_per_second": 6.363, "step": 11250 }, { "epoch": 2.979872881355932, "grad_norm": 2.064781427383423, "learning_rate": 8.510195974576272e-06, "loss": 1.3848, "mean_token_accuracy": 0.6994193959981203, "num_tokens": 9052533.0, "step": 11252 }, { "epoch": 2.9804025423728815, "grad_norm": 1.3939980268478394, "learning_rate": 8.509931144067798e-06, "loss": 1.5558, "mean_token_accuracy": 0.6580117121338844, "num_tokens": 9054654.0, "step": 11254 }, { "epoch": 2.9809322033898304, "grad_norm": 2.0042731761932373, "learning_rate": 8.509666313559323e-06, "loss": 0.9708, "mean_token_accuracy": 0.7655400484800339, "num_tokens": 9056238.0, "step": 11256 }, { "epoch": 2.9814618644067794, "grad_norm": 1.5958036184310913, "learning_rate": 8.509401483050848e-06, "loss": 1.2891, "mean_token_accuracy": 0.6917987018823624, "num_tokens": 9057938.0, "step": 11258 }, { "epoch": 2.981991525423729, "grad_norm": 1.7608685493469238, "learning_rate": 8.509136652542373e-06, "loss": 1.2262, "mean_token_accuracy": 0.7025992721319199, "num_tokens": 9059377.0, "step": 11260 }, { "epoch": 2.982521186440678, "grad_norm": 1.5102685689926147, "learning_rate": 8.5088718220339e-06, "loss": 1.457, "mean_token_accuracy": 0.6690939143300056, "num_tokens": 9061451.0, "step": 11262 }, { "epoch": 2.983050847457627, "grad_norm": 1.6843308210372925, "learning_rate": 8.508606991525424e-06, "loss": 1.3218, "mean_token_accuracy": 0.7155274152755737, "num_tokens": 9063093.0, "step": 11264 }, { "epoch": 2.983580508474576, "grad_norm": 1.9495824575424194, "learning_rate": 8.508342161016951e-06, "loss": 1.0463, "mean_token_accuracy": 0.74002356082201, "num_tokens": 9064455.0, "step": 11266 }, { "epoch": 2.9841101694915255, "grad_norm": 1.7822253704071045, "learning_rate": 8.508077330508474e-06, "loss": 1.3654, "mean_token_accuracy": 0.686830498278141, "num_tokens": 9065881.0, "step": 11268 }, { "epoch": 2.9846398305084745, "grad_norm": 1.6618874073028564, "learning_rate": 8.507812500000001e-06, "loss": 1.0218, "mean_token_accuracy": 0.7547531649470329, "num_tokens": 9067376.0, "step": 11270 }, { "epoch": 2.985169491525424, "grad_norm": 1.6236686706542969, "learning_rate": 8.507547669491526e-06, "loss": 1.3588, "mean_token_accuracy": 0.6966373845934868, "num_tokens": 9068925.0, "step": 11272 }, { "epoch": 2.985699152542373, "grad_norm": 1.7236438989639282, "learning_rate": 8.507282838983053e-06, "loss": 1.3663, "mean_token_accuracy": 0.6848077178001404, "num_tokens": 9070665.0, "step": 11274 }, { "epoch": 2.986228813559322, "grad_norm": 1.925901174545288, "learning_rate": 8.507018008474577e-06, "loss": 1.3646, "mean_token_accuracy": 0.713696263730526, "num_tokens": 9072335.0, "step": 11276 }, { "epoch": 2.986758474576271, "grad_norm": 1.4688283205032349, "learning_rate": 8.506753177966102e-06, "loss": 1.0988, "mean_token_accuracy": 0.7319312021136284, "num_tokens": 9073897.0, "step": 11278 }, { "epoch": 2.9872881355932206, "grad_norm": 2.247804880142212, "learning_rate": 8.506488347457627e-06, "loss": 1.6207, "mean_token_accuracy": 0.6413134858012199, "num_tokens": 9075512.0, "step": 11280 }, { "epoch": 2.9878177966101696, "grad_norm": 0.8986403942108154, "learning_rate": 8.506223516949154e-06, "loss": 1.1177, "mean_token_accuracy": 0.7314603477716446, "num_tokens": 9078026.0, "step": 11282 }, { "epoch": 2.9883474576271185, "grad_norm": 2.0902185440063477, "learning_rate": 8.505958686440679e-06, "loss": 1.298, "mean_token_accuracy": 0.7137986719608307, "num_tokens": 9079327.0, "step": 11284 }, { "epoch": 2.988877118644068, "grad_norm": 2.2833361625671387, "learning_rate": 8.505693855932204e-06, "loss": 1.1462, "mean_token_accuracy": 0.728437140583992, "num_tokens": 9080738.0, "step": 11286 }, { "epoch": 2.989406779661017, "grad_norm": 1.6722429990768433, "learning_rate": 8.505429025423729e-06, "loss": 1.0941, "mean_token_accuracy": 0.711202897131443, "num_tokens": 9082240.0, "step": 11288 }, { "epoch": 2.9899364406779663, "grad_norm": 1.9595065116882324, "learning_rate": 8.505164194915255e-06, "loss": 1.3571, "mean_token_accuracy": 0.6837651878595352, "num_tokens": 9083942.0, "step": 11290 }, { "epoch": 2.9904661016949152, "grad_norm": 1.6607434749603271, "learning_rate": 8.50489936440678e-06, "loss": 1.495, "mean_token_accuracy": 0.6646536141633987, "num_tokens": 9085312.0, "step": 11292 }, { "epoch": 2.990995762711864, "grad_norm": 1.801951289176941, "learning_rate": 8.504634533898307e-06, "loss": 1.2873, "mean_token_accuracy": 0.7117081135511398, "num_tokens": 9086879.0, "step": 11294 }, { "epoch": 2.9915254237288136, "grad_norm": 1.5747100114822388, "learning_rate": 8.50436970338983e-06, "loss": 1.513, "mean_token_accuracy": 0.6738689877092838, "num_tokens": 9088345.0, "step": 11296 }, { "epoch": 2.992055084745763, "grad_norm": 1.8052257299423218, "learning_rate": 8.504104872881357e-06, "loss": 1.1155, "mean_token_accuracy": 0.716717079281807, "num_tokens": 9090043.0, "step": 11298 }, { "epoch": 2.992584745762712, "grad_norm": 2.360172986984253, "learning_rate": 8.503840042372882e-06, "loss": 1.4312, "mean_token_accuracy": 0.6732030212879181, "num_tokens": 9091948.0, "step": 11300 }, { "epoch": 2.993114406779661, "grad_norm": 1.2565053701400757, "learning_rate": 8.503575211864408e-06, "loss": 1.2479, "mean_token_accuracy": 0.6882373169064522, "num_tokens": 9093810.0, "step": 11302 }, { "epoch": 2.9936440677966103, "grad_norm": 1.6545878648757935, "learning_rate": 8.503310381355933e-06, "loss": 1.1705, "mean_token_accuracy": 0.70808394998312, "num_tokens": 9095505.0, "step": 11304 }, { "epoch": 2.9941737288135593, "grad_norm": 1.382370948791504, "learning_rate": 8.503045550847458e-06, "loss": 0.8612, "mean_token_accuracy": 0.7904419749975204, "num_tokens": 9097316.0, "step": 11306 }, { "epoch": 2.9947033898305087, "grad_norm": 1.596487045288086, "learning_rate": 8.502780720338983e-06, "loss": 1.2001, "mean_token_accuracy": 0.7182565331459045, "num_tokens": 9098953.0, "step": 11308 }, { "epoch": 2.9952330508474576, "grad_norm": 1.5579509735107422, "learning_rate": 8.50251588983051e-06, "loss": 0.7907, "mean_token_accuracy": 0.7920894548296928, "num_tokens": 9100620.0, "step": 11310 }, { "epoch": 2.9957627118644066, "grad_norm": 2.0052828788757324, "learning_rate": 8.502251059322035e-06, "loss": 1.6838, "mean_token_accuracy": 0.622249498963356, "num_tokens": 9102317.0, "step": 11312 }, { "epoch": 2.996292372881356, "grad_norm": 1.6875478029251099, "learning_rate": 8.50198622881356e-06, "loss": 1.3497, "mean_token_accuracy": 0.663997620344162, "num_tokens": 9103747.0, "step": 11314 }, { "epoch": 2.996822033898305, "grad_norm": 1.1130105257034302, "learning_rate": 8.501721398305085e-06, "loss": 0.961, "mean_token_accuracy": 0.7599804624915123, "num_tokens": 9105267.0, "step": 11316 }, { "epoch": 2.9973516949152543, "grad_norm": 1.5812489986419678, "learning_rate": 8.501456567796611e-06, "loss": 0.8325, "mean_token_accuracy": 0.7904578223824501, "num_tokens": 9106738.0, "step": 11318 }, { "epoch": 2.9978813559322033, "grad_norm": 1.5267447233200073, "learning_rate": 8.501191737288136e-06, "loss": 1.5117, "mean_token_accuracy": 0.6782233268022537, "num_tokens": 9108499.0, "step": 11320 }, { "epoch": 2.9984110169491527, "grad_norm": 1.6965962648391724, "learning_rate": 8.500926906779661e-06, "loss": 0.947, "mean_token_accuracy": 0.7589130476117134, "num_tokens": 9110036.0, "step": 11322 }, { "epoch": 2.9989406779661016, "grad_norm": 1.7927906513214111, "learning_rate": 8.500662076271186e-06, "loss": 1.5974, "mean_token_accuracy": 0.6375877782702446, "num_tokens": 9111579.0, "step": 11324 }, { "epoch": 2.999470338983051, "grad_norm": 1.7628389596939087, "learning_rate": 8.500397245762713e-06, "loss": 1.2347, "mean_token_accuracy": 0.7264343574643135, "num_tokens": 9113126.0, "step": 11326 }, { "epoch": 3.0, "grad_norm": 1.4531856775283813, "learning_rate": 8.500132415254237e-06, "loss": 1.0896, "mean_token_accuracy": 0.7310714274644852, "num_tokens": 9114804.0, "step": 11328 }, { "epoch": 3.000529661016949, "grad_norm": 1.4711238145828247, "learning_rate": 8.499867584745764e-06, "loss": 1.3378, "mean_token_accuracy": 0.7150200456380844, "num_tokens": 9116484.0, "step": 11330 }, { "epoch": 3.0010593220338984, "grad_norm": 1.511664867401123, "learning_rate": 8.499602754237289e-06, "loss": 1.1587, "mean_token_accuracy": 0.7484483197331429, "num_tokens": 9118164.0, "step": 11332 }, { "epoch": 3.0015889830508473, "grad_norm": 1.6764092445373535, "learning_rate": 8.499337923728814e-06, "loss": 1.4264, "mean_token_accuracy": 0.654480092227459, "num_tokens": 9119906.0, "step": 11334 }, { "epoch": 3.0021186440677967, "grad_norm": 1.244033932685852, "learning_rate": 8.499073093220339e-06, "loss": 1.0869, "mean_token_accuracy": 0.7511123791337013, "num_tokens": 9121630.0, "step": 11336 }, { "epoch": 3.0026483050847457, "grad_norm": 1.8800053596496582, "learning_rate": 8.498808262711866e-06, "loss": 1.6387, "mean_token_accuracy": 0.6349415704607964, "num_tokens": 9123293.0, "step": 11338 }, { "epoch": 3.003177966101695, "grad_norm": 1.8109829425811768, "learning_rate": 8.49854343220339e-06, "loss": 1.5354, "mean_token_accuracy": 0.6461709067225456, "num_tokens": 9124825.0, "step": 11340 }, { "epoch": 3.003707627118644, "grad_norm": 1.7218997478485107, "learning_rate": 8.498278601694915e-06, "loss": 1.1357, "mean_token_accuracy": 0.719704233109951, "num_tokens": 9126220.0, "step": 11342 }, { "epoch": 3.0042372881355934, "grad_norm": 2.4272258281707764, "learning_rate": 8.498013771186442e-06, "loss": 1.9266, "mean_token_accuracy": 0.5959770679473877, "num_tokens": 9127598.0, "step": 11344 }, { "epoch": 3.0047669491525424, "grad_norm": 1.964996099472046, "learning_rate": 8.497748940677967e-06, "loss": 1.2998, "mean_token_accuracy": 0.7071061879396439, "num_tokens": 9129120.0, "step": 11346 }, { "epoch": 3.0052966101694913, "grad_norm": 1.6891602277755737, "learning_rate": 8.497484110169494e-06, "loss": 1.2407, "mean_token_accuracy": 0.6913774311542511, "num_tokens": 9130705.0, "step": 11348 }, { "epoch": 3.0058262711864407, "grad_norm": 1.1398084163665771, "learning_rate": 8.497219279661017e-06, "loss": 1.2696, "mean_token_accuracy": 0.7075635530054569, "num_tokens": 9132391.0, "step": 11350 }, { "epoch": 3.0063559322033897, "grad_norm": 1.6058403253555298, "learning_rate": 8.496954449152543e-06, "loss": 1.3792, "mean_token_accuracy": 0.689236968755722, "num_tokens": 9133787.0, "step": 11352 }, { "epoch": 3.006885593220339, "grad_norm": 1.6051357984542847, "learning_rate": 8.496689618644068e-06, "loss": 1.3694, "mean_token_accuracy": 0.7083153501152992, "num_tokens": 9135517.0, "step": 11354 }, { "epoch": 3.007415254237288, "grad_norm": 1.7414692640304565, "learning_rate": 8.496424788135595e-06, "loss": 1.2502, "mean_token_accuracy": 0.7374618574976921, "num_tokens": 9137150.0, "step": 11356 }, { "epoch": 3.0079449152542375, "grad_norm": 1.6491496562957764, "learning_rate": 8.49615995762712e-06, "loss": 0.8958, "mean_token_accuracy": 0.7824690267443657, "num_tokens": 9138670.0, "step": 11358 }, { "epoch": 3.0084745762711864, "grad_norm": 1.6110708713531494, "learning_rate": 8.495895127118645e-06, "loss": 1.4354, "mean_token_accuracy": 0.6604946032166481, "num_tokens": 9140418.0, "step": 11360 }, { "epoch": 3.0090042372881354, "grad_norm": 1.8322601318359375, "learning_rate": 8.49563029661017e-06, "loss": 1.0849, "mean_token_accuracy": 0.7414926066994667, "num_tokens": 9141761.0, "step": 11362 }, { "epoch": 3.0095338983050848, "grad_norm": 1.3757352828979492, "learning_rate": 8.495365466101696e-06, "loss": 0.8257, "mean_token_accuracy": 0.7713750973343849, "num_tokens": 9143317.0, "step": 11364 }, { "epoch": 3.0100635593220337, "grad_norm": 1.5836284160614014, "learning_rate": 8.495100635593221e-06, "loss": 1.245, "mean_token_accuracy": 0.7294767275452614, "num_tokens": 9144914.0, "step": 11366 }, { "epoch": 3.010593220338983, "grad_norm": 1.5845890045166016, "learning_rate": 8.494835805084746e-06, "loss": 1.1781, "mean_token_accuracy": 0.7074326947331429, "num_tokens": 9146513.0, "step": 11368 }, { "epoch": 3.011122881355932, "grad_norm": 1.5936599969863892, "learning_rate": 8.494570974576271e-06, "loss": 1.1803, "mean_token_accuracy": 0.7197463139891624, "num_tokens": 9148112.0, "step": 11370 }, { "epoch": 3.0116525423728815, "grad_norm": 1.4232734441757202, "learning_rate": 8.494306144067798e-06, "loss": 0.8176, "mean_token_accuracy": 0.7828144729137421, "num_tokens": 9149895.0, "step": 11372 }, { "epoch": 3.0121822033898304, "grad_norm": 1.6061831712722778, "learning_rate": 8.494041313559323e-06, "loss": 1.4991, "mean_token_accuracy": 0.6879515796899796, "num_tokens": 9151588.0, "step": 11374 }, { "epoch": 3.01271186440678, "grad_norm": 1.702134132385254, "learning_rate": 8.493776483050848e-06, "loss": 1.6262, "mean_token_accuracy": 0.619891133159399, "num_tokens": 9153103.0, "step": 11376 }, { "epoch": 3.013241525423729, "grad_norm": 1.6048238277435303, "learning_rate": 8.493511652542373e-06, "loss": 1.2177, "mean_token_accuracy": 0.7317167483270168, "num_tokens": 9154461.0, "step": 11378 }, { "epoch": 3.0137711864406778, "grad_norm": 1.7072113752365112, "learning_rate": 8.4932468220339e-06, "loss": 1.3531, "mean_token_accuracy": 0.7166106402873993, "num_tokens": 9156162.0, "step": 11380 }, { "epoch": 3.014300847457627, "grad_norm": 2.4278910160064697, "learning_rate": 8.492981991525424e-06, "loss": 1.1479, "mean_token_accuracy": 0.7178939133882523, "num_tokens": 9157474.0, "step": 11382 }, { "epoch": 3.014830508474576, "grad_norm": 1.671883463859558, "learning_rate": 8.49271716101695e-06, "loss": 1.235, "mean_token_accuracy": 0.7208574637770653, "num_tokens": 9158778.0, "step": 11384 }, { "epoch": 3.0153601694915255, "grad_norm": 1.2997798919677734, "learning_rate": 8.492452330508476e-06, "loss": 0.8918, "mean_token_accuracy": 0.7913181260228157, "num_tokens": 9160329.0, "step": 11386 }, { "epoch": 3.0158898305084745, "grad_norm": 1.7671438455581665, "learning_rate": 8.4921875e-06, "loss": 1.4715, "mean_token_accuracy": 0.6688920930027962, "num_tokens": 9161903.0, "step": 11388 }, { "epoch": 3.016419491525424, "grad_norm": 1.867132544517517, "learning_rate": 8.491922669491526e-06, "loss": 1.0075, "mean_token_accuracy": 0.7626645490527153, "num_tokens": 9163334.0, "step": 11390 }, { "epoch": 3.016949152542373, "grad_norm": 1.7796217203140259, "learning_rate": 8.491657838983052e-06, "loss": 1.3918, "mean_token_accuracy": 0.6988836154341698, "num_tokens": 9164828.0, "step": 11392 }, { "epoch": 3.0174788135593222, "grad_norm": 1.852003574371338, "learning_rate": 8.491393008474577e-06, "loss": 1.5559, "mean_token_accuracy": 0.6853305175900459, "num_tokens": 9166368.0, "step": 11394 }, { "epoch": 3.018008474576271, "grad_norm": 1.3732564449310303, "learning_rate": 8.491128177966102e-06, "loss": 0.9279, "mean_token_accuracy": 0.7449296340346336, "num_tokens": 9168098.0, "step": 11396 }, { "epoch": 3.01853813559322, "grad_norm": 1.7240426540374756, "learning_rate": 8.490863347457627e-06, "loss": 1.3867, "mean_token_accuracy": 0.6943604201078415, "num_tokens": 9169968.0, "step": 11398 }, { "epoch": 3.0190677966101696, "grad_norm": 1.7162832021713257, "learning_rate": 8.490598516949154e-06, "loss": 1.3439, "mean_token_accuracy": 0.6870796605944633, "num_tokens": 9171540.0, "step": 11400 }, { "epoch": 3.0195974576271185, "grad_norm": 1.7779271602630615, "learning_rate": 8.490333686440678e-06, "loss": 1.2734, "mean_token_accuracy": 0.7488262057304382, "num_tokens": 9173026.0, "step": 11402 }, { "epoch": 3.020127118644068, "grad_norm": 1.8393210172653198, "learning_rate": 8.490068855932203e-06, "loss": 1.4227, "mean_token_accuracy": 0.6887306869029999, "num_tokens": 9174470.0, "step": 11404 }, { "epoch": 3.020656779661017, "grad_norm": 1.5247554779052734, "learning_rate": 8.489804025423728e-06, "loss": 1.3551, "mean_token_accuracy": 0.6973572596907616, "num_tokens": 9176131.0, "step": 11406 }, { "epoch": 3.0211864406779663, "grad_norm": 2.0069313049316406, "learning_rate": 8.489539194915255e-06, "loss": 1.3601, "mean_token_accuracy": 0.6959833800792694, "num_tokens": 9177642.0, "step": 11408 }, { "epoch": 3.0217161016949152, "grad_norm": 1.559714436531067, "learning_rate": 8.48927436440678e-06, "loss": 0.8653, "mean_token_accuracy": 0.7919088304042816, "num_tokens": 9179210.0, "step": 11410 }, { "epoch": 3.0222457627118646, "grad_norm": 2.0592093467712402, "learning_rate": 8.489009533898307e-06, "loss": 1.3017, "mean_token_accuracy": 0.7098939195275307, "num_tokens": 9180456.0, "step": 11412 }, { "epoch": 3.0227754237288136, "grad_norm": 1.5172362327575684, "learning_rate": 8.488744703389831e-06, "loss": 1.202, "mean_token_accuracy": 0.7133582532405853, "num_tokens": 9182332.0, "step": 11414 }, { "epoch": 3.0233050847457625, "grad_norm": 1.7932981252670288, "learning_rate": 8.488479872881356e-06, "loss": 1.2816, "mean_token_accuracy": 0.6872383058071136, "num_tokens": 9183684.0, "step": 11416 }, { "epoch": 3.023834745762712, "grad_norm": 1.70606529712677, "learning_rate": 8.488215042372881e-06, "loss": 1.4372, "mean_token_accuracy": 0.6653970740735531, "num_tokens": 9185232.0, "step": 11418 }, { "epoch": 3.024364406779661, "grad_norm": 1.7041637897491455, "learning_rate": 8.487950211864408e-06, "loss": 1.2578, "mean_token_accuracy": 0.7129021361470222, "num_tokens": 9186935.0, "step": 11420 }, { "epoch": 3.0248940677966103, "grad_norm": 1.5227177143096924, "learning_rate": 8.487685381355933e-06, "loss": 1.1827, "mean_token_accuracy": 0.7240949794650078, "num_tokens": 9188881.0, "step": 11422 }, { "epoch": 3.0254237288135593, "grad_norm": 1.8505367040634155, "learning_rate": 8.487420550847458e-06, "loss": 1.5572, "mean_token_accuracy": 0.641967173665762, "num_tokens": 9190673.0, "step": 11424 }, { "epoch": 3.0259533898305087, "grad_norm": 1.804431676864624, "learning_rate": 8.487155720338984e-06, "loss": 1.5496, "mean_token_accuracy": 0.6658710911870003, "num_tokens": 9192266.0, "step": 11426 }, { "epoch": 3.0264830508474576, "grad_norm": 1.4957419633865356, "learning_rate": 8.48689088983051e-06, "loss": 1.0842, "mean_token_accuracy": 0.7627894356846809, "num_tokens": 9193885.0, "step": 11428 }, { "epoch": 3.0270127118644066, "grad_norm": 1.6342679262161255, "learning_rate": 8.486626059322034e-06, "loss": 1.039, "mean_token_accuracy": 0.7466977760195732, "num_tokens": 9195471.0, "step": 11430 }, { "epoch": 3.027542372881356, "grad_norm": 1.9099112749099731, "learning_rate": 8.48636122881356e-06, "loss": 1.2337, "mean_token_accuracy": 0.7297928184270859, "num_tokens": 9196969.0, "step": 11432 }, { "epoch": 3.028072033898305, "grad_norm": 2.1350560188293457, "learning_rate": 8.486096398305086e-06, "loss": 1.2685, "mean_token_accuracy": 0.710248164832592, "num_tokens": 9198494.0, "step": 11434 }, { "epoch": 3.0286016949152543, "grad_norm": 1.8729147911071777, "learning_rate": 8.48583156779661e-06, "loss": 1.163, "mean_token_accuracy": 0.7188422530889511, "num_tokens": 9199953.0, "step": 11436 }, { "epoch": 3.0291313559322033, "grad_norm": 1.6487449407577515, "learning_rate": 8.485566737288137e-06, "loss": 1.3392, "mean_token_accuracy": 0.6843923777341843, "num_tokens": 9201657.0, "step": 11438 }, { "epoch": 3.0296610169491527, "grad_norm": 1.8783056735992432, "learning_rate": 8.485301906779662e-06, "loss": 1.3964, "mean_token_accuracy": 0.6942959129810333, "num_tokens": 9203149.0, "step": 11440 }, { "epoch": 3.0301906779661016, "grad_norm": 2.006896734237671, "learning_rate": 8.485037076271187e-06, "loss": 1.5922, "mean_token_accuracy": 0.6571350730955601, "num_tokens": 9204588.0, "step": 11442 }, { "epoch": 3.030720338983051, "grad_norm": 2.0838205814361572, "learning_rate": 8.484772245762712e-06, "loss": 1.0716, "mean_token_accuracy": 0.7655046284198761, "num_tokens": 9205891.0, "step": 11444 }, { "epoch": 3.03125, "grad_norm": 2.1322991847991943, "learning_rate": 8.484507415254239e-06, "loss": 1.7603, "mean_token_accuracy": 0.6116836816072464, "num_tokens": 9207517.0, "step": 11446 }, { "epoch": 3.031779661016949, "grad_norm": 1.4607411623001099, "learning_rate": 8.484242584745764e-06, "loss": 1.3766, "mean_token_accuracy": 0.6900991052389145, "num_tokens": 9209049.0, "step": 11448 }, { "epoch": 3.0323093220338984, "grad_norm": 1.7918802499771118, "learning_rate": 8.483977754237289e-06, "loss": 1.3548, "mean_token_accuracy": 0.6687556281685829, "num_tokens": 9210806.0, "step": 11450 }, { "epoch": 3.0328389830508473, "grad_norm": 1.7825323343276978, "learning_rate": 8.483712923728814e-06, "loss": 1.6648, "mean_token_accuracy": 0.6571430414915085, "num_tokens": 9212435.0, "step": 11452 }, { "epoch": 3.0333686440677967, "grad_norm": 1.5703370571136475, "learning_rate": 8.48344809322034e-06, "loss": 1.4328, "mean_token_accuracy": 0.6652732081711292, "num_tokens": 9214384.0, "step": 11454 }, { "epoch": 3.0338983050847457, "grad_norm": 2.4325499534606934, "learning_rate": 8.483183262711865e-06, "loss": 1.7468, "mean_token_accuracy": 0.636189341545105, "num_tokens": 9216028.0, "step": 11456 }, { "epoch": 3.034427966101695, "grad_norm": 1.6202092170715332, "learning_rate": 8.48291843220339e-06, "loss": 1.4055, "mean_token_accuracy": 0.6825168058276176, "num_tokens": 9217415.0, "step": 11458 }, { "epoch": 3.034957627118644, "grad_norm": 1.7967045307159424, "learning_rate": 8.482653601694915e-06, "loss": 1.1314, "mean_token_accuracy": 0.7373289987444878, "num_tokens": 9219068.0, "step": 11460 }, { "epoch": 3.0354872881355934, "grad_norm": 1.5601414442062378, "learning_rate": 8.482388771186442e-06, "loss": 1.1343, "mean_token_accuracy": 0.7244481295347214, "num_tokens": 9220787.0, "step": 11462 }, { "epoch": 3.0360169491525424, "grad_norm": 1.9267964363098145, "learning_rate": 8.482123940677967e-06, "loss": 0.9752, "mean_token_accuracy": 0.7816235572099686, "num_tokens": 9222278.0, "step": 11464 }, { "epoch": 3.0365466101694913, "grad_norm": 1.738480806350708, "learning_rate": 8.481859110169493e-06, "loss": 1.5811, "mean_token_accuracy": 0.6534691751003265, "num_tokens": 9223857.0, "step": 11466 }, { "epoch": 3.0370762711864407, "grad_norm": 2.28829026222229, "learning_rate": 8.481594279661018e-06, "loss": 1.0164, "mean_token_accuracy": 0.75377107411623, "num_tokens": 9225198.0, "step": 11468 }, { "epoch": 3.0376059322033897, "grad_norm": 2.058084726333618, "learning_rate": 8.481329449152543e-06, "loss": 1.5472, "mean_token_accuracy": 0.6548259258270264, "num_tokens": 9226631.0, "step": 11470 }, { "epoch": 3.038135593220339, "grad_norm": 1.8962680101394653, "learning_rate": 8.481064618644068e-06, "loss": 1.5851, "mean_token_accuracy": 0.645319789648056, "num_tokens": 9228410.0, "step": 11472 }, { "epoch": 3.038665254237288, "grad_norm": 1.6091976165771484, "learning_rate": 8.480799788135595e-06, "loss": 1.274, "mean_token_accuracy": 0.7057022638618946, "num_tokens": 9230074.0, "step": 11474 }, { "epoch": 3.0391949152542375, "grad_norm": 1.497454047203064, "learning_rate": 8.48053495762712e-06, "loss": 1.4164, "mean_token_accuracy": 0.676651768386364, "num_tokens": 9231837.0, "step": 11476 }, { "epoch": 3.0397245762711864, "grad_norm": 1.73826003074646, "learning_rate": 8.480270127118644e-06, "loss": 1.2844, "mean_token_accuracy": 0.6965580061078072, "num_tokens": 9233628.0, "step": 11478 }, { "epoch": 3.0402542372881354, "grad_norm": 1.960597038269043, "learning_rate": 8.48000529661017e-06, "loss": 1.6844, "mean_token_accuracy": 0.6133296564221382, "num_tokens": 9235427.0, "step": 11480 }, { "epoch": 3.0407838983050848, "grad_norm": 1.716464638710022, "learning_rate": 8.479740466101696e-06, "loss": 1.457, "mean_token_accuracy": 0.6757576242089272, "num_tokens": 9237147.0, "step": 11482 }, { "epoch": 3.0413135593220337, "grad_norm": 1.4479196071624756, "learning_rate": 8.479475635593221e-06, "loss": 0.7933, "mean_token_accuracy": 0.7927194461226463, "num_tokens": 9238899.0, "step": 11484 }, { "epoch": 3.041843220338983, "grad_norm": 1.7579319477081299, "learning_rate": 8.479210805084746e-06, "loss": 1.2282, "mean_token_accuracy": 0.7197293043136597, "num_tokens": 9240354.0, "step": 11486 }, { "epoch": 3.042372881355932, "grad_norm": 1.8492707014083862, "learning_rate": 8.47894597457627e-06, "loss": 1.0844, "mean_token_accuracy": 0.7338114604353905, "num_tokens": 9241838.0, "step": 11488 }, { "epoch": 3.0429025423728815, "grad_norm": 1.598514437675476, "learning_rate": 8.478681144067797e-06, "loss": 1.4944, "mean_token_accuracy": 0.6694672927260399, "num_tokens": 9243498.0, "step": 11490 }, { "epoch": 3.0434322033898304, "grad_norm": 1.6689823865890503, "learning_rate": 8.478416313559322e-06, "loss": 1.3563, "mean_token_accuracy": 0.6953393071889877, "num_tokens": 9245375.0, "step": 11492 }, { "epoch": 3.04396186440678, "grad_norm": 1.4522442817687988, "learning_rate": 8.478151483050849e-06, "loss": 1.0757, "mean_token_accuracy": 0.7307165339589119, "num_tokens": 9247322.0, "step": 11494 }, { "epoch": 3.044491525423729, "grad_norm": 1.7427115440368652, "learning_rate": 8.477886652542374e-06, "loss": 1.1884, "mean_token_accuracy": 0.7477292940020561, "num_tokens": 9248961.0, "step": 11496 }, { "epoch": 3.0450211864406778, "grad_norm": 1.7794824838638306, "learning_rate": 8.477621822033899e-06, "loss": 0.964, "mean_token_accuracy": 0.7355424538254738, "num_tokens": 9250503.0, "step": 11498 }, { "epoch": 3.045550847457627, "grad_norm": 1.6818774938583374, "learning_rate": 8.477356991525424e-06, "loss": 0.884, "step": 11500 }, { "epoch": 3.045550847457627, "eval_loss": 1.3122713565826416, "eval_mean_token_accuracy": 0.7008760973037064, "eval_num_tokens": 9252123.0, "eval_runtime": 48.576, "eval_samples_per_second": 6.341, "eval_steps_per_second": 6.341, "step": 11500 }, { "epoch": 3.046080508474576, "grad_norm": 1.7714365720748901, "learning_rate": 8.47709216101695e-06, "loss": 1.2476, "mean_token_accuracy": 0.7435172572731972, "num_tokens": 9253928.0, "step": 11502 }, { "epoch": 3.0466101694915255, "grad_norm": 1.752470850944519, "learning_rate": 8.476827330508475e-06, "loss": 1.3219, "mean_token_accuracy": 0.6851206049323082, "num_tokens": 9255396.0, "step": 11504 }, { "epoch": 3.0471398305084745, "grad_norm": 1.8043795824050903, "learning_rate": 8.4765625e-06, "loss": 1.2843, "mean_token_accuracy": 0.7230702936649323, "num_tokens": 9257001.0, "step": 11506 }, { "epoch": 3.047669491525424, "grad_norm": 1.1388211250305176, "learning_rate": 8.476297669491527e-06, "loss": 1.2607, "mean_token_accuracy": 0.6973396018147469, "num_tokens": 9258807.0, "step": 11508 }, { "epoch": 3.048199152542373, "grad_norm": 1.7484012842178345, "learning_rate": 8.476032838983052e-06, "loss": 0.992, "mean_token_accuracy": 0.7660632133483887, "num_tokens": 9260229.0, "step": 11510 }, { "epoch": 3.0487288135593222, "grad_norm": 1.6767230033874512, "learning_rate": 8.475768008474577e-06, "loss": 1.4338, "mean_token_accuracy": 0.6717596873641014, "num_tokens": 9261993.0, "step": 11512 }, { "epoch": 3.049258474576271, "grad_norm": 1.8145896196365356, "learning_rate": 8.475503177966102e-06, "loss": 1.2689, "mean_token_accuracy": 0.7079008743166924, "num_tokens": 9263382.0, "step": 11514 }, { "epoch": 3.04978813559322, "grad_norm": 2.2347121238708496, "learning_rate": 8.475238347457628e-06, "loss": 1.5325, "mean_token_accuracy": 0.6595740988850594, "num_tokens": 9264960.0, "step": 11516 }, { "epoch": 3.0503177966101696, "grad_norm": 1.7415056228637695, "learning_rate": 8.474973516949153e-06, "loss": 1.2703, "mean_token_accuracy": 0.7076404914259911, "num_tokens": 9266496.0, "step": 11518 }, { "epoch": 3.0508474576271185, "grad_norm": 1.553858995437622, "learning_rate": 8.47470868644068e-06, "loss": 1.2964, "mean_token_accuracy": 0.6805407330393791, "num_tokens": 9268110.0, "step": 11520 }, { "epoch": 3.051377118644068, "grad_norm": 1.8487576246261597, "learning_rate": 8.474443855932205e-06, "loss": 1.5817, "mean_token_accuracy": 0.6652432754635811, "num_tokens": 9269755.0, "step": 11522 }, { "epoch": 3.051906779661017, "grad_norm": 1.6639751195907593, "learning_rate": 8.47417902542373e-06, "loss": 1.3943, "mean_token_accuracy": 0.677523173391819, "num_tokens": 9271385.0, "step": 11524 }, { "epoch": 3.0524364406779663, "grad_norm": 1.6564419269561768, "learning_rate": 8.473914194915255e-06, "loss": 1.0887, "mean_token_accuracy": 0.7481508031487465, "num_tokens": 9273195.0, "step": 11526 }, { "epoch": 3.0529661016949152, "grad_norm": 1.6821202039718628, "learning_rate": 8.473649364406781e-06, "loss": 1.2745, "mean_token_accuracy": 0.7005890756845474, "num_tokens": 9275125.0, "step": 11528 }, { "epoch": 3.0534957627118646, "grad_norm": 1.6697617769241333, "learning_rate": 8.473384533898306e-06, "loss": 1.531, "mean_token_accuracy": 0.6720606908202171, "num_tokens": 9276935.0, "step": 11530 }, { "epoch": 3.0540254237288136, "grad_norm": 1.9282073974609375, "learning_rate": 8.473119703389831e-06, "loss": 1.3576, "mean_token_accuracy": 0.6947024911642075, "num_tokens": 9278515.0, "step": 11532 }, { "epoch": 3.0545550847457625, "grad_norm": 1.2782344818115234, "learning_rate": 8.472854872881356e-06, "loss": 1.2188, "mean_token_accuracy": 0.7309399172663689, "num_tokens": 9280872.0, "step": 11534 }, { "epoch": 3.055084745762712, "grad_norm": 1.9740674495697021, "learning_rate": 8.472590042372883e-06, "loss": 1.1728, "mean_token_accuracy": 0.7253432869911194, "num_tokens": 9282448.0, "step": 11536 }, { "epoch": 3.055614406779661, "grad_norm": 1.5048563480377197, "learning_rate": 8.472325211864408e-06, "loss": 1.4968, "mean_token_accuracy": 0.6460772529244423, "num_tokens": 9284207.0, "step": 11538 }, { "epoch": 3.0561440677966103, "grad_norm": 1.6082888841629028, "learning_rate": 8.472060381355932e-06, "loss": 1.6904, "mean_token_accuracy": 0.6372493654489517, "num_tokens": 9285884.0, "step": 11540 }, { "epoch": 3.0566737288135593, "grad_norm": 1.8243895769119263, "learning_rate": 8.471795550847457e-06, "loss": 1.2427, "mean_token_accuracy": 0.6854500249028206, "num_tokens": 9288018.0, "step": 11542 }, { "epoch": 3.0572033898305087, "grad_norm": 2.0395236015319824, "learning_rate": 8.471530720338984e-06, "loss": 1.4685, "mean_token_accuracy": 0.6768578588962555, "num_tokens": 9289614.0, "step": 11544 }, { "epoch": 3.0577330508474576, "grad_norm": 1.6853554248809814, "learning_rate": 8.471265889830509e-06, "loss": 1.3156, "mean_token_accuracy": 0.6979684829711914, "num_tokens": 9291092.0, "step": 11546 }, { "epoch": 3.0582627118644066, "grad_norm": 1.6510242223739624, "learning_rate": 8.471001059322036e-06, "loss": 1.4971, "mean_token_accuracy": 0.6536035090684891, "num_tokens": 9292911.0, "step": 11548 }, { "epoch": 3.058792372881356, "grad_norm": 1.68772292137146, "learning_rate": 8.47073622881356e-06, "loss": 0.9275, "mean_token_accuracy": 0.7917407229542732, "num_tokens": 9294421.0, "step": 11550 }, { "epoch": 3.059322033898305, "grad_norm": 1.7142598628997803, "learning_rate": 8.470471398305085e-06, "loss": 1.3628, "mean_token_accuracy": 0.7001496106386185, "num_tokens": 9296049.0, "step": 11552 }, { "epoch": 3.0598516949152543, "grad_norm": 1.7775099277496338, "learning_rate": 8.47020656779661e-06, "loss": 0.9779, "mean_token_accuracy": 0.7673260793089867, "num_tokens": 9297763.0, "step": 11554 }, { "epoch": 3.0603813559322033, "grad_norm": 1.7362114191055298, "learning_rate": 8.469941737288137e-06, "loss": 1.353, "mean_token_accuracy": 0.708812914788723, "num_tokens": 9299114.0, "step": 11556 }, { "epoch": 3.0609110169491527, "grad_norm": 1.955718755722046, "learning_rate": 8.469676906779662e-06, "loss": 1.4781, "mean_token_accuracy": 0.6888471320271492, "num_tokens": 9300935.0, "step": 11558 }, { "epoch": 3.0614406779661016, "grad_norm": 1.489408016204834, "learning_rate": 8.469412076271187e-06, "loss": 1.0988, "mean_token_accuracy": 0.7447991967201233, "num_tokens": 9302667.0, "step": 11560 }, { "epoch": 3.061970338983051, "grad_norm": 1.911625862121582, "learning_rate": 8.469147245762712e-06, "loss": 1.3061, "mean_token_accuracy": 0.6959310360252857, "num_tokens": 9304184.0, "step": 11562 }, { "epoch": 3.0625, "grad_norm": 1.5283942222595215, "learning_rate": 8.468882415254238e-06, "loss": 1.3869, "mean_token_accuracy": 0.6773444041609764, "num_tokens": 9306196.0, "step": 11564 }, { "epoch": 3.063029661016949, "grad_norm": 1.6219855546951294, "learning_rate": 8.468617584745763e-06, "loss": 1.2795, "mean_token_accuracy": 0.7096344009041786, "num_tokens": 9307819.0, "step": 11566 }, { "epoch": 3.0635593220338984, "grad_norm": 1.6856399774551392, "learning_rate": 8.468352754237288e-06, "loss": 1.5225, "mean_token_accuracy": 0.6788892596960068, "num_tokens": 9309421.0, "step": 11568 }, { "epoch": 3.0640889830508473, "grad_norm": 1.802313208580017, "learning_rate": 8.468087923728813e-06, "loss": 1.1739, "mean_token_accuracy": 0.7153978571295738, "num_tokens": 9311054.0, "step": 11570 }, { "epoch": 3.0646186440677967, "grad_norm": 1.752158284187317, "learning_rate": 8.46782309322034e-06, "loss": 1.3269, "mean_token_accuracy": 0.6919068917632103, "num_tokens": 9312590.0, "step": 11572 }, { "epoch": 3.0651483050847457, "grad_norm": 2.0570590496063232, "learning_rate": 8.467558262711865e-06, "loss": 1.4881, "mean_token_accuracy": 0.6731261014938354, "num_tokens": 9314019.0, "step": 11574 }, { "epoch": 3.065677966101695, "grad_norm": 1.158551573753357, "learning_rate": 8.467293432203391e-06, "loss": 0.7252, "mean_token_accuracy": 0.8187379315495491, "num_tokens": 9315662.0, "step": 11576 }, { "epoch": 3.066207627118644, "grad_norm": 2.1937031745910645, "learning_rate": 8.467028601694916e-06, "loss": 1.7022, "mean_token_accuracy": 0.6215777173638344, "num_tokens": 9317115.0, "step": 11578 }, { "epoch": 3.0667372881355934, "grad_norm": 1.88534414768219, "learning_rate": 8.466763771186441e-06, "loss": 1.5618, "mean_token_accuracy": 0.6397956684231758, "num_tokens": 9318597.0, "step": 11580 }, { "epoch": 3.0672669491525424, "grad_norm": 1.5852477550506592, "learning_rate": 8.466498940677966e-06, "loss": 0.9795, "mean_token_accuracy": 0.752977542579174, "num_tokens": 9320306.0, "step": 11582 }, { "epoch": 3.0677966101694913, "grad_norm": 2.012538433074951, "learning_rate": 8.466234110169493e-06, "loss": 1.2839, "mean_token_accuracy": 0.7019910849630833, "num_tokens": 9321763.0, "step": 11584 }, { "epoch": 3.0683262711864407, "grad_norm": 1.5380799770355225, "learning_rate": 8.465969279661018e-06, "loss": 1.0212, "mean_token_accuracy": 0.7410731911659241, "num_tokens": 9323642.0, "step": 11586 }, { "epoch": 3.0688559322033897, "grad_norm": 2.242525815963745, "learning_rate": 8.465704449152543e-06, "loss": 1.6965, "mean_token_accuracy": 0.6296417787671089, "num_tokens": 9325050.0, "step": 11588 }, { "epoch": 3.069385593220339, "grad_norm": 1.8273767232894897, "learning_rate": 8.465439618644068e-06, "loss": 1.6018, "mean_token_accuracy": 0.6597204133868217, "num_tokens": 9326564.0, "step": 11590 }, { "epoch": 3.069915254237288, "grad_norm": 1.5524823665618896, "learning_rate": 8.465174788135594e-06, "loss": 1.2257, "mean_token_accuracy": 0.6998121961951256, "num_tokens": 9328211.0, "step": 11592 }, { "epoch": 3.0704449152542375, "grad_norm": 1.6445871591567993, "learning_rate": 8.464909957627119e-06, "loss": 1.5778, "mean_token_accuracy": 0.6471014507114887, "num_tokens": 9329920.0, "step": 11594 }, { "epoch": 3.0709745762711864, "grad_norm": 1.9617817401885986, "learning_rate": 8.464645127118644e-06, "loss": 0.9043, "mean_token_accuracy": 0.7728669941425323, "num_tokens": 9331292.0, "step": 11596 }, { "epoch": 3.071504237288136, "grad_norm": 1.9677326679229736, "learning_rate": 8.46438029661017e-06, "loss": 1.5224, "mean_token_accuracy": 0.6761055663228035, "num_tokens": 9332921.0, "step": 11598 }, { "epoch": 3.0720338983050848, "grad_norm": 1.499873399734497, "learning_rate": 8.464115466101696e-06, "loss": 1.2299, "mean_token_accuracy": 0.7126211076974869, "num_tokens": 9334573.0, "step": 11600 }, { "epoch": 3.0725635593220337, "grad_norm": 1.758392333984375, "learning_rate": 8.463850635593222e-06, "loss": 1.2149, "mean_token_accuracy": 0.7084273248910904, "num_tokens": 9336118.0, "step": 11602 }, { "epoch": 3.073093220338983, "grad_norm": 1.893637776374817, "learning_rate": 8.463585805084747e-06, "loss": 1.6984, "mean_token_accuracy": 0.6375429406762123, "num_tokens": 9337948.0, "step": 11604 }, { "epoch": 3.073622881355932, "grad_norm": 1.6836323738098145, "learning_rate": 8.463320974576272e-06, "loss": 1.3181, "mean_token_accuracy": 0.6959356963634491, "num_tokens": 9339581.0, "step": 11606 }, { "epoch": 3.0741525423728815, "grad_norm": 1.8214471340179443, "learning_rate": 8.463056144067797e-06, "loss": 1.2982, "mean_token_accuracy": 0.709648035466671, "num_tokens": 9341119.0, "step": 11608 }, { "epoch": 3.0746822033898304, "grad_norm": 2.436410665512085, "learning_rate": 8.462791313559324e-06, "loss": 1.1359, "mean_token_accuracy": 0.7439079657196999, "num_tokens": 9342787.0, "step": 11610 }, { "epoch": 3.07521186440678, "grad_norm": 1.5323081016540527, "learning_rate": 8.462526483050849e-06, "loss": 1.1271, "mean_token_accuracy": 0.732190802693367, "num_tokens": 9344229.0, "step": 11612 }, { "epoch": 3.075741525423729, "grad_norm": 1.975176453590393, "learning_rate": 8.462261652542373e-06, "loss": 1.3394, "mean_token_accuracy": 0.6998809948563576, "num_tokens": 9346087.0, "step": 11614 }, { "epoch": 3.0762711864406778, "grad_norm": 1.6744264364242554, "learning_rate": 8.461996822033898e-06, "loss": 1.2604, "mean_token_accuracy": 0.702665776014328, "num_tokens": 9348004.0, "step": 11616 }, { "epoch": 3.076800847457627, "grad_norm": 1.8470150232315063, "learning_rate": 8.461731991525425e-06, "loss": 1.4359, "mean_token_accuracy": 0.6760299950838089, "num_tokens": 9349926.0, "step": 11618 }, { "epoch": 3.077330508474576, "grad_norm": 1.9874833822250366, "learning_rate": 8.46146716101695e-06, "loss": 1.8373, "mean_token_accuracy": 0.6430018395185471, "num_tokens": 9351419.0, "step": 11620 }, { "epoch": 3.0778601694915255, "grad_norm": 1.5818397998809814, "learning_rate": 8.461202330508475e-06, "loss": 1.0204, "mean_token_accuracy": 0.7426329478621483, "num_tokens": 9353234.0, "step": 11622 }, { "epoch": 3.0783898305084745, "grad_norm": 2.048114061355591, "learning_rate": 8.4609375e-06, "loss": 1.63, "mean_token_accuracy": 0.6367826238274574, "num_tokens": 9354747.0, "step": 11624 }, { "epoch": 3.078919491525424, "grad_norm": 1.5066035985946655, "learning_rate": 8.460672669491526e-06, "loss": 1.0535, "mean_token_accuracy": 0.743798203766346, "num_tokens": 9356354.0, "step": 11626 }, { "epoch": 3.079449152542373, "grad_norm": 1.9743213653564453, "learning_rate": 8.460407838983051e-06, "loss": 1.3674, "mean_token_accuracy": 0.688864104449749, "num_tokens": 9358038.0, "step": 11628 }, { "epoch": 3.0799788135593222, "grad_norm": 1.6629834175109863, "learning_rate": 8.460143008474578e-06, "loss": 1.4425, "mean_token_accuracy": 0.6591451466083527, "num_tokens": 9359916.0, "step": 11630 }, { "epoch": 3.080508474576271, "grad_norm": 1.7120472192764282, "learning_rate": 8.459878177966103e-06, "loss": 1.1001, "mean_token_accuracy": 0.7421559169888496, "num_tokens": 9361662.0, "step": 11632 }, { "epoch": 3.08103813559322, "grad_norm": 1.6746649742126465, "learning_rate": 8.459613347457628e-06, "loss": 1.5891, "mean_token_accuracy": 0.649322472512722, "num_tokens": 9363443.0, "step": 11634 }, { "epoch": 3.0815677966101696, "grad_norm": 1.8869298696517944, "learning_rate": 8.459348516949153e-06, "loss": 1.3666, "mean_token_accuracy": 0.6989549398422241, "num_tokens": 9364900.0, "step": 11636 }, { "epoch": 3.0820974576271185, "grad_norm": 1.774533987045288, "learning_rate": 8.45908368644068e-06, "loss": 1.1765, "mean_token_accuracy": 0.7291575223207474, "num_tokens": 9366263.0, "step": 11638 }, { "epoch": 3.082627118644068, "grad_norm": 2.640221118927002, "learning_rate": 8.458818855932204e-06, "loss": 1.3427, "mean_token_accuracy": 0.7084162905812263, "num_tokens": 9367607.0, "step": 11640 }, { "epoch": 3.083156779661017, "grad_norm": 2.058924913406372, "learning_rate": 8.45855402542373e-06, "loss": 1.2571, "mean_token_accuracy": 0.7310393005609512, "num_tokens": 9369236.0, "step": 11642 }, { "epoch": 3.0836864406779663, "grad_norm": 2.1145706176757812, "learning_rate": 8.458289194915254e-06, "loss": 0.9523, "mean_token_accuracy": 0.7803576588630676, "num_tokens": 9370431.0, "step": 11644 }, { "epoch": 3.0842161016949152, "grad_norm": 1.9324069023132324, "learning_rate": 8.45802436440678e-06, "loss": 1.2277, "mean_token_accuracy": 0.7228641510009766, "num_tokens": 9372117.0, "step": 11646 }, { "epoch": 3.084745762711864, "grad_norm": 1.595565915107727, "learning_rate": 8.457759533898306e-06, "loss": 1.4523, "mean_token_accuracy": 0.6686695665121078, "num_tokens": 9373985.0, "step": 11648 }, { "epoch": 3.0852754237288136, "grad_norm": 1.744041085243225, "learning_rate": 8.45749470338983e-06, "loss": 1.391, "mean_token_accuracy": 0.7100015580654144, "num_tokens": 9375630.0, "step": 11650 }, { "epoch": 3.0858050847457625, "grad_norm": 1.5992263555526733, "learning_rate": 8.457229872881356e-06, "loss": 1.2087, "mean_token_accuracy": 0.7119836285710335, "num_tokens": 9377305.0, "step": 11652 }, { "epoch": 3.086334745762712, "grad_norm": 2.0360026359558105, "learning_rate": 8.456965042372882e-06, "loss": 1.3489, "mean_token_accuracy": 0.6907888203859329, "num_tokens": 9378596.0, "step": 11654 }, { "epoch": 3.086864406779661, "grad_norm": 1.874087929725647, "learning_rate": 8.456700211864407e-06, "loss": 1.4076, "mean_token_accuracy": 0.673649787902832, "num_tokens": 9380207.0, "step": 11656 }, { "epoch": 3.0873940677966103, "grad_norm": 1.6504391431808472, "learning_rate": 8.456435381355934e-06, "loss": 1.1661, "mean_token_accuracy": 0.7513282001018524, "num_tokens": 9381859.0, "step": 11658 }, { "epoch": 3.0879237288135593, "grad_norm": 1.6845946311950684, "learning_rate": 8.456170550847457e-06, "loss": 1.2283, "mean_token_accuracy": 0.7335245087742805, "num_tokens": 9383251.0, "step": 11660 }, { "epoch": 3.0884533898305087, "grad_norm": 1.6926689147949219, "learning_rate": 8.455905720338984e-06, "loss": 1.1518, "mean_token_accuracy": 0.7340750619769096, "num_tokens": 9384846.0, "step": 11662 }, { "epoch": 3.0889830508474576, "grad_norm": 1.8253111839294434, "learning_rate": 8.455640889830509e-06, "loss": 0.9244, "mean_token_accuracy": 0.7790823355317116, "num_tokens": 9386315.0, "step": 11664 }, { "epoch": 3.0895127118644066, "grad_norm": 1.9140760898590088, "learning_rate": 8.455376059322035e-06, "loss": 1.3473, "mean_token_accuracy": 0.6646919623017311, "num_tokens": 9387732.0, "step": 11666 }, { "epoch": 3.090042372881356, "grad_norm": 1.8717796802520752, "learning_rate": 8.45511122881356e-06, "loss": 1.1806, "mean_token_accuracy": 0.7269835025072098, "num_tokens": 9389219.0, "step": 11668 }, { "epoch": 3.090572033898305, "grad_norm": 1.5272271633148193, "learning_rate": 8.454846398305085e-06, "loss": 1.0895, "mean_token_accuracy": 0.7319668680429459, "num_tokens": 9391050.0, "step": 11670 }, { "epoch": 3.0911016949152543, "grad_norm": 1.5244019031524658, "learning_rate": 8.45458156779661e-06, "loss": 1.1347, "mean_token_accuracy": 0.7352767586708069, "num_tokens": 9392767.0, "step": 11672 }, { "epoch": 3.0916313559322033, "grad_norm": 1.5460432767868042, "learning_rate": 8.454316737288137e-06, "loss": 0.7712, "mean_token_accuracy": 0.7844155579805374, "num_tokens": 9394750.0, "step": 11674 }, { "epoch": 3.0921610169491527, "grad_norm": 1.5683858394622803, "learning_rate": 8.454051906779662e-06, "loss": 0.9843, "mean_token_accuracy": 0.7717676907777786, "num_tokens": 9396292.0, "step": 11676 }, { "epoch": 3.0926906779661016, "grad_norm": 1.883030891418457, "learning_rate": 8.453787076271186e-06, "loss": 1.5903, "mean_token_accuracy": 0.6634291484951973, "num_tokens": 9397795.0, "step": 11678 }, { "epoch": 3.093220338983051, "grad_norm": 1.7506132125854492, "learning_rate": 8.453522245762713e-06, "loss": 1.1267, "mean_token_accuracy": 0.7392234057188034, "num_tokens": 9399397.0, "step": 11680 }, { "epoch": 3.09375, "grad_norm": 2.002699136734009, "learning_rate": 8.453257415254238e-06, "loss": 1.1418, "mean_token_accuracy": 0.7539604753255844, "num_tokens": 9400724.0, "step": 11682 }, { "epoch": 3.094279661016949, "grad_norm": 1.8718212842941284, "learning_rate": 8.452992584745765e-06, "loss": 1.503, "mean_token_accuracy": 0.6587576121091843, "num_tokens": 9402091.0, "step": 11684 }, { "epoch": 3.0948093220338984, "grad_norm": 1.8590502738952637, "learning_rate": 8.45272775423729e-06, "loss": 1.2945, "mean_token_accuracy": 0.6920560076832771, "num_tokens": 9403934.0, "step": 11686 }, { "epoch": 3.0953389830508473, "grad_norm": 1.6896165609359741, "learning_rate": 8.452462923728815e-06, "loss": 1.0927, "mean_token_accuracy": 0.711351603269577, "num_tokens": 9405519.0, "step": 11688 }, { "epoch": 3.0958686440677967, "grad_norm": 1.8057670593261719, "learning_rate": 8.45219809322034e-06, "loss": 1.4255, "mean_token_accuracy": 0.6844894886016846, "num_tokens": 9407050.0, "step": 11690 }, { "epoch": 3.0963983050847457, "grad_norm": 1.6626927852630615, "learning_rate": 8.451933262711866e-06, "loss": 1.514, "mean_token_accuracy": 0.6542497053742409, "num_tokens": 9409122.0, "step": 11692 }, { "epoch": 3.096927966101695, "grad_norm": 1.660093069076538, "learning_rate": 8.451668432203391e-06, "loss": 1.1156, "mean_token_accuracy": 0.7318192422389984, "num_tokens": 9410479.0, "step": 11694 }, { "epoch": 3.097457627118644, "grad_norm": 1.7724651098251343, "learning_rate": 8.451403601694916e-06, "loss": 1.1882, "mean_token_accuracy": 0.7356176152825356, "num_tokens": 9412014.0, "step": 11696 }, { "epoch": 3.0979872881355934, "grad_norm": 1.747941493988037, "learning_rate": 8.45113877118644e-06, "loss": 1.0891, "mean_token_accuracy": 0.7375666350126266, "num_tokens": 9413706.0, "step": 11698 }, { "epoch": 3.0985169491525424, "grad_norm": 1.8945412635803223, "learning_rate": 8.450873940677967e-06, "loss": 1.4119, "mean_token_accuracy": 0.6862372010946274, "num_tokens": 9415200.0, "step": 11700 }, { "epoch": 3.0990466101694913, "grad_norm": 1.8749457597732544, "learning_rate": 8.450609110169492e-06, "loss": 1.111, "mean_token_accuracy": 0.7577391788363457, "num_tokens": 9416541.0, "step": 11702 }, { "epoch": 3.0995762711864407, "grad_norm": 1.810222864151001, "learning_rate": 8.450344279661017e-06, "loss": 1.5547, "mean_token_accuracy": 0.6398008912801743, "num_tokens": 9418346.0, "step": 11704 }, { "epoch": 3.1001059322033897, "grad_norm": 1.6298884153366089, "learning_rate": 8.450079449152542e-06, "loss": 1.2627, "mean_token_accuracy": 0.7058043107390404, "num_tokens": 9419930.0, "step": 11706 }, { "epoch": 3.100635593220339, "grad_norm": 1.3092206716537476, "learning_rate": 8.449814618644069e-06, "loss": 0.729, "mean_token_accuracy": 0.8120924755930901, "num_tokens": 9422194.0, "step": 11708 }, { "epoch": 3.101165254237288, "grad_norm": 1.685875654220581, "learning_rate": 8.449549788135594e-06, "loss": 1.1091, "mean_token_accuracy": 0.7518602162599564, "num_tokens": 9423882.0, "step": 11710 }, { "epoch": 3.1016949152542375, "grad_norm": 1.717512607574463, "learning_rate": 8.44928495762712e-06, "loss": 1.4198, "mean_token_accuracy": 0.6646414250135422, "num_tokens": 9425951.0, "step": 11712 }, { "epoch": 3.1022245762711864, "grad_norm": 2.0244784355163574, "learning_rate": 8.449020127118644e-06, "loss": 1.6175, "mean_token_accuracy": 0.6319579631090164, "num_tokens": 9427362.0, "step": 11714 }, { "epoch": 3.102754237288136, "grad_norm": 2.009871006011963, "learning_rate": 8.44875529661017e-06, "loss": 1.0066, "mean_token_accuracy": 0.759753130376339, "num_tokens": 9428590.0, "step": 11716 }, { "epoch": 3.1032838983050848, "grad_norm": 1.5856882333755493, "learning_rate": 8.448490466101695e-06, "loss": 1.2903, "mean_token_accuracy": 0.6825641766190529, "num_tokens": 9430388.0, "step": 11718 }, { "epoch": 3.1038135593220337, "grad_norm": 1.7917333841323853, "learning_rate": 8.448225635593222e-06, "loss": 1.5619, "mean_token_accuracy": 0.6637952253222466, "num_tokens": 9432288.0, "step": 11720 }, { "epoch": 3.104343220338983, "grad_norm": 1.6712177991867065, "learning_rate": 8.447960805084747e-06, "loss": 1.4845, "mean_token_accuracy": 0.6606718078255653, "num_tokens": 9434158.0, "step": 11722 }, { "epoch": 3.104872881355932, "grad_norm": 2.0878403186798096, "learning_rate": 8.447695974576272e-06, "loss": 1.0482, "mean_token_accuracy": 0.7369219213724136, "num_tokens": 9435577.0, "step": 11724 }, { "epoch": 3.1054025423728815, "grad_norm": 1.9912655353546143, "learning_rate": 8.447431144067797e-06, "loss": 1.4195, "mean_token_accuracy": 0.6806275621056557, "num_tokens": 9437351.0, "step": 11726 }, { "epoch": 3.1059322033898304, "grad_norm": 1.7781137228012085, "learning_rate": 8.447166313559323e-06, "loss": 1.0272, "mean_token_accuracy": 0.7537553310394287, "num_tokens": 9438610.0, "step": 11728 }, { "epoch": 3.10646186440678, "grad_norm": 1.8963133096694946, "learning_rate": 8.446901483050848e-06, "loss": 0.9767, "mean_token_accuracy": 0.752462737262249, "num_tokens": 9440008.0, "step": 11730 }, { "epoch": 3.106991525423729, "grad_norm": 1.6476255655288696, "learning_rate": 8.446636652542373e-06, "loss": 1.2064, "mean_token_accuracy": 0.7066522464156151, "num_tokens": 9442005.0, "step": 11732 }, { "epoch": 3.1075211864406778, "grad_norm": 2.4554240703582764, "learning_rate": 8.446371822033898e-06, "loss": 1.4024, "mean_token_accuracy": 0.6971220783889294, "num_tokens": 9443469.0, "step": 11734 }, { "epoch": 3.108050847457627, "grad_norm": 1.58473539352417, "learning_rate": 8.446106991525425e-06, "loss": 0.8787, "mean_token_accuracy": 0.8058570399880409, "num_tokens": 9444811.0, "step": 11736 }, { "epoch": 3.108580508474576, "grad_norm": 1.662360429763794, "learning_rate": 8.44584216101695e-06, "loss": 1.2605, "mean_token_accuracy": 0.7115848213434219, "num_tokens": 9446396.0, "step": 11738 }, { "epoch": 3.1091101694915255, "grad_norm": 1.6236172914505005, "learning_rate": 8.445577330508476e-06, "loss": 0.8017, "mean_token_accuracy": 0.7978779822587967, "num_tokens": 9447806.0, "step": 11740 }, { "epoch": 3.1096398305084745, "grad_norm": 1.690811038017273, "learning_rate": 8.4453125e-06, "loss": 1.0151, "mean_token_accuracy": 0.7586173787713051, "num_tokens": 9449034.0, "step": 11742 }, { "epoch": 3.110169491525424, "grad_norm": 1.5417917966842651, "learning_rate": 8.445047669491526e-06, "loss": 1.1281, "mean_token_accuracy": 0.7143570631742477, "num_tokens": 9450673.0, "step": 11744 }, { "epoch": 3.110699152542373, "grad_norm": 1.8608700037002563, "learning_rate": 8.444782838983051e-06, "loss": 1.9445, "mean_token_accuracy": 0.5853975638747215, "num_tokens": 9452329.0, "step": 11746 }, { "epoch": 3.1112288135593222, "grad_norm": 2.326016902923584, "learning_rate": 8.444518008474578e-06, "loss": 1.5553, "mean_token_accuracy": 0.6434625908732414, "num_tokens": 9454355.0, "step": 11748 }, { "epoch": 3.111758474576271, "grad_norm": 2.090195894241333, "learning_rate": 8.444253177966103e-06, "loss": 1.0324, "step": 11750 }, { "epoch": 3.111758474576271, "eval_loss": 1.3124076128005981, "eval_mean_token_accuracy": 0.7012273116738765, "eval_num_tokens": 9455693.0, "eval_runtime": 48.4283, "eval_samples_per_second": 6.36, "eval_steps_per_second": 6.36, "step": 11750 }, { "epoch": 3.11228813559322, "grad_norm": 2.0447397232055664, "learning_rate": 8.443988347457627e-06, "loss": 1.2347, "mean_token_accuracy": 0.7237461768090725, "num_tokens": 9457367.0, "step": 11752 }, { "epoch": 3.1128177966101696, "grad_norm": 1.617278814315796, "learning_rate": 8.443723516949152e-06, "loss": 1.3072, "mean_token_accuracy": 0.692838616669178, "num_tokens": 9458856.0, "step": 11754 }, { "epoch": 3.1133474576271185, "grad_norm": 1.6859666109085083, "learning_rate": 8.443458686440679e-06, "loss": 1.3028, "mean_token_accuracy": 0.6807587370276451, "num_tokens": 9460451.0, "step": 11756 }, { "epoch": 3.113877118644068, "grad_norm": 1.5749893188476562, "learning_rate": 8.443193855932204e-06, "loss": 1.4615, "mean_token_accuracy": 0.666002131998539, "num_tokens": 9462349.0, "step": 11758 }, { "epoch": 3.114406779661017, "grad_norm": 1.3702139854431152, "learning_rate": 8.442929025423729e-06, "loss": 1.1407, "mean_token_accuracy": 0.7349101528525352, "num_tokens": 9464118.0, "step": 11760 }, { "epoch": 3.1149364406779663, "grad_norm": 1.6153836250305176, "learning_rate": 8.442664194915256e-06, "loss": 1.4389, "mean_token_accuracy": 0.6551721021533012, "num_tokens": 9465723.0, "step": 11762 }, { "epoch": 3.1154661016949152, "grad_norm": 1.8313190937042236, "learning_rate": 8.44239936440678e-06, "loss": 1.3152, "mean_token_accuracy": 0.7063721567392349, "num_tokens": 9467107.0, "step": 11764 }, { "epoch": 3.115995762711864, "grad_norm": 1.975706696510315, "learning_rate": 8.442134533898307e-06, "loss": 1.5311, "mean_token_accuracy": 0.6661925539374352, "num_tokens": 9468693.0, "step": 11766 }, { "epoch": 3.1165254237288136, "grad_norm": 1.5566740036010742, "learning_rate": 8.44186970338983e-06, "loss": 1.2896, "mean_token_accuracy": 0.7119782567024231, "num_tokens": 9470262.0, "step": 11768 }, { "epoch": 3.1170550847457625, "grad_norm": 1.5437767505645752, "learning_rate": 8.441604872881357e-06, "loss": 0.6979, "mean_token_accuracy": 0.8260496631264687, "num_tokens": 9471881.0, "step": 11770 }, { "epoch": 3.117584745762712, "grad_norm": 1.6471234560012817, "learning_rate": 8.441340042372882e-06, "loss": 1.396, "mean_token_accuracy": 0.6761399731040001, "num_tokens": 9473590.0, "step": 11772 }, { "epoch": 3.118114406779661, "grad_norm": 1.2824492454528809, "learning_rate": 8.441075211864408e-06, "loss": 0.7785, "mean_token_accuracy": 0.7947663888335228, "num_tokens": 9475349.0, "step": 11774 }, { "epoch": 3.1186440677966103, "grad_norm": 1.9233698844909668, "learning_rate": 8.440810381355933e-06, "loss": 1.7569, "mean_token_accuracy": 0.6226881705224514, "num_tokens": 9476751.0, "step": 11776 }, { "epoch": 3.1191737288135593, "grad_norm": 1.8401243686676025, "learning_rate": 8.440545550847458e-06, "loss": 1.2122, "mean_token_accuracy": 0.729132242500782, "num_tokens": 9478276.0, "step": 11778 }, { "epoch": 3.1197033898305087, "grad_norm": 1.2251633405685425, "learning_rate": 8.440280720338983e-06, "loss": 0.643, "mean_token_accuracy": 0.8204642683267593, "num_tokens": 9479757.0, "step": 11780 }, { "epoch": 3.1202330508474576, "grad_norm": 1.7645334005355835, "learning_rate": 8.44001588983051e-06, "loss": 1.5315, "mean_token_accuracy": 0.6459161415696144, "num_tokens": 9481447.0, "step": 11782 }, { "epoch": 3.1207627118644066, "grad_norm": 1.8255292177200317, "learning_rate": 8.439751059322035e-06, "loss": 1.0683, "mean_token_accuracy": 0.770551398396492, "num_tokens": 9483264.0, "step": 11784 }, { "epoch": 3.121292372881356, "grad_norm": 1.668271541595459, "learning_rate": 8.43948622881356e-06, "loss": 1.0398, "mean_token_accuracy": 0.7438390105962753, "num_tokens": 9484745.0, "step": 11786 }, { "epoch": 3.121822033898305, "grad_norm": 1.3515101671218872, "learning_rate": 8.439221398305085e-06, "loss": 0.7849, "mean_token_accuracy": 0.7493544295430183, "num_tokens": 9487313.0, "step": 11788 }, { "epoch": 3.1223516949152543, "grad_norm": 1.9049080610275269, "learning_rate": 8.438956567796611e-06, "loss": 1.4334, "mean_token_accuracy": 0.6793962568044662, "num_tokens": 9489110.0, "step": 11790 }, { "epoch": 3.1228813559322033, "grad_norm": 1.758172631263733, "learning_rate": 8.438691737288136e-06, "loss": 1.082, "mean_token_accuracy": 0.7342774868011475, "num_tokens": 9490873.0, "step": 11792 }, { "epoch": 3.1234110169491527, "grad_norm": 1.7174152135849, "learning_rate": 8.438426906779663e-06, "loss": 1.3657, "mean_token_accuracy": 0.6654440686106682, "num_tokens": 9492594.0, "step": 11794 }, { "epoch": 3.1239406779661016, "grad_norm": 1.7514861822128296, "learning_rate": 8.438162076271186e-06, "loss": 1.3502, "mean_token_accuracy": 0.7150283306837082, "num_tokens": 9494153.0, "step": 11796 }, { "epoch": 3.124470338983051, "grad_norm": 2.1862223148345947, "learning_rate": 8.437897245762713e-06, "loss": 1.212, "mean_token_accuracy": 0.707667239010334, "num_tokens": 9495773.0, "step": 11798 }, { "epoch": 3.125, "grad_norm": 1.774545431137085, "learning_rate": 8.437632415254238e-06, "loss": 1.4845, "mean_token_accuracy": 0.6897734105587006, "num_tokens": 9497378.0, "step": 11800 }, { "epoch": 3.125529661016949, "grad_norm": 1.6169673204421997, "learning_rate": 8.437367584745764e-06, "loss": 1.1936, "mean_token_accuracy": 0.7092271372675896, "num_tokens": 9498836.0, "step": 11802 }, { "epoch": 3.1260593220338984, "grad_norm": 1.8369488716125488, "learning_rate": 8.43710275423729e-06, "loss": 1.4371, "mean_token_accuracy": 0.6712436527013779, "num_tokens": 9500513.0, "step": 11804 }, { "epoch": 3.1265889830508473, "grad_norm": 1.873849868774414, "learning_rate": 8.436837923728814e-06, "loss": 1.271, "mean_token_accuracy": 0.6893176808953285, "num_tokens": 9501959.0, "step": 11806 }, { "epoch": 3.1271186440677967, "grad_norm": 2.4357614517211914, "learning_rate": 8.436573093220339e-06, "loss": 1.8503, "mean_token_accuracy": 0.587828129529953, "num_tokens": 9503602.0, "step": 11808 }, { "epoch": 3.1276483050847457, "grad_norm": 1.9509475231170654, "learning_rate": 8.436308262711866e-06, "loss": 0.9051, "mean_token_accuracy": 0.7649279609322548, "num_tokens": 9505200.0, "step": 11810 }, { "epoch": 3.128177966101695, "grad_norm": 1.887943148612976, "learning_rate": 8.43604343220339e-06, "loss": 1.1581, "mean_token_accuracy": 0.7284858971834183, "num_tokens": 9506569.0, "step": 11812 }, { "epoch": 3.128707627118644, "grad_norm": 2.0953917503356934, "learning_rate": 8.435778601694916e-06, "loss": 1.1182, "mean_token_accuracy": 0.7374480441212654, "num_tokens": 9508069.0, "step": 11814 }, { "epoch": 3.1292372881355934, "grad_norm": 1.6696112155914307, "learning_rate": 8.43551377118644e-06, "loss": 0.9951, "mean_token_accuracy": 0.7578147053718567, "num_tokens": 9509494.0, "step": 11816 }, { "epoch": 3.1297669491525424, "grad_norm": 1.6177586317062378, "learning_rate": 8.435248940677967e-06, "loss": 1.0271, "mean_token_accuracy": 0.7531822174787521, "num_tokens": 9511167.0, "step": 11818 }, { "epoch": 3.1302966101694913, "grad_norm": 1.8123222589492798, "learning_rate": 8.434984110169492e-06, "loss": 1.0046, "mean_token_accuracy": 0.7576834484934807, "num_tokens": 9513050.0, "step": 11820 }, { "epoch": 3.1308262711864407, "grad_norm": 1.844171166419983, "learning_rate": 8.434719279661017e-06, "loss": 1.3655, "mean_token_accuracy": 0.6940274462103844, "num_tokens": 9514730.0, "step": 11822 }, { "epoch": 3.1313559322033897, "grad_norm": 1.8570159673690796, "learning_rate": 8.434454449152542e-06, "loss": 1.4918, "mean_token_accuracy": 0.6791543141007423, "num_tokens": 9516475.0, "step": 11824 }, { "epoch": 3.131885593220339, "grad_norm": 1.7940260171890259, "learning_rate": 8.434189618644068e-06, "loss": 1.2904, "mean_token_accuracy": 0.7175885736942291, "num_tokens": 9518169.0, "step": 11826 }, { "epoch": 3.132415254237288, "grad_norm": 2.308607816696167, "learning_rate": 8.433924788135593e-06, "loss": 1.3174, "mean_token_accuracy": 0.70697370916605, "num_tokens": 9519486.0, "step": 11828 }, { "epoch": 3.1329449152542375, "grad_norm": 1.6490131616592407, "learning_rate": 8.43365995762712e-06, "loss": 1.7446, "mean_token_accuracy": 0.6181913129985332, "num_tokens": 9521322.0, "step": 11830 }, { "epoch": 3.1334745762711864, "grad_norm": 1.3410159349441528, "learning_rate": 8.433395127118645e-06, "loss": 0.9401, "mean_token_accuracy": 0.7696905434131622, "num_tokens": 9522702.0, "step": 11832 }, { "epoch": 3.134004237288136, "grad_norm": 1.5033634901046753, "learning_rate": 8.43313029661017e-06, "loss": 0.8515, "mean_token_accuracy": 0.7945396229624748, "num_tokens": 9524171.0, "step": 11834 }, { "epoch": 3.1345338983050848, "grad_norm": 2.111921548843384, "learning_rate": 8.432865466101695e-06, "loss": 1.4587, "mean_token_accuracy": 0.696864627301693, "num_tokens": 9525808.0, "step": 11836 }, { "epoch": 3.1350635593220337, "grad_norm": 1.9983789920806885, "learning_rate": 8.432600635593221e-06, "loss": 1.3886, "mean_token_accuracy": 0.6956151686608791, "num_tokens": 9527403.0, "step": 11838 }, { "epoch": 3.135593220338983, "grad_norm": 2.146369457244873, "learning_rate": 8.432335805084746e-06, "loss": 1.3549, "mean_token_accuracy": 0.6904269456863403, "num_tokens": 9528972.0, "step": 11840 }, { "epoch": 3.136122881355932, "grad_norm": 1.8129304647445679, "learning_rate": 8.432070974576271e-06, "loss": 1.38, "mean_token_accuracy": 0.6905135810375214, "num_tokens": 9530488.0, "step": 11842 }, { "epoch": 3.1366525423728815, "grad_norm": 1.881778597831726, "learning_rate": 8.431806144067798e-06, "loss": 1.3612, "mean_token_accuracy": 0.6609711870551109, "num_tokens": 9532008.0, "step": 11844 }, { "epoch": 3.1371822033898304, "grad_norm": 1.5720528364181519, "learning_rate": 8.431541313559323e-06, "loss": 1.3476, "mean_token_accuracy": 0.6901626661419868, "num_tokens": 9533520.0, "step": 11846 }, { "epoch": 3.13771186440678, "grad_norm": 1.7682331800460815, "learning_rate": 8.43127648305085e-06, "loss": 1.2157, "mean_token_accuracy": 0.7187749221920967, "num_tokens": 9534909.0, "step": 11848 }, { "epoch": 3.138241525423729, "grad_norm": 1.6794573068618774, "learning_rate": 8.431011652542373e-06, "loss": 1.1902, "mean_token_accuracy": 0.7321327701210976, "num_tokens": 9536590.0, "step": 11850 }, { "epoch": 3.1387711864406778, "grad_norm": 2.2142884731292725, "learning_rate": 8.4307468220339e-06, "loss": 1.1594, "mean_token_accuracy": 0.7352520450949669, "num_tokens": 9537809.0, "step": 11852 }, { "epoch": 3.139300847457627, "grad_norm": 1.6886359453201294, "learning_rate": 8.430481991525424e-06, "loss": 1.1348, "mean_token_accuracy": 0.7306363433599472, "num_tokens": 9539638.0, "step": 11854 }, { "epoch": 3.139830508474576, "grad_norm": 1.541693091392517, "learning_rate": 8.430217161016951e-06, "loss": 1.3859, "mean_token_accuracy": 0.6978809982538223, "num_tokens": 9541302.0, "step": 11856 }, { "epoch": 3.1403601694915255, "grad_norm": 1.9400838613510132, "learning_rate": 8.429952330508476e-06, "loss": 1.6291, "mean_token_accuracy": 0.6466951817274094, "num_tokens": 9543073.0, "step": 11858 }, { "epoch": 3.1408898305084745, "grad_norm": 1.7257905006408691, "learning_rate": 8.4296875e-06, "loss": 1.3951, "mean_token_accuracy": 0.6787741184234619, "num_tokens": 9544823.0, "step": 11860 }, { "epoch": 3.141419491525424, "grad_norm": 2.4062044620513916, "learning_rate": 8.429422669491526e-06, "loss": 1.5365, "mean_token_accuracy": 0.6594928577542305, "num_tokens": 9546226.0, "step": 11862 }, { "epoch": 3.141949152542373, "grad_norm": 2.1499195098876953, "learning_rate": 8.429157838983052e-06, "loss": 1.3196, "mean_token_accuracy": 0.7140181064605713, "num_tokens": 9547625.0, "step": 11864 }, { "epoch": 3.1424788135593222, "grad_norm": 1.3812445402145386, "learning_rate": 8.428893008474577e-06, "loss": 0.8218, "mean_token_accuracy": 0.7844278961420059, "num_tokens": 9549219.0, "step": 11866 }, { "epoch": 3.143008474576271, "grad_norm": 2.136221408843994, "learning_rate": 8.428628177966102e-06, "loss": 1.0927, "mean_token_accuracy": 0.731702022254467, "num_tokens": 9550574.0, "step": 11868 }, { "epoch": 3.14353813559322, "grad_norm": 1.4651708602905273, "learning_rate": 8.428363347457627e-06, "loss": 1.0019, "mean_token_accuracy": 0.7427553981542587, "num_tokens": 9552308.0, "step": 11870 }, { "epoch": 3.1440677966101696, "grad_norm": 2.2917962074279785, "learning_rate": 8.428098516949154e-06, "loss": 1.5451, "mean_token_accuracy": 0.6519871763885021, "num_tokens": 9553573.0, "step": 11872 }, { "epoch": 3.1445974576271185, "grad_norm": 1.9096078872680664, "learning_rate": 8.427833686440679e-06, "loss": 1.2796, "mean_token_accuracy": 0.6932927034795284, "num_tokens": 9555256.0, "step": 11874 }, { "epoch": 3.145127118644068, "grad_norm": 1.8420608043670654, "learning_rate": 8.427568855932204e-06, "loss": 1.1062, "mean_token_accuracy": 0.7359240725636482, "num_tokens": 9556758.0, "step": 11876 }, { "epoch": 3.145656779661017, "grad_norm": 1.8885929584503174, "learning_rate": 8.427304025423729e-06, "loss": 1.3554, "mean_token_accuracy": 0.6855134293437004, "num_tokens": 9558397.0, "step": 11878 }, { "epoch": 3.1461864406779663, "grad_norm": 1.8777649402618408, "learning_rate": 8.427039194915255e-06, "loss": 1.4544, "mean_token_accuracy": 0.6596017852425575, "num_tokens": 9559988.0, "step": 11880 }, { "epoch": 3.1467161016949152, "grad_norm": 2.026355504989624, "learning_rate": 8.42677436440678e-06, "loss": 1.1051, "mean_token_accuracy": 0.7325623109936714, "num_tokens": 9561429.0, "step": 11882 }, { "epoch": 3.147245762711864, "grad_norm": 2.0045154094696045, "learning_rate": 8.426509533898307e-06, "loss": 1.2223, "mean_token_accuracy": 0.7452194839715958, "num_tokens": 9562849.0, "step": 11884 }, { "epoch": 3.1477754237288136, "grad_norm": 2.6622376441955566, "learning_rate": 8.426244703389832e-06, "loss": 1.4448, "mean_token_accuracy": 0.6955912932753563, "num_tokens": 9564135.0, "step": 11886 }, { "epoch": 3.1483050847457625, "grad_norm": 1.9148404598236084, "learning_rate": 8.425979872881357e-06, "loss": 1.6362, "mean_token_accuracy": 0.6523889973759651, "num_tokens": 9565700.0, "step": 11888 }, { "epoch": 3.148834745762712, "grad_norm": 1.5297024250030518, "learning_rate": 8.425715042372881e-06, "loss": 1.0736, "mean_token_accuracy": 0.7418124675750732, "num_tokens": 9567160.0, "step": 11890 }, { "epoch": 3.149364406779661, "grad_norm": 1.9441581964492798, "learning_rate": 8.425450211864408e-06, "loss": 1.5249, "mean_token_accuracy": 0.6625266000628471, "num_tokens": 9568916.0, "step": 11892 }, { "epoch": 3.1498940677966103, "grad_norm": 2.3557932376861572, "learning_rate": 8.425185381355933e-06, "loss": 1.1124, "mean_token_accuracy": 0.7331303805112839, "num_tokens": 9570327.0, "step": 11894 }, { "epoch": 3.1504237288135593, "grad_norm": 1.7428609132766724, "learning_rate": 8.424920550847458e-06, "loss": 0.9131, "mean_token_accuracy": 0.7699535936117172, "num_tokens": 9571854.0, "step": 11896 }, { "epoch": 3.1509533898305087, "grad_norm": 2.028897285461426, "learning_rate": 8.424655720338983e-06, "loss": 1.2069, "mean_token_accuracy": 0.7282239533960819, "num_tokens": 9573321.0, "step": 11898 }, { "epoch": 3.1514830508474576, "grad_norm": 1.9460026025772095, "learning_rate": 8.42439088983051e-06, "loss": 1.1941, "mean_token_accuracy": 0.7054263725876808, "num_tokens": 9574949.0, "step": 11900 }, { "epoch": 3.1520127118644066, "grad_norm": 1.8071757555007935, "learning_rate": 8.424126059322034e-06, "loss": 1.4187, "mean_token_accuracy": 0.7016135305166245, "num_tokens": 9576782.0, "step": 11902 }, { "epoch": 3.152542372881356, "grad_norm": 1.930417537689209, "learning_rate": 8.42386122881356e-06, "loss": 1.3073, "mean_token_accuracy": 0.704391710460186, "num_tokens": 9578463.0, "step": 11904 }, { "epoch": 3.153072033898305, "grad_norm": 1.7722864151000977, "learning_rate": 8.423596398305084e-06, "loss": 1.1385, "mean_token_accuracy": 0.7103897109627724, "num_tokens": 9579912.0, "step": 11906 }, { "epoch": 3.1536016949152543, "grad_norm": 1.4663925170898438, "learning_rate": 8.423331567796611e-06, "loss": 0.936, "mean_token_accuracy": 0.7600322142243385, "num_tokens": 9581554.0, "step": 11908 }, { "epoch": 3.1541313559322033, "grad_norm": 1.2835524082183838, "learning_rate": 8.423066737288136e-06, "loss": 0.8854, "mean_token_accuracy": 0.7853992059826851, "num_tokens": 9583459.0, "step": 11910 }, { "epoch": 3.1546610169491527, "grad_norm": 1.3469785451889038, "learning_rate": 8.422801906779662e-06, "loss": 1.0573, "mean_token_accuracy": 0.7248022183775902, "num_tokens": 9585118.0, "step": 11912 }, { "epoch": 3.1551906779661016, "grad_norm": 1.863709807395935, "learning_rate": 8.422537076271187e-06, "loss": 1.6778, "mean_token_accuracy": 0.6283418834209442, "num_tokens": 9586737.0, "step": 11914 }, { "epoch": 3.155720338983051, "grad_norm": 1.571260929107666, "learning_rate": 8.422272245762712e-06, "loss": 1.2203, "mean_token_accuracy": 0.7096971422433853, "num_tokens": 9588518.0, "step": 11916 }, { "epoch": 3.15625, "grad_norm": 1.6795706748962402, "learning_rate": 8.422007415254237e-06, "loss": 0.9019, "mean_token_accuracy": 0.7638954892754555, "num_tokens": 9590389.0, "step": 11918 }, { "epoch": 3.156779661016949, "grad_norm": 1.9381179809570312, "learning_rate": 8.421742584745764e-06, "loss": 1.5559, "mean_token_accuracy": 0.6681340709328651, "num_tokens": 9592016.0, "step": 11920 }, { "epoch": 3.1573093220338984, "grad_norm": 1.5526459217071533, "learning_rate": 8.421477754237289e-06, "loss": 1.1821, "mean_token_accuracy": 0.7268192172050476, "num_tokens": 9593543.0, "step": 11922 }, { "epoch": 3.1578389830508473, "grad_norm": 1.743451714515686, "learning_rate": 8.421212923728814e-06, "loss": 1.4248, "mean_token_accuracy": 0.6896917447447777, "num_tokens": 9595423.0, "step": 11924 }, { "epoch": 3.1583686440677967, "grad_norm": 1.8813419342041016, "learning_rate": 8.420948093220339e-06, "loss": 0.9928, "mean_token_accuracy": 0.761163167655468, "num_tokens": 9597003.0, "step": 11926 }, { "epoch": 3.1588983050847457, "grad_norm": 2.05883526802063, "learning_rate": 8.420683262711865e-06, "loss": 1.123, "mean_token_accuracy": 0.7530649900436401, "num_tokens": 9598599.0, "step": 11928 }, { "epoch": 3.159427966101695, "grad_norm": 2.1064414978027344, "learning_rate": 8.42041843220339e-06, "loss": 1.2491, "mean_token_accuracy": 0.7161619365215302, "num_tokens": 9600243.0, "step": 11930 }, { "epoch": 3.159957627118644, "grad_norm": 1.7896294593811035, "learning_rate": 8.420153601694915e-06, "loss": 1.5286, "mean_token_accuracy": 0.6672626957297325, "num_tokens": 9601854.0, "step": 11932 }, { "epoch": 3.1604872881355934, "grad_norm": 2.004777193069458, "learning_rate": 8.419888771186442e-06, "loss": 1.2229, "mean_token_accuracy": 0.7178065963089466, "num_tokens": 9603383.0, "step": 11934 }, { "epoch": 3.1610169491525424, "grad_norm": 1.7310372591018677, "learning_rate": 8.419623940677967e-06, "loss": 1.5139, "mean_token_accuracy": 0.6658402942121029, "num_tokens": 9604926.0, "step": 11936 }, { "epoch": 3.1615466101694913, "grad_norm": 2.2305727005004883, "learning_rate": 8.419359110169493e-06, "loss": 1.162, "mean_token_accuracy": 0.7480733394622803, "num_tokens": 9606525.0, "step": 11938 }, { "epoch": 3.1620762711864407, "grad_norm": 2.2310168743133545, "learning_rate": 8.419094279661018e-06, "loss": 1.442, "mean_token_accuracy": 0.6741909235715866, "num_tokens": 9608068.0, "step": 11940 }, { "epoch": 3.1626059322033897, "grad_norm": 1.6675283908843994, "learning_rate": 8.418829449152543e-06, "loss": 1.1107, "mean_token_accuracy": 0.7383161261677742, "num_tokens": 9609722.0, "step": 11942 }, { "epoch": 3.163135593220339, "grad_norm": 2.4972591400146484, "learning_rate": 8.418564618644068e-06, "loss": 1.7182, "mean_token_accuracy": 0.6648251563310623, "num_tokens": 9611088.0, "step": 11944 }, { "epoch": 3.163665254237288, "grad_norm": 1.5503363609313965, "learning_rate": 8.418299788135595e-06, "loss": 1.5898, "mean_token_accuracy": 0.6305075585842133, "num_tokens": 9612779.0, "step": 11946 }, { "epoch": 3.1641949152542375, "grad_norm": 1.7606103420257568, "learning_rate": 8.41803495762712e-06, "loss": 1.3645, "mean_token_accuracy": 0.6969230435788631, "num_tokens": 9614241.0, "step": 11948 }, { "epoch": 3.1647245762711864, "grad_norm": 1.579704999923706, "learning_rate": 8.417770127118645e-06, "loss": 0.7199, "mean_token_accuracy": 0.8044523671269417, "num_tokens": 9615682.0, "step": 11950 }, { "epoch": 3.165254237288136, "grad_norm": 1.6099393367767334, "learning_rate": 8.41750529661017e-06, "loss": 1.3762, "mean_token_accuracy": 0.6987434402108192, "num_tokens": 9617273.0, "step": 11952 }, { "epoch": 3.1657838983050848, "grad_norm": 1.9203624725341797, "learning_rate": 8.417240466101696e-06, "loss": 1.7887, "mean_token_accuracy": 0.6413977965712547, "num_tokens": 9618776.0, "step": 11954 }, { "epoch": 3.1663135593220337, "grad_norm": 2.2187600135803223, "learning_rate": 8.416975635593221e-06, "loss": 1.4113, "mean_token_accuracy": 0.6880883201956749, "num_tokens": 9620310.0, "step": 11956 }, { "epoch": 3.166843220338983, "grad_norm": 1.7852710485458374, "learning_rate": 8.416710805084746e-06, "loss": 1.0805, "mean_token_accuracy": 0.7396652549505234, "num_tokens": 9621993.0, "step": 11958 }, { "epoch": 3.167372881355932, "grad_norm": 1.4424327611923218, "learning_rate": 8.416445974576271e-06, "loss": 1.081, "mean_token_accuracy": 0.7468257546424866, "num_tokens": 9623797.0, "step": 11960 }, { "epoch": 3.1679025423728815, "grad_norm": 1.703450322151184, "learning_rate": 8.416181144067798e-06, "loss": 1.068, "mean_token_accuracy": 0.7641232088208199, "num_tokens": 9625242.0, "step": 11962 }, { "epoch": 3.1684322033898304, "grad_norm": 1.8792849779129028, "learning_rate": 8.415916313559322e-06, "loss": 0.9792, "mean_token_accuracy": 0.7656214833259583, "num_tokens": 9626655.0, "step": 11964 }, { "epoch": 3.16896186440678, "grad_norm": 1.9880939722061157, "learning_rate": 8.415651483050849e-06, "loss": 1.1702, "mean_token_accuracy": 0.7199072986841202, "num_tokens": 9628074.0, "step": 11966 }, { "epoch": 3.169491525423729, "grad_norm": 1.6965718269348145, "learning_rate": 8.415386652542374e-06, "loss": 1.3029, "mean_token_accuracy": 0.7102290615439415, "num_tokens": 9629584.0, "step": 11968 }, { "epoch": 3.1700211864406778, "grad_norm": 2.1528537273406982, "learning_rate": 8.415121822033899e-06, "loss": 1.1851, "mean_token_accuracy": 0.7544515877962112, "num_tokens": 9630913.0, "step": 11970 }, { "epoch": 3.170550847457627, "grad_norm": 2.2140161991119385, "learning_rate": 8.414856991525424e-06, "loss": 1.693, "mean_token_accuracy": 0.6243956983089447, "num_tokens": 9632422.0, "step": 11972 }, { "epoch": 3.171080508474576, "grad_norm": 1.528671145439148, "learning_rate": 8.41459216101695e-06, "loss": 1.4299, "mean_token_accuracy": 0.6693015247583389, "num_tokens": 9634405.0, "step": 11974 }, { "epoch": 3.1716101694915255, "grad_norm": 2.3674697875976562, "learning_rate": 8.414327330508475e-06, "loss": 1.4198, "mean_token_accuracy": 0.6795874908566475, "num_tokens": 9635763.0, "step": 11976 }, { "epoch": 3.1721398305084745, "grad_norm": 1.9476165771484375, "learning_rate": 8.4140625e-06, "loss": 1.5892, "mean_token_accuracy": 0.6313742473721504, "num_tokens": 9637580.0, "step": 11978 }, { "epoch": 3.172669491525424, "grad_norm": 1.9884090423583984, "learning_rate": 8.413797669491525e-06, "loss": 1.4075, "mean_token_accuracy": 0.6796011328697205, "num_tokens": 9639374.0, "step": 11980 }, { "epoch": 3.173199152542373, "grad_norm": 1.8005828857421875, "learning_rate": 8.413532838983052e-06, "loss": 0.8351, "mean_token_accuracy": 0.786840632557869, "num_tokens": 9640808.0, "step": 11982 }, { "epoch": 3.1737288135593222, "grad_norm": 1.7340600490570068, "learning_rate": 8.413268008474577e-06, "loss": 1.0267, "mean_token_accuracy": 0.7644933611154556, "num_tokens": 9642421.0, "step": 11984 }, { "epoch": 3.174258474576271, "grad_norm": 1.8797335624694824, "learning_rate": 8.413003177966102e-06, "loss": 1.2546, "mean_token_accuracy": 0.7331685051321983, "num_tokens": 9644127.0, "step": 11986 }, { "epoch": 3.17478813559322, "grad_norm": 1.7302653789520264, "learning_rate": 8.412738347457627e-06, "loss": 1.4346, "mean_token_accuracy": 0.6766052767634392, "num_tokens": 9645849.0, "step": 11988 }, { "epoch": 3.1753177966101696, "grad_norm": 2.0206658840179443, "learning_rate": 8.412473516949153e-06, "loss": 1.5173, "mean_token_accuracy": 0.684919998049736, "num_tokens": 9647487.0, "step": 11990 }, { "epoch": 3.1758474576271185, "grad_norm": 1.5497992038726807, "learning_rate": 8.412208686440678e-06, "loss": 1.099, "mean_token_accuracy": 0.7528790384531021, "num_tokens": 9649046.0, "step": 11992 }, { "epoch": 3.176377118644068, "grad_norm": 1.6731876134872437, "learning_rate": 8.411943855932205e-06, "loss": 1.5101, "mean_token_accuracy": 0.6659092530608177, "num_tokens": 9650846.0, "step": 11994 }, { "epoch": 3.176906779661017, "grad_norm": 1.8025169372558594, "learning_rate": 8.41167902542373e-06, "loss": 1.3005, "mean_token_accuracy": 0.7025796249508858, "num_tokens": 9652597.0, "step": 11996 }, { "epoch": 3.1774364406779663, "grad_norm": 1.569124460220337, "learning_rate": 8.411414194915255e-06, "loss": 1.1566, "mean_token_accuracy": 0.7317559346556664, "num_tokens": 9654139.0, "step": 11998 }, { "epoch": 3.1779661016949152, "grad_norm": 1.5798135995864868, "learning_rate": 8.41114936440678e-06, "loss": 1.0879, "step": 12000 }, { "epoch": 3.1779661016949152, "eval_loss": 1.311347484588623, "eval_mean_token_accuracy": 0.7012617525148701, "eval_num_tokens": 9655798.0, "eval_runtime": 48.1542, "eval_samples_per_second": 6.396, "eval_steps_per_second": 6.396, "step": 12000 }, { "epoch": 3.178495762711864, "grad_norm": 1.8513199090957642, "learning_rate": 8.410884533898306e-06, "loss": 1.4128, "mean_token_accuracy": 0.7116485014557838, "num_tokens": 9657727.0, "step": 12002 }, { "epoch": 3.1790254237288136, "grad_norm": 1.7132868766784668, "learning_rate": 8.410619703389831e-06, "loss": 1.1726, "mean_token_accuracy": 0.7256844192743301, "num_tokens": 9659284.0, "step": 12004 }, { "epoch": 3.1795550847457625, "grad_norm": 2.093834638595581, "learning_rate": 8.410354872881356e-06, "loss": 1.6651, "mean_token_accuracy": 0.6445322483778, "num_tokens": 9660662.0, "step": 12006 }, { "epoch": 3.180084745762712, "grad_norm": 1.957356333732605, "learning_rate": 8.410090042372881e-06, "loss": 1.2359, "mean_token_accuracy": 0.6890356093645096, "num_tokens": 9662184.0, "step": 12008 }, { "epoch": 3.180614406779661, "grad_norm": 1.424027681350708, "learning_rate": 8.409825211864408e-06, "loss": 1.0519, "mean_token_accuracy": 0.7621508091688156, "num_tokens": 9663950.0, "step": 12010 }, { "epoch": 3.1811440677966103, "grad_norm": 1.8194148540496826, "learning_rate": 8.409560381355933e-06, "loss": 1.4087, "mean_token_accuracy": 0.6849463805556297, "num_tokens": 9665669.0, "step": 12012 }, { "epoch": 3.1816737288135593, "grad_norm": 1.9260917901992798, "learning_rate": 8.409295550847458e-06, "loss": 1.1772, "mean_token_accuracy": 0.7138912007212639, "num_tokens": 9667382.0, "step": 12014 }, { "epoch": 3.1822033898305087, "grad_norm": 1.8777132034301758, "learning_rate": 8.409030720338984e-06, "loss": 1.0024, "mean_token_accuracy": 0.7780003175139427, "num_tokens": 9668753.0, "step": 12016 }, { "epoch": 3.1827330508474576, "grad_norm": 2.0654489994049072, "learning_rate": 8.408765889830509e-06, "loss": 0.9354, "mean_token_accuracy": 0.7551018074154854, "num_tokens": 9670259.0, "step": 12018 }, { "epoch": 3.1832627118644066, "grad_norm": 1.739467978477478, "learning_rate": 8.408501059322036e-06, "loss": 1.4205, "mean_token_accuracy": 0.6804093718528748, "num_tokens": 9671889.0, "step": 12020 }, { "epoch": 3.183792372881356, "grad_norm": 2.2708559036254883, "learning_rate": 8.40823622881356e-06, "loss": 1.3106, "mean_token_accuracy": 0.7002740800380707, "num_tokens": 9673488.0, "step": 12022 }, { "epoch": 3.184322033898305, "grad_norm": 1.7604851722717285, "learning_rate": 8.407971398305086e-06, "loss": 1.1256, "mean_token_accuracy": 0.7376478053629398, "num_tokens": 9675552.0, "step": 12024 }, { "epoch": 3.1848516949152543, "grad_norm": 1.8058456182479858, "learning_rate": 8.40770656779661e-06, "loss": 1.2744, "mean_token_accuracy": 0.6618527472019196, "num_tokens": 9677786.0, "step": 12026 }, { "epoch": 3.1853813559322033, "grad_norm": 1.7387062311172485, "learning_rate": 8.407441737288137e-06, "loss": 1.4087, "mean_token_accuracy": 0.7141376659274101, "num_tokens": 9679458.0, "step": 12028 }, { "epoch": 3.1859110169491527, "grad_norm": 1.9305951595306396, "learning_rate": 8.407176906779662e-06, "loss": 1.2536, "mean_token_accuracy": 0.7343810647726059, "num_tokens": 9680936.0, "step": 12030 }, { "epoch": 3.1864406779661016, "grad_norm": 1.885955810546875, "learning_rate": 8.406912076271187e-06, "loss": 1.1404, "mean_token_accuracy": 0.7363968044519424, "num_tokens": 9682600.0, "step": 12032 }, { "epoch": 3.186970338983051, "grad_norm": 2.062680244445801, "learning_rate": 8.406647245762712e-06, "loss": 1.2525, "mean_token_accuracy": 0.7385869733989239, "num_tokens": 9683906.0, "step": 12034 }, { "epoch": 3.1875, "grad_norm": 1.7049038410186768, "learning_rate": 8.406382415254239e-06, "loss": 1.0705, "mean_token_accuracy": 0.7606050372123718, "num_tokens": 9685475.0, "step": 12036 }, { "epoch": 3.188029661016949, "grad_norm": 1.5886088609695435, "learning_rate": 8.406117584745764e-06, "loss": 1.6902, "mean_token_accuracy": 0.6432695277035236, "num_tokens": 9687315.0, "step": 12038 }, { "epoch": 3.1885593220338984, "grad_norm": 1.6539483070373535, "learning_rate": 8.405852754237288e-06, "loss": 0.9661, "mean_token_accuracy": 0.7552332654595375, "num_tokens": 9688978.0, "step": 12040 }, { "epoch": 3.1890889830508473, "grad_norm": 1.5692956447601318, "learning_rate": 8.405587923728813e-06, "loss": 0.9507, "mean_token_accuracy": 0.767366424202919, "num_tokens": 9690380.0, "step": 12042 }, { "epoch": 3.1896186440677967, "grad_norm": 2.0568690299987793, "learning_rate": 8.40532309322034e-06, "loss": 1.8986, "mean_token_accuracy": 0.5945472419261932, "num_tokens": 9691876.0, "step": 12044 }, { "epoch": 3.1901483050847457, "grad_norm": 2.1925201416015625, "learning_rate": 8.405058262711865e-06, "loss": 1.5178, "mean_token_accuracy": 0.6600621864199638, "num_tokens": 9693246.0, "step": 12046 }, { "epoch": 3.190677966101695, "grad_norm": 1.6760307550430298, "learning_rate": 8.404793432203392e-06, "loss": 1.4548, "mean_token_accuracy": 0.6663644686341286, "num_tokens": 9695161.0, "step": 12048 }, { "epoch": 3.191207627118644, "grad_norm": 1.8161433935165405, "learning_rate": 8.404528601694916e-06, "loss": 1.6165, "mean_token_accuracy": 0.6561683677136898, "num_tokens": 9696741.0, "step": 12050 }, { "epoch": 3.1917372881355934, "grad_norm": 1.8034309148788452, "learning_rate": 8.404263771186441e-06, "loss": 1.1225, "mean_token_accuracy": 0.7503894940018654, "num_tokens": 9698760.0, "step": 12052 }, { "epoch": 3.1922669491525424, "grad_norm": 1.90074622631073, "learning_rate": 8.403998940677966e-06, "loss": 1.1964, "mean_token_accuracy": 0.7237981036305428, "num_tokens": 9700468.0, "step": 12054 }, { "epoch": 3.1927966101694913, "grad_norm": 1.7351397275924683, "learning_rate": 8.403734110169493e-06, "loss": 1.2169, "mean_token_accuracy": 0.7068517431616783, "num_tokens": 9702102.0, "step": 12056 }, { "epoch": 3.1933262711864407, "grad_norm": 1.5996392965316772, "learning_rate": 8.403469279661018e-06, "loss": 1.3078, "mean_token_accuracy": 0.6923288479447365, "num_tokens": 9703999.0, "step": 12058 }, { "epoch": 3.1938559322033897, "grad_norm": 1.439870834350586, "learning_rate": 8.403204449152543e-06, "loss": 1.1397, "mean_token_accuracy": 0.6970910876989365, "num_tokens": 9705905.0, "step": 12060 }, { "epoch": 3.194385593220339, "grad_norm": 1.6108102798461914, "learning_rate": 8.402939618644068e-06, "loss": 1.0034, "mean_token_accuracy": 0.7641346380114555, "num_tokens": 9707402.0, "step": 12062 }, { "epoch": 3.194915254237288, "grad_norm": 1.5830506086349487, "learning_rate": 8.402674788135594e-06, "loss": 1.2306, "mean_token_accuracy": 0.7176127657294273, "num_tokens": 9709277.0, "step": 12064 }, { "epoch": 3.1954449152542375, "grad_norm": 1.9092305898666382, "learning_rate": 8.40240995762712e-06, "loss": 1.5573, "mean_token_accuracy": 0.645639069378376, "num_tokens": 9710816.0, "step": 12066 }, { "epoch": 3.1959745762711864, "grad_norm": 1.8768483400344849, "learning_rate": 8.402145127118644e-06, "loss": 0.7443, "mean_token_accuracy": 0.7772476226091385, "num_tokens": 9712712.0, "step": 12068 }, { "epoch": 3.196504237288136, "grad_norm": 1.8032572269439697, "learning_rate": 8.401880296610169e-06, "loss": 0.9684, "mean_token_accuracy": 0.7566670253872871, "num_tokens": 9714203.0, "step": 12070 }, { "epoch": 3.1970338983050848, "grad_norm": 2.190551280975342, "learning_rate": 8.401615466101696e-06, "loss": 1.2512, "mean_token_accuracy": 0.7094099000096321, "num_tokens": 9715436.0, "step": 12072 }, { "epoch": 3.1975635593220337, "grad_norm": 1.833975076675415, "learning_rate": 8.40135063559322e-06, "loss": 1.3156, "mean_token_accuracy": 0.6878500580787659, "num_tokens": 9717117.0, "step": 12074 }, { "epoch": 3.198093220338983, "grad_norm": 1.6939464807510376, "learning_rate": 8.401085805084747e-06, "loss": 1.4853, "mean_token_accuracy": 0.6447208225727081, "num_tokens": 9718714.0, "step": 12076 }, { "epoch": 3.198622881355932, "grad_norm": 1.3869799375534058, "learning_rate": 8.400820974576272e-06, "loss": 0.8517, "mean_token_accuracy": 0.7991493865847588, "num_tokens": 9720544.0, "step": 12078 }, { "epoch": 3.1991525423728815, "grad_norm": 1.8713934421539307, "learning_rate": 8.400556144067797e-06, "loss": 1.1523, "mean_token_accuracy": 0.7103001102805138, "num_tokens": 9722034.0, "step": 12080 }, { "epoch": 3.1996822033898304, "grad_norm": 1.7796170711517334, "learning_rate": 8.400291313559322e-06, "loss": 1.3584, "mean_token_accuracy": 0.7002456299960613, "num_tokens": 9723809.0, "step": 12082 }, { "epoch": 3.20021186440678, "grad_norm": 1.3383619785308838, "learning_rate": 8.400026483050849e-06, "loss": 1.1258, "mean_token_accuracy": 0.7302303463220596, "num_tokens": 9725584.0, "step": 12084 }, { "epoch": 3.200741525423729, "grad_norm": 1.535333275794983, "learning_rate": 8.399761652542374e-06, "loss": 1.1851, "mean_token_accuracy": 0.7076070010662079, "num_tokens": 9727843.0, "step": 12086 }, { "epoch": 3.2012711864406778, "grad_norm": 1.6573207378387451, "learning_rate": 8.399496822033899e-06, "loss": 1.3589, "mean_token_accuracy": 0.7103884816169739, "num_tokens": 9729567.0, "step": 12088 }, { "epoch": 3.201800847457627, "grad_norm": 1.6357978582382202, "learning_rate": 8.399231991525424e-06, "loss": 1.0901, "mean_token_accuracy": 0.7086568847298622, "num_tokens": 9731381.0, "step": 12090 }, { "epoch": 3.202330508474576, "grad_norm": 1.729324460029602, "learning_rate": 8.39896716101695e-06, "loss": 1.3038, "mean_token_accuracy": 0.7120492085814476, "num_tokens": 9732831.0, "step": 12092 }, { "epoch": 3.2028601694915255, "grad_norm": 1.8130618333816528, "learning_rate": 8.398702330508475e-06, "loss": 1.606, "mean_token_accuracy": 0.6437245309352875, "num_tokens": 9734579.0, "step": 12094 }, { "epoch": 3.2033898305084745, "grad_norm": 1.7970664501190186, "learning_rate": 8.3984375e-06, "loss": 1.4341, "mean_token_accuracy": 0.7141041085124016, "num_tokens": 9736120.0, "step": 12096 }, { "epoch": 3.203919491525424, "grad_norm": 1.446068525314331, "learning_rate": 8.398172669491527e-06, "loss": 1.1693, "mean_token_accuracy": 0.7222931906580925, "num_tokens": 9737808.0, "step": 12098 }, { "epoch": 3.204449152542373, "grad_norm": 1.9390631914138794, "learning_rate": 8.397907838983052e-06, "loss": 1.391, "mean_token_accuracy": 0.7126876562833786, "num_tokens": 9739169.0, "step": 12100 }, { "epoch": 3.2049788135593222, "grad_norm": 1.797269344329834, "learning_rate": 8.397643008474578e-06, "loss": 1.5589, "mean_token_accuracy": 0.6615351401269436, "num_tokens": 9740508.0, "step": 12102 }, { "epoch": 3.205508474576271, "grad_norm": 1.9562054872512817, "learning_rate": 8.397378177966103e-06, "loss": 1.3187, "mean_token_accuracy": 0.695691391825676, "num_tokens": 9742105.0, "step": 12104 }, { "epoch": 3.20603813559322, "grad_norm": 1.8606849908828735, "learning_rate": 8.397113347457628e-06, "loss": 1.2748, "mean_token_accuracy": 0.7021867334842682, "num_tokens": 9743637.0, "step": 12106 }, { "epoch": 3.2065677966101696, "grad_norm": 1.9721672534942627, "learning_rate": 8.396848516949153e-06, "loss": 1.2459, "mean_token_accuracy": 0.7168158516287804, "num_tokens": 9745080.0, "step": 12108 }, { "epoch": 3.2070974576271185, "grad_norm": 1.6779102087020874, "learning_rate": 8.39658368644068e-06, "loss": 1.1481, "mean_token_accuracy": 0.7332725450396538, "num_tokens": 9746480.0, "step": 12110 }, { "epoch": 3.207627118644068, "grad_norm": 1.8326308727264404, "learning_rate": 8.396318855932205e-06, "loss": 0.9914, "mean_token_accuracy": 0.7661545723676682, "num_tokens": 9748807.0, "step": 12112 }, { "epoch": 3.208156779661017, "grad_norm": 2.388143539428711, "learning_rate": 8.39605402542373e-06, "loss": 1.0626, "mean_token_accuracy": 0.7463592886924744, "num_tokens": 9750563.0, "step": 12114 }, { "epoch": 3.2086864406779663, "grad_norm": 1.8848445415496826, "learning_rate": 8.395789194915254e-06, "loss": 1.4392, "mean_token_accuracy": 0.6939421221613884, "num_tokens": 9752313.0, "step": 12116 }, { "epoch": 3.2092161016949152, "grad_norm": 2.0687625408172607, "learning_rate": 8.395524364406781e-06, "loss": 1.3412, "mean_token_accuracy": 0.7038019374012947, "num_tokens": 9753823.0, "step": 12118 }, { "epoch": 3.209745762711864, "grad_norm": 1.688342571258545, "learning_rate": 8.395259533898306e-06, "loss": 1.3506, "mean_token_accuracy": 0.6836194321513176, "num_tokens": 9755887.0, "step": 12120 }, { "epoch": 3.2102754237288136, "grad_norm": 1.715567708015442, "learning_rate": 8.394994703389831e-06, "loss": 1.476, "mean_token_accuracy": 0.6689862906932831, "num_tokens": 9757538.0, "step": 12122 }, { "epoch": 3.2108050847457625, "grad_norm": 1.7882299423217773, "learning_rate": 8.394729872881356e-06, "loss": 1.3935, "mean_token_accuracy": 0.6880566850304604, "num_tokens": 9759346.0, "step": 12124 }, { "epoch": 3.211334745762712, "grad_norm": 1.9177489280700684, "learning_rate": 8.394465042372882e-06, "loss": 1.2907, "mean_token_accuracy": 0.7228298187255859, "num_tokens": 9760788.0, "step": 12126 }, { "epoch": 3.211864406779661, "grad_norm": 1.7701694965362549, "learning_rate": 8.394200211864407e-06, "loss": 0.9919, "mean_token_accuracy": 0.7489245980978012, "num_tokens": 9762791.0, "step": 12128 }, { "epoch": 3.2123940677966103, "grad_norm": 1.8482006788253784, "learning_rate": 8.393935381355934e-06, "loss": 1.5939, "mean_token_accuracy": 0.6637760326266289, "num_tokens": 9764615.0, "step": 12130 }, { "epoch": 3.2129237288135593, "grad_norm": 1.7220991849899292, "learning_rate": 8.393670550847459e-06, "loss": 1.5253, "mean_token_accuracy": 0.6633167713880539, "num_tokens": 9766407.0, "step": 12132 }, { "epoch": 3.2134533898305087, "grad_norm": 1.767930269241333, "learning_rate": 8.393405720338984e-06, "loss": 1.1366, "mean_token_accuracy": 0.7306570112705231, "num_tokens": 9768079.0, "step": 12134 }, { "epoch": 3.2139830508474576, "grad_norm": 1.9119596481323242, "learning_rate": 8.393140889830509e-06, "loss": 1.2575, "mean_token_accuracy": 0.7363569363951683, "num_tokens": 9769446.0, "step": 12136 }, { "epoch": 3.2145127118644066, "grad_norm": 1.7393642663955688, "learning_rate": 8.392876059322035e-06, "loss": 1.106, "mean_token_accuracy": 0.7437090128660202, "num_tokens": 9771213.0, "step": 12138 }, { "epoch": 3.215042372881356, "grad_norm": 1.850541114807129, "learning_rate": 8.39261122881356e-06, "loss": 1.3523, "mean_token_accuracy": 0.6646497845649719, "num_tokens": 9772771.0, "step": 12140 }, { "epoch": 3.215572033898305, "grad_norm": 1.6890956163406372, "learning_rate": 8.392346398305085e-06, "loss": 1.0093, "mean_token_accuracy": 0.7387188598513603, "num_tokens": 9774701.0, "step": 12142 }, { "epoch": 3.2161016949152543, "grad_norm": 1.9692038297653198, "learning_rate": 8.39208156779661e-06, "loss": 0.9664, "mean_token_accuracy": 0.7802737206220627, "num_tokens": 9776361.0, "step": 12144 }, { "epoch": 3.2166313559322033, "grad_norm": 1.5236455202102661, "learning_rate": 8.391816737288137e-06, "loss": 1.1791, "mean_token_accuracy": 0.6967218443751335, "num_tokens": 9777997.0, "step": 12146 }, { "epoch": 3.2171610169491527, "grad_norm": 1.7247841358184814, "learning_rate": 8.391551906779662e-06, "loss": 1.2744, "mean_token_accuracy": 0.6967649385333061, "num_tokens": 9779454.0, "step": 12148 }, { "epoch": 3.2176906779661016, "grad_norm": 1.7695475816726685, "learning_rate": 8.391287076271187e-06, "loss": 1.2068, "mean_token_accuracy": 0.7089357450604439, "num_tokens": 9781255.0, "step": 12150 }, { "epoch": 3.218220338983051, "grad_norm": 1.6278005838394165, "learning_rate": 8.391022245762712e-06, "loss": 1.4187, "mean_token_accuracy": 0.7061341851949692, "num_tokens": 9782909.0, "step": 12152 }, { "epoch": 3.21875, "grad_norm": 1.8824949264526367, "learning_rate": 8.390757415254238e-06, "loss": 1.5531, "mean_token_accuracy": 0.6600169464945793, "num_tokens": 9784558.0, "step": 12154 }, { "epoch": 3.219279661016949, "grad_norm": 1.5577049255371094, "learning_rate": 8.390492584745763e-06, "loss": 1.2908, "mean_token_accuracy": 0.7083750441670418, "num_tokens": 9786314.0, "step": 12156 }, { "epoch": 3.2198093220338984, "grad_norm": 2.1311287879943848, "learning_rate": 8.39022775423729e-06, "loss": 1.435, "mean_token_accuracy": 0.6778651624917984, "num_tokens": 9787710.0, "step": 12158 }, { "epoch": 3.2203389830508473, "grad_norm": 1.6846588850021362, "learning_rate": 8.389962923728813e-06, "loss": 1.7199, "mean_token_accuracy": 0.6417191922664642, "num_tokens": 9789446.0, "step": 12160 }, { "epoch": 3.2208686440677967, "grad_norm": 1.7498689889907837, "learning_rate": 8.38969809322034e-06, "loss": 1.4006, "mean_token_accuracy": 0.6897032931447029, "num_tokens": 9791264.0, "step": 12162 }, { "epoch": 3.2213983050847457, "grad_norm": 1.7535582780838013, "learning_rate": 8.389433262711865e-06, "loss": 1.2221, "mean_token_accuracy": 0.7072106376290321, "num_tokens": 9793064.0, "step": 12164 }, { "epoch": 3.221927966101695, "grad_norm": 1.9280112981796265, "learning_rate": 8.389168432203391e-06, "loss": 1.3599, "mean_token_accuracy": 0.6964558213949203, "num_tokens": 9794491.0, "step": 12166 }, { "epoch": 3.222457627118644, "grad_norm": 1.5476566553115845, "learning_rate": 8.388903601694916e-06, "loss": 0.9726, "mean_token_accuracy": 0.7727104015648365, "num_tokens": 9796076.0, "step": 12168 }, { "epoch": 3.2229872881355934, "grad_norm": 1.9550116062164307, "learning_rate": 8.388638771186441e-06, "loss": 1.3864, "mean_token_accuracy": 0.682470478117466, "num_tokens": 9797620.0, "step": 12170 }, { "epoch": 3.2235169491525424, "grad_norm": 1.7838462591171265, "learning_rate": 8.388373940677966e-06, "loss": 1.3208, "mean_token_accuracy": 0.7082992345094681, "num_tokens": 9799275.0, "step": 12172 }, { "epoch": 3.2240466101694913, "grad_norm": 1.637123703956604, "learning_rate": 8.388109110169493e-06, "loss": 0.759, "mean_token_accuracy": 0.8111316487193108, "num_tokens": 9800500.0, "step": 12174 }, { "epoch": 3.2245762711864407, "grad_norm": 1.9257572889328003, "learning_rate": 8.387844279661018e-06, "loss": 1.2372, "mean_token_accuracy": 0.6953321471810341, "num_tokens": 9802167.0, "step": 12176 }, { "epoch": 3.2251059322033897, "grad_norm": 1.6982769966125488, "learning_rate": 8.387579449152542e-06, "loss": 0.8236, "mean_token_accuracy": 0.8005561530590057, "num_tokens": 9803696.0, "step": 12178 }, { "epoch": 3.225635593220339, "grad_norm": 2.2269978523254395, "learning_rate": 8.387314618644067e-06, "loss": 1.0988, "mean_token_accuracy": 0.7605433613061905, "num_tokens": 9805090.0, "step": 12180 }, { "epoch": 3.226165254237288, "grad_norm": 2.2573275566101074, "learning_rate": 8.387049788135594e-06, "loss": 1.2367, "mean_token_accuracy": 0.6956893056631088, "num_tokens": 9806536.0, "step": 12182 }, { "epoch": 3.2266949152542375, "grad_norm": 1.4332616329193115, "learning_rate": 8.38678495762712e-06, "loss": 0.8873, "mean_token_accuracy": 0.7904403582215309, "num_tokens": 9808055.0, "step": 12184 }, { "epoch": 3.2272245762711864, "grad_norm": 1.7866220474243164, "learning_rate": 8.386520127118646e-06, "loss": 1.2408, "mean_token_accuracy": 0.7196314930915833, "num_tokens": 9809627.0, "step": 12186 }, { "epoch": 3.227754237288136, "grad_norm": 1.981000304222107, "learning_rate": 8.38625529661017e-06, "loss": 1.1116, "mean_token_accuracy": 0.7581666633486748, "num_tokens": 9810901.0, "step": 12188 }, { "epoch": 3.2282838983050848, "grad_norm": 1.99625563621521, "learning_rate": 8.385990466101695e-06, "loss": 1.1632, "mean_token_accuracy": 0.7053788006305695, "num_tokens": 9812330.0, "step": 12190 }, { "epoch": 3.2288135593220337, "grad_norm": 1.8183937072753906, "learning_rate": 8.385725635593222e-06, "loss": 0.9435, "mean_token_accuracy": 0.7749375253915787, "num_tokens": 9813829.0, "step": 12192 }, { "epoch": 3.229343220338983, "grad_norm": 2.0046441555023193, "learning_rate": 8.385460805084747e-06, "loss": 1.1983, "mean_token_accuracy": 0.7149482443928719, "num_tokens": 9815422.0, "step": 12194 }, { "epoch": 3.229872881355932, "grad_norm": 1.7017805576324463, "learning_rate": 8.385195974576272e-06, "loss": 1.3725, "mean_token_accuracy": 0.690045278519392, "num_tokens": 9817200.0, "step": 12196 }, { "epoch": 3.2304025423728815, "grad_norm": 1.723878026008606, "learning_rate": 8.384931144067797e-06, "loss": 1.2472, "mean_token_accuracy": 0.695370264351368, "num_tokens": 9818747.0, "step": 12198 }, { "epoch": 3.2309322033898304, "grad_norm": 1.7407722473144531, "learning_rate": 8.384666313559323e-06, "loss": 1.2735, "mean_token_accuracy": 0.7110033705830574, "num_tokens": 9820556.0, "step": 12200 }, { "epoch": 3.23146186440678, "grad_norm": 2.344076633453369, "learning_rate": 8.384401483050848e-06, "loss": 1.6079, "mean_token_accuracy": 0.6421325951814651, "num_tokens": 9821982.0, "step": 12202 }, { "epoch": 3.231991525423729, "grad_norm": 1.7119885683059692, "learning_rate": 8.384136652542373e-06, "loss": 0.8817, "mean_token_accuracy": 0.7996792569756508, "num_tokens": 9823627.0, "step": 12204 }, { "epoch": 3.2325211864406778, "grad_norm": 1.7181490659713745, "learning_rate": 8.383871822033898e-06, "loss": 0.9828, "mean_token_accuracy": 0.7545005828142166, "num_tokens": 9825192.0, "step": 12206 }, { "epoch": 3.233050847457627, "grad_norm": 1.9160016775131226, "learning_rate": 8.383606991525425e-06, "loss": 1.5386, "mean_token_accuracy": 0.6679069697856903, "num_tokens": 9826708.0, "step": 12208 }, { "epoch": 3.233580508474576, "grad_norm": 2.14675235748291, "learning_rate": 8.38334216101695e-06, "loss": 1.2962, "mean_token_accuracy": 0.7048143818974495, "num_tokens": 9828118.0, "step": 12210 }, { "epoch": 3.2341101694915255, "grad_norm": 1.841045618057251, "learning_rate": 8.383077330508476e-06, "loss": 1.1007, "mean_token_accuracy": 0.7508963122963905, "num_tokens": 9829572.0, "step": 12212 }, { "epoch": 3.2346398305084745, "grad_norm": 2.0253517627716064, "learning_rate": 8.3828125e-06, "loss": 1.2472, "mean_token_accuracy": 0.7323519662022591, "num_tokens": 9831273.0, "step": 12214 }, { "epoch": 3.235169491525424, "grad_norm": 1.955562710762024, "learning_rate": 8.382547669491526e-06, "loss": 1.1072, "mean_token_accuracy": 0.7264446392655373, "num_tokens": 9832622.0, "step": 12216 }, { "epoch": 3.235699152542373, "grad_norm": 2.0140023231506348, "learning_rate": 8.382282838983051e-06, "loss": 1.2545, "mean_token_accuracy": 0.7514926046133041, "num_tokens": 9833924.0, "step": 12218 }, { "epoch": 3.2362288135593222, "grad_norm": 2.0305869579315186, "learning_rate": 8.382018008474578e-06, "loss": 1.821, "mean_token_accuracy": 0.6149560660123825, "num_tokens": 9835462.0, "step": 12220 }, { "epoch": 3.236758474576271, "grad_norm": 1.579177975654602, "learning_rate": 8.381753177966103e-06, "loss": 1.374, "mean_token_accuracy": 0.6816728636622429, "num_tokens": 9837262.0, "step": 12222 }, { "epoch": 3.23728813559322, "grad_norm": 1.7263574600219727, "learning_rate": 8.381488347457628e-06, "loss": 1.3706, "mean_token_accuracy": 0.6829374879598618, "num_tokens": 9838774.0, "step": 12224 }, { "epoch": 3.2378177966101696, "grad_norm": 1.6576993465423584, "learning_rate": 8.381223516949153e-06, "loss": 1.4265, "mean_token_accuracy": 0.6754671260714531, "num_tokens": 9840518.0, "step": 12226 }, { "epoch": 3.2383474576271185, "grad_norm": 1.9918019771575928, "learning_rate": 8.38095868644068e-06, "loss": 1.338, "mean_token_accuracy": 0.6680897325277328, "num_tokens": 9842211.0, "step": 12228 }, { "epoch": 3.238877118644068, "grad_norm": 1.8961611986160278, "learning_rate": 8.380693855932204e-06, "loss": 1.1103, "mean_token_accuracy": 0.714498907327652, "num_tokens": 9843870.0, "step": 12230 }, { "epoch": 3.239406779661017, "grad_norm": 1.6938177347183228, "learning_rate": 8.380429025423729e-06, "loss": 1.4919, "mean_token_accuracy": 0.6618136540055275, "num_tokens": 9845998.0, "step": 12232 }, { "epoch": 3.2399364406779663, "grad_norm": 2.236607313156128, "learning_rate": 8.380164194915254e-06, "loss": 1.5336, "mean_token_accuracy": 0.6578626856207848, "num_tokens": 9847518.0, "step": 12234 }, { "epoch": 3.2404661016949152, "grad_norm": 2.1043152809143066, "learning_rate": 8.37989936440678e-06, "loss": 1.1907, "mean_token_accuracy": 0.7092284858226776, "num_tokens": 9848998.0, "step": 12236 }, { "epoch": 3.240995762711864, "grad_norm": 1.4894404411315918, "learning_rate": 8.379634533898306e-06, "loss": 1.0688, "mean_token_accuracy": 0.7441887930035591, "num_tokens": 9850436.0, "step": 12238 }, { "epoch": 3.2415254237288136, "grad_norm": 1.729672908782959, "learning_rate": 8.379369703389832e-06, "loss": 1.4034, "mean_token_accuracy": 0.6837622597813606, "num_tokens": 9852025.0, "step": 12240 }, { "epoch": 3.2420550847457625, "grad_norm": 1.6816586256027222, "learning_rate": 8.379104872881355e-06, "loss": 1.2363, "mean_token_accuracy": 0.7207745239138603, "num_tokens": 9853538.0, "step": 12242 }, { "epoch": 3.242584745762712, "grad_norm": 1.7908194065093994, "learning_rate": 8.378840042372882e-06, "loss": 1.0935, "mean_token_accuracy": 0.7521214857697487, "num_tokens": 9855090.0, "step": 12244 }, { "epoch": 3.243114406779661, "grad_norm": 1.5078539848327637, "learning_rate": 8.378575211864407e-06, "loss": 1.1762, "mean_token_accuracy": 0.7046333327889442, "num_tokens": 9857273.0, "step": 12246 }, { "epoch": 3.2436440677966103, "grad_norm": 1.6893281936645508, "learning_rate": 8.378310381355934e-06, "loss": 0.9324, "mean_token_accuracy": 0.7498041540384293, "num_tokens": 9858871.0, "step": 12248 }, { "epoch": 3.2441737288135593, "grad_norm": 1.7829408645629883, "learning_rate": 8.378045550847459e-06, "loss": 1.1634, "step": 12250 }, { "epoch": 3.2441737288135593, "eval_loss": 1.3122984170913696, "eval_mean_token_accuracy": 0.701297954208665, "eval_num_tokens": 9860527.0, "eval_runtime": 48.1287, "eval_samples_per_second": 6.4, "eval_steps_per_second": 6.4, "step": 12250 }, { "epoch": 3.2447033898305087, "grad_norm": 1.6093487739562988, "learning_rate": 8.377780720338983e-06, "loss": 1.1464, "mean_token_accuracy": 0.72445273026824, "num_tokens": 9862187.0, "step": 12252 }, { "epoch": 3.2452330508474576, "grad_norm": 1.6088629961013794, "learning_rate": 8.377515889830508e-06, "loss": 1.2599, "mean_token_accuracy": 0.7298242300748825, "num_tokens": 9863848.0, "step": 12254 }, { "epoch": 3.2457627118644066, "grad_norm": 1.4553574323654175, "learning_rate": 8.377251059322035e-06, "loss": 1.4171, "mean_token_accuracy": 0.6936668679118156, "num_tokens": 9865738.0, "step": 12256 }, { "epoch": 3.246292372881356, "grad_norm": 1.9426933526992798, "learning_rate": 8.37698622881356e-06, "loss": 1.3621, "mean_token_accuracy": 0.7026521489024162, "num_tokens": 9867192.0, "step": 12258 }, { "epoch": 3.246822033898305, "grad_norm": 1.5666577816009521, "learning_rate": 8.376721398305085e-06, "loss": 1.1884, "mean_token_accuracy": 0.7454761490225792, "num_tokens": 9868512.0, "step": 12260 }, { "epoch": 3.2473516949152543, "grad_norm": 1.7463635206222534, "learning_rate": 8.37645656779661e-06, "loss": 1.3516, "mean_token_accuracy": 0.6815746054053307, "num_tokens": 9870197.0, "step": 12262 }, { "epoch": 3.2478813559322033, "grad_norm": 1.5709093809127808, "learning_rate": 8.376191737288136e-06, "loss": 1.353, "mean_token_accuracy": 0.688153013586998, "num_tokens": 9871800.0, "step": 12264 }, { "epoch": 3.2484110169491527, "grad_norm": 1.8828121423721313, "learning_rate": 8.375926906779663e-06, "loss": 1.4218, "mean_token_accuracy": 0.6852269321680069, "num_tokens": 9873438.0, "step": 12266 }, { "epoch": 3.2489406779661016, "grad_norm": 1.6508029699325562, "learning_rate": 8.375662076271186e-06, "loss": 1.1146, "mean_token_accuracy": 0.7525154650211334, "num_tokens": 9875247.0, "step": 12268 }, { "epoch": 3.249470338983051, "grad_norm": 1.443415641784668, "learning_rate": 8.375397245762713e-06, "loss": 1.0571, "mean_token_accuracy": 0.7291793450713158, "num_tokens": 9876920.0, "step": 12270 }, { "epoch": 3.25, "grad_norm": 1.39326810836792, "learning_rate": 8.375132415254238e-06, "loss": 0.9625, "mean_token_accuracy": 0.7591830641031265, "num_tokens": 9878698.0, "step": 12272 }, { "epoch": 3.250529661016949, "grad_norm": 1.4330627918243408, "learning_rate": 8.374867584745764e-06, "loss": 0.9371, "mean_token_accuracy": 0.7947765961289406, "num_tokens": 9880168.0, "step": 12274 }, { "epoch": 3.2510593220338984, "grad_norm": 1.5151383876800537, "learning_rate": 8.37460275423729e-06, "loss": 1.2104, "mean_token_accuracy": 0.713864654302597, "num_tokens": 9881769.0, "step": 12276 }, { "epoch": 3.2515889830508473, "grad_norm": 1.7456153631210327, "learning_rate": 8.374337923728814e-06, "loss": 1.3737, "mean_token_accuracy": 0.6762525886297226, "num_tokens": 9883255.0, "step": 12278 }, { "epoch": 3.2521186440677967, "grad_norm": 1.9084694385528564, "learning_rate": 8.37407309322034e-06, "loss": 1.2701, "mean_token_accuracy": 0.6847068518400192, "num_tokens": 9884885.0, "step": 12280 }, { "epoch": 3.2526483050847457, "grad_norm": 2.2083683013916016, "learning_rate": 8.373808262711866e-06, "loss": 1.2627, "mean_token_accuracy": 0.6990456432104111, "num_tokens": 9886598.0, "step": 12282 }, { "epoch": 3.253177966101695, "grad_norm": 2.0379300117492676, "learning_rate": 8.37354343220339e-06, "loss": 1.3623, "mean_token_accuracy": 0.6833757273852825, "num_tokens": 9888067.0, "step": 12284 }, { "epoch": 3.253707627118644, "grad_norm": 1.6688082218170166, "learning_rate": 8.373278601694916e-06, "loss": 1.5507, "mean_token_accuracy": 0.6671664118766785, "num_tokens": 9889641.0, "step": 12286 }, { "epoch": 3.2542372881355934, "grad_norm": 2.087089776992798, "learning_rate": 8.37301377118644e-06, "loss": 1.5896, "mean_token_accuracy": 0.6407320573925972, "num_tokens": 9891115.0, "step": 12288 }, { "epoch": 3.2547669491525424, "grad_norm": 1.8196876049041748, "learning_rate": 8.372748940677967e-06, "loss": 1.0121, "mean_token_accuracy": 0.7641117349267006, "num_tokens": 9892612.0, "step": 12290 }, { "epoch": 3.2552966101694913, "grad_norm": 1.7623025178909302, "learning_rate": 8.372484110169492e-06, "loss": 1.1782, "mean_token_accuracy": 0.731389194726944, "num_tokens": 9894175.0, "step": 12292 }, { "epoch": 3.2558262711864407, "grad_norm": 1.783905029296875, "learning_rate": 8.372219279661019e-06, "loss": 1.4684, "mean_token_accuracy": 0.6945001259446144, "num_tokens": 9895738.0, "step": 12294 }, { "epoch": 3.2563559322033897, "grad_norm": 1.6655563116073608, "learning_rate": 8.371954449152542e-06, "loss": 1.2409, "mean_token_accuracy": 0.7141508162021637, "num_tokens": 9897290.0, "step": 12296 }, { "epoch": 3.256885593220339, "grad_norm": 1.6784234046936035, "learning_rate": 8.371689618644069e-06, "loss": 1.0471, "mean_token_accuracy": 0.7360911071300507, "num_tokens": 9898829.0, "step": 12298 }, { "epoch": 3.257415254237288, "grad_norm": 2.206040859222412, "learning_rate": 8.371424788135594e-06, "loss": 1.7018, "mean_token_accuracy": 0.6236980780959129, "num_tokens": 9900485.0, "step": 12300 }, { "epoch": 3.2579449152542375, "grad_norm": 2.29152774810791, "learning_rate": 8.37115995762712e-06, "loss": 1.4657, "mean_token_accuracy": 0.6673487797379494, "num_tokens": 9901962.0, "step": 12302 }, { "epoch": 3.2584745762711864, "grad_norm": 1.432302474975586, "learning_rate": 8.370895127118645e-06, "loss": 1.131, "mean_token_accuracy": 0.7146067917346954, "num_tokens": 9903622.0, "step": 12304 }, { "epoch": 3.259004237288136, "grad_norm": 1.8391224145889282, "learning_rate": 8.37063029661017e-06, "loss": 1.2143, "mean_token_accuracy": 0.7109751105308533, "num_tokens": 9905078.0, "step": 12306 }, { "epoch": 3.2595338983050848, "grad_norm": 1.7037028074264526, "learning_rate": 8.370365466101695e-06, "loss": 1.1045, "mean_token_accuracy": 0.7432030066847801, "num_tokens": 9906551.0, "step": 12308 }, { "epoch": 3.2600635593220337, "grad_norm": 1.8562023639678955, "learning_rate": 8.370100635593222e-06, "loss": 1.392, "mean_token_accuracy": 0.685482382774353, "num_tokens": 9908247.0, "step": 12310 }, { "epoch": 3.260593220338983, "grad_norm": 2.0557515621185303, "learning_rate": 8.369835805084747e-06, "loss": 1.1514, "mean_token_accuracy": 0.7318916544318199, "num_tokens": 9910389.0, "step": 12312 }, { "epoch": 3.261122881355932, "grad_norm": 1.6877110004425049, "learning_rate": 8.369570974576271e-06, "loss": 1.2277, "mean_token_accuracy": 0.7180253714323044, "num_tokens": 9912020.0, "step": 12314 }, { "epoch": 3.2616525423728815, "grad_norm": 1.7424001693725586, "learning_rate": 8.369306144067796e-06, "loss": 1.2866, "mean_token_accuracy": 0.708955280482769, "num_tokens": 9913558.0, "step": 12316 }, { "epoch": 3.2621822033898304, "grad_norm": 1.5943130254745483, "learning_rate": 8.369041313559323e-06, "loss": 1.3474, "mean_token_accuracy": 0.6739615052938461, "num_tokens": 9915464.0, "step": 12318 }, { "epoch": 3.26271186440678, "grad_norm": 2.036191463470459, "learning_rate": 8.368776483050848e-06, "loss": 1.5156, "mean_token_accuracy": 0.6721709221601486, "num_tokens": 9916781.0, "step": 12320 }, { "epoch": 3.263241525423729, "grad_norm": 1.7582303285598755, "learning_rate": 8.368511652542373e-06, "loss": 0.9173, "mean_token_accuracy": 0.7513798177242279, "num_tokens": 9918542.0, "step": 12322 }, { "epoch": 3.263771186440678, "grad_norm": 1.6993250846862793, "learning_rate": 8.368246822033898e-06, "loss": 1.3961, "mean_token_accuracy": 0.7002343758940697, "num_tokens": 9920211.0, "step": 12324 }, { "epoch": 3.264300847457627, "grad_norm": 1.838181495666504, "learning_rate": 8.367981991525424e-06, "loss": 0.9979, "mean_token_accuracy": 0.7748257145285606, "num_tokens": 9921614.0, "step": 12326 }, { "epoch": 3.264830508474576, "grad_norm": 1.4528899192810059, "learning_rate": 8.36771716101695e-06, "loss": 1.1669, "mean_token_accuracy": 0.7297734543681145, "num_tokens": 9923422.0, "step": 12328 }, { "epoch": 3.2653601694915255, "grad_norm": 1.7454861402511597, "learning_rate": 8.367452330508476e-06, "loss": 1.0735, "mean_token_accuracy": 0.7400733679533005, "num_tokens": 9924870.0, "step": 12330 }, { "epoch": 3.2658898305084745, "grad_norm": 1.5867445468902588, "learning_rate": 8.367187500000001e-06, "loss": 0.9197, "mean_token_accuracy": 0.7638425976037979, "num_tokens": 9926454.0, "step": 12332 }, { "epoch": 3.266419491525424, "grad_norm": 2.104691982269287, "learning_rate": 8.366922669491526e-06, "loss": 1.7192, "mean_token_accuracy": 0.6567023284733295, "num_tokens": 9928244.0, "step": 12334 }, { "epoch": 3.266949152542373, "grad_norm": 2.130551338195801, "learning_rate": 8.36665783898305e-06, "loss": 1.0765, "mean_token_accuracy": 0.7461052685976028, "num_tokens": 9929594.0, "step": 12336 }, { "epoch": 3.267478813559322, "grad_norm": 1.8616467714309692, "learning_rate": 8.366393008474577e-06, "loss": 1.2969, "mean_token_accuracy": 0.7143687829375267, "num_tokens": 9931060.0, "step": 12338 }, { "epoch": 3.268008474576271, "grad_norm": 2.0160584449768066, "learning_rate": 8.366128177966102e-06, "loss": 1.4879, "mean_token_accuracy": 0.6726089715957642, "num_tokens": 9932787.0, "step": 12340 }, { "epoch": 3.26853813559322, "grad_norm": 1.6687357425689697, "learning_rate": 8.365863347457627e-06, "loss": 1.1997, "mean_token_accuracy": 0.7165228873491287, "num_tokens": 9934500.0, "step": 12342 }, { "epoch": 3.2690677966101696, "grad_norm": 2.186950206756592, "learning_rate": 8.365598516949152e-06, "loss": 1.4362, "mean_token_accuracy": 0.6773470714688301, "num_tokens": 9935946.0, "step": 12344 }, { "epoch": 3.2695974576271185, "grad_norm": 1.8557332754135132, "learning_rate": 8.365333686440679e-06, "loss": 1.3724, "mean_token_accuracy": 0.7167525365948677, "num_tokens": 9937382.0, "step": 12346 }, { "epoch": 3.270127118644068, "grad_norm": 1.893709421157837, "learning_rate": 8.365068855932204e-06, "loss": 1.4702, "mean_token_accuracy": 0.6764837801456451, "num_tokens": 9939092.0, "step": 12348 }, { "epoch": 3.270656779661017, "grad_norm": 1.9817758798599243, "learning_rate": 8.364804025423729e-06, "loss": 1.3229, "mean_token_accuracy": 0.6799673065543175, "num_tokens": 9940547.0, "step": 12350 }, { "epoch": 3.2711864406779663, "grad_norm": 2.1570205688476562, "learning_rate": 8.364539194915255e-06, "loss": 1.496, "mean_token_accuracy": 0.6641899198293686, "num_tokens": 9942286.0, "step": 12352 }, { "epoch": 3.2717161016949152, "grad_norm": 1.6431910991668701, "learning_rate": 8.36427436440678e-06, "loss": 1.4203, "mean_token_accuracy": 0.6848629415035248, "num_tokens": 9943895.0, "step": 12354 }, { "epoch": 3.272245762711864, "grad_norm": 1.8749576807022095, "learning_rate": 8.364009533898307e-06, "loss": 1.1485, "mean_token_accuracy": 0.7357017248868942, "num_tokens": 9945155.0, "step": 12356 }, { "epoch": 3.2727754237288136, "grad_norm": 1.8818135261535645, "learning_rate": 8.363744703389832e-06, "loss": 1.3722, "mean_token_accuracy": 0.6860484629869461, "num_tokens": 9946736.0, "step": 12358 }, { "epoch": 3.2733050847457625, "grad_norm": 1.591488242149353, "learning_rate": 8.363479872881357e-06, "loss": 1.1493, "mean_token_accuracy": 0.713990107178688, "num_tokens": 9948252.0, "step": 12360 }, { "epoch": 3.273834745762712, "grad_norm": 1.7973519563674927, "learning_rate": 8.363215042372882e-06, "loss": 1.5364, "mean_token_accuracy": 0.674049586057663, "num_tokens": 9949892.0, "step": 12362 }, { "epoch": 3.274364406779661, "grad_norm": 2.1917779445648193, "learning_rate": 8.362950211864408e-06, "loss": 1.1668, "mean_token_accuracy": 0.7258040904998779, "num_tokens": 9951209.0, "step": 12364 }, { "epoch": 3.2748940677966103, "grad_norm": 1.564570426940918, "learning_rate": 8.362685381355933e-06, "loss": 1.2256, "mean_token_accuracy": 0.7064462713897228, "num_tokens": 9952924.0, "step": 12366 }, { "epoch": 3.2754237288135593, "grad_norm": 1.7441409826278687, "learning_rate": 8.362420550847458e-06, "loss": 1.4443, "mean_token_accuracy": 0.6579414084553719, "num_tokens": 9954749.0, "step": 12368 }, { "epoch": 3.2759533898305087, "grad_norm": 1.5552972555160522, "learning_rate": 8.362155720338983e-06, "loss": 1.0204, "mean_token_accuracy": 0.7507077530026436, "num_tokens": 9956533.0, "step": 12370 }, { "epoch": 3.2764830508474576, "grad_norm": 1.7682734727859497, "learning_rate": 8.36189088983051e-06, "loss": 1.1542, "mean_token_accuracy": 0.7192934602499008, "num_tokens": 9958259.0, "step": 12372 }, { "epoch": 3.2770127118644066, "grad_norm": 2.187647581100464, "learning_rate": 8.361626059322035e-06, "loss": 1.7825, "mean_token_accuracy": 0.610062301158905, "num_tokens": 9959732.0, "step": 12374 }, { "epoch": 3.277542372881356, "grad_norm": 1.9624027013778687, "learning_rate": 8.36136122881356e-06, "loss": 1.2697, "mean_token_accuracy": 0.7185536921024323, "num_tokens": 9961174.0, "step": 12376 }, { "epoch": 3.278072033898305, "grad_norm": 1.7755502462387085, "learning_rate": 8.361096398305084e-06, "loss": 1.2829, "mean_token_accuracy": 0.7035958468914032, "num_tokens": 9962877.0, "step": 12378 }, { "epoch": 3.2786016949152543, "grad_norm": 2.0790655612945557, "learning_rate": 8.360831567796611e-06, "loss": 1.4246, "mean_token_accuracy": 0.6966647729277611, "num_tokens": 9964411.0, "step": 12380 }, { "epoch": 3.2791313559322033, "grad_norm": 2.2533388137817383, "learning_rate": 8.360566737288136e-06, "loss": 1.3564, "mean_token_accuracy": 0.7000368610024452, "num_tokens": 9966021.0, "step": 12382 }, { "epoch": 3.2796610169491527, "grad_norm": 1.9395318031311035, "learning_rate": 8.360301906779663e-06, "loss": 1.3448, "mean_token_accuracy": 0.7283437550067902, "num_tokens": 9967519.0, "step": 12384 }, { "epoch": 3.2801906779661016, "grad_norm": 2.0197806358337402, "learning_rate": 8.360037076271188e-06, "loss": 1.0151, "mean_token_accuracy": 0.7610657438635826, "num_tokens": 9968955.0, "step": 12386 }, { "epoch": 3.280720338983051, "grad_norm": 2.0188865661621094, "learning_rate": 8.359772245762713e-06, "loss": 1.5127, "mean_token_accuracy": 0.6605795919895172, "num_tokens": 9970549.0, "step": 12388 }, { "epoch": 3.28125, "grad_norm": 1.667232871055603, "learning_rate": 8.359507415254237e-06, "loss": 1.0816, "mean_token_accuracy": 0.7314116209745407, "num_tokens": 9972218.0, "step": 12390 }, { "epoch": 3.281779661016949, "grad_norm": 2.1987199783325195, "learning_rate": 8.359242584745764e-06, "loss": 1.5573, "mean_token_accuracy": 0.6810683757066727, "num_tokens": 9973581.0, "step": 12392 }, { "epoch": 3.2823093220338984, "grad_norm": 2.0816571712493896, "learning_rate": 8.358977754237289e-06, "loss": 1.1524, "mean_token_accuracy": 0.7187567204236984, "num_tokens": 9975559.0, "step": 12394 }, { "epoch": 3.2828389830508473, "grad_norm": 1.8676106929779053, "learning_rate": 8.358712923728814e-06, "loss": 1.2745, "mean_token_accuracy": 0.7185271754860878, "num_tokens": 9977094.0, "step": 12396 }, { "epoch": 3.2833686440677967, "grad_norm": 2.2681853771209717, "learning_rate": 8.358448093220339e-06, "loss": 1.177, "mean_token_accuracy": 0.7249861136078835, "num_tokens": 9978349.0, "step": 12398 }, { "epoch": 3.2838983050847457, "grad_norm": 1.768416404724121, "learning_rate": 8.358183262711865e-06, "loss": 1.1762, "mean_token_accuracy": 0.7326437458395958, "num_tokens": 9980154.0, "step": 12400 }, { "epoch": 3.284427966101695, "grad_norm": 1.9846304655075073, "learning_rate": 8.35791843220339e-06, "loss": 1.7826, "mean_token_accuracy": 0.6232509724795818, "num_tokens": 9982155.0, "step": 12402 }, { "epoch": 3.284957627118644, "grad_norm": 1.6992230415344238, "learning_rate": 8.357653601694915e-06, "loss": 1.0205, "mean_token_accuracy": 0.7418700009584427, "num_tokens": 9983584.0, "step": 12404 }, { "epoch": 3.2854872881355934, "grad_norm": 1.9091917276382446, "learning_rate": 8.35738877118644e-06, "loss": 1.6625, "mean_token_accuracy": 0.6491169258952141, "num_tokens": 9985129.0, "step": 12406 }, { "epoch": 3.2860169491525424, "grad_norm": 1.4057785272598267, "learning_rate": 8.357123940677967e-06, "loss": 0.8064, "mean_token_accuracy": 0.7866930738091469, "num_tokens": 9986718.0, "step": 12408 }, { "epoch": 3.2865466101694913, "grad_norm": 1.7174782752990723, "learning_rate": 8.356859110169492e-06, "loss": 1.1651, "mean_token_accuracy": 0.7037729024887085, "num_tokens": 9988232.0, "step": 12410 }, { "epoch": 3.2870762711864407, "grad_norm": 1.7190639972686768, "learning_rate": 8.356594279661018e-06, "loss": 0.8719, "mean_token_accuracy": 0.782897375524044, "num_tokens": 9989783.0, "step": 12412 }, { "epoch": 3.2876059322033897, "grad_norm": 2.219393491744995, "learning_rate": 8.356329449152543e-06, "loss": 1.5813, "mean_token_accuracy": 0.6340217664837837, "num_tokens": 9991297.0, "step": 12414 }, { "epoch": 3.288135593220339, "grad_norm": 1.7480863332748413, "learning_rate": 8.356064618644068e-06, "loss": 1.3854, "mean_token_accuracy": 0.6992168724536896, "num_tokens": 9992791.0, "step": 12416 }, { "epoch": 3.288665254237288, "grad_norm": 1.6815725564956665, "learning_rate": 8.355799788135593e-06, "loss": 1.6317, "mean_token_accuracy": 0.6636384427547455, "num_tokens": 9994765.0, "step": 12418 }, { "epoch": 3.2891949152542375, "grad_norm": 1.9500000476837158, "learning_rate": 8.35553495762712e-06, "loss": 1.3019, "mean_token_accuracy": 0.6936494708061218, "num_tokens": 9996305.0, "step": 12420 }, { "epoch": 3.2897245762711864, "grad_norm": 1.764570713043213, "learning_rate": 8.355270127118645e-06, "loss": 1.1914, "mean_token_accuracy": 0.7361847013235092, "num_tokens": 9997785.0, "step": 12422 }, { "epoch": 3.290254237288136, "grad_norm": 1.8112627267837524, "learning_rate": 8.35500529661017e-06, "loss": 1.4053, "mean_token_accuracy": 0.6904130131006241, "num_tokens": 9999326.0, "step": 12424 }, { "epoch": 3.2907838983050848, "grad_norm": 1.6547297239303589, "learning_rate": 8.354740466101695e-06, "loss": 1.3037, "mean_token_accuracy": 0.7021523788571358, "num_tokens": 10000990.0, "step": 12426 }, { "epoch": 3.2913135593220337, "grad_norm": 1.573514461517334, "learning_rate": 8.354475635593221e-06, "loss": 1.0623, "mean_token_accuracy": 0.7414620444178581, "num_tokens": 10002857.0, "step": 12428 }, { "epoch": 3.291843220338983, "grad_norm": 1.6572391986846924, "learning_rate": 8.354210805084746e-06, "loss": 1.0909, "mean_token_accuracy": 0.7563014477491379, "num_tokens": 10004395.0, "step": 12430 }, { "epoch": 3.292372881355932, "grad_norm": 1.797343373298645, "learning_rate": 8.353945974576271e-06, "loss": 1.0758, "mean_token_accuracy": 0.7354396507143974, "num_tokens": 10005819.0, "step": 12432 }, { "epoch": 3.2929025423728815, "grad_norm": 1.7266149520874023, "learning_rate": 8.353681144067798e-06, "loss": 1.2437, "mean_token_accuracy": 0.6994047090411186, "num_tokens": 10007397.0, "step": 12434 }, { "epoch": 3.2934322033898304, "grad_norm": 1.5528558492660522, "learning_rate": 8.353416313559323e-06, "loss": 1.0245, "mean_token_accuracy": 0.7561518922448158, "num_tokens": 10009124.0, "step": 12436 }, { "epoch": 3.29396186440678, "grad_norm": 1.6543209552764893, "learning_rate": 8.35315148305085e-06, "loss": 0.9936, "mean_token_accuracy": 0.7560097649693489, "num_tokens": 10010636.0, "step": 12438 }, { "epoch": 3.294491525423729, "grad_norm": 2.1389145851135254, "learning_rate": 8.352886652542374e-06, "loss": 1.536, "mean_token_accuracy": 0.6893701702356339, "num_tokens": 10012093.0, "step": 12440 }, { "epoch": 3.295021186440678, "grad_norm": 1.9255170822143555, "learning_rate": 8.352621822033899e-06, "loss": 1.2033, "mean_token_accuracy": 0.7040275409817696, "num_tokens": 10013546.0, "step": 12442 }, { "epoch": 3.295550847457627, "grad_norm": 1.79538094997406, "learning_rate": 8.352356991525424e-06, "loss": 1.3775, "mean_token_accuracy": 0.6937122195959091, "num_tokens": 10015104.0, "step": 12444 }, { "epoch": 3.296080508474576, "grad_norm": 1.7823588848114014, "learning_rate": 8.35209216101695e-06, "loss": 1.4165, "mean_token_accuracy": 0.6797821298241615, "num_tokens": 10016825.0, "step": 12446 }, { "epoch": 3.2966101694915255, "grad_norm": 2.172679901123047, "learning_rate": 8.351827330508476e-06, "loss": 1.3637, "mean_token_accuracy": 0.6877687722444534, "num_tokens": 10018268.0, "step": 12448 }, { "epoch": 3.2971398305084745, "grad_norm": 1.560598373413086, "learning_rate": 8.3515625e-06, "loss": 1.0205, "mean_token_accuracy": 0.7428949028253555, "num_tokens": 10019950.0, "step": 12450 }, { "epoch": 3.297669491525424, "grad_norm": 2.3553621768951416, "learning_rate": 8.351297669491525e-06, "loss": 1.524, "mean_token_accuracy": 0.6618661284446716, "num_tokens": 10021564.0, "step": 12452 }, { "epoch": 3.298199152542373, "grad_norm": 2.06825852394104, "learning_rate": 8.351032838983052e-06, "loss": 1.0057, "mean_token_accuracy": 0.7493012621998787, "num_tokens": 10023038.0, "step": 12454 }, { "epoch": 3.298728813559322, "grad_norm": 1.693405270576477, "learning_rate": 8.350768008474577e-06, "loss": 1.2531, "mean_token_accuracy": 0.6840023174881935, "num_tokens": 10024482.0, "step": 12456 }, { "epoch": 3.299258474576271, "grad_norm": 2.246415615081787, "learning_rate": 8.350503177966102e-06, "loss": 1.5415, "mean_token_accuracy": 0.6674579307436943, "num_tokens": 10026060.0, "step": 12458 }, { "epoch": 3.29978813559322, "grad_norm": 1.7576581239700317, "learning_rate": 8.350238347457627e-06, "loss": 1.1073, "mean_token_accuracy": 0.7474962770938873, "num_tokens": 10027747.0, "step": 12460 }, { "epoch": 3.3003177966101696, "grad_norm": 2.161479949951172, "learning_rate": 8.349973516949154e-06, "loss": 1.3662, "mean_token_accuracy": 0.68038559705019, "num_tokens": 10029218.0, "step": 12462 }, { "epoch": 3.3008474576271185, "grad_norm": 1.702489972114563, "learning_rate": 8.349708686440678e-06, "loss": 1.1278, "mean_token_accuracy": 0.7177342474460602, "num_tokens": 10030955.0, "step": 12464 }, { "epoch": 3.301377118644068, "grad_norm": 1.958000898361206, "learning_rate": 8.349443855932205e-06, "loss": 1.8232, "mean_token_accuracy": 0.6280074864625931, "num_tokens": 10032585.0, "step": 12466 }, { "epoch": 3.301906779661017, "grad_norm": 2.0537586212158203, "learning_rate": 8.34917902542373e-06, "loss": 1.2047, "mean_token_accuracy": 0.7123869732022285, "num_tokens": 10034170.0, "step": 12468 }, { "epoch": 3.3024364406779663, "grad_norm": 2.157137155532837, "learning_rate": 8.348914194915255e-06, "loss": 1.4997, "mean_token_accuracy": 0.6683617532253265, "num_tokens": 10035709.0, "step": 12470 }, { "epoch": 3.3029661016949152, "grad_norm": 1.310558795928955, "learning_rate": 8.34864936440678e-06, "loss": 0.9362, "mean_token_accuracy": 0.7720804214477539, "num_tokens": 10037440.0, "step": 12472 }, { "epoch": 3.303495762711864, "grad_norm": 1.936568260192871, "learning_rate": 8.348384533898306e-06, "loss": 1.5634, "mean_token_accuracy": 0.6225731670856476, "num_tokens": 10039146.0, "step": 12474 }, { "epoch": 3.3040254237288136, "grad_norm": 2.153857469558716, "learning_rate": 8.348119703389831e-06, "loss": 1.4257, "mean_token_accuracy": 0.6911552138626575, "num_tokens": 10040652.0, "step": 12476 }, { "epoch": 3.3045550847457625, "grad_norm": 2.034923791885376, "learning_rate": 8.347854872881356e-06, "loss": 1.2527, "mean_token_accuracy": 0.7127060666680336, "num_tokens": 10041816.0, "step": 12478 }, { "epoch": 3.305084745762712, "grad_norm": 1.8022246360778809, "learning_rate": 8.347590042372881e-06, "loss": 1.4789, "mean_token_accuracy": 0.6930824294686317, "num_tokens": 10043291.0, "step": 12480 }, { "epoch": 3.305614406779661, "grad_norm": 1.4781688451766968, "learning_rate": 8.347325211864408e-06, "loss": 0.7923, "mean_token_accuracy": 0.7917792946100235, "num_tokens": 10044906.0, "step": 12482 }, { "epoch": 3.3061440677966103, "grad_norm": 1.7177094221115112, "learning_rate": 8.347060381355933e-06, "loss": 1.0595, "mean_token_accuracy": 0.7315410524606705, "num_tokens": 10046836.0, "step": 12484 }, { "epoch": 3.3066737288135593, "grad_norm": 1.6909123659133911, "learning_rate": 8.346795550847458e-06, "loss": 1.3516, "mean_token_accuracy": 0.7197216153144836, "num_tokens": 10048743.0, "step": 12486 }, { "epoch": 3.3072033898305087, "grad_norm": 2.2010231018066406, "learning_rate": 8.346530720338983e-06, "loss": 1.3437, "mean_token_accuracy": 0.7016001045703888, "num_tokens": 10050101.0, "step": 12488 }, { "epoch": 3.3077330508474576, "grad_norm": 1.7139256000518799, "learning_rate": 8.34626588983051e-06, "loss": 1.5134, "mean_token_accuracy": 0.6929753422737122, "num_tokens": 10051819.0, "step": 12490 }, { "epoch": 3.3082627118644066, "grad_norm": 1.5315700769424438, "learning_rate": 8.346001059322034e-06, "loss": 1.2349, "mean_token_accuracy": 0.740762609988451, "num_tokens": 10053450.0, "step": 12492 }, { "epoch": 3.308792372881356, "grad_norm": 2.039677381515503, "learning_rate": 8.345736228813561e-06, "loss": 1.1976, "mean_token_accuracy": 0.7539186328649521, "num_tokens": 10055334.0, "step": 12494 }, { "epoch": 3.309322033898305, "grad_norm": 2.0406441688537598, "learning_rate": 8.345471398305086e-06, "loss": 1.4592, "mean_token_accuracy": 0.6777561157941818, "num_tokens": 10057213.0, "step": 12496 }, { "epoch": 3.3098516949152543, "grad_norm": 1.7123554944992065, "learning_rate": 8.34520656779661e-06, "loss": 1.076, "mean_token_accuracy": 0.7460861504077911, "num_tokens": 10058452.0, "step": 12498 }, { "epoch": 3.3103813559322033, "grad_norm": 2.12703800201416, "learning_rate": 8.344941737288136e-06, "loss": 1.2269, "step": 12500 }, { "epoch": 3.3103813559322033, "eval_loss": 1.3114397525787354, "eval_mean_token_accuracy": 0.7013189061121508, "eval_num_tokens": 10059957.0, "eval_runtime": 48.1606, "eval_samples_per_second": 6.395, "eval_steps_per_second": 6.395, "step": 12500 }, { "epoch": 3.3109110169491527, "grad_norm": 2.249854564666748, "learning_rate": 8.344676906779662e-06, "loss": 1.3441, "mean_token_accuracy": 0.7161918617784977, "num_tokens": 10061720.0, "step": 12502 }, { "epoch": 3.3114406779661016, "grad_norm": 1.5058568716049194, "learning_rate": 8.344412076271187e-06, "loss": 1.0791, "mean_token_accuracy": 0.7275108471512794, "num_tokens": 10063755.0, "step": 12504 }, { "epoch": 3.311970338983051, "grad_norm": 2.1491541862487793, "learning_rate": 8.344147245762712e-06, "loss": 1.3072, "mean_token_accuracy": 0.7072028368711472, "num_tokens": 10065265.0, "step": 12506 }, { "epoch": 3.3125, "grad_norm": 2.3168933391571045, "learning_rate": 8.343882415254237e-06, "loss": 1.6003, "mean_token_accuracy": 0.6431705392897129, "num_tokens": 10066968.0, "step": 12508 }, { "epoch": 3.313029661016949, "grad_norm": 1.7153770923614502, "learning_rate": 8.343617584745764e-06, "loss": 1.2021, "mean_token_accuracy": 0.735790491104126, "num_tokens": 10068154.0, "step": 12510 }, { "epoch": 3.3135593220338984, "grad_norm": 1.9553529024124146, "learning_rate": 8.343352754237289e-06, "loss": 1.5351, "mean_token_accuracy": 0.6433973163366318, "num_tokens": 10070190.0, "step": 12512 }, { "epoch": 3.3140889830508473, "grad_norm": 2.3236653804779053, "learning_rate": 8.343087923728814e-06, "loss": 1.2801, "mean_token_accuracy": 0.720662958920002, "num_tokens": 10071657.0, "step": 12514 }, { "epoch": 3.3146186440677967, "grad_norm": 1.6473509073257446, "learning_rate": 8.342823093220338e-06, "loss": 0.9342, "mean_token_accuracy": 0.757447212934494, "num_tokens": 10073320.0, "step": 12516 }, { "epoch": 3.3151483050847457, "grad_norm": 1.9818886518478394, "learning_rate": 8.342558262711865e-06, "loss": 1.1517, "mean_token_accuracy": 0.725915290415287, "num_tokens": 10074952.0, "step": 12518 }, { "epoch": 3.315677966101695, "grad_norm": 2.0923643112182617, "learning_rate": 8.342293432203392e-06, "loss": 1.7036, "mean_token_accuracy": 0.6468258202075958, "num_tokens": 10076715.0, "step": 12520 }, { "epoch": 3.316207627118644, "grad_norm": 1.8199220895767212, "learning_rate": 8.342028601694917e-06, "loss": 1.3926, "mean_token_accuracy": 0.6764410361647606, "num_tokens": 10078456.0, "step": 12522 }, { "epoch": 3.3167372881355934, "grad_norm": 1.3833609819412231, "learning_rate": 8.341763771186442e-06, "loss": 0.759, "mean_token_accuracy": 0.8037121742963791, "num_tokens": 10080275.0, "step": 12524 }, { "epoch": 3.3172669491525424, "grad_norm": 1.5389878749847412, "learning_rate": 8.341498940677967e-06, "loss": 1.4549, "mean_token_accuracy": 0.691177673637867, "num_tokens": 10081924.0, "step": 12526 }, { "epoch": 3.3177966101694913, "grad_norm": 2.0761377811431885, "learning_rate": 8.341234110169493e-06, "loss": 1.7503, "mean_token_accuracy": 0.6181731522083282, "num_tokens": 10083641.0, "step": 12528 }, { "epoch": 3.3183262711864407, "grad_norm": 1.5947813987731934, "learning_rate": 8.340969279661018e-06, "loss": 0.9564, "mean_token_accuracy": 0.7640422247350216, "num_tokens": 10085182.0, "step": 12530 }, { "epoch": 3.3188559322033897, "grad_norm": 1.7510274648666382, "learning_rate": 8.340704449152543e-06, "loss": 1.4122, "mean_token_accuracy": 0.6667098626494408, "num_tokens": 10086741.0, "step": 12532 }, { "epoch": 3.319385593220339, "grad_norm": 2.1037392616271973, "learning_rate": 8.340439618644068e-06, "loss": 1.5856, "mean_token_accuracy": 0.6700781807303429, "num_tokens": 10088283.0, "step": 12534 }, { "epoch": 3.319915254237288, "grad_norm": 2.214664936065674, "learning_rate": 8.340174788135595e-06, "loss": 1.3316, "mean_token_accuracy": 0.6973356157541275, "num_tokens": 10089720.0, "step": 12536 }, { "epoch": 3.3204449152542375, "grad_norm": 1.6454880237579346, "learning_rate": 8.33990995762712e-06, "loss": 1.0879, "mean_token_accuracy": 0.726103700697422, "num_tokens": 10091360.0, "step": 12538 }, { "epoch": 3.3209745762711864, "grad_norm": 1.6388871669769287, "learning_rate": 8.339645127118644e-06, "loss": 1.3397, "mean_token_accuracy": 0.692733496427536, "num_tokens": 10093122.0, "step": 12540 }, { "epoch": 3.321504237288136, "grad_norm": 1.8113043308258057, "learning_rate": 8.33938029661017e-06, "loss": 1.2154, "mean_token_accuracy": 0.7146070897579193, "num_tokens": 10094681.0, "step": 12542 }, { "epoch": 3.3220338983050848, "grad_norm": 2.184185266494751, "learning_rate": 8.339115466101696e-06, "loss": 1.3737, "mean_token_accuracy": 0.6877282336354256, "num_tokens": 10096334.0, "step": 12544 }, { "epoch": 3.3225635593220337, "grad_norm": 1.8941465616226196, "learning_rate": 8.338850635593221e-06, "loss": 1.3526, "mean_token_accuracy": 0.6905824393033981, "num_tokens": 10097995.0, "step": 12546 }, { "epoch": 3.323093220338983, "grad_norm": 1.6735352277755737, "learning_rate": 8.338585805084748e-06, "loss": 1.3762, "mean_token_accuracy": 0.6871441975235939, "num_tokens": 10099543.0, "step": 12548 }, { "epoch": 3.323622881355932, "grad_norm": 2.4584264755249023, "learning_rate": 8.338320974576272e-06, "loss": 1.9068, "mean_token_accuracy": 0.6145181208848953, "num_tokens": 10101102.0, "step": 12550 }, { "epoch": 3.3241525423728815, "grad_norm": 1.9356768131256104, "learning_rate": 8.338056144067797e-06, "loss": 0.8244, "mean_token_accuracy": 0.7825702801346779, "num_tokens": 10102636.0, "step": 12552 }, { "epoch": 3.3246822033898304, "grad_norm": 1.8512756824493408, "learning_rate": 8.337791313559322e-06, "loss": 1.3178, "mean_token_accuracy": 0.7060783840715885, "num_tokens": 10104224.0, "step": 12554 }, { "epoch": 3.32521186440678, "grad_norm": 1.99457585811615, "learning_rate": 8.337526483050849e-06, "loss": 1.2783, "mean_token_accuracy": 0.7268588244915009, "num_tokens": 10105847.0, "step": 12556 }, { "epoch": 3.325741525423729, "grad_norm": 1.7281959056854248, "learning_rate": 8.337261652542374e-06, "loss": 1.3898, "mean_token_accuracy": 0.6754611730575562, "num_tokens": 10107374.0, "step": 12558 }, { "epoch": 3.326271186440678, "grad_norm": 2.0928139686584473, "learning_rate": 8.336996822033899e-06, "loss": 1.1555, "mean_token_accuracy": 0.7186137139797211, "num_tokens": 10109027.0, "step": 12560 }, { "epoch": 3.326800847457627, "grad_norm": 1.9291479587554932, "learning_rate": 8.336731991525424e-06, "loss": 0.8619, "mean_token_accuracy": 0.7901335656642914, "num_tokens": 10110436.0, "step": 12562 }, { "epoch": 3.327330508474576, "grad_norm": 2.069978952407837, "learning_rate": 8.33646716101695e-06, "loss": 1.5787, "mean_token_accuracy": 0.6814093217253685, "num_tokens": 10111757.0, "step": 12564 }, { "epoch": 3.3278601694915255, "grad_norm": 2.1145238876342773, "learning_rate": 8.336202330508475e-06, "loss": 1.6049, "mean_token_accuracy": 0.6539304405450821, "num_tokens": 10113232.0, "step": 12566 }, { "epoch": 3.3283898305084745, "grad_norm": 2.064863920211792, "learning_rate": 8.3359375e-06, "loss": 1.2017, "mean_token_accuracy": 0.7560161165893078, "num_tokens": 10114630.0, "step": 12568 }, { "epoch": 3.328919491525424, "grad_norm": 1.700682282447815, "learning_rate": 8.335672669491525e-06, "loss": 1.405, "mean_token_accuracy": 0.6842199340462685, "num_tokens": 10116121.0, "step": 12570 }, { "epoch": 3.329449152542373, "grad_norm": 2.096673011779785, "learning_rate": 8.335407838983052e-06, "loss": 1.377, "mean_token_accuracy": 0.7005037069320679, "num_tokens": 10117535.0, "step": 12572 }, { "epoch": 3.329978813559322, "grad_norm": 2.107417583465576, "learning_rate": 8.335143008474577e-06, "loss": 1.6168, "mean_token_accuracy": 0.6422978416085243, "num_tokens": 10119145.0, "step": 12574 }, { "epoch": 3.330508474576271, "grad_norm": 1.9830125570297241, "learning_rate": 8.334878177966103e-06, "loss": 1.5067, "mean_token_accuracy": 0.6443231180310249, "num_tokens": 10120711.0, "step": 12576 }, { "epoch": 3.33103813559322, "grad_norm": 1.9474166631698608, "learning_rate": 8.334613347457628e-06, "loss": 1.4629, "mean_token_accuracy": 0.658123604953289, "num_tokens": 10122476.0, "step": 12578 }, { "epoch": 3.3315677966101696, "grad_norm": 1.7776286602020264, "learning_rate": 8.334348516949153e-06, "loss": 1.5921, "mean_token_accuracy": 0.6551658287644386, "num_tokens": 10124124.0, "step": 12580 }, { "epoch": 3.3320974576271185, "grad_norm": 2.5864219665527344, "learning_rate": 8.334083686440678e-06, "loss": 1.1922, "mean_token_accuracy": 0.7380006164312363, "num_tokens": 10125643.0, "step": 12582 }, { "epoch": 3.332627118644068, "grad_norm": 1.9379178285598755, "learning_rate": 8.333818855932205e-06, "loss": 1.6505, "mean_token_accuracy": 0.6651154160499573, "num_tokens": 10127227.0, "step": 12584 }, { "epoch": 3.333156779661017, "grad_norm": 1.9466686248779297, "learning_rate": 8.33355402542373e-06, "loss": 1.4355, "mean_token_accuracy": 0.6917852684855461, "num_tokens": 10128998.0, "step": 12586 }, { "epoch": 3.3336864406779663, "grad_norm": 1.7764174938201904, "learning_rate": 8.333289194915255e-06, "loss": 1.0002, "mean_token_accuracy": 0.7466037943959236, "num_tokens": 10130368.0, "step": 12588 }, { "epoch": 3.3342161016949152, "grad_norm": 1.5707193613052368, "learning_rate": 8.33302436440678e-06, "loss": 1.2658, "mean_token_accuracy": 0.7059864103794098, "num_tokens": 10132529.0, "step": 12590 }, { "epoch": 3.334745762711864, "grad_norm": 1.5989384651184082, "learning_rate": 8.332759533898306e-06, "loss": 0.8988, "mean_token_accuracy": 0.7649412080645561, "num_tokens": 10134205.0, "step": 12592 }, { "epoch": 3.3352754237288136, "grad_norm": 2.306032657623291, "learning_rate": 8.332494703389831e-06, "loss": 1.5749, "mean_token_accuracy": 0.6625603660941124, "num_tokens": 10135494.0, "step": 12594 }, { "epoch": 3.3358050847457625, "grad_norm": 2.0157365798950195, "learning_rate": 8.332229872881356e-06, "loss": 1.1754, "mean_token_accuracy": 0.7304239794611931, "num_tokens": 10136899.0, "step": 12596 }, { "epoch": 3.336334745762712, "grad_norm": 1.3400932550430298, "learning_rate": 8.331965042372881e-06, "loss": 0.8846, "mean_token_accuracy": 0.7713016495108604, "num_tokens": 10138732.0, "step": 12598 }, { "epoch": 3.336864406779661, "grad_norm": 1.9199773073196411, "learning_rate": 8.331700211864408e-06, "loss": 1.4739, "mean_token_accuracy": 0.6757242158055305, "num_tokens": 10140288.0, "step": 12600 }, { "epoch": 3.3373940677966103, "grad_norm": 2.6580071449279785, "learning_rate": 8.331435381355934e-06, "loss": 1.2741, "mean_token_accuracy": 0.7150172367691994, "num_tokens": 10141650.0, "step": 12602 }, { "epoch": 3.3379237288135593, "grad_norm": 1.9649585485458374, "learning_rate": 8.331170550847459e-06, "loss": 1.3724, "mean_token_accuracy": 0.7029895484447479, "num_tokens": 10143322.0, "step": 12604 }, { "epoch": 3.3384533898305087, "grad_norm": 1.9461885690689087, "learning_rate": 8.330905720338984e-06, "loss": 1.4511, "mean_token_accuracy": 0.6822459623217583, "num_tokens": 10144960.0, "step": 12606 }, { "epoch": 3.3389830508474576, "grad_norm": 2.1047134399414062, "learning_rate": 8.330640889830509e-06, "loss": 1.2238, "mean_token_accuracy": 0.7245667949318886, "num_tokens": 10146374.0, "step": 12608 }, { "epoch": 3.3395127118644066, "grad_norm": 2.0182411670684814, "learning_rate": 8.330376059322036e-06, "loss": 1.4161, "mean_token_accuracy": 0.6877226307988167, "num_tokens": 10147752.0, "step": 12610 }, { "epoch": 3.340042372881356, "grad_norm": 2.1141481399536133, "learning_rate": 8.33011122881356e-06, "loss": 1.9423, "mean_token_accuracy": 0.5956268310546875, "num_tokens": 10149369.0, "step": 12612 }, { "epoch": 3.340572033898305, "grad_norm": 2.08259916305542, "learning_rate": 8.329846398305085e-06, "loss": 1.6902, "mean_token_accuracy": 0.6310912445187569, "num_tokens": 10150939.0, "step": 12614 }, { "epoch": 3.3411016949152543, "grad_norm": 2.2070019245147705, "learning_rate": 8.32958156779661e-06, "loss": 1.3014, "mean_token_accuracy": 0.7094997689127922, "num_tokens": 10152282.0, "step": 12616 }, { "epoch": 3.3416313559322033, "grad_norm": 1.5083699226379395, "learning_rate": 8.329316737288137e-06, "loss": 0.9312, "mean_token_accuracy": 0.7739696055650711, "num_tokens": 10153791.0, "step": 12618 }, { "epoch": 3.3421610169491527, "grad_norm": 2.1406090259552, "learning_rate": 8.329051906779662e-06, "loss": 1.9058, "mean_token_accuracy": 0.6345521546900272, "num_tokens": 10155597.0, "step": 12620 }, { "epoch": 3.3426906779661016, "grad_norm": 1.8380554914474487, "learning_rate": 8.328787076271187e-06, "loss": 0.8307, "mean_token_accuracy": 0.7883079126477242, "num_tokens": 10156842.0, "step": 12622 }, { "epoch": 3.343220338983051, "grad_norm": 2.1539900302886963, "learning_rate": 8.328522245762712e-06, "loss": 1.2095, "mean_token_accuracy": 0.7276935130357742, "num_tokens": 10158154.0, "step": 12624 }, { "epoch": 3.34375, "grad_norm": 1.9116451740264893, "learning_rate": 8.328257415254238e-06, "loss": 1.4099, "mean_token_accuracy": 0.6660432741045952, "num_tokens": 10159787.0, "step": 12626 }, { "epoch": 3.344279661016949, "grad_norm": 2.387211322784424, "learning_rate": 8.327992584745763e-06, "loss": 1.3859, "mean_token_accuracy": 0.6876217871904373, "num_tokens": 10161297.0, "step": 12628 }, { "epoch": 3.3448093220338984, "grad_norm": 1.910582184791565, "learning_rate": 8.32772775423729e-06, "loss": 1.2313, "mean_token_accuracy": 0.7536084726452827, "num_tokens": 10163086.0, "step": 12630 }, { "epoch": 3.3453389830508473, "grad_norm": 2.1950695514678955, "learning_rate": 8.327462923728815e-06, "loss": 1.6541, "mean_token_accuracy": 0.6290609613060951, "num_tokens": 10164583.0, "step": 12632 }, { "epoch": 3.3458686440677967, "grad_norm": 1.8905625343322754, "learning_rate": 8.32719809322034e-06, "loss": 1.329, "mean_token_accuracy": 0.7161627411842346, "num_tokens": 10166270.0, "step": 12634 }, { "epoch": 3.3463983050847457, "grad_norm": 2.092804193496704, "learning_rate": 8.326933262711865e-06, "loss": 1.5076, "mean_token_accuracy": 0.6605455577373505, "num_tokens": 10167731.0, "step": 12636 }, { "epoch": 3.346927966101695, "grad_norm": 2.139545202255249, "learning_rate": 8.326668432203391e-06, "loss": 1.4017, "mean_token_accuracy": 0.673332117497921, "num_tokens": 10169225.0, "step": 12638 }, { "epoch": 3.347457627118644, "grad_norm": 1.9001497030258179, "learning_rate": 8.326403601694916e-06, "loss": 1.3481, "mean_token_accuracy": 0.6664182916283607, "num_tokens": 10170671.0, "step": 12640 }, { "epoch": 3.3479872881355934, "grad_norm": 1.8910741806030273, "learning_rate": 8.326138771186441e-06, "loss": 0.9774, "mean_token_accuracy": 0.7575536742806435, "num_tokens": 10172078.0, "step": 12642 }, { "epoch": 3.3485169491525424, "grad_norm": 2.496443510055542, "learning_rate": 8.325873940677966e-06, "loss": 1.2365, "mean_token_accuracy": 0.710359625518322, "num_tokens": 10173423.0, "step": 12644 }, { "epoch": 3.3490466101694913, "grad_norm": 1.9230111837387085, "learning_rate": 8.325609110169493e-06, "loss": 1.1654, "mean_token_accuracy": 0.732866682112217, "num_tokens": 10175017.0, "step": 12646 }, { "epoch": 3.3495762711864407, "grad_norm": 1.8711258172988892, "learning_rate": 8.325344279661018e-06, "loss": 1.4064, "mean_token_accuracy": 0.6867628619074821, "num_tokens": 10176398.0, "step": 12648 }, { "epoch": 3.3501059322033897, "grad_norm": 1.8690180778503418, "learning_rate": 8.325079449152543e-06, "loss": 1.1413, "mean_token_accuracy": 0.7120084911584854, "num_tokens": 10177854.0, "step": 12650 }, { "epoch": 3.350635593220339, "grad_norm": 1.9727848768234253, "learning_rate": 8.324814618644068e-06, "loss": 1.344, "mean_token_accuracy": 0.6789861097931862, "num_tokens": 10179341.0, "step": 12652 }, { "epoch": 3.351165254237288, "grad_norm": 2.134415864944458, "learning_rate": 8.324549788135594e-06, "loss": 0.9297, "mean_token_accuracy": 0.7681689709424973, "num_tokens": 10180600.0, "step": 12654 }, { "epoch": 3.3516949152542375, "grad_norm": 1.8107692003250122, "learning_rate": 8.324284957627119e-06, "loss": 1.1004, "mean_token_accuracy": 0.7370665371417999, "num_tokens": 10182334.0, "step": 12656 }, { "epoch": 3.3522245762711864, "grad_norm": 1.5468733310699463, "learning_rate": 8.324020127118646e-06, "loss": 1.1965, "mean_token_accuracy": 0.7442027181386948, "num_tokens": 10184021.0, "step": 12658 }, { "epoch": 3.352754237288136, "grad_norm": 1.4178600311279297, "learning_rate": 8.323755296610169e-06, "loss": 1.2137, "mean_token_accuracy": 0.730838842689991, "num_tokens": 10186109.0, "step": 12660 }, { "epoch": 3.3532838983050848, "grad_norm": 1.6063870191574097, "learning_rate": 8.323490466101696e-06, "loss": 1.1362, "mean_token_accuracy": 0.7560147047042847, "num_tokens": 10187945.0, "step": 12662 }, { "epoch": 3.3538135593220337, "grad_norm": 1.7908575534820557, "learning_rate": 8.32322563559322e-06, "loss": 1.5913, "mean_token_accuracy": 0.654128897935152, "num_tokens": 10189708.0, "step": 12664 }, { "epoch": 3.354343220338983, "grad_norm": 1.7416752576828003, "learning_rate": 8.322960805084747e-06, "loss": 1.0653, "mean_token_accuracy": 0.7504926472902298, "num_tokens": 10191542.0, "step": 12666 }, { "epoch": 3.354872881355932, "grad_norm": 2.152681589126587, "learning_rate": 8.322695974576272e-06, "loss": 1.635, "mean_token_accuracy": 0.66298583522439, "num_tokens": 10193011.0, "step": 12668 }, { "epoch": 3.3554025423728815, "grad_norm": 1.4934370517730713, "learning_rate": 8.322431144067797e-06, "loss": 0.8525, "mean_token_accuracy": 0.7641373574733734, "num_tokens": 10194616.0, "step": 12670 }, { "epoch": 3.3559322033898304, "grad_norm": 2.0047378540039062, "learning_rate": 8.322166313559322e-06, "loss": 1.2808, "mean_token_accuracy": 0.7221788763999939, "num_tokens": 10196158.0, "step": 12672 }, { "epoch": 3.35646186440678, "grad_norm": 1.9199519157409668, "learning_rate": 8.321901483050849e-06, "loss": 1.324, "mean_token_accuracy": 0.7043797075748444, "num_tokens": 10197603.0, "step": 12674 }, { "epoch": 3.356991525423729, "grad_norm": 1.6706463098526, "learning_rate": 8.321636652542373e-06, "loss": 1.3152, "mean_token_accuracy": 0.7296693623065948, "num_tokens": 10199399.0, "step": 12676 }, { "epoch": 3.357521186440678, "grad_norm": 1.756508231163025, "learning_rate": 8.321371822033898e-06, "loss": 1.2138, "mean_token_accuracy": 0.722723200917244, "num_tokens": 10200946.0, "step": 12678 }, { "epoch": 3.358050847457627, "grad_norm": 1.7992873191833496, "learning_rate": 8.321106991525423e-06, "loss": 1.5263, "mean_token_accuracy": 0.6550123132765293, "num_tokens": 10202477.0, "step": 12680 }, { "epoch": 3.358580508474576, "grad_norm": 1.8444061279296875, "learning_rate": 8.32084216101695e-06, "loss": 0.9294, "mean_token_accuracy": 0.767836295068264, "num_tokens": 10204263.0, "step": 12682 }, { "epoch": 3.3591101694915255, "grad_norm": 1.7192583084106445, "learning_rate": 8.320577330508475e-06, "loss": 1.2289, "mean_token_accuracy": 0.7009698562324047, "num_tokens": 10206356.0, "step": 12684 }, { "epoch": 3.3596398305084745, "grad_norm": 1.3396167755126953, "learning_rate": 8.320312500000001e-06, "loss": 0.8558, "mean_token_accuracy": 0.7790722548961639, "num_tokens": 10208183.0, "step": 12686 }, { "epoch": 3.360169491525424, "grad_norm": 1.7208033800125122, "learning_rate": 8.320047669491526e-06, "loss": 1.3294, "mean_token_accuracy": 0.7007024437189102, "num_tokens": 10209597.0, "step": 12688 }, { "epoch": 3.360699152542373, "grad_norm": 1.8723965883255005, "learning_rate": 8.319782838983051e-06, "loss": 1.2271, "mean_token_accuracy": 0.7266354560852051, "num_tokens": 10211401.0, "step": 12690 }, { "epoch": 3.361228813559322, "grad_norm": 1.7429838180541992, "learning_rate": 8.319518008474578e-06, "loss": 1.2175, "mean_token_accuracy": 0.7081640064716339, "num_tokens": 10213298.0, "step": 12692 }, { "epoch": 3.361758474576271, "grad_norm": 2.094641923904419, "learning_rate": 8.319253177966103e-06, "loss": 1.6004, "mean_token_accuracy": 0.664365753531456, "num_tokens": 10214709.0, "step": 12694 }, { "epoch": 3.36228813559322, "grad_norm": 1.934831142425537, "learning_rate": 8.318988347457628e-06, "loss": 1.0897, "mean_token_accuracy": 0.7300724983215332, "num_tokens": 10216278.0, "step": 12696 }, { "epoch": 3.3628177966101696, "grad_norm": 1.6815581321716309, "learning_rate": 8.318723516949153e-06, "loss": 1.0232, "mean_token_accuracy": 0.7693808004260063, "num_tokens": 10217660.0, "step": 12698 }, { "epoch": 3.3633474576271185, "grad_norm": 1.7171275615692139, "learning_rate": 8.31845868644068e-06, "loss": 1.2411, "mean_token_accuracy": 0.6961178034543991, "num_tokens": 10219366.0, "step": 12700 }, { "epoch": 3.363877118644068, "grad_norm": 1.969842553138733, "learning_rate": 8.318193855932204e-06, "loss": 1.2394, "mean_token_accuracy": 0.7118778824806213, "num_tokens": 10220862.0, "step": 12702 }, { "epoch": 3.364406779661017, "grad_norm": 2.0475683212280273, "learning_rate": 8.31792902542373e-06, "loss": 1.4188, "mean_token_accuracy": 0.704345315694809, "num_tokens": 10222451.0, "step": 12704 }, { "epoch": 3.3649364406779663, "grad_norm": 1.80470609664917, "learning_rate": 8.317664194915254e-06, "loss": 0.9783, "mean_token_accuracy": 0.7717979699373245, "num_tokens": 10223903.0, "step": 12706 }, { "epoch": 3.3654661016949152, "grad_norm": 1.9302722215652466, "learning_rate": 8.31739936440678e-06, "loss": 1.3233, "mean_token_accuracy": 0.6937280148267746, "num_tokens": 10225674.0, "step": 12708 }, { "epoch": 3.365995762711864, "grad_norm": 2.0087249279022217, "learning_rate": 8.317134533898306e-06, "loss": 1.3681, "mean_token_accuracy": 0.7017062157392502, "num_tokens": 10227267.0, "step": 12710 }, { "epoch": 3.3665254237288136, "grad_norm": 1.4752014875411987, "learning_rate": 8.316869703389832e-06, "loss": 0.8079, "mean_token_accuracy": 0.799886517226696, "num_tokens": 10228632.0, "step": 12712 }, { "epoch": 3.3670550847457625, "grad_norm": 1.6860952377319336, "learning_rate": 8.316604872881356e-06, "loss": 1.2074, "mean_token_accuracy": 0.732621043920517, "num_tokens": 10230262.0, "step": 12714 }, { "epoch": 3.367584745762712, "grad_norm": 2.360192060470581, "learning_rate": 8.316340042372882e-06, "loss": 1.2825, "mean_token_accuracy": 0.6981389224529266, "num_tokens": 10231838.0, "step": 12716 }, { "epoch": 3.368114406779661, "grad_norm": 1.6772938966751099, "learning_rate": 8.316075211864407e-06, "loss": 1.4313, "mean_token_accuracy": 0.6518514826893806, "num_tokens": 10233687.0, "step": 12718 }, { "epoch": 3.3686440677966103, "grad_norm": 2.1759192943573, "learning_rate": 8.315810381355934e-06, "loss": 1.0437, "mean_token_accuracy": 0.7661308944225311, "num_tokens": 10235030.0, "step": 12720 }, { "epoch": 3.3691737288135593, "grad_norm": 1.8709264993667603, "learning_rate": 8.315545550847459e-06, "loss": 1.4798, "mean_token_accuracy": 0.6796977519989014, "num_tokens": 10236668.0, "step": 12722 }, { "epoch": 3.3697033898305087, "grad_norm": 2.119109869003296, "learning_rate": 8.315280720338984e-06, "loss": 1.3904, "mean_token_accuracy": 0.6773199439048767, "num_tokens": 10238321.0, "step": 12724 }, { "epoch": 3.3702330508474576, "grad_norm": 2.0208442211151123, "learning_rate": 8.315015889830509e-06, "loss": 1.7623, "mean_token_accuracy": 0.5955960527062416, "num_tokens": 10239871.0, "step": 12726 }, { "epoch": 3.3707627118644066, "grad_norm": 1.7198313474655151, "learning_rate": 8.314751059322035e-06, "loss": 1.0387, "mean_token_accuracy": 0.7596453949809074, "num_tokens": 10241485.0, "step": 12728 }, { "epoch": 3.371292372881356, "grad_norm": 1.9714621305465698, "learning_rate": 8.31448622881356e-06, "loss": 1.2604, "mean_token_accuracy": 0.6809565722942352, "num_tokens": 10243216.0, "step": 12730 }, { "epoch": 3.371822033898305, "grad_norm": 1.81134033203125, "learning_rate": 8.314221398305085e-06, "loss": 1.0314, "mean_token_accuracy": 0.7398376688361168, "num_tokens": 10244629.0, "step": 12732 }, { "epoch": 3.3723516949152543, "grad_norm": 2.0260169506073, "learning_rate": 8.31395656779661e-06, "loss": 1.3683, "mean_token_accuracy": 0.70931476354599, "num_tokens": 10246133.0, "step": 12734 }, { "epoch": 3.3728813559322033, "grad_norm": 2.1044981479644775, "learning_rate": 8.313691737288137e-06, "loss": 0.7612, "mean_token_accuracy": 0.7940522879362106, "num_tokens": 10247641.0, "step": 12736 }, { "epoch": 3.3734110169491527, "grad_norm": 1.6949025392532349, "learning_rate": 8.313426906779662e-06, "loss": 1.4135, "mean_token_accuracy": 0.669059082865715, "num_tokens": 10249279.0, "step": 12738 }, { "epoch": 3.3739406779661016, "grad_norm": 1.9626187086105347, "learning_rate": 8.313162076271188e-06, "loss": 1.3249, "mean_token_accuracy": 0.7063435465097427, "num_tokens": 10250710.0, "step": 12740 }, { "epoch": 3.374470338983051, "grad_norm": 1.9571624994277954, "learning_rate": 8.312897245762711e-06, "loss": 1.269, "mean_token_accuracy": 0.7048767507076263, "num_tokens": 10252351.0, "step": 12742 }, { "epoch": 3.375, "grad_norm": 1.7985833883285522, "learning_rate": 8.312632415254238e-06, "loss": 0.8061, "mean_token_accuracy": 0.7898903116583824, "num_tokens": 10253812.0, "step": 12744 }, { "epoch": 3.375529661016949, "grad_norm": 1.9321975708007812, "learning_rate": 8.312367584745763e-06, "loss": 1.4459, "mean_token_accuracy": 0.6937912106513977, "num_tokens": 10255461.0, "step": 12746 }, { "epoch": 3.3760593220338984, "grad_norm": 2.0074262619018555, "learning_rate": 8.31210275423729e-06, "loss": 1.5035, "mean_token_accuracy": 0.6726662293076515, "num_tokens": 10256847.0, "step": 12748 }, { "epoch": 3.3765889830508473, "grad_norm": 1.692520022392273, "learning_rate": 8.311837923728814e-06, "loss": 1.0683, "step": 12750 }, { "epoch": 3.3765889830508473, "eval_loss": 1.311171531677246, "eval_mean_token_accuracy": 0.7015790755485559, "eval_num_tokens": 10258521.0, "eval_runtime": 48.3073, "eval_samples_per_second": 6.376, "eval_steps_per_second": 6.376, "step": 12750 }, { "epoch": 3.3771186440677967, "grad_norm": 1.8255146741867065, "learning_rate": 8.31157309322034e-06, "loss": 1.2588, "mean_token_accuracy": 0.71269291639328, "num_tokens": 10260469.0, "step": 12752 }, { "epoch": 3.3776483050847457, "grad_norm": 1.8316903114318848, "learning_rate": 8.311308262711864e-06, "loss": 1.6438, "mean_token_accuracy": 0.6196768581867218, "num_tokens": 10262168.0, "step": 12754 }, { "epoch": 3.378177966101695, "grad_norm": 2.090460777282715, "learning_rate": 8.311043432203391e-06, "loss": 1.5426, "mean_token_accuracy": 0.6613346561789513, "num_tokens": 10263711.0, "step": 12756 }, { "epoch": 3.378707627118644, "grad_norm": 1.9070007801055908, "learning_rate": 8.310778601694916e-06, "loss": 1.398, "mean_token_accuracy": 0.7038167864084244, "num_tokens": 10265191.0, "step": 12758 }, { "epoch": 3.3792372881355934, "grad_norm": 1.4884384870529175, "learning_rate": 8.31051377118644e-06, "loss": 1.2032, "mean_token_accuracy": 0.7179280444979668, "num_tokens": 10267068.0, "step": 12760 }, { "epoch": 3.3797669491525424, "grad_norm": 2.307217836380005, "learning_rate": 8.310248940677966e-06, "loss": 1.3349, "mean_token_accuracy": 0.6851356849074364, "num_tokens": 10268546.0, "step": 12762 }, { "epoch": 3.3802966101694913, "grad_norm": 2.061955213546753, "learning_rate": 8.309984110169492e-06, "loss": 1.432, "mean_token_accuracy": 0.7076505422592163, "num_tokens": 10270060.0, "step": 12764 }, { "epoch": 3.3808262711864407, "grad_norm": 2.378539562225342, "learning_rate": 8.309719279661017e-06, "loss": 1.716, "mean_token_accuracy": 0.6353047490119934, "num_tokens": 10271514.0, "step": 12766 }, { "epoch": 3.3813559322033897, "grad_norm": 1.8483461141586304, "learning_rate": 8.309454449152542e-06, "loss": 1.2815, "mean_token_accuracy": 0.6896428242325783, "num_tokens": 10273254.0, "step": 12768 }, { "epoch": 3.381885593220339, "grad_norm": 1.7823805809020996, "learning_rate": 8.309189618644067e-06, "loss": 1.3553, "mean_token_accuracy": 0.674164168536663, "num_tokens": 10274973.0, "step": 12770 }, { "epoch": 3.382415254237288, "grad_norm": 1.5639129877090454, "learning_rate": 8.308924788135594e-06, "loss": 1.0554, "mean_token_accuracy": 0.7532432675361633, "num_tokens": 10276812.0, "step": 12772 }, { "epoch": 3.3829449152542375, "grad_norm": 2.176567554473877, "learning_rate": 8.30865995762712e-06, "loss": 1.4799, "mean_token_accuracy": 0.6693031489849091, "num_tokens": 10278553.0, "step": 12774 }, { "epoch": 3.3834745762711864, "grad_norm": 1.9710129499435425, "learning_rate": 8.308395127118645e-06, "loss": 1.3608, "mean_token_accuracy": 0.7052203938364983, "num_tokens": 10279831.0, "step": 12776 }, { "epoch": 3.384004237288136, "grad_norm": 2.013740062713623, "learning_rate": 8.30813029661017e-06, "loss": 1.6636, "mean_token_accuracy": 0.6376052722334862, "num_tokens": 10281408.0, "step": 12778 }, { "epoch": 3.3845338983050848, "grad_norm": 1.4738107919692993, "learning_rate": 8.307865466101695e-06, "loss": 1.2126, "mean_token_accuracy": 0.7255878522992134, "num_tokens": 10282991.0, "step": 12780 }, { "epoch": 3.3850635593220337, "grad_norm": 1.7437489032745361, "learning_rate": 8.307600635593222e-06, "loss": 1.112, "mean_token_accuracy": 0.73700051009655, "num_tokens": 10284635.0, "step": 12782 }, { "epoch": 3.385593220338983, "grad_norm": 2.3338286876678467, "learning_rate": 8.307335805084747e-06, "loss": 1.1637, "mean_token_accuracy": 0.7078008651733398, "num_tokens": 10285886.0, "step": 12784 }, { "epoch": 3.386122881355932, "grad_norm": 1.658070683479309, "learning_rate": 8.307070974576272e-06, "loss": 0.9806, "mean_token_accuracy": 0.7502828538417816, "num_tokens": 10287277.0, "step": 12786 }, { "epoch": 3.3866525423728815, "grad_norm": 1.5841020345687866, "learning_rate": 8.306806144067797e-06, "loss": 1.2944, "mean_token_accuracy": 0.7092228196561337, "num_tokens": 10289060.0, "step": 12788 }, { "epoch": 3.3871822033898304, "grad_norm": 1.8254488706588745, "learning_rate": 8.306541313559323e-06, "loss": 1.5366, "mean_token_accuracy": 0.659178152680397, "num_tokens": 10290850.0, "step": 12790 }, { "epoch": 3.38771186440678, "grad_norm": 1.780615210533142, "learning_rate": 8.306276483050848e-06, "loss": 1.3342, "mean_token_accuracy": 0.7043256089091301, "num_tokens": 10292439.0, "step": 12792 }, { "epoch": 3.388241525423729, "grad_norm": 1.553112268447876, "learning_rate": 8.306011652542375e-06, "loss": 1.333, "mean_token_accuracy": 0.7035698741674423, "num_tokens": 10294279.0, "step": 12794 }, { "epoch": 3.388771186440678, "grad_norm": 2.013949394226074, "learning_rate": 8.305746822033898e-06, "loss": 0.9599, "mean_token_accuracy": 0.7750746458768845, "num_tokens": 10295768.0, "step": 12796 }, { "epoch": 3.389300847457627, "grad_norm": 1.5943635702133179, "learning_rate": 8.305481991525425e-06, "loss": 1.2246, "mean_token_accuracy": 0.7152341157197952, "num_tokens": 10297197.0, "step": 12798 }, { "epoch": 3.389830508474576, "grad_norm": 1.9926598072052002, "learning_rate": 8.30521716101695e-06, "loss": 1.3246, "mean_token_accuracy": 0.707488551735878, "num_tokens": 10298622.0, "step": 12800 }, { "epoch": 3.3903601694915255, "grad_norm": 1.7151919603347778, "learning_rate": 8.304952330508476e-06, "loss": 1.1236, "mean_token_accuracy": 0.7403682395815849, "num_tokens": 10300168.0, "step": 12802 }, { "epoch": 3.3908898305084745, "grad_norm": 1.8133540153503418, "learning_rate": 8.304687500000001e-06, "loss": 1.7693, "mean_token_accuracy": 0.6256222203373909, "num_tokens": 10301654.0, "step": 12804 }, { "epoch": 3.391419491525424, "grad_norm": 1.784037470817566, "learning_rate": 8.304422669491526e-06, "loss": 1.3122, "mean_token_accuracy": 0.7076726257801056, "num_tokens": 10303246.0, "step": 12806 }, { "epoch": 3.391949152542373, "grad_norm": 1.9123578071594238, "learning_rate": 8.304157838983051e-06, "loss": 1.1319, "mean_token_accuracy": 0.7269704043865204, "num_tokens": 10304712.0, "step": 12808 }, { "epoch": 3.392478813559322, "grad_norm": 1.757908582687378, "learning_rate": 8.303893008474578e-06, "loss": 0.871, "mean_token_accuracy": 0.7822316884994507, "num_tokens": 10306129.0, "step": 12810 }, { "epoch": 3.393008474576271, "grad_norm": 1.9725422859191895, "learning_rate": 8.303628177966103e-06, "loss": 1.5088, "mean_token_accuracy": 0.6556626856327057, "num_tokens": 10307543.0, "step": 12812 }, { "epoch": 3.39353813559322, "grad_norm": 1.6458042860031128, "learning_rate": 8.303363347457627e-06, "loss": 0.9251, "mean_token_accuracy": 0.7692833617329597, "num_tokens": 10309146.0, "step": 12814 }, { "epoch": 3.3940677966101696, "grad_norm": 1.8893332481384277, "learning_rate": 8.303098516949152e-06, "loss": 1.2951, "mean_token_accuracy": 0.6994799003005028, "num_tokens": 10310890.0, "step": 12816 }, { "epoch": 3.3945974576271185, "grad_norm": 1.8328675031661987, "learning_rate": 8.302833686440679e-06, "loss": 1.6152, "mean_token_accuracy": 0.6467064693570137, "num_tokens": 10312524.0, "step": 12818 }, { "epoch": 3.395127118644068, "grad_norm": 1.9249953031539917, "learning_rate": 8.302568855932204e-06, "loss": 1.6328, "mean_token_accuracy": 0.6346104368567467, "num_tokens": 10314218.0, "step": 12820 }, { "epoch": 3.395656779661017, "grad_norm": 1.497555136680603, "learning_rate": 8.302304025423729e-06, "loss": 0.9126, "mean_token_accuracy": 0.7781171277165413, "num_tokens": 10315895.0, "step": 12822 }, { "epoch": 3.3961864406779663, "grad_norm": 2.6776275634765625, "learning_rate": 8.302039194915254e-06, "loss": 1.7376, "mean_token_accuracy": 0.6413085609674454, "num_tokens": 10317106.0, "step": 12824 }, { "epoch": 3.3967161016949152, "grad_norm": 2.462104558944702, "learning_rate": 8.30177436440678e-06, "loss": 1.227, "mean_token_accuracy": 0.690782755613327, "num_tokens": 10318535.0, "step": 12826 }, { "epoch": 3.397245762711864, "grad_norm": 1.6709389686584473, "learning_rate": 8.301509533898305e-06, "loss": 1.2858, "mean_token_accuracy": 0.6969425566494465, "num_tokens": 10320213.0, "step": 12828 }, { "epoch": 3.3977754237288136, "grad_norm": 1.7814583778381348, "learning_rate": 8.301244703389832e-06, "loss": 1.1397, "mean_token_accuracy": 0.7294984236359596, "num_tokens": 10322006.0, "step": 12830 }, { "epoch": 3.3983050847457625, "grad_norm": 2.4718055725097656, "learning_rate": 8.300979872881357e-06, "loss": 1.3235, "mean_token_accuracy": 0.7282906919717789, "num_tokens": 10324239.0, "step": 12832 }, { "epoch": 3.398834745762712, "grad_norm": 1.6135159730911255, "learning_rate": 8.300715042372882e-06, "loss": 1.3867, "mean_token_accuracy": 0.6702160909771919, "num_tokens": 10325844.0, "step": 12834 }, { "epoch": 3.399364406779661, "grad_norm": 2.2447493076324463, "learning_rate": 8.300450211864407e-06, "loss": 1.4083, "mean_token_accuracy": 0.670173205435276, "num_tokens": 10327286.0, "step": 12836 }, { "epoch": 3.3998940677966103, "grad_norm": 2.313483476638794, "learning_rate": 8.300185381355933e-06, "loss": 1.4598, "mean_token_accuracy": 0.6907989978790283, "num_tokens": 10328903.0, "step": 12838 }, { "epoch": 3.4004237288135593, "grad_norm": 1.6120729446411133, "learning_rate": 8.299920550847458e-06, "loss": 1.4057, "mean_token_accuracy": 0.6665667742490768, "num_tokens": 10330695.0, "step": 12840 }, { "epoch": 3.4009533898305087, "grad_norm": 1.8532156944274902, "learning_rate": 8.299655720338983e-06, "loss": 1.582, "mean_token_accuracy": 0.6407218053936958, "num_tokens": 10332667.0, "step": 12842 }, { "epoch": 3.4014830508474576, "grad_norm": 1.5431312322616577, "learning_rate": 8.299390889830508e-06, "loss": 1.2283, "mean_token_accuracy": 0.7321295216679573, "num_tokens": 10334333.0, "step": 12844 }, { "epoch": 3.4020127118644066, "grad_norm": 1.523193359375, "learning_rate": 8.299126059322035e-06, "loss": 0.8318, "mean_token_accuracy": 0.785195454955101, "num_tokens": 10336181.0, "step": 12846 }, { "epoch": 3.402542372881356, "grad_norm": 1.6466032266616821, "learning_rate": 8.29886122881356e-06, "loss": 1.3681, "mean_token_accuracy": 0.6770189329981804, "num_tokens": 10337755.0, "step": 12848 }, { "epoch": 3.403072033898305, "grad_norm": 2.4442367553710938, "learning_rate": 8.298596398305085e-06, "loss": 1.4727, "mean_token_accuracy": 0.6711128354072571, "num_tokens": 10339181.0, "step": 12850 }, { "epoch": 3.4036016949152543, "grad_norm": 1.8355960845947266, "learning_rate": 8.29833156779661e-06, "loss": 1.2927, "mean_token_accuracy": 0.6977462023496628, "num_tokens": 10340769.0, "step": 12852 }, { "epoch": 3.4041313559322033, "grad_norm": 2.209930181503296, "learning_rate": 8.298066737288136e-06, "loss": 1.4821, "mean_token_accuracy": 0.6827152743935585, "num_tokens": 10342354.0, "step": 12854 }, { "epoch": 3.4046610169491527, "grad_norm": 2.0208046436309814, "learning_rate": 8.297801906779663e-06, "loss": 1.454, "mean_token_accuracy": 0.6831777319312096, "num_tokens": 10344037.0, "step": 12856 }, { "epoch": 3.4051906779661016, "grad_norm": 1.7353442907333374, "learning_rate": 8.297537076271188e-06, "loss": 1.2672, "mean_token_accuracy": 0.717613123357296, "num_tokens": 10345559.0, "step": 12858 }, { "epoch": 3.405720338983051, "grad_norm": 1.9350028038024902, "learning_rate": 8.297272245762713e-06, "loss": 1.3212, "mean_token_accuracy": 0.6952374503016472, "num_tokens": 10347031.0, "step": 12860 }, { "epoch": 3.40625, "grad_norm": 2.220304489135742, "learning_rate": 8.297007415254238e-06, "loss": 1.4071, "mean_token_accuracy": 0.682589054107666, "num_tokens": 10348463.0, "step": 12862 }, { "epoch": 3.406779661016949, "grad_norm": 1.7415367364883423, "learning_rate": 8.296742584745764e-06, "loss": 1.3057, "mean_token_accuracy": 0.7046136409044266, "num_tokens": 10350149.0, "step": 12864 }, { "epoch": 3.4073093220338984, "grad_norm": 1.820966362953186, "learning_rate": 8.29647775423729e-06, "loss": 1.274, "mean_token_accuracy": 0.6869437471032143, "num_tokens": 10351686.0, "step": 12866 }, { "epoch": 3.4078389830508473, "grad_norm": 1.9315211772918701, "learning_rate": 8.296212923728814e-06, "loss": 1.0663, "mean_token_accuracy": 0.7368319183588028, "num_tokens": 10353097.0, "step": 12868 }, { "epoch": 3.4083686440677967, "grad_norm": 1.9366023540496826, "learning_rate": 8.295948093220339e-06, "loss": 1.1944, "mean_token_accuracy": 0.7131597064435482, "num_tokens": 10354973.0, "step": 12870 }, { "epoch": 3.4088983050847457, "grad_norm": 2.050577163696289, "learning_rate": 8.295683262711866e-06, "loss": 1.3956, "mean_token_accuracy": 0.6972233057022095, "num_tokens": 10356566.0, "step": 12872 }, { "epoch": 3.409427966101695, "grad_norm": 1.7317782640457153, "learning_rate": 8.29541843220339e-06, "loss": 1.2077, "mean_token_accuracy": 0.7165325656533241, "num_tokens": 10358363.0, "step": 12874 }, { "epoch": 3.409957627118644, "grad_norm": 1.7330846786499023, "learning_rate": 8.295153601694916e-06, "loss": 1.4616, "mean_token_accuracy": 0.6744001284241676, "num_tokens": 10360029.0, "step": 12876 }, { "epoch": 3.4104872881355934, "grad_norm": 2.171806573867798, "learning_rate": 8.29488877118644e-06, "loss": 1.4436, "mean_token_accuracy": 0.692703403532505, "num_tokens": 10361402.0, "step": 12878 }, { "epoch": 3.4110169491525424, "grad_norm": 1.6464639902114868, "learning_rate": 8.294623940677967e-06, "loss": 0.9942, "mean_token_accuracy": 0.7412682771682739, "num_tokens": 10363182.0, "step": 12880 }, { "epoch": 3.4115466101694913, "grad_norm": 1.7028712034225464, "learning_rate": 8.294359110169492e-06, "loss": 1.0719, "mean_token_accuracy": 0.7385574504733086, "num_tokens": 10364837.0, "step": 12882 }, { "epoch": 3.4120762711864407, "grad_norm": 2.103191614151001, "learning_rate": 8.294094279661019e-06, "loss": 1.418, "mean_token_accuracy": 0.6713576838374138, "num_tokens": 10366411.0, "step": 12884 }, { "epoch": 3.4126059322033897, "grad_norm": 1.6654508113861084, "learning_rate": 8.293829449152544e-06, "loss": 1.2439, "mean_token_accuracy": 0.720228835940361, "num_tokens": 10368384.0, "step": 12886 }, { "epoch": 3.413135593220339, "grad_norm": 2.1510164737701416, "learning_rate": 8.293564618644068e-06, "loss": 1.6511, "mean_token_accuracy": 0.6416747868061066, "num_tokens": 10370381.0, "step": 12888 }, { "epoch": 3.413665254237288, "grad_norm": 1.921846628189087, "learning_rate": 8.293299788135593e-06, "loss": 1.3315, "mean_token_accuracy": 0.6790933385491371, "num_tokens": 10372171.0, "step": 12890 }, { "epoch": 3.4141949152542375, "grad_norm": 1.8870439529418945, "learning_rate": 8.29303495762712e-06, "loss": 1.402, "mean_token_accuracy": 0.6899830102920532, "num_tokens": 10373570.0, "step": 12892 }, { "epoch": 3.4147245762711864, "grad_norm": 2.2609994411468506, "learning_rate": 8.292770127118645e-06, "loss": 1.5188, "mean_token_accuracy": 0.6532096713781357, "num_tokens": 10374963.0, "step": 12894 }, { "epoch": 3.415254237288136, "grad_norm": 1.9240362644195557, "learning_rate": 8.29250529661017e-06, "loss": 1.0485, "mean_token_accuracy": 0.7522028125822544, "num_tokens": 10376419.0, "step": 12896 }, { "epoch": 3.4157838983050848, "grad_norm": 1.581980586051941, "learning_rate": 8.292240466101695e-06, "loss": 1.244, "mean_token_accuracy": 0.7143819108605385, "num_tokens": 10377897.0, "step": 12898 }, { "epoch": 3.4163135593220337, "grad_norm": 1.5757213830947876, "learning_rate": 8.291975635593221e-06, "loss": 1.0913, "mean_token_accuracy": 0.7541089989244938, "num_tokens": 10379672.0, "step": 12900 }, { "epoch": 3.416843220338983, "grad_norm": 2.0283586978912354, "learning_rate": 8.291710805084746e-06, "loss": 1.2554, "mean_token_accuracy": 0.7187947258353233, "num_tokens": 10381125.0, "step": 12902 }, { "epoch": 3.417372881355932, "grad_norm": 1.6834996938705444, "learning_rate": 8.291445974576271e-06, "loss": 1.2241, "mean_token_accuracy": 0.7181835025548935, "num_tokens": 10382761.0, "step": 12904 }, { "epoch": 3.4179025423728815, "grad_norm": 1.5902968645095825, "learning_rate": 8.291181144067796e-06, "loss": 1.0591, "mean_token_accuracy": 0.7815548181533813, "num_tokens": 10384227.0, "step": 12906 }, { "epoch": 3.4184322033898304, "grad_norm": 1.8879728317260742, "learning_rate": 8.290916313559323e-06, "loss": 1.4557, "mean_token_accuracy": 0.6866942346096039, "num_tokens": 10385565.0, "step": 12908 }, { "epoch": 3.41896186440678, "grad_norm": 1.8632731437683105, "learning_rate": 8.290651483050848e-06, "loss": 1.0333, "mean_token_accuracy": 0.7436107322573662, "num_tokens": 10386880.0, "step": 12910 }, { "epoch": 3.419491525423729, "grad_norm": 1.5336984395980835, "learning_rate": 8.290386652542374e-06, "loss": 0.9655, "mean_token_accuracy": 0.7741387709975243, "num_tokens": 10388353.0, "step": 12912 }, { "epoch": 3.420021186440678, "grad_norm": 2.0633256435394287, "learning_rate": 8.2901218220339e-06, "loss": 1.1282, "mean_token_accuracy": 0.7395230531692505, "num_tokens": 10389842.0, "step": 12914 }, { "epoch": 3.420550847457627, "grad_norm": 1.9227975606918335, "learning_rate": 8.289856991525424e-06, "loss": 1.3825, "mean_token_accuracy": 0.7009122967720032, "num_tokens": 10391317.0, "step": 12916 }, { "epoch": 3.421080508474576, "grad_norm": 1.7882381677627563, "learning_rate": 8.28959216101695e-06, "loss": 1.1197, "mean_token_accuracy": 0.7386717721819878, "num_tokens": 10392861.0, "step": 12918 }, { "epoch": 3.4216101694915255, "grad_norm": 1.9017318487167358, "learning_rate": 8.289327330508476e-06, "loss": 1.3442, "mean_token_accuracy": 0.6970523223280907, "num_tokens": 10394275.0, "step": 12920 }, { "epoch": 3.4221398305084745, "grad_norm": 2.5325539112091064, "learning_rate": 8.2890625e-06, "loss": 1.3482, "mean_token_accuracy": 0.7062062844634056, "num_tokens": 10395754.0, "step": 12922 }, { "epoch": 3.422669491525424, "grad_norm": 1.8124326467514038, "learning_rate": 8.288797669491526e-06, "loss": 1.0739, "mean_token_accuracy": 0.7506642043590546, "num_tokens": 10397483.0, "step": 12924 }, { "epoch": 3.423199152542373, "grad_norm": 2.0346765518188477, "learning_rate": 8.28853283898305e-06, "loss": 1.1056, "mean_token_accuracy": 0.7345326840877533, "num_tokens": 10399184.0, "step": 12926 }, { "epoch": 3.423728813559322, "grad_norm": 2.019791603088379, "learning_rate": 8.288268008474577e-06, "loss": 1.3898, "mean_token_accuracy": 0.7124179527163506, "num_tokens": 10400658.0, "step": 12928 }, { "epoch": 3.424258474576271, "grad_norm": 1.74848473072052, "learning_rate": 8.288003177966102e-06, "loss": 1.2171, "mean_token_accuracy": 0.7466215379536152, "num_tokens": 10402016.0, "step": 12930 }, { "epoch": 3.42478813559322, "grad_norm": 1.97738778591156, "learning_rate": 8.287738347457627e-06, "loss": 1.1734, "mean_token_accuracy": 0.7293604090809822, "num_tokens": 10403509.0, "step": 12932 }, { "epoch": 3.4253177966101696, "grad_norm": 2.3548812866210938, "learning_rate": 8.287473516949152e-06, "loss": 1.3092, "mean_token_accuracy": 0.7186909690499306, "num_tokens": 10404805.0, "step": 12934 }, { "epoch": 3.4258474576271185, "grad_norm": 1.8794548511505127, "learning_rate": 8.287208686440679e-06, "loss": 0.7417, "mean_token_accuracy": 0.8173755183815956, "num_tokens": 10406373.0, "step": 12936 }, { "epoch": 3.426377118644068, "grad_norm": 2.5365676879882812, "learning_rate": 8.286943855932204e-06, "loss": 1.5461, "mean_token_accuracy": 0.6599818617105484, "num_tokens": 10407863.0, "step": 12938 }, { "epoch": 3.426906779661017, "grad_norm": 2.5435094833374023, "learning_rate": 8.28667902542373e-06, "loss": 1.0559, "mean_token_accuracy": 0.7655859589576721, "num_tokens": 10409279.0, "step": 12940 }, { "epoch": 3.4274364406779663, "grad_norm": 1.9913419485092163, "learning_rate": 8.286414194915255e-06, "loss": 1.3674, "mean_token_accuracy": 0.6982802152633667, "num_tokens": 10410798.0, "step": 12942 }, { "epoch": 3.4279661016949152, "grad_norm": 2.140856981277466, "learning_rate": 8.28614936440678e-06, "loss": 1.7648, "mean_token_accuracy": 0.6277670040726662, "num_tokens": 10412103.0, "step": 12944 }, { "epoch": 3.428495762711864, "grad_norm": 1.6180659532546997, "learning_rate": 8.285884533898307e-06, "loss": 1.0545, "mean_token_accuracy": 0.7449265941977501, "num_tokens": 10413882.0, "step": 12946 }, { "epoch": 3.4290254237288136, "grad_norm": 1.6515716314315796, "learning_rate": 8.285619703389832e-06, "loss": 1.08, "mean_token_accuracy": 0.7330101430416107, "num_tokens": 10415538.0, "step": 12948 }, { "epoch": 3.4295550847457625, "grad_norm": 2.0118188858032227, "learning_rate": 8.285354872881357e-06, "loss": 1.5801, "mean_token_accuracy": 0.6450097039341927, "num_tokens": 10417044.0, "step": 12950 }, { "epoch": 3.430084745762712, "grad_norm": 2.061330556869507, "learning_rate": 8.285090042372881e-06, "loss": 1.3457, "mean_token_accuracy": 0.6934249550104141, "num_tokens": 10418684.0, "step": 12952 }, { "epoch": 3.430614406779661, "grad_norm": 2.0832808017730713, "learning_rate": 8.284825211864408e-06, "loss": 1.8584, "mean_token_accuracy": 0.6200214363634586, "num_tokens": 10420994.0, "step": 12954 }, { "epoch": 3.4311440677966103, "grad_norm": 1.209366798400879, "learning_rate": 8.284560381355933e-06, "loss": 1.2923, "mean_token_accuracy": 0.6914543509483337, "num_tokens": 10423213.0, "step": 12956 }, { "epoch": 3.4316737288135593, "grad_norm": 1.573043704032898, "learning_rate": 8.284295550847458e-06, "loss": 1.2697, "mean_token_accuracy": 0.7236777804791927, "num_tokens": 10425052.0, "step": 12958 }, { "epoch": 3.4322033898305087, "grad_norm": 1.881339430809021, "learning_rate": 8.284030720338983e-06, "loss": 1.1837, "mean_token_accuracy": 0.7328031212091446, "num_tokens": 10426900.0, "step": 12960 }, { "epoch": 3.4327330508474576, "grad_norm": 2.0602705478668213, "learning_rate": 8.28376588983051e-06, "loss": 1.62, "mean_token_accuracy": 0.6582523137331009, "num_tokens": 10428542.0, "step": 12962 }, { "epoch": 3.4332627118644066, "grad_norm": 1.9825770854949951, "learning_rate": 8.283501059322034e-06, "loss": 1.3845, "mean_token_accuracy": 0.6892084404826164, "num_tokens": 10429916.0, "step": 12964 }, { "epoch": 3.433792372881356, "grad_norm": 1.9765254259109497, "learning_rate": 8.283236228813561e-06, "loss": 1.8936, "mean_token_accuracy": 0.6029234267771244, "num_tokens": 10431572.0, "step": 12966 }, { "epoch": 3.434322033898305, "grad_norm": 1.767958641052246, "learning_rate": 8.282971398305086e-06, "loss": 1.1135, "mean_token_accuracy": 0.7341849356889725, "num_tokens": 10433129.0, "step": 12968 }, { "epoch": 3.4348516949152543, "grad_norm": 1.7709577083587646, "learning_rate": 8.282706567796611e-06, "loss": 1.1544, "mean_token_accuracy": 0.7453578487038612, "num_tokens": 10434804.0, "step": 12970 }, { "epoch": 3.4353813559322033, "grad_norm": 1.8287688493728638, "learning_rate": 8.282441737288136e-06, "loss": 0.7919, "mean_token_accuracy": 0.8028540089726448, "num_tokens": 10436253.0, "step": 12972 }, { "epoch": 3.4359110169491527, "grad_norm": 1.7175819873809814, "learning_rate": 8.282176906779662e-06, "loss": 1.0401, "mean_token_accuracy": 0.748590387403965, "num_tokens": 10437946.0, "step": 12974 }, { "epoch": 3.4364406779661016, "grad_norm": 1.606542944908142, "learning_rate": 8.281912076271187e-06, "loss": 1.2112, "mean_token_accuracy": 0.7168659269809723, "num_tokens": 10439426.0, "step": 12976 }, { "epoch": 3.436970338983051, "grad_norm": 1.8140877485275269, "learning_rate": 8.281647245762712e-06, "loss": 1.3164, "mean_token_accuracy": 0.6847967356443405, "num_tokens": 10441168.0, "step": 12978 }, { "epoch": 3.4375, "grad_norm": 2.044790506362915, "learning_rate": 8.281382415254237e-06, "loss": 1.5547, "mean_token_accuracy": 0.6495349779725075, "num_tokens": 10442712.0, "step": 12980 }, { "epoch": 3.438029661016949, "grad_norm": 1.811712384223938, "learning_rate": 8.281117584745764e-06, "loss": 1.2308, "mean_token_accuracy": 0.7125914767384529, "num_tokens": 10444267.0, "step": 12982 }, { "epoch": 3.4385593220338984, "grad_norm": 1.7867751121520996, "learning_rate": 8.280852754237289e-06, "loss": 1.1703, "mean_token_accuracy": 0.7394389733672142, "num_tokens": 10445734.0, "step": 12984 }, { "epoch": 3.4390889830508473, "grad_norm": 2.1751253604888916, "learning_rate": 8.280587923728814e-06, "loss": 0.9257, "mean_token_accuracy": 0.7672459930181503, "num_tokens": 10447145.0, "step": 12986 }, { "epoch": 3.4396186440677967, "grad_norm": 1.792797327041626, "learning_rate": 8.280323093220339e-06, "loss": 1.0277, "mean_token_accuracy": 0.7399735078215599, "num_tokens": 10448610.0, "step": 12988 }, { "epoch": 3.4401483050847457, "grad_norm": 2.0742411613464355, "learning_rate": 8.280058262711865e-06, "loss": 1.5324, "mean_token_accuracy": 0.6922419108450413, "num_tokens": 10450374.0, "step": 12990 }, { "epoch": 3.440677966101695, "grad_norm": 2.268423557281494, "learning_rate": 8.27979343220339e-06, "loss": 1.2713, "mean_token_accuracy": 0.7099012732505798, "num_tokens": 10451847.0, "step": 12992 }, { "epoch": 3.441207627118644, "grad_norm": 1.6364902257919312, "learning_rate": 8.279528601694917e-06, "loss": 1.3575, "mean_token_accuracy": 0.6837932020425797, "num_tokens": 10453744.0, "step": 12994 }, { "epoch": 3.4417372881355934, "grad_norm": 1.8360825777053833, "learning_rate": 8.279263771186442e-06, "loss": 1.129, "mean_token_accuracy": 0.7720850631594658, "num_tokens": 10455264.0, "step": 12996 }, { "epoch": 3.4422669491525424, "grad_norm": 2.0815844535827637, "learning_rate": 8.278998940677967e-06, "loss": 1.2127, "mean_token_accuracy": 0.7106116786599159, "num_tokens": 10456803.0, "step": 12998 }, { "epoch": 3.4427966101694913, "grad_norm": 1.8313878774642944, "learning_rate": 8.278734110169492e-06, "loss": 1.5548, "step": 13000 }, { "epoch": 3.4427966101694913, "eval_loss": 1.310645341873169, "eval_mean_token_accuracy": 0.7017144021856321, "eval_num_tokens": 10458780.0, "eval_runtime": 48.194, "eval_samples_per_second": 6.391, "eval_steps_per_second": 6.391, "step": 13000 }, { "epoch": 3.4433262711864407, "grad_norm": 2.1797068119049072, "learning_rate": 8.278469279661018e-06, "loss": 1.2287, "mean_token_accuracy": 0.6864385455846786, "num_tokens": 10460568.0, "step": 13002 }, { "epoch": 3.4438559322033897, "grad_norm": 1.9431488513946533, "learning_rate": 8.278204449152543e-06, "loss": 0.9652, "mean_token_accuracy": 0.7847785726189613, "num_tokens": 10462134.0, "step": 13004 }, { "epoch": 3.444385593220339, "grad_norm": 1.791777491569519, "learning_rate": 8.277939618644068e-06, "loss": 1.1361, "mean_token_accuracy": 0.7292910143733025, "num_tokens": 10463999.0, "step": 13006 }, { "epoch": 3.444915254237288, "grad_norm": 1.9557585716247559, "learning_rate": 8.277674788135593e-06, "loss": 1.4583, "mean_token_accuracy": 0.689028587192297, "num_tokens": 10465666.0, "step": 13008 }, { "epoch": 3.4454449152542375, "grad_norm": 1.8772448301315308, "learning_rate": 8.27740995762712e-06, "loss": 1.4197, "mean_token_accuracy": 0.6838599219918251, "num_tokens": 10467210.0, "step": 13010 }, { "epoch": 3.4459745762711864, "grad_norm": 2.1045989990234375, "learning_rate": 8.277145127118645e-06, "loss": 1.2661, "mean_token_accuracy": 0.735530436038971, "num_tokens": 10468842.0, "step": 13012 }, { "epoch": 3.446504237288136, "grad_norm": 1.7472889423370361, "learning_rate": 8.27688029661017e-06, "loss": 1.428, "mean_token_accuracy": 0.6908543929457664, "num_tokens": 10470556.0, "step": 13014 }, { "epoch": 3.4470338983050848, "grad_norm": 2.13354229927063, "learning_rate": 8.276615466101694e-06, "loss": 1.276, "mean_token_accuracy": 0.7331805303692818, "num_tokens": 10472019.0, "step": 13016 }, { "epoch": 3.4475635593220337, "grad_norm": 1.1170711517333984, "learning_rate": 8.276350635593221e-06, "loss": 1.0789, "mean_token_accuracy": 0.7446741238236427, "num_tokens": 10474529.0, "step": 13018 }, { "epoch": 3.448093220338983, "grad_norm": 1.7709896564483643, "learning_rate": 8.276085805084746e-06, "loss": 1.1995, "mean_token_accuracy": 0.689132459461689, "num_tokens": 10476893.0, "step": 13020 }, { "epoch": 3.448622881355932, "grad_norm": 2.0504934787750244, "learning_rate": 8.275820974576273e-06, "loss": 1.263, "mean_token_accuracy": 0.6956541463732719, "num_tokens": 10478174.0, "step": 13022 }, { "epoch": 3.4491525423728815, "grad_norm": 1.9140677452087402, "learning_rate": 8.275556144067798e-06, "loss": 1.4296, "mean_token_accuracy": 0.6744173541665077, "num_tokens": 10479750.0, "step": 13024 }, { "epoch": 3.4496822033898304, "grad_norm": 2.0297353267669678, "learning_rate": 8.275291313559322e-06, "loss": 1.5109, "mean_token_accuracy": 0.6557889208197594, "num_tokens": 10481154.0, "step": 13026 }, { "epoch": 3.45021186440678, "grad_norm": 1.5856108665466309, "learning_rate": 8.275026483050849e-06, "loss": 0.7928, "mean_token_accuracy": 0.7903392538428307, "num_tokens": 10482832.0, "step": 13028 }, { "epoch": 3.450741525423729, "grad_norm": 1.8807226419448853, "learning_rate": 8.274761652542374e-06, "loss": 1.0829, "mean_token_accuracy": 0.7314765304327011, "num_tokens": 10484246.0, "step": 13030 }, { "epoch": 3.451271186440678, "grad_norm": 1.916080355644226, "learning_rate": 8.274496822033899e-06, "loss": 1.4491, "mean_token_accuracy": 0.6772411838173866, "num_tokens": 10485865.0, "step": 13032 }, { "epoch": 3.451800847457627, "grad_norm": 1.5636835098266602, "learning_rate": 8.274231991525424e-06, "loss": 1.4142, "mean_token_accuracy": 0.6923833638429642, "num_tokens": 10487621.0, "step": 13034 }, { "epoch": 3.452330508474576, "grad_norm": 2.3098113536834717, "learning_rate": 8.27396716101695e-06, "loss": 1.238, "mean_token_accuracy": 0.7219346687197685, "num_tokens": 10489075.0, "step": 13036 }, { "epoch": 3.4528601694915255, "grad_norm": 1.73994779586792, "learning_rate": 8.273702330508475e-06, "loss": 1.2917, "mean_token_accuracy": 0.7056285813450813, "num_tokens": 10490697.0, "step": 13038 }, { "epoch": 3.4533898305084745, "grad_norm": 2.1585421562194824, "learning_rate": 8.2734375e-06, "loss": 1.1068, "mean_token_accuracy": 0.7261818423867226, "num_tokens": 10492191.0, "step": 13040 }, { "epoch": 3.453919491525424, "grad_norm": 1.6350573301315308, "learning_rate": 8.273172669491525e-06, "loss": 1.0972, "mean_token_accuracy": 0.7453628107905388, "num_tokens": 10493803.0, "step": 13042 }, { "epoch": 3.454449152542373, "grad_norm": 1.6759135723114014, "learning_rate": 8.272907838983052e-06, "loss": 1.1819, "mean_token_accuracy": 0.7193966507911682, "num_tokens": 10495482.0, "step": 13044 }, { "epoch": 3.454978813559322, "grad_norm": 1.9630471467971802, "learning_rate": 8.272643008474577e-06, "loss": 1.457, "mean_token_accuracy": 0.6982052624225616, "num_tokens": 10497215.0, "step": 13046 }, { "epoch": 3.455508474576271, "grad_norm": 1.7841085195541382, "learning_rate": 8.272378177966103e-06, "loss": 1.416, "mean_token_accuracy": 0.7067699022591114, "num_tokens": 10499169.0, "step": 13048 }, { "epoch": 3.45603813559322, "grad_norm": 1.7862619161605835, "learning_rate": 8.272113347457628e-06, "loss": 1.3654, "mean_token_accuracy": 0.6908344179391861, "num_tokens": 10500921.0, "step": 13050 }, { "epoch": 3.4565677966101696, "grad_norm": 1.4877010583877563, "learning_rate": 8.271848516949153e-06, "loss": 1.0815, "mean_token_accuracy": 0.7194997146725655, "num_tokens": 10503027.0, "step": 13052 }, { "epoch": 3.4570974576271185, "grad_norm": 2.077451229095459, "learning_rate": 8.271583686440678e-06, "loss": 1.2686, "mean_token_accuracy": 0.7132699489593506, "num_tokens": 10504470.0, "step": 13054 }, { "epoch": 3.457627118644068, "grad_norm": 1.9655637741088867, "learning_rate": 8.271318855932205e-06, "loss": 1.4948, "mean_token_accuracy": 0.6636830568313599, "num_tokens": 10506002.0, "step": 13056 }, { "epoch": 3.458156779661017, "grad_norm": 1.925148367881775, "learning_rate": 8.27105402542373e-06, "loss": 1.3959, "mean_token_accuracy": 0.6809230670332909, "num_tokens": 10507511.0, "step": 13058 }, { "epoch": 3.4586864406779663, "grad_norm": 1.8341516256332397, "learning_rate": 8.270789194915255e-06, "loss": 1.4709, "mean_token_accuracy": 0.6786301210522652, "num_tokens": 10509003.0, "step": 13060 }, { "epoch": 3.4592161016949152, "grad_norm": 1.7810657024383545, "learning_rate": 8.27052436440678e-06, "loss": 1.6012, "mean_token_accuracy": 0.6290342658758163, "num_tokens": 10510661.0, "step": 13062 }, { "epoch": 3.459745762711864, "grad_norm": 1.5646297931671143, "learning_rate": 8.270259533898306e-06, "loss": 0.9672, "mean_token_accuracy": 0.7521173879504204, "num_tokens": 10512514.0, "step": 13064 }, { "epoch": 3.4602754237288136, "grad_norm": 1.6229146718978882, "learning_rate": 8.269994703389831e-06, "loss": 1.1769, "mean_token_accuracy": 0.7089956700801849, "num_tokens": 10514218.0, "step": 13066 }, { "epoch": 3.4608050847457625, "grad_norm": 1.1782082319259644, "learning_rate": 8.269729872881356e-06, "loss": 1.0595, "mean_token_accuracy": 0.7410941980779171, "num_tokens": 10515977.0, "step": 13068 }, { "epoch": 3.461334745762712, "grad_norm": 1.6824220418930054, "learning_rate": 8.269465042372881e-06, "loss": 1.4102, "mean_token_accuracy": 0.6901431605219841, "num_tokens": 10517745.0, "step": 13070 }, { "epoch": 3.461864406779661, "grad_norm": 1.980832576751709, "learning_rate": 8.269200211864408e-06, "loss": 1.4383, "mean_token_accuracy": 0.693170502781868, "num_tokens": 10519356.0, "step": 13072 }, { "epoch": 3.4623940677966103, "grad_norm": 1.9376530647277832, "learning_rate": 8.268935381355933e-06, "loss": 1.4247, "mean_token_accuracy": 0.663268081843853, "num_tokens": 10521055.0, "step": 13074 }, { "epoch": 3.4629237288135593, "grad_norm": 1.4256597757339478, "learning_rate": 8.26867055084746e-06, "loss": 1.1587, "mean_token_accuracy": 0.7414375469088554, "num_tokens": 10522596.0, "step": 13076 }, { "epoch": 3.4634533898305087, "grad_norm": 1.9609451293945312, "learning_rate": 8.268405720338984e-06, "loss": 1.3328, "mean_token_accuracy": 0.7021761536598206, "num_tokens": 10524102.0, "step": 13078 }, { "epoch": 3.4639830508474576, "grad_norm": 1.7058576345443726, "learning_rate": 8.268140889830509e-06, "loss": 1.2339, "mean_token_accuracy": 0.7134525999426842, "num_tokens": 10525672.0, "step": 13080 }, { "epoch": 3.4645127118644066, "grad_norm": 1.5704489946365356, "learning_rate": 8.267876059322034e-06, "loss": 0.967, "mean_token_accuracy": 0.7706740275025368, "num_tokens": 10527206.0, "step": 13082 }, { "epoch": 3.465042372881356, "grad_norm": 1.6518962383270264, "learning_rate": 8.26761122881356e-06, "loss": 1.3589, "mean_token_accuracy": 0.6828956305980682, "num_tokens": 10528997.0, "step": 13084 }, { "epoch": 3.465572033898305, "grad_norm": 1.695525884628296, "learning_rate": 8.267346398305086e-06, "loss": 1.3131, "mean_token_accuracy": 0.6826454699039459, "num_tokens": 10530720.0, "step": 13086 }, { "epoch": 3.4661016949152543, "grad_norm": 1.864788293838501, "learning_rate": 8.26708156779661e-06, "loss": 1.1783, "mean_token_accuracy": 0.7372708469629288, "num_tokens": 10532245.0, "step": 13088 }, { "epoch": 3.4666313559322033, "grad_norm": 1.5929676294326782, "learning_rate": 8.266816737288135e-06, "loss": 1.2608, "mean_token_accuracy": 0.6951405853033066, "num_tokens": 10534127.0, "step": 13090 }, { "epoch": 3.4671610169491527, "grad_norm": 1.6114022731781006, "learning_rate": 8.266551906779662e-06, "loss": 1.4038, "mean_token_accuracy": 0.6600561812520027, "num_tokens": 10535921.0, "step": 13092 }, { "epoch": 3.4676906779661016, "grad_norm": 1.4328635931015015, "learning_rate": 8.266287076271187e-06, "loss": 0.8551, "mean_token_accuracy": 0.7881293967366219, "num_tokens": 10537558.0, "step": 13094 }, { "epoch": 3.468220338983051, "grad_norm": 1.8775405883789062, "learning_rate": 8.266022245762712e-06, "loss": 1.1869, "mean_token_accuracy": 0.7078190222382545, "num_tokens": 10539210.0, "step": 13096 }, { "epoch": 3.46875, "grad_norm": 1.8840502500534058, "learning_rate": 8.265757415254237e-06, "loss": 1.1257, "mean_token_accuracy": 0.7219685390591621, "num_tokens": 10540389.0, "step": 13098 }, { "epoch": 3.469279661016949, "grad_norm": 1.7458263635635376, "learning_rate": 8.265492584745763e-06, "loss": 1.5083, "mean_token_accuracy": 0.6538460776209831, "num_tokens": 10541958.0, "step": 13100 }, { "epoch": 3.4698093220338984, "grad_norm": 1.434319257736206, "learning_rate": 8.265227754237288e-06, "loss": 1.1028, "mean_token_accuracy": 0.7384070530533791, "num_tokens": 10543822.0, "step": 13102 }, { "epoch": 3.4703389830508473, "grad_norm": 2.1678173542022705, "learning_rate": 8.264962923728815e-06, "loss": 1.3209, "mean_token_accuracy": 0.7069011181592941, "num_tokens": 10545470.0, "step": 13104 }, { "epoch": 3.4708686440677967, "grad_norm": 1.8587511777877808, "learning_rate": 8.26469809322034e-06, "loss": 1.3228, "mean_token_accuracy": 0.7001443728804588, "num_tokens": 10546953.0, "step": 13106 }, { "epoch": 3.4713983050847457, "grad_norm": 1.8374366760253906, "learning_rate": 8.264433262711865e-06, "loss": 1.3409, "mean_token_accuracy": 0.6945038810372353, "num_tokens": 10548625.0, "step": 13108 }, { "epoch": 3.471927966101695, "grad_norm": 1.9000085592269897, "learning_rate": 8.264168432203392e-06, "loss": 1.1589, "mean_token_accuracy": 0.7574789598584175, "num_tokens": 10550187.0, "step": 13110 }, { "epoch": 3.472457627118644, "grad_norm": 2.0528340339660645, "learning_rate": 8.263903601694916e-06, "loss": 1.2273, "mean_token_accuracy": 0.7350626066327095, "num_tokens": 10551840.0, "step": 13112 }, { "epoch": 3.4729872881355934, "grad_norm": 1.9005753993988037, "learning_rate": 8.263638771186441e-06, "loss": 1.3469, "mean_token_accuracy": 0.6895119547843933, "num_tokens": 10553326.0, "step": 13114 }, { "epoch": 3.4735169491525424, "grad_norm": 1.674780011177063, "learning_rate": 8.263373940677966e-06, "loss": 1.2782, "mean_token_accuracy": 0.7114256732165813, "num_tokens": 10555171.0, "step": 13116 }, { "epoch": 3.4740466101694913, "grad_norm": 2.0294220447540283, "learning_rate": 8.263109110169493e-06, "loss": 1.2998, "mean_token_accuracy": 0.692367821931839, "num_tokens": 10556814.0, "step": 13118 }, { "epoch": 3.4745762711864407, "grad_norm": 2.0775911808013916, "learning_rate": 8.262844279661018e-06, "loss": 0.8831, "mean_token_accuracy": 0.7753245532512665, "num_tokens": 10558121.0, "step": 13120 }, { "epoch": 3.4751059322033897, "grad_norm": 2.046546697616577, "learning_rate": 8.262579449152543e-06, "loss": 1.286, "mean_token_accuracy": 0.7122275233268738, "num_tokens": 10559695.0, "step": 13122 }, { "epoch": 3.475635593220339, "grad_norm": 1.7030961513519287, "learning_rate": 8.262314618644068e-06, "loss": 1.2217, "mean_token_accuracy": 0.7158290147781372, "num_tokens": 10561457.0, "step": 13124 }, { "epoch": 3.476165254237288, "grad_norm": 1.680734395980835, "learning_rate": 8.262049788135594e-06, "loss": 1.3311, "mean_token_accuracy": 0.676736369729042, "num_tokens": 10563395.0, "step": 13126 }, { "epoch": 3.4766949152542375, "grad_norm": 2.5306007862091064, "learning_rate": 8.26178495762712e-06, "loss": 1.3317, "mean_token_accuracy": 0.7035850137472153, "num_tokens": 10564903.0, "step": 13128 }, { "epoch": 3.4772245762711864, "grad_norm": 1.7181117534637451, "learning_rate": 8.261520127118646e-06, "loss": 1.3045, "mean_token_accuracy": 0.693541869521141, "num_tokens": 10566522.0, "step": 13130 }, { "epoch": 3.477754237288136, "grad_norm": 2.1028172969818115, "learning_rate": 8.26125529661017e-06, "loss": 1.7469, "mean_token_accuracy": 0.6361057087779045, "num_tokens": 10568833.0, "step": 13132 }, { "epoch": 3.4782838983050848, "grad_norm": 1.922373652458191, "learning_rate": 8.260990466101696e-06, "loss": 1.0889, "mean_token_accuracy": 0.7468216493725777, "num_tokens": 10570177.0, "step": 13134 }, { "epoch": 3.4788135593220337, "grad_norm": 1.7479944229125977, "learning_rate": 8.26072563559322e-06, "loss": 1.2341, "mean_token_accuracy": 0.71548692882061, "num_tokens": 10571627.0, "step": 13136 }, { "epoch": 3.479343220338983, "grad_norm": 1.7807484865188599, "learning_rate": 8.260460805084747e-06, "loss": 1.2427, "mean_token_accuracy": 0.7171688079833984, "num_tokens": 10573279.0, "step": 13138 }, { "epoch": 3.479872881355932, "grad_norm": 1.6197677850723267, "learning_rate": 8.260195974576272e-06, "loss": 1.6914, "mean_token_accuracy": 0.6285839080810547, "num_tokens": 10575421.0, "step": 13140 }, { "epoch": 3.4804025423728815, "grad_norm": 1.6174237728118896, "learning_rate": 8.259931144067797e-06, "loss": 1.1769, "mean_token_accuracy": 0.7021891474723816, "num_tokens": 10576986.0, "step": 13142 }, { "epoch": 3.4809322033898304, "grad_norm": 1.6648008823394775, "learning_rate": 8.259666313559322e-06, "loss": 1.4683, "mean_token_accuracy": 0.6553217247128487, "num_tokens": 10578830.0, "step": 13144 }, { "epoch": 3.48146186440678, "grad_norm": 2.151860237121582, "learning_rate": 8.259401483050849e-06, "loss": 1.8811, "mean_token_accuracy": 0.6043797805905342, "num_tokens": 10580506.0, "step": 13146 }, { "epoch": 3.481991525423729, "grad_norm": 1.595218539237976, "learning_rate": 8.259136652542374e-06, "loss": 1.2151, "mean_token_accuracy": 0.725726343691349, "num_tokens": 10582081.0, "step": 13148 }, { "epoch": 3.482521186440678, "grad_norm": 1.6062264442443848, "learning_rate": 8.258871822033899e-06, "loss": 1.143, "mean_token_accuracy": 0.7388476058840752, "num_tokens": 10583692.0, "step": 13150 }, { "epoch": 3.483050847457627, "grad_norm": 1.7076385021209717, "learning_rate": 8.258606991525423e-06, "loss": 1.281, "mean_token_accuracy": 0.7181855365633965, "num_tokens": 10585278.0, "step": 13152 }, { "epoch": 3.483580508474576, "grad_norm": 1.678084135055542, "learning_rate": 8.25834216101695e-06, "loss": 1.1918, "mean_token_accuracy": 0.728847436606884, "num_tokens": 10586802.0, "step": 13154 }, { "epoch": 3.4841101694915255, "grad_norm": 2.5606698989868164, "learning_rate": 8.258077330508475e-06, "loss": 1.2488, "mean_token_accuracy": 0.6964931860566139, "num_tokens": 10588418.0, "step": 13156 }, { "epoch": 3.4846398305084745, "grad_norm": 1.9822635650634766, "learning_rate": 8.257812500000002e-06, "loss": 1.5188, "mean_token_accuracy": 0.6643277406692505, "num_tokens": 10589931.0, "step": 13158 }, { "epoch": 3.485169491525424, "grad_norm": 2.2545173168182373, "learning_rate": 8.257547669491527e-06, "loss": 1.5722, "mean_token_accuracy": 0.6613527983427048, "num_tokens": 10591280.0, "step": 13160 }, { "epoch": 3.485699152542373, "grad_norm": 1.8617291450500488, "learning_rate": 8.257282838983052e-06, "loss": 1.4206, "mean_token_accuracy": 0.67804966121912, "num_tokens": 10593009.0, "step": 13162 }, { "epoch": 3.486228813559322, "grad_norm": 1.8567509651184082, "learning_rate": 8.257018008474576e-06, "loss": 0.8456, "mean_token_accuracy": 0.7922595739364624, "num_tokens": 10594518.0, "step": 13164 }, { "epoch": 3.486758474576271, "grad_norm": 1.9506220817565918, "learning_rate": 8.256753177966103e-06, "loss": 1.3473, "mean_token_accuracy": 0.7025995030999184, "num_tokens": 10596147.0, "step": 13166 }, { "epoch": 3.48728813559322, "grad_norm": 1.8258060216903687, "learning_rate": 8.256488347457628e-06, "loss": 0.9018, "mean_token_accuracy": 0.7743038535118103, "num_tokens": 10597697.0, "step": 13168 }, { "epoch": 3.4878177966101696, "grad_norm": 2.1038784980773926, "learning_rate": 8.256223516949153e-06, "loss": 1.3296, "mean_token_accuracy": 0.6894765198230743, "num_tokens": 10599463.0, "step": 13170 }, { "epoch": 3.4883474576271185, "grad_norm": 2.148909330368042, "learning_rate": 8.255958686440678e-06, "loss": 1.5934, "mean_token_accuracy": 0.641620971262455, "num_tokens": 10601173.0, "step": 13172 }, { "epoch": 3.488877118644068, "grad_norm": 1.911750078201294, "learning_rate": 8.255693855932204e-06, "loss": 1.2842, "mean_token_accuracy": 0.7215376570820808, "num_tokens": 10602593.0, "step": 13174 }, { "epoch": 3.489406779661017, "grad_norm": 1.848777413368225, "learning_rate": 8.25542902542373e-06, "loss": 1.2977, "mean_token_accuracy": 0.6981778740882874, "num_tokens": 10604229.0, "step": 13176 }, { "epoch": 3.4899364406779663, "grad_norm": 1.8905740976333618, "learning_rate": 8.255164194915254e-06, "loss": 1.023, "mean_token_accuracy": 0.73322743922472, "num_tokens": 10605918.0, "step": 13178 }, { "epoch": 3.4904661016949152, "grad_norm": 1.888792634010315, "learning_rate": 8.25489936440678e-06, "loss": 1.3509, "mean_token_accuracy": 0.7299085333943367, "num_tokens": 10607342.0, "step": 13180 }, { "epoch": 3.490995762711864, "grad_norm": 1.7856720685958862, "learning_rate": 8.254634533898306e-06, "loss": 1.5758, "mean_token_accuracy": 0.6506693512201309, "num_tokens": 10609025.0, "step": 13182 }, { "epoch": 3.4915254237288136, "grad_norm": 2.1392009258270264, "learning_rate": 8.25436970338983e-06, "loss": 1.5966, "mean_token_accuracy": 0.6595222502946854, "num_tokens": 10610559.0, "step": 13184 }, { "epoch": 3.4920550847457625, "grad_norm": 2.224156618118286, "learning_rate": 8.254104872881357e-06, "loss": 1.4601, "mean_token_accuracy": 0.6837983727455139, "num_tokens": 10611939.0, "step": 13186 }, { "epoch": 3.492584745762712, "grad_norm": 1.716469645500183, "learning_rate": 8.25384004237288e-06, "loss": 1.2467, "mean_token_accuracy": 0.6873857602477074, "num_tokens": 10613785.0, "step": 13188 }, { "epoch": 3.493114406779661, "grad_norm": 1.8826899528503418, "learning_rate": 8.253575211864407e-06, "loss": 1.3703, "mean_token_accuracy": 0.6855285614728928, "num_tokens": 10615240.0, "step": 13190 }, { "epoch": 3.4936440677966103, "grad_norm": 1.6588828563690186, "learning_rate": 8.253310381355934e-06, "loss": 1.2386, "mean_token_accuracy": 0.7069371752440929, "num_tokens": 10617021.0, "step": 13192 }, { "epoch": 3.4941737288135593, "grad_norm": 1.7519311904907227, "learning_rate": 8.253045550847459e-06, "loss": 0.8453, "mean_token_accuracy": 0.7765101715922356, "num_tokens": 10618509.0, "step": 13194 }, { "epoch": 3.4947033898305087, "grad_norm": 2.241044521331787, "learning_rate": 8.252780720338984e-06, "loss": 1.9216, "mean_token_accuracy": 0.5952250212430954, "num_tokens": 10620235.0, "step": 13196 }, { "epoch": 3.4952330508474576, "grad_norm": 2.052398204803467, "learning_rate": 8.252515889830509e-06, "loss": 1.3546, "mean_token_accuracy": 0.6959373280405998, "num_tokens": 10621745.0, "step": 13198 }, { "epoch": 3.4957627118644066, "grad_norm": 2.1458499431610107, "learning_rate": 8.252251059322035e-06, "loss": 1.4528, "mean_token_accuracy": 0.6762711778283119, "num_tokens": 10623290.0, "step": 13200 }, { "epoch": 3.496292372881356, "grad_norm": 1.7890734672546387, "learning_rate": 8.25198622881356e-06, "loss": 0.939, "mean_token_accuracy": 0.7705183625221252, "num_tokens": 10624450.0, "step": 13202 }, { "epoch": 3.496822033898305, "grad_norm": 1.7395180463790894, "learning_rate": 8.251721398305085e-06, "loss": 1.1824, "mean_token_accuracy": 0.7332126945257187, "num_tokens": 10625875.0, "step": 13204 }, { "epoch": 3.4973516949152543, "grad_norm": 1.753780722618103, "learning_rate": 8.25145656779661e-06, "loss": 1.1241, "mean_token_accuracy": 0.7246500849723816, "num_tokens": 10627213.0, "step": 13206 }, { "epoch": 3.4978813559322033, "grad_norm": 1.528324007987976, "learning_rate": 8.251191737288137e-06, "loss": 1.2307, "mean_token_accuracy": 0.6930754706263542, "num_tokens": 10628911.0, "step": 13208 }, { "epoch": 3.4984110169491527, "grad_norm": 1.9074091911315918, "learning_rate": 8.250926906779662e-06, "loss": 1.4145, "mean_token_accuracy": 0.6987354308366776, "num_tokens": 10630495.0, "step": 13210 }, { "epoch": 3.4989406779661016, "grad_norm": 2.1220438480377197, "learning_rate": 8.250662076271188e-06, "loss": 1.4894, "mean_token_accuracy": 0.6699493527412415, "num_tokens": 10631846.0, "step": 13212 }, { "epoch": 3.499470338983051, "grad_norm": 1.969866394996643, "learning_rate": 8.250397245762713e-06, "loss": 1.3206, "mean_token_accuracy": 0.7109638750553131, "num_tokens": 10633408.0, "step": 13214 }, { "epoch": 3.5, "grad_norm": 1.8863699436187744, "learning_rate": 8.250132415254238e-06, "loss": 1.3151, "mean_token_accuracy": 0.7027370110154152, "num_tokens": 10634831.0, "step": 13216 }, { "epoch": 3.500529661016949, "grad_norm": 1.6021132469177246, "learning_rate": 8.249867584745763e-06, "loss": 0.8374, "mean_token_accuracy": 0.7557870149612427, "num_tokens": 10636320.0, "step": 13218 }, { "epoch": 3.5010593220338984, "grad_norm": 2.3104641437530518, "learning_rate": 8.24960275423729e-06, "loss": 1.5703, "mean_token_accuracy": 0.6575090289115906, "num_tokens": 10637669.0, "step": 13220 }, { "epoch": 3.5015889830508473, "grad_norm": 1.6515865325927734, "learning_rate": 8.249337923728815e-06, "loss": 1.1582, "mean_token_accuracy": 0.7178553193807602, "num_tokens": 10639287.0, "step": 13222 }, { "epoch": 3.5021186440677967, "grad_norm": 1.9023005962371826, "learning_rate": 8.24907309322034e-06, "loss": 1.0796, "mean_token_accuracy": 0.7709198594093323, "num_tokens": 10640709.0, "step": 13224 }, { "epoch": 3.5026483050847457, "grad_norm": 2.544189453125, "learning_rate": 8.248808262711865e-06, "loss": 1.7975, "mean_token_accuracy": 0.5954028144478798, "num_tokens": 10642402.0, "step": 13226 }, { "epoch": 3.503177966101695, "grad_norm": 2.002861261367798, "learning_rate": 8.248543432203391e-06, "loss": 0.901, "mean_token_accuracy": 0.7884142324328423, "num_tokens": 10643789.0, "step": 13228 }, { "epoch": 3.503707627118644, "grad_norm": 1.9287807941436768, "learning_rate": 8.248278601694916e-06, "loss": 1.5327, "mean_token_accuracy": 0.6722670793533325, "num_tokens": 10645288.0, "step": 13230 }, { "epoch": 3.5042372881355934, "grad_norm": 1.8683863878250122, "learning_rate": 8.248013771186441e-06, "loss": 1.1616, "mean_token_accuracy": 0.7184152975678444, "num_tokens": 10647086.0, "step": 13232 }, { "epoch": 3.5047669491525424, "grad_norm": 1.7158868312835693, "learning_rate": 8.247748940677966e-06, "loss": 1.4033, "mean_token_accuracy": 0.6735134050250053, "num_tokens": 10648979.0, "step": 13234 }, { "epoch": 3.5052966101694913, "grad_norm": 1.7291450500488281, "learning_rate": 8.247484110169493e-06, "loss": 1.383, "mean_token_accuracy": 0.6713089346885681, "num_tokens": 10650682.0, "step": 13236 }, { "epoch": 3.5058262711864407, "grad_norm": 2.0229365825653076, "learning_rate": 8.247219279661017e-06, "loss": 1.4956, "mean_token_accuracy": 0.6758146062493324, "num_tokens": 10652065.0, "step": 13238 }, { "epoch": 3.5063559322033897, "grad_norm": 1.6939914226531982, "learning_rate": 8.246954449152544e-06, "loss": 1.2466, "mean_token_accuracy": 0.7202899530529976, "num_tokens": 10653546.0, "step": 13240 }, { "epoch": 3.506885593220339, "grad_norm": 2.0284907817840576, "learning_rate": 8.246689618644067e-06, "loss": 1.6129, "mean_token_accuracy": 0.669470950961113, "num_tokens": 10655176.0, "step": 13242 }, { "epoch": 3.507415254237288, "grad_norm": 1.560608983039856, "learning_rate": 8.246424788135594e-06, "loss": 1.2463, "mean_token_accuracy": 0.7120642587542534, "num_tokens": 10656881.0, "step": 13244 }, { "epoch": 3.507944915254237, "grad_norm": 1.8064305782318115, "learning_rate": 8.246159957627119e-06, "loss": 1.27, "mean_token_accuracy": 0.7082061469554901, "num_tokens": 10658494.0, "step": 13246 }, { "epoch": 3.5084745762711864, "grad_norm": 2.44028902053833, "learning_rate": 8.245895127118646e-06, "loss": 1.3573, "mean_token_accuracy": 0.6555696278810501, "num_tokens": 10660652.0, "step": 13248 }, { "epoch": 3.509004237288136, "grad_norm": 1.9979572296142578, "learning_rate": 8.24563029661017e-06, "loss": 1.0828, "step": 13250 }, { "epoch": 3.509004237288136, "eval_loss": 1.3100718259811401, "eval_mean_token_accuracy": 0.7010830896241325, "eval_num_tokens": 10662251.0, "eval_runtime": 48.2821, "eval_samples_per_second": 6.379, "eval_steps_per_second": 6.379, "step": 13250 }, { "epoch": 3.5095338983050848, "grad_norm": 1.749677300453186, "learning_rate": 8.245365466101695e-06, "loss": 1.3052, "mean_token_accuracy": 0.7229509018361568, "num_tokens": 10663984.0, "step": 13252 }, { "epoch": 3.5100635593220337, "grad_norm": 1.8085395097732544, "learning_rate": 8.24510063559322e-06, "loss": 1.2786, "mean_token_accuracy": 0.7184727117419243, "num_tokens": 10665533.0, "step": 13254 }, { "epoch": 3.510593220338983, "grad_norm": 1.6927273273468018, "learning_rate": 8.244835805084747e-06, "loss": 1.1255, "mean_token_accuracy": 0.7575406134128571, "num_tokens": 10667080.0, "step": 13256 }, { "epoch": 3.511122881355932, "grad_norm": 1.6062800884246826, "learning_rate": 8.244570974576272e-06, "loss": 0.9715, "mean_token_accuracy": 0.7762316688895226, "num_tokens": 10669050.0, "step": 13258 }, { "epoch": 3.5116525423728815, "grad_norm": 2.0480213165283203, "learning_rate": 8.244306144067797e-06, "loss": 1.4339, "mean_token_accuracy": 0.6788604855537415, "num_tokens": 10670861.0, "step": 13260 }, { "epoch": 3.5121822033898304, "grad_norm": 1.870255470275879, "learning_rate": 8.244041313559322e-06, "loss": 1.3507, "mean_token_accuracy": 0.675292931497097, "num_tokens": 10672488.0, "step": 13262 }, { "epoch": 3.5127118644067794, "grad_norm": 1.8843178749084473, "learning_rate": 8.243776483050848e-06, "loss": 1.4395, "mean_token_accuracy": 0.6869976744055748, "num_tokens": 10673926.0, "step": 13264 }, { "epoch": 3.513241525423729, "grad_norm": 1.9686936140060425, "learning_rate": 8.243511652542373e-06, "loss": 1.6413, "mean_token_accuracy": 0.6657541245222092, "num_tokens": 10675402.0, "step": 13266 }, { "epoch": 3.513771186440678, "grad_norm": 1.928969383239746, "learning_rate": 8.2432468220339e-06, "loss": 1.2832, "mean_token_accuracy": 0.7044920101761818, "num_tokens": 10676959.0, "step": 13268 }, { "epoch": 3.514300847457627, "grad_norm": 1.883424162864685, "learning_rate": 8.242981991525423e-06, "loss": 0.8459, "mean_token_accuracy": 0.7837560698390007, "num_tokens": 10678411.0, "step": 13270 }, { "epoch": 3.514830508474576, "grad_norm": 1.8806736469268799, "learning_rate": 8.24271716101695e-06, "loss": 1.1139, "mean_token_accuracy": 0.743362307548523, "num_tokens": 10680283.0, "step": 13272 }, { "epoch": 3.5153601694915255, "grad_norm": 1.8363057374954224, "learning_rate": 8.242452330508475e-06, "loss": 1.2877, "mean_token_accuracy": 0.7084954082965851, "num_tokens": 10681828.0, "step": 13274 }, { "epoch": 3.5158898305084745, "grad_norm": 1.8135288953781128, "learning_rate": 8.242187500000001e-06, "loss": 1.219, "mean_token_accuracy": 0.7199567779898643, "num_tokens": 10683334.0, "step": 13276 }, { "epoch": 3.516419491525424, "grad_norm": 1.988912582397461, "learning_rate": 8.241922669491526e-06, "loss": 1.3759, "mean_token_accuracy": 0.711820624768734, "num_tokens": 10684832.0, "step": 13278 }, { "epoch": 3.516949152542373, "grad_norm": 1.906620979309082, "learning_rate": 8.241657838983051e-06, "loss": 1.2757, "mean_token_accuracy": 0.7293715327978134, "num_tokens": 10686059.0, "step": 13280 }, { "epoch": 3.517478813559322, "grad_norm": 2.592434883117676, "learning_rate": 8.241393008474578e-06, "loss": 1.4806, "mean_token_accuracy": 0.6416138038039207, "num_tokens": 10687714.0, "step": 13282 }, { "epoch": 3.518008474576271, "grad_norm": 2.318420648574829, "learning_rate": 8.241128177966103e-06, "loss": 1.2298, "mean_token_accuracy": 0.7397220730781555, "num_tokens": 10689287.0, "step": 13284 }, { "epoch": 3.5185381355932206, "grad_norm": 1.9218934774398804, "learning_rate": 8.240863347457628e-06, "loss": 1.1868, "mean_token_accuracy": 0.7257023677229881, "num_tokens": 10690753.0, "step": 13286 }, { "epoch": 3.5190677966101696, "grad_norm": 2.0939435958862305, "learning_rate": 8.240598516949153e-06, "loss": 2.1936, "mean_token_accuracy": 0.555132869631052, "num_tokens": 10692603.0, "step": 13288 }, { "epoch": 3.5195974576271185, "grad_norm": 1.9039138555526733, "learning_rate": 8.24033368644068e-06, "loss": 1.3836, "mean_token_accuracy": 0.7037742808461189, "num_tokens": 10694260.0, "step": 13290 }, { "epoch": 3.520127118644068, "grad_norm": 2.194427013397217, "learning_rate": 8.240068855932204e-06, "loss": 1.1524, "mean_token_accuracy": 0.7444288954138756, "num_tokens": 10695755.0, "step": 13292 }, { "epoch": 3.520656779661017, "grad_norm": 1.597272515296936, "learning_rate": 8.23980402542373e-06, "loss": 0.9831, "mean_token_accuracy": 0.7585911452770233, "num_tokens": 10697504.0, "step": 13294 }, { "epoch": 3.5211864406779663, "grad_norm": 1.866727352142334, "learning_rate": 8.239539194915254e-06, "loss": 1.2448, "mean_token_accuracy": 0.7124374806880951, "num_tokens": 10698811.0, "step": 13296 }, { "epoch": 3.5217161016949152, "grad_norm": 2.0816543102264404, "learning_rate": 8.23927436440678e-06, "loss": 1.4375, "mean_token_accuracy": 0.6685559302568436, "num_tokens": 10700321.0, "step": 13298 }, { "epoch": 3.522245762711864, "grad_norm": 1.725329875946045, "learning_rate": 8.239009533898306e-06, "loss": 1.217, "mean_token_accuracy": 0.7220877632498741, "num_tokens": 10701751.0, "step": 13300 }, { "epoch": 3.5227754237288136, "grad_norm": 2.035457134246826, "learning_rate": 8.238744703389832e-06, "loss": 1.2967, "mean_token_accuracy": 0.7017441987991333, "num_tokens": 10703257.0, "step": 13302 }, { "epoch": 3.523305084745763, "grad_norm": 2.1564009189605713, "learning_rate": 8.238479872881357e-06, "loss": 1.4484, "mean_token_accuracy": 0.6671537831425667, "num_tokens": 10705028.0, "step": 13304 }, { "epoch": 3.523834745762712, "grad_norm": 2.0277137756347656, "learning_rate": 8.238215042372882e-06, "loss": 1.1036, "mean_token_accuracy": 0.7250204011797905, "num_tokens": 10706538.0, "step": 13306 }, { "epoch": 3.524364406779661, "grad_norm": 1.8861042261123657, "learning_rate": 8.237950211864407e-06, "loss": 1.3021, "mean_token_accuracy": 0.7154451906681061, "num_tokens": 10707954.0, "step": 13308 }, { "epoch": 3.5248940677966103, "grad_norm": 1.6800239086151123, "learning_rate": 8.237685381355934e-06, "loss": 0.8863, "mean_token_accuracy": 0.7522469162940979, "num_tokens": 10709562.0, "step": 13310 }, { "epoch": 3.5254237288135593, "grad_norm": 1.5447195768356323, "learning_rate": 8.237420550847458e-06, "loss": 1.6312, "mean_token_accuracy": 0.6378841549158096, "num_tokens": 10711721.0, "step": 13312 }, { "epoch": 3.5259533898305087, "grad_norm": 1.9652249813079834, "learning_rate": 8.237155720338983e-06, "loss": 1.2175, "mean_token_accuracy": 0.7111961096525192, "num_tokens": 10713237.0, "step": 13314 }, { "epoch": 3.5264830508474576, "grad_norm": 1.9347257614135742, "learning_rate": 8.236890889830508e-06, "loss": 1.3158, "mean_token_accuracy": 0.6961941421031952, "num_tokens": 10714853.0, "step": 13316 }, { "epoch": 3.5270127118644066, "grad_norm": 1.9439030885696411, "learning_rate": 8.236626059322035e-06, "loss": 1.5187, "mean_token_accuracy": 0.6566945910453796, "num_tokens": 10716509.0, "step": 13318 }, { "epoch": 3.527542372881356, "grad_norm": 1.6869797706604004, "learning_rate": 8.23636122881356e-06, "loss": 0.7636, "mean_token_accuracy": 0.8092575967311859, "num_tokens": 10718137.0, "step": 13320 }, { "epoch": 3.528072033898305, "grad_norm": 1.7639014720916748, "learning_rate": 8.236096398305085e-06, "loss": 1.4866, "mean_token_accuracy": 0.6614471971988678, "num_tokens": 10719796.0, "step": 13322 }, { "epoch": 3.5286016949152543, "grad_norm": 2.6296234130859375, "learning_rate": 8.23583156779661e-06, "loss": 1.3687, "mean_token_accuracy": 0.6907523199915886, "num_tokens": 10721174.0, "step": 13324 }, { "epoch": 3.5291313559322033, "grad_norm": 2.1988298892974854, "learning_rate": 8.235566737288136e-06, "loss": 1.4695, "mean_token_accuracy": 0.6643546149134636, "num_tokens": 10722630.0, "step": 13326 }, { "epoch": 3.5296610169491527, "grad_norm": 1.5073158740997314, "learning_rate": 8.235301906779661e-06, "loss": 0.8934, "mean_token_accuracy": 0.798778772354126, "num_tokens": 10724115.0, "step": 13328 }, { "epoch": 3.5301906779661016, "grad_norm": 1.6555042266845703, "learning_rate": 8.235037076271188e-06, "loss": 1.252, "mean_token_accuracy": 0.7091092318296432, "num_tokens": 10725653.0, "step": 13330 }, { "epoch": 3.530720338983051, "grad_norm": 2.067505359649658, "learning_rate": 8.234772245762713e-06, "loss": 1.1434, "mean_token_accuracy": 0.7085304111242294, "num_tokens": 10727268.0, "step": 13332 }, { "epoch": 3.53125, "grad_norm": 1.8184701204299927, "learning_rate": 8.234507415254238e-06, "loss": 1.3354, "mean_token_accuracy": 0.7071575745940208, "num_tokens": 10728719.0, "step": 13334 }, { "epoch": 3.531779661016949, "grad_norm": 1.6763452291488647, "learning_rate": 8.234242584745763e-06, "loss": 1.8179, "mean_token_accuracy": 0.6037650555372238, "num_tokens": 10730467.0, "step": 13336 }, { "epoch": 3.5323093220338984, "grad_norm": 1.7755489349365234, "learning_rate": 8.23397775423729e-06, "loss": 1.2441, "mean_token_accuracy": 0.7013202905654907, "num_tokens": 10732102.0, "step": 13338 }, { "epoch": 3.5328389830508473, "grad_norm": 2.086812734603882, "learning_rate": 8.233712923728814e-06, "loss": 1.4038, "mean_token_accuracy": 0.6894936785101891, "num_tokens": 10733660.0, "step": 13340 }, { "epoch": 3.5333686440677967, "grad_norm": 1.625893473625183, "learning_rate": 8.23344809322034e-06, "loss": 1.2785, "mean_token_accuracy": 0.7278029173612595, "num_tokens": 10735118.0, "step": 13342 }, { "epoch": 3.5338983050847457, "grad_norm": 1.995287299156189, "learning_rate": 8.233183262711864e-06, "loss": 1.5394, "mean_token_accuracy": 0.6747815981507301, "num_tokens": 10736784.0, "step": 13344 }, { "epoch": 3.534427966101695, "grad_norm": 1.708109974861145, "learning_rate": 8.23291843220339e-06, "loss": 1.2507, "mean_token_accuracy": 0.6925106421113014, "num_tokens": 10738567.0, "step": 13346 }, { "epoch": 3.534957627118644, "grad_norm": 1.7787203788757324, "learning_rate": 8.232653601694916e-06, "loss": 1.1868, "mean_token_accuracy": 0.7015012726187706, "num_tokens": 10740110.0, "step": 13348 }, { "epoch": 3.5354872881355934, "grad_norm": 1.8618532419204712, "learning_rate": 8.23238877118644e-06, "loss": 1.3499, "mean_token_accuracy": 0.6832246333360672, "num_tokens": 10741710.0, "step": 13350 }, { "epoch": 3.5360169491525424, "grad_norm": 2.079439640045166, "learning_rate": 8.232123940677966e-06, "loss": 1.6449, "mean_token_accuracy": 0.6610636711120605, "num_tokens": 10743147.0, "step": 13352 }, { "epoch": 3.5365466101694913, "grad_norm": 1.809263825416565, "learning_rate": 8.231859110169492e-06, "loss": 1.3133, "mean_token_accuracy": 0.7101559564471245, "num_tokens": 10744566.0, "step": 13354 }, { "epoch": 3.5370762711864407, "grad_norm": 1.833042025566101, "learning_rate": 8.231594279661017e-06, "loss": 0.8593, "mean_token_accuracy": 0.7837989553809166, "num_tokens": 10746352.0, "step": 13356 }, { "epoch": 3.5376059322033897, "grad_norm": 2.0344951152801514, "learning_rate": 8.231329449152544e-06, "loss": 1.2258, "mean_token_accuracy": 0.7284309789538383, "num_tokens": 10747771.0, "step": 13358 }, { "epoch": 3.538135593220339, "grad_norm": 1.747380256652832, "learning_rate": 8.231064618644069e-06, "loss": 1.1516, "mean_token_accuracy": 0.7388100698590279, "num_tokens": 10749243.0, "step": 13360 }, { "epoch": 3.538665254237288, "grad_norm": 1.9886088371276855, "learning_rate": 8.230799788135594e-06, "loss": 1.0482, "mean_token_accuracy": 0.7453457489609718, "num_tokens": 10750856.0, "step": 13362 }, { "epoch": 3.539194915254237, "grad_norm": 2.198617458343506, "learning_rate": 8.23053495762712e-06, "loss": 1.2042, "mean_token_accuracy": 0.7253762409090996, "num_tokens": 10752400.0, "step": 13364 }, { "epoch": 3.5397245762711864, "grad_norm": 2.0058445930480957, "learning_rate": 8.230270127118645e-06, "loss": 1.4427, "mean_token_accuracy": 0.6803516410291195, "num_tokens": 10753687.0, "step": 13366 }, { "epoch": 3.540254237288136, "grad_norm": 2.016289234161377, "learning_rate": 8.23000529661017e-06, "loss": 1.2548, "mean_token_accuracy": 0.6846458502113819, "num_tokens": 10755297.0, "step": 13368 }, { "epoch": 3.5407838983050848, "grad_norm": 2.0166099071502686, "learning_rate": 8.229740466101695e-06, "loss": 1.4746, "mean_token_accuracy": 0.6919164955615997, "num_tokens": 10757010.0, "step": 13370 }, { "epoch": 3.5413135593220337, "grad_norm": 1.616791009902954, "learning_rate": 8.229475635593222e-06, "loss": 1.1495, "mean_token_accuracy": 0.7258393242955208, "num_tokens": 10758661.0, "step": 13372 }, { "epoch": 3.541843220338983, "grad_norm": 2.15727162361145, "learning_rate": 8.229210805084747e-06, "loss": 1.6136, "mean_token_accuracy": 0.6533482931554317, "num_tokens": 10760236.0, "step": 13374 }, { "epoch": 3.542372881355932, "grad_norm": 1.837425947189331, "learning_rate": 8.228945974576271e-06, "loss": 1.2199, "mean_token_accuracy": 0.7079746648669243, "num_tokens": 10761534.0, "step": 13376 }, { "epoch": 3.5429025423728815, "grad_norm": 1.8594387769699097, "learning_rate": 8.228681144067796e-06, "loss": 1.5555, "mean_token_accuracy": 0.6642794981598854, "num_tokens": 10763216.0, "step": 13378 }, { "epoch": 3.5434322033898304, "grad_norm": 1.9598125219345093, "learning_rate": 8.228416313559323e-06, "loss": 1.4648, "mean_token_accuracy": 0.6953888386487961, "num_tokens": 10764874.0, "step": 13380 }, { "epoch": 3.5439618644067794, "grad_norm": 1.9642360210418701, "learning_rate": 8.228151483050848e-06, "loss": 1.0799, "mean_token_accuracy": 0.7399513497948647, "num_tokens": 10766381.0, "step": 13382 }, { "epoch": 3.544491525423729, "grad_norm": 1.9583288431167603, "learning_rate": 8.227886652542375e-06, "loss": 1.5376, "mean_token_accuracy": 0.6510750725865364, "num_tokens": 10767925.0, "step": 13384 }, { "epoch": 3.545021186440678, "grad_norm": 1.887448787689209, "learning_rate": 8.2276218220339e-06, "loss": 1.0986, "mean_token_accuracy": 0.7260660529136658, "num_tokens": 10769371.0, "step": 13386 }, { "epoch": 3.545550847457627, "grad_norm": 2.512995481491089, "learning_rate": 8.227356991525424e-06, "loss": 1.3845, "mean_token_accuracy": 0.699582751840353, "num_tokens": 10770829.0, "step": 13388 }, { "epoch": 3.546080508474576, "grad_norm": 1.9974466562271118, "learning_rate": 8.22709216101695e-06, "loss": 1.3356, "mean_token_accuracy": 0.6989789083600044, "num_tokens": 10772627.0, "step": 13390 }, { "epoch": 3.5466101694915255, "grad_norm": 1.8667644262313843, "learning_rate": 8.226827330508476e-06, "loss": 1.3299, "mean_token_accuracy": 0.6724659726023674, "num_tokens": 10774134.0, "step": 13392 }, { "epoch": 3.5471398305084745, "grad_norm": 1.8250582218170166, "learning_rate": 8.226562500000001e-06, "loss": 1.0191, "mean_token_accuracy": 0.7704078406095505, "num_tokens": 10775597.0, "step": 13394 }, { "epoch": 3.547669491525424, "grad_norm": 1.904219388961792, "learning_rate": 8.226297669491526e-06, "loss": 1.6165, "mean_token_accuracy": 0.6410400494933128, "num_tokens": 10777347.0, "step": 13396 }, { "epoch": 3.548199152542373, "grad_norm": 1.6983369588851929, "learning_rate": 8.22603283898305e-06, "loss": 1.0942, "mean_token_accuracy": 0.7348994091153145, "num_tokens": 10778934.0, "step": 13398 }, { "epoch": 3.548728813559322, "grad_norm": 1.9779102802276611, "learning_rate": 8.225768008474577e-06, "loss": 1.3776, "mean_token_accuracy": 0.6811240166425705, "num_tokens": 10780450.0, "step": 13400 }, { "epoch": 3.549258474576271, "grad_norm": 1.6321756839752197, "learning_rate": 8.225503177966102e-06, "loss": 1.2883, "mean_token_accuracy": 0.7098223641514778, "num_tokens": 10782038.0, "step": 13402 }, { "epoch": 3.5497881355932206, "grad_norm": 1.7129192352294922, "learning_rate": 8.225238347457627e-06, "loss": 1.4582, "mean_token_accuracy": 0.6961373686790466, "num_tokens": 10783746.0, "step": 13404 }, { "epoch": 3.5503177966101696, "grad_norm": 1.7092928886413574, "learning_rate": 8.224973516949152e-06, "loss": 1.2998, "mean_token_accuracy": 0.6903307139873505, "num_tokens": 10785466.0, "step": 13406 }, { "epoch": 3.5508474576271185, "grad_norm": 1.6876591444015503, "learning_rate": 8.224708686440679e-06, "loss": 0.9362, "mean_token_accuracy": 0.7834205776453018, "num_tokens": 10786951.0, "step": 13408 }, { "epoch": 3.551377118644068, "grad_norm": 1.7769039869308472, "learning_rate": 8.224443855932204e-06, "loss": 1.0593, "mean_token_accuracy": 0.7411770820617676, "num_tokens": 10788678.0, "step": 13410 }, { "epoch": 3.551906779661017, "grad_norm": 1.6476644277572632, "learning_rate": 8.22417902542373e-06, "loss": 1.4207, "mean_token_accuracy": 0.6947625353932381, "num_tokens": 10790374.0, "step": 13412 }, { "epoch": 3.5524364406779663, "grad_norm": 1.6465809345245361, "learning_rate": 8.223914194915255e-06, "loss": 1.1436, "mean_token_accuracy": 0.7291205450892448, "num_tokens": 10791921.0, "step": 13414 }, { "epoch": 3.5529661016949152, "grad_norm": 1.9406403303146362, "learning_rate": 8.22364936440678e-06, "loss": 1.6772, "mean_token_accuracy": 0.6598027125000954, "num_tokens": 10793496.0, "step": 13416 }, { "epoch": 3.553495762711864, "grad_norm": 1.8654600381851196, "learning_rate": 8.223384533898305e-06, "loss": 1.178, "mean_token_accuracy": 0.7180019915103912, "num_tokens": 10795011.0, "step": 13418 }, { "epoch": 3.5540254237288136, "grad_norm": 2.0090677738189697, "learning_rate": 8.223119703389832e-06, "loss": 1.4718, "mean_token_accuracy": 0.6666365340352058, "num_tokens": 10796732.0, "step": 13420 }, { "epoch": 3.554555084745763, "grad_norm": 1.9443731307983398, "learning_rate": 8.222854872881357e-06, "loss": 1.4653, "mean_token_accuracy": 0.6530750542879105, "num_tokens": 10799099.0, "step": 13422 }, { "epoch": 3.555084745762712, "grad_norm": 1.828228235244751, "learning_rate": 8.222590042372882e-06, "loss": 1.3183, "mean_token_accuracy": 0.7124413251876831, "num_tokens": 10800611.0, "step": 13424 }, { "epoch": 3.555614406779661, "grad_norm": 2.332998752593994, "learning_rate": 8.222325211864407e-06, "loss": 1.5163, "mean_token_accuracy": 0.6842521280050278, "num_tokens": 10802091.0, "step": 13426 }, { "epoch": 3.5561440677966103, "grad_norm": 1.788047432899475, "learning_rate": 8.222060381355933e-06, "loss": 1.2066, "mean_token_accuracy": 0.7054114267230034, "num_tokens": 10803720.0, "step": 13428 }, { "epoch": 3.5566737288135593, "grad_norm": 1.3876893520355225, "learning_rate": 8.221795550847458e-06, "loss": 0.9513, "mean_token_accuracy": 0.7807170376181602, "num_tokens": 10805215.0, "step": 13430 }, { "epoch": 3.5572033898305087, "grad_norm": 1.9014877080917358, "learning_rate": 8.221530720338983e-06, "loss": 1.2334, "mean_token_accuracy": 0.7146954387426376, "num_tokens": 10806948.0, "step": 13432 }, { "epoch": 3.5577330508474576, "grad_norm": 1.93606698513031, "learning_rate": 8.221265889830508e-06, "loss": 1.1979, "mean_token_accuracy": 0.7102033346891403, "num_tokens": 10808308.0, "step": 13434 }, { "epoch": 3.5582627118644066, "grad_norm": 2.44405198097229, "learning_rate": 8.221001059322035e-06, "loss": 1.7998, "mean_token_accuracy": 0.6399377100169659, "num_tokens": 10809809.0, "step": 13436 }, { "epoch": 3.558792372881356, "grad_norm": 1.7089351415634155, "learning_rate": 8.22073622881356e-06, "loss": 1.442, "mean_token_accuracy": 0.7009197324514389, "num_tokens": 10811556.0, "step": 13438 }, { "epoch": 3.559322033898305, "grad_norm": 1.9390665292739868, "learning_rate": 8.220471398305086e-06, "loss": 1.0813, "mean_token_accuracy": 0.7319923639297485, "num_tokens": 10812958.0, "step": 13440 }, { "epoch": 3.5598516949152543, "grad_norm": 1.7126600742340088, "learning_rate": 8.220206567796611e-06, "loss": 1.2117, "mean_token_accuracy": 0.7158762589097023, "num_tokens": 10814980.0, "step": 13442 }, { "epoch": 3.5603813559322033, "grad_norm": 1.8931151628494263, "learning_rate": 8.219941737288136e-06, "loss": 1.4841, "mean_token_accuracy": 0.6617616340517998, "num_tokens": 10816774.0, "step": 13444 }, { "epoch": 3.5609110169491527, "grad_norm": 2.0150749683380127, "learning_rate": 8.219676906779663e-06, "loss": 1.1236, "mean_token_accuracy": 0.734690360724926, "num_tokens": 10818167.0, "step": 13446 }, { "epoch": 3.5614406779661016, "grad_norm": 2.1152615547180176, "learning_rate": 8.219412076271188e-06, "loss": 1.2904, "mean_token_accuracy": 0.6998625919222832, "num_tokens": 10819654.0, "step": 13448 }, { "epoch": 3.561970338983051, "grad_norm": 2.085948944091797, "learning_rate": 8.219147245762712e-06, "loss": 1.4358, "mean_token_accuracy": 0.6599043309688568, "num_tokens": 10821061.0, "step": 13450 }, { "epoch": 3.5625, "grad_norm": 1.818713903427124, "learning_rate": 8.218882415254237e-06, "loss": 0.8805, "mean_token_accuracy": 0.7841843664646149, "num_tokens": 10822635.0, "step": 13452 }, { "epoch": 3.563029661016949, "grad_norm": 1.5310008525848389, "learning_rate": 8.218617584745764e-06, "loss": 0.9978, "mean_token_accuracy": 0.7351838015019894, "num_tokens": 10824501.0, "step": 13454 }, { "epoch": 3.5635593220338984, "grad_norm": 1.8635677099227905, "learning_rate": 8.218352754237289e-06, "loss": 1.3981, "mean_token_accuracy": 0.6728459522128105, "num_tokens": 10826121.0, "step": 13456 }, { "epoch": 3.5640889830508473, "grad_norm": 1.919830322265625, "learning_rate": 8.218087923728814e-06, "loss": 1.3715, "mean_token_accuracy": 0.7072495371103287, "num_tokens": 10827623.0, "step": 13458 }, { "epoch": 3.5646186440677967, "grad_norm": 1.9272119998931885, "learning_rate": 8.217823093220339e-06, "loss": 1.426, "mean_token_accuracy": 0.6723345816135406, "num_tokens": 10829321.0, "step": 13460 }, { "epoch": 3.5651483050847457, "grad_norm": 1.9472228288650513, "learning_rate": 8.217558262711865e-06, "loss": 1.5784, "mean_token_accuracy": 0.6465662159025669, "num_tokens": 10830936.0, "step": 13462 }, { "epoch": 3.565677966101695, "grad_norm": 2.101994276046753, "learning_rate": 8.21729343220339e-06, "loss": 1.467, "mean_token_accuracy": 0.692322313785553, "num_tokens": 10832532.0, "step": 13464 }, { "epoch": 3.566207627118644, "grad_norm": 1.4612421989440918, "learning_rate": 8.217028601694917e-06, "loss": 0.863, "mean_token_accuracy": 0.7781677544116974, "num_tokens": 10834222.0, "step": 13466 }, { "epoch": 3.5667372881355934, "grad_norm": 2.1296439170837402, "learning_rate": 8.216763771186442e-06, "loss": 1.4448, "mean_token_accuracy": 0.6799489930272102, "num_tokens": 10835713.0, "step": 13468 }, { "epoch": 3.5672669491525424, "grad_norm": 1.6895910501480103, "learning_rate": 8.216498940677967e-06, "loss": 1.1433, "mean_token_accuracy": 0.7285958155989647, "num_tokens": 10837205.0, "step": 13470 }, { "epoch": 3.5677966101694913, "grad_norm": 2.138428211212158, "learning_rate": 8.216234110169492e-06, "loss": 1.6961, "mean_token_accuracy": 0.6617362648248672, "num_tokens": 10838696.0, "step": 13472 }, { "epoch": 3.5683262711864407, "grad_norm": 1.9951061010360718, "learning_rate": 8.215969279661018e-06, "loss": 1.4423, "mean_token_accuracy": 0.6877996698021889, "num_tokens": 10840055.0, "step": 13474 }, { "epoch": 3.5688559322033897, "grad_norm": 1.9626445770263672, "learning_rate": 8.215704449152543e-06, "loss": 1.473, "mean_token_accuracy": 0.6655876822769642, "num_tokens": 10841812.0, "step": 13476 }, { "epoch": 3.569385593220339, "grad_norm": 2.076968193054199, "learning_rate": 8.215439618644068e-06, "loss": 1.0487, "mean_token_accuracy": 0.7365303635597229, "num_tokens": 10843085.0, "step": 13478 }, { "epoch": 3.569915254237288, "grad_norm": 2.165508508682251, "learning_rate": 8.215174788135593e-06, "loss": 1.2767, "mean_token_accuracy": 0.6908841952681541, "num_tokens": 10844606.0, "step": 13480 }, { "epoch": 3.570444915254237, "grad_norm": 1.882871389389038, "learning_rate": 8.21490995762712e-06, "loss": 1.4503, "mean_token_accuracy": 0.6718093454837799, "num_tokens": 10846184.0, "step": 13482 }, { "epoch": 3.5709745762711864, "grad_norm": 2.128309726715088, "learning_rate": 8.214645127118645e-06, "loss": 1.3825, "mean_token_accuracy": 0.6887283399701118, "num_tokens": 10847485.0, "step": 13484 }, { "epoch": 3.571504237288136, "grad_norm": 1.6886541843414307, "learning_rate": 8.21438029661017e-06, "loss": 1.2455, "mean_token_accuracy": 0.7338745072484016, "num_tokens": 10848854.0, "step": 13486 }, { "epoch": 3.5720338983050848, "grad_norm": 1.7826569080352783, "learning_rate": 8.214115466101695e-06, "loss": 0.9212, "mean_token_accuracy": 0.779525876045227, "num_tokens": 10850536.0, "step": 13488 }, { "epoch": 3.5725635593220337, "grad_norm": 1.896155595779419, "learning_rate": 8.213850635593221e-06, "loss": 1.1576, "mean_token_accuracy": 0.7339943423867226, "num_tokens": 10852181.0, "step": 13490 }, { "epoch": 3.573093220338983, "grad_norm": 1.7648662328720093, "learning_rate": 8.213585805084746e-06, "loss": 1.2713, "mean_token_accuracy": 0.7064106091856956, "num_tokens": 10853689.0, "step": 13492 }, { "epoch": 3.573622881355932, "grad_norm": 1.9358065128326416, "learning_rate": 8.213320974576273e-06, "loss": 1.4547, "mean_token_accuracy": 0.7045027166604996, "num_tokens": 10855365.0, "step": 13494 }, { "epoch": 3.5741525423728815, "grad_norm": 1.8393725156784058, "learning_rate": 8.213056144067798e-06, "loss": 0.9486, "mean_token_accuracy": 0.757656417787075, "num_tokens": 10856709.0, "step": 13496 }, { "epoch": 3.5746822033898304, "grad_norm": 1.577390193939209, "learning_rate": 8.212791313559323e-06, "loss": 0.9469, "mean_token_accuracy": 0.8035549148917198, "num_tokens": 10858181.0, "step": 13498 }, { "epoch": 3.5752118644067794, "grad_norm": 1.6985869407653809, "learning_rate": 8.212526483050848e-06, "loss": 1.5252, "step": 13500 }, { "epoch": 3.5752118644067794, "eval_loss": 1.3085060119628906, "eval_mean_token_accuracy": 0.701580651201211, "eval_num_tokens": 10859869.0, "eval_runtime": 48.4828, "eval_samples_per_second": 6.353, "eval_steps_per_second": 6.353, "step": 13500 }, { "epoch": 3.575741525423729, "grad_norm": 1.9871031045913696, "learning_rate": 8.212261652542374e-06, "loss": 1.0819, "mean_token_accuracy": 0.711592972278595, "num_tokens": 10861369.0, "step": 13502 }, { "epoch": 3.576271186440678, "grad_norm": 1.6535991430282593, "learning_rate": 8.211996822033899e-06, "loss": 1.2835, "mean_token_accuracy": 0.7129891142249107, "num_tokens": 10862947.0, "step": 13504 }, { "epoch": 3.576800847457627, "grad_norm": 2.183149576187134, "learning_rate": 8.211731991525424e-06, "loss": 1.2314, "mean_token_accuracy": 0.7364420667290688, "num_tokens": 10864447.0, "step": 13506 }, { "epoch": 3.577330508474576, "grad_norm": 1.8753434419631958, "learning_rate": 8.211467161016949e-06, "loss": 1.1807, "mean_token_accuracy": 0.6966322511434555, "num_tokens": 10866131.0, "step": 13508 }, { "epoch": 3.5778601694915255, "grad_norm": 1.5401743650436401, "learning_rate": 8.211202330508476e-06, "loss": 1.406, "mean_token_accuracy": 0.6835026517510414, "num_tokens": 10867958.0, "step": 13510 }, { "epoch": 3.5783898305084745, "grad_norm": 1.7797738313674927, "learning_rate": 8.2109375e-06, "loss": 1.1372, "mean_token_accuracy": 0.7233162149786949, "num_tokens": 10869666.0, "step": 13512 }, { "epoch": 3.578919491525424, "grad_norm": 1.9261655807495117, "learning_rate": 8.210672669491525e-06, "loss": 1.4783, "mean_token_accuracy": 0.6651974767446518, "num_tokens": 10871067.0, "step": 13514 }, { "epoch": 3.579449152542373, "grad_norm": 1.2526339292526245, "learning_rate": 8.21040783898305e-06, "loss": 0.878, "mean_token_accuracy": 0.7777158096432686, "num_tokens": 10872675.0, "step": 13516 }, { "epoch": 3.579978813559322, "grad_norm": 1.8564260005950928, "learning_rate": 8.210143008474577e-06, "loss": 1.2579, "mean_token_accuracy": 0.700028084218502, "num_tokens": 10874338.0, "step": 13518 }, { "epoch": 3.580508474576271, "grad_norm": 1.978430986404419, "learning_rate": 8.209878177966102e-06, "loss": 1.2957, "mean_token_accuracy": 0.7040638588368893, "num_tokens": 10875841.0, "step": 13520 }, { "epoch": 3.5810381355932206, "grad_norm": 2.2596616744995117, "learning_rate": 8.209613347457629e-06, "loss": 1.4569, "mean_token_accuracy": 0.6833451502025127, "num_tokens": 10877274.0, "step": 13522 }, { "epoch": 3.5815677966101696, "grad_norm": 2.3276822566986084, "learning_rate": 8.209348516949153e-06, "loss": 1.3318, "mean_token_accuracy": 0.7045488134026527, "num_tokens": 10878666.0, "step": 13524 }, { "epoch": 3.5820974576271185, "grad_norm": 1.855378270149231, "learning_rate": 8.209083686440678e-06, "loss": 1.35, "mean_token_accuracy": 0.6932595297694206, "num_tokens": 10880325.0, "step": 13526 }, { "epoch": 3.582627118644068, "grad_norm": 1.931313157081604, "learning_rate": 8.208818855932203e-06, "loss": 1.0577, "mean_token_accuracy": 0.7490171939134598, "num_tokens": 10881679.0, "step": 13528 }, { "epoch": 3.583156779661017, "grad_norm": 2.216810703277588, "learning_rate": 8.20855402542373e-06, "loss": 1.1595, "mean_token_accuracy": 0.7209995537996292, "num_tokens": 10883098.0, "step": 13530 }, { "epoch": 3.5836864406779663, "grad_norm": 2.3440792560577393, "learning_rate": 8.208289194915255e-06, "loss": 1.2358, "mean_token_accuracy": 0.7213219255208969, "num_tokens": 10884751.0, "step": 13532 }, { "epoch": 3.5842161016949152, "grad_norm": 2.336156129837036, "learning_rate": 8.20802436440678e-06, "loss": 1.3669, "mean_token_accuracy": 0.6954498887062073, "num_tokens": 10886338.0, "step": 13534 }, { "epoch": 3.584745762711864, "grad_norm": 1.867287516593933, "learning_rate": 8.207759533898306e-06, "loss": 1.4607, "mean_token_accuracy": 0.6858243383467197, "num_tokens": 10887839.0, "step": 13536 }, { "epoch": 3.5852754237288136, "grad_norm": 2.01713490486145, "learning_rate": 8.207494703389831e-06, "loss": 1.1023, "mean_token_accuracy": 0.7420279905200005, "num_tokens": 10889245.0, "step": 13538 }, { "epoch": 3.585805084745763, "grad_norm": 3.0740978717803955, "learning_rate": 8.207229872881356e-06, "loss": 0.8933, "mean_token_accuracy": 0.7756135910749435, "num_tokens": 10890946.0, "step": 13540 }, { "epoch": 3.586334745762712, "grad_norm": 1.8978323936462402, "learning_rate": 8.206965042372881e-06, "loss": 1.4397, "mean_token_accuracy": 0.6910000741481781, "num_tokens": 10892439.0, "step": 13542 }, { "epoch": 3.586864406779661, "grad_norm": 1.7227286100387573, "learning_rate": 8.206700211864408e-06, "loss": 0.7784, "mean_token_accuracy": 0.7984903901815414, "num_tokens": 10894016.0, "step": 13544 }, { "epoch": 3.5873940677966103, "grad_norm": 1.309086799621582, "learning_rate": 8.206435381355933e-06, "loss": 1.0286, "mean_token_accuracy": 0.7594940289855003, "num_tokens": 10895738.0, "step": 13546 }, { "epoch": 3.5879237288135593, "grad_norm": 1.468056321144104, "learning_rate": 8.20617055084746e-06, "loss": 1.1125, "mean_token_accuracy": 0.7398029118776321, "num_tokens": 10897193.0, "step": 13548 }, { "epoch": 3.5884533898305087, "grad_norm": 1.8120381832122803, "learning_rate": 8.205905720338984e-06, "loss": 0.7978, "mean_token_accuracy": 0.8010961711406708, "num_tokens": 10898960.0, "step": 13550 }, { "epoch": 3.5889830508474576, "grad_norm": 1.6657804250717163, "learning_rate": 8.20564088983051e-06, "loss": 1.2859, "mean_token_accuracy": 0.7071790546178818, "num_tokens": 10900416.0, "step": 13552 }, { "epoch": 3.5895127118644066, "grad_norm": 1.9078409671783447, "learning_rate": 8.205376059322034e-06, "loss": 1.535, "mean_token_accuracy": 0.6533746346831322, "num_tokens": 10902122.0, "step": 13554 }, { "epoch": 3.590042372881356, "grad_norm": 1.5407516956329346, "learning_rate": 8.20511122881356e-06, "loss": 0.7301, "mean_token_accuracy": 0.8034521788358688, "num_tokens": 10903681.0, "step": 13556 }, { "epoch": 3.590572033898305, "grad_norm": 1.752056360244751, "learning_rate": 8.204846398305086e-06, "loss": 1.0707, "mean_token_accuracy": 0.7719429954886436, "num_tokens": 10905011.0, "step": 13558 }, { "epoch": 3.5911016949152543, "grad_norm": 2.128624677658081, "learning_rate": 8.20458156779661e-06, "loss": 1.307, "mean_token_accuracy": 0.689398743212223, "num_tokens": 10906662.0, "step": 13560 }, { "epoch": 3.5916313559322033, "grad_norm": 1.9141753911972046, "learning_rate": 8.204316737288136e-06, "loss": 0.9962, "mean_token_accuracy": 0.7352351620793343, "num_tokens": 10908270.0, "step": 13562 }, { "epoch": 3.5921610169491527, "grad_norm": 1.762931227684021, "learning_rate": 8.204051906779662e-06, "loss": 1.1795, "mean_token_accuracy": 0.7247641235589981, "num_tokens": 10909693.0, "step": 13564 }, { "epoch": 3.5926906779661016, "grad_norm": 1.7791036367416382, "learning_rate": 8.203787076271187e-06, "loss": 1.4687, "mean_token_accuracy": 0.6625774875283241, "num_tokens": 10911644.0, "step": 13566 }, { "epoch": 3.593220338983051, "grad_norm": 2.378460645675659, "learning_rate": 8.203522245762712e-06, "loss": 1.4758, "mean_token_accuracy": 0.6970539316534996, "num_tokens": 10912980.0, "step": 13568 }, { "epoch": 3.59375, "grad_norm": 2.1230692863464355, "learning_rate": 8.203257415254237e-06, "loss": 1.647, "mean_token_accuracy": 0.6458104066550732, "num_tokens": 10914445.0, "step": 13570 }, { "epoch": 3.594279661016949, "grad_norm": 1.872641682624817, "learning_rate": 8.202992584745764e-06, "loss": 1.0178, "mean_token_accuracy": 0.7540215998888016, "num_tokens": 10916092.0, "step": 13572 }, { "epoch": 3.5948093220338984, "grad_norm": 1.6020623445510864, "learning_rate": 8.202727754237289e-06, "loss": 1.1939, "mean_token_accuracy": 0.7102465406060219, "num_tokens": 10917900.0, "step": 13574 }, { "epoch": 3.5953389830508473, "grad_norm": 1.965492606163025, "learning_rate": 8.202462923728815e-06, "loss": 1.2674, "mean_token_accuracy": 0.710199736058712, "num_tokens": 10919357.0, "step": 13576 }, { "epoch": 3.5958686440677967, "grad_norm": 1.9113754034042358, "learning_rate": 8.20219809322034e-06, "loss": 1.1056, "mean_token_accuracy": 0.7382921949028969, "num_tokens": 10920800.0, "step": 13578 }, { "epoch": 3.5963983050847457, "grad_norm": 2.4602620601654053, "learning_rate": 8.201933262711865e-06, "loss": 1.2819, "mean_token_accuracy": 0.6962758600711823, "num_tokens": 10922177.0, "step": 13580 }, { "epoch": 3.596927966101695, "grad_norm": 2.080608606338501, "learning_rate": 8.20166843220339e-06, "loss": 1.1711, "mean_token_accuracy": 0.7369857579469681, "num_tokens": 10923648.0, "step": 13582 }, { "epoch": 3.597457627118644, "grad_norm": 2.2452330589294434, "learning_rate": 8.201403601694917e-06, "loss": 1.6435, "mean_token_accuracy": 0.6442513950169086, "num_tokens": 10925111.0, "step": 13584 }, { "epoch": 3.5979872881355934, "grad_norm": 2.189761161804199, "learning_rate": 8.201138771186442e-06, "loss": 1.3125, "mean_token_accuracy": 0.7223513796925545, "num_tokens": 10926441.0, "step": 13586 }, { "epoch": 3.5985169491525424, "grad_norm": 1.6451168060302734, "learning_rate": 8.200873940677966e-06, "loss": 1.0005, "mean_token_accuracy": 0.751382552087307, "num_tokens": 10927969.0, "step": 13588 }, { "epoch": 3.5990466101694913, "grad_norm": 2.3146464824676514, "learning_rate": 8.200609110169491e-06, "loss": 1.2626, "mean_token_accuracy": 0.7023506984114647, "num_tokens": 10929550.0, "step": 13590 }, { "epoch": 3.5995762711864407, "grad_norm": 1.9502415657043457, "learning_rate": 8.200344279661018e-06, "loss": 1.3469, "mean_token_accuracy": 0.7091369107365608, "num_tokens": 10930884.0, "step": 13592 }, { "epoch": 3.6001059322033897, "grad_norm": 2.024845838546753, "learning_rate": 8.200079449152543e-06, "loss": 1.4621, "mean_token_accuracy": 0.6775247827172279, "num_tokens": 10932559.0, "step": 13594 }, { "epoch": 3.600635593220339, "grad_norm": 1.8226217031478882, "learning_rate": 8.199814618644068e-06, "loss": 1.1096, "mean_token_accuracy": 0.7271289750933647, "num_tokens": 10934168.0, "step": 13596 }, { "epoch": 3.601165254237288, "grad_norm": 1.6777557134628296, "learning_rate": 8.199549788135593e-06, "loss": 1.0576, "mean_token_accuracy": 0.7411266714334488, "num_tokens": 10935673.0, "step": 13598 }, { "epoch": 3.601694915254237, "grad_norm": 1.9598047733306885, "learning_rate": 8.19928495762712e-06, "loss": 1.189, "mean_token_accuracy": 0.7281683310866356, "num_tokens": 10937184.0, "step": 13600 }, { "epoch": 3.6022245762711864, "grad_norm": 1.8735501766204834, "learning_rate": 8.199020127118644e-06, "loss": 1.1188, "mean_token_accuracy": 0.736297681927681, "num_tokens": 10938724.0, "step": 13602 }, { "epoch": 3.602754237288136, "grad_norm": 1.860703468322754, "learning_rate": 8.198755296610171e-06, "loss": 1.2161, "mean_token_accuracy": 0.723382942378521, "num_tokens": 10940184.0, "step": 13604 }, { "epoch": 3.6032838983050848, "grad_norm": 1.962284803390503, "learning_rate": 8.198490466101696e-06, "loss": 1.4497, "mean_token_accuracy": 0.6901314333081245, "num_tokens": 10941571.0, "step": 13606 }, { "epoch": 3.6038135593220337, "grad_norm": 1.7091296911239624, "learning_rate": 8.198225635593221e-06, "loss": 1.3348, "mean_token_accuracy": 0.6960377991199493, "num_tokens": 10943101.0, "step": 13608 }, { "epoch": 3.604343220338983, "grad_norm": 1.9065715074539185, "learning_rate": 8.197960805084746e-06, "loss": 1.0753, "mean_token_accuracy": 0.7594949528574944, "num_tokens": 10944532.0, "step": 13610 }, { "epoch": 3.604872881355932, "grad_norm": 1.7595491409301758, "learning_rate": 8.197695974576272e-06, "loss": 1.078, "mean_token_accuracy": 0.7423929497599602, "num_tokens": 10945977.0, "step": 13612 }, { "epoch": 3.6054025423728815, "grad_norm": 2.0867483615875244, "learning_rate": 8.197431144067797e-06, "loss": 1.0354, "mean_token_accuracy": 0.7541307806968689, "num_tokens": 10947520.0, "step": 13614 }, { "epoch": 3.6059322033898304, "grad_norm": 1.8815900087356567, "learning_rate": 8.197166313559322e-06, "loss": 1.4129, "mean_token_accuracy": 0.7149464413523674, "num_tokens": 10948972.0, "step": 13616 }, { "epoch": 3.6064618644067794, "grad_norm": 2.0385074615478516, "learning_rate": 8.196901483050849e-06, "loss": 1.457, "mean_token_accuracy": 0.6710258796811104, "num_tokens": 10950711.0, "step": 13618 }, { "epoch": 3.606991525423729, "grad_norm": 1.8913298845291138, "learning_rate": 8.196636652542374e-06, "loss": 1.4361, "mean_token_accuracy": 0.6780392900109291, "num_tokens": 10952321.0, "step": 13620 }, { "epoch": 3.607521186440678, "grad_norm": 1.6911827325820923, "learning_rate": 8.196371822033899e-06, "loss": 1.2997, "mean_token_accuracy": 0.7103714868426323, "num_tokens": 10953966.0, "step": 13622 }, { "epoch": 3.608050847457627, "grad_norm": 2.1771631240844727, "learning_rate": 8.196106991525424e-06, "loss": 1.3224, "mean_token_accuracy": 0.7204759791493416, "num_tokens": 10955402.0, "step": 13624 }, { "epoch": 3.608580508474576, "grad_norm": 1.953784704208374, "learning_rate": 8.19584216101695e-06, "loss": 1.3081, "mean_token_accuracy": 0.7049507908523083, "num_tokens": 10956965.0, "step": 13626 }, { "epoch": 3.6091101694915255, "grad_norm": 1.7674508094787598, "learning_rate": 8.195577330508475e-06, "loss": 1.6163, "mean_token_accuracy": 0.6470558196306229, "num_tokens": 10958912.0, "step": 13628 }, { "epoch": 3.6096398305084745, "grad_norm": 1.9975082874298096, "learning_rate": 8.195312500000002e-06, "loss": 1.2724, "mean_token_accuracy": 0.7060747519135475, "num_tokens": 10960358.0, "step": 13630 }, { "epoch": 3.610169491525424, "grad_norm": 1.7410777807235718, "learning_rate": 8.195047669491527e-06, "loss": 0.9964, "mean_token_accuracy": 0.7396116033196449, "num_tokens": 10961855.0, "step": 13632 }, { "epoch": 3.610699152542373, "grad_norm": 2.0077052116394043, "learning_rate": 8.194782838983052e-06, "loss": 1.3352, "mean_token_accuracy": 0.6925792694091797, "num_tokens": 10963310.0, "step": 13634 }, { "epoch": 3.611228813559322, "grad_norm": 1.8145222663879395, "learning_rate": 8.194518008474577e-06, "loss": 1.3796, "mean_token_accuracy": 0.6723048835992813, "num_tokens": 10965009.0, "step": 13636 }, { "epoch": 3.611758474576271, "grad_norm": 1.5806230306625366, "learning_rate": 8.194253177966103e-06, "loss": 1.3433, "mean_token_accuracy": 0.684741273522377, "num_tokens": 10966730.0, "step": 13638 }, { "epoch": 3.6122881355932206, "grad_norm": 1.880883812904358, "learning_rate": 8.193988347457628e-06, "loss": 1.1077, "mean_token_accuracy": 0.7318913340568542, "num_tokens": 10968502.0, "step": 13640 }, { "epoch": 3.6128177966101696, "grad_norm": 1.9487721920013428, "learning_rate": 8.193723516949153e-06, "loss": 1.1423, "mean_token_accuracy": 0.7450006753206253, "num_tokens": 10970108.0, "step": 13642 }, { "epoch": 3.6133474576271185, "grad_norm": 1.2977265119552612, "learning_rate": 8.193458686440678e-06, "loss": 1.0453, "mean_token_accuracy": 0.7446270808577538, "num_tokens": 10971769.0, "step": 13644 }, { "epoch": 3.613877118644068, "grad_norm": 1.8730051517486572, "learning_rate": 8.193193855932205e-06, "loss": 1.3895, "mean_token_accuracy": 0.698694746941328, "num_tokens": 10973366.0, "step": 13646 }, { "epoch": 3.614406779661017, "grad_norm": 1.6741373538970947, "learning_rate": 8.19292902542373e-06, "loss": 1.2896, "mean_token_accuracy": 0.6919818036258221, "num_tokens": 10975049.0, "step": 13648 }, { "epoch": 3.6149364406779663, "grad_norm": 1.8138381242752075, "learning_rate": 8.192664194915255e-06, "loss": 1.1059, "mean_token_accuracy": 0.723287433385849, "num_tokens": 10976747.0, "step": 13650 }, { "epoch": 3.6154661016949152, "grad_norm": 1.6602516174316406, "learning_rate": 8.19239936440678e-06, "loss": 1.1355, "mean_token_accuracy": 0.7271469756960869, "num_tokens": 10978264.0, "step": 13652 }, { "epoch": 3.615995762711864, "grad_norm": 1.447035312652588, "learning_rate": 8.192134533898306e-06, "loss": 1.0348, "mean_token_accuracy": 0.7514391243457794, "num_tokens": 10979943.0, "step": 13654 }, { "epoch": 3.6165254237288136, "grad_norm": 1.7522653341293335, "learning_rate": 8.191869703389831e-06, "loss": 1.2854, "mean_token_accuracy": 0.6948921978473663, "num_tokens": 10981801.0, "step": 13656 }, { "epoch": 3.617055084745763, "grad_norm": 1.9116969108581543, "learning_rate": 8.191604872881358e-06, "loss": 1.462, "mean_token_accuracy": 0.67005854845047, "num_tokens": 10983344.0, "step": 13658 }, { "epoch": 3.617584745762712, "grad_norm": 1.9319988489151, "learning_rate": 8.191340042372883e-06, "loss": 1.3367, "mean_token_accuracy": 0.6913134306669235, "num_tokens": 10984755.0, "step": 13660 }, { "epoch": 3.618114406779661, "grad_norm": 1.5761289596557617, "learning_rate": 8.191075211864407e-06, "loss": 1.1277, "mean_token_accuracy": 0.7360964119434357, "num_tokens": 10986491.0, "step": 13662 }, { "epoch": 3.6186440677966103, "grad_norm": 1.656030297279358, "learning_rate": 8.190810381355932e-06, "loss": 0.9935, "mean_token_accuracy": 0.7526024803519249, "num_tokens": 10988014.0, "step": 13664 }, { "epoch": 3.6191737288135593, "grad_norm": 2.4054393768310547, "learning_rate": 8.190545550847459e-06, "loss": 1.3472, "mean_token_accuracy": 0.6993527561426163, "num_tokens": 10989487.0, "step": 13666 }, { "epoch": 3.6197033898305087, "grad_norm": 1.749858021736145, "learning_rate": 8.190280720338984e-06, "loss": 0.9207, "mean_token_accuracy": 0.7726613357663155, "num_tokens": 10991039.0, "step": 13668 }, { "epoch": 3.6202330508474576, "grad_norm": 1.797019362449646, "learning_rate": 8.190015889830509e-06, "loss": 1.0243, "mean_token_accuracy": 0.7540926039218903, "num_tokens": 10992636.0, "step": 13670 }, { "epoch": 3.6207627118644066, "grad_norm": 1.7246376276016235, "learning_rate": 8.189751059322034e-06, "loss": 1.024, "mean_token_accuracy": 0.7410334497690201, "num_tokens": 10994492.0, "step": 13672 }, { "epoch": 3.621292372881356, "grad_norm": 1.6159179210662842, "learning_rate": 8.18948622881356e-06, "loss": 1.1815, "mean_token_accuracy": 0.6998305097222328, "num_tokens": 10997165.0, "step": 13674 }, { "epoch": 3.621822033898305, "grad_norm": 2.089601993560791, "learning_rate": 8.189221398305085e-06, "loss": 0.8386, "mean_token_accuracy": 0.7829103171825409, "num_tokens": 10998858.0, "step": 13676 }, { "epoch": 3.6223516949152543, "grad_norm": 2.362447500228882, "learning_rate": 8.18895656779661e-06, "loss": 1.2382, "mean_token_accuracy": 0.7437665462493896, "num_tokens": 11000349.0, "step": 13678 }, { "epoch": 3.6228813559322033, "grad_norm": 2.3257484436035156, "learning_rate": 8.188691737288135e-06, "loss": 1.3947, "mean_token_accuracy": 0.6801592782139778, "num_tokens": 11001660.0, "step": 13680 }, { "epoch": 3.6234110169491527, "grad_norm": 1.7537068128585815, "learning_rate": 8.188426906779662e-06, "loss": 1.1006, "mean_token_accuracy": 0.720342218875885, "num_tokens": 11003045.0, "step": 13682 }, { "epoch": 3.6239406779661016, "grad_norm": 1.9871115684509277, "learning_rate": 8.188162076271187e-06, "loss": 1.4177, "mean_token_accuracy": 0.7224582955241203, "num_tokens": 11004924.0, "step": 13684 }, { "epoch": 3.624470338983051, "grad_norm": 1.605775237083435, "learning_rate": 8.187897245762713e-06, "loss": 1.3284, "mean_token_accuracy": 0.6856389120221138, "num_tokens": 11006723.0, "step": 13686 }, { "epoch": 3.625, "grad_norm": 1.675805687904358, "learning_rate": 8.187632415254237e-06, "loss": 1.3259, "mean_token_accuracy": 0.669069454073906, "num_tokens": 11008418.0, "step": 13688 }, { "epoch": 3.625529661016949, "grad_norm": 2.1704750061035156, "learning_rate": 8.187367584745763e-06, "loss": 1.7709, "mean_token_accuracy": 0.5942362919449806, "num_tokens": 11010265.0, "step": 13690 }, { "epoch": 3.6260593220338984, "grad_norm": 1.6014527082443237, "learning_rate": 8.187102754237288e-06, "loss": 1.036, "mean_token_accuracy": 0.727201297879219, "num_tokens": 11011955.0, "step": 13692 }, { "epoch": 3.6265889830508473, "grad_norm": 1.760330080986023, "learning_rate": 8.186837923728815e-06, "loss": 1.1831, "mean_token_accuracy": 0.7359203919768333, "num_tokens": 11013458.0, "step": 13694 }, { "epoch": 3.6271186440677967, "grad_norm": 1.768495798110962, "learning_rate": 8.18657309322034e-06, "loss": 1.2828, "mean_token_accuracy": 0.6897872984409332, "num_tokens": 11015075.0, "step": 13696 }, { "epoch": 3.6276483050847457, "grad_norm": 1.9120975732803345, "learning_rate": 8.186308262711865e-06, "loss": 1.0566, "mean_token_accuracy": 0.7612593173980713, "num_tokens": 11016765.0, "step": 13698 }, { "epoch": 3.628177966101695, "grad_norm": 1.8507037162780762, "learning_rate": 8.186043432203391e-06, "loss": 1.062, "mean_token_accuracy": 0.7479674816131592, "num_tokens": 11018467.0, "step": 13700 }, { "epoch": 3.628707627118644, "grad_norm": 1.7863354682922363, "learning_rate": 8.185778601694916e-06, "loss": 1.0925, "mean_token_accuracy": 0.7206572964787483, "num_tokens": 11020145.0, "step": 13702 }, { "epoch": 3.6292372881355934, "grad_norm": 2.0463168621063232, "learning_rate": 8.185513771186441e-06, "loss": 1.0475, "mean_token_accuracy": 0.7317378297448158, "num_tokens": 11021637.0, "step": 13704 }, { "epoch": 3.6297669491525424, "grad_norm": 1.7383140325546265, "learning_rate": 8.185248940677966e-06, "loss": 1.112, "mean_token_accuracy": 0.7301643341779709, "num_tokens": 11023403.0, "step": 13706 }, { "epoch": 3.6302966101694913, "grad_norm": 1.5542314052581787, "learning_rate": 8.184984110169493e-06, "loss": 1.232, "mean_token_accuracy": 0.7039540857076645, "num_tokens": 11025014.0, "step": 13708 }, { "epoch": 3.6308262711864407, "grad_norm": 1.9579986333847046, "learning_rate": 8.184719279661018e-06, "loss": 1.3793, "mean_token_accuracy": 0.6886161491274834, "num_tokens": 11026371.0, "step": 13710 }, { "epoch": 3.6313559322033897, "grad_norm": 1.5038131475448608, "learning_rate": 8.184454449152544e-06, "loss": 1.1851, "mean_token_accuracy": 0.7396746054291725, "num_tokens": 11028127.0, "step": 13712 }, { "epoch": 3.631885593220339, "grad_norm": 1.9616931676864624, "learning_rate": 8.18418961864407e-06, "loss": 1.5624, "mean_token_accuracy": 0.6732265129685402, "num_tokens": 11030011.0, "step": 13714 }, { "epoch": 3.632415254237288, "grad_norm": 2.1739590167999268, "learning_rate": 8.183924788135594e-06, "loss": 1.409, "mean_token_accuracy": 0.6803679168224335, "num_tokens": 11031525.0, "step": 13716 }, { "epoch": 3.632944915254237, "grad_norm": 1.9246467351913452, "learning_rate": 8.183659957627119e-06, "loss": 1.4258, "mean_token_accuracy": 0.7024203687906265, "num_tokens": 11033282.0, "step": 13718 }, { "epoch": 3.6334745762711864, "grad_norm": 2.189741849899292, "learning_rate": 8.183395127118646e-06, "loss": 1.5726, "mean_token_accuracy": 0.653867095708847, "num_tokens": 11034974.0, "step": 13720 }, { "epoch": 3.634004237288136, "grad_norm": 2.1299479007720947, "learning_rate": 8.18313029661017e-06, "loss": 1.7105, "mean_token_accuracy": 0.6023406162858009, "num_tokens": 11036719.0, "step": 13722 }, { "epoch": 3.6345338983050848, "grad_norm": 2.0648157596588135, "learning_rate": 8.182865466101696e-06, "loss": 1.6664, "mean_token_accuracy": 0.6396830156445503, "num_tokens": 11038388.0, "step": 13724 }, { "epoch": 3.6350635593220337, "grad_norm": 2.063560962677002, "learning_rate": 8.18260063559322e-06, "loss": 1.3652, "mean_token_accuracy": 0.6836849078536034, "num_tokens": 11039637.0, "step": 13726 }, { "epoch": 3.635593220338983, "grad_norm": 1.805258870124817, "learning_rate": 8.182335805084747e-06, "loss": 1.3103, "mean_token_accuracy": 0.7013310566544533, "num_tokens": 11041341.0, "step": 13728 }, { "epoch": 3.636122881355932, "grad_norm": 1.6497021913528442, "learning_rate": 8.182070974576272e-06, "loss": 1.1571, "mean_token_accuracy": 0.7324099317193031, "num_tokens": 11043172.0, "step": 13730 }, { "epoch": 3.6366525423728815, "grad_norm": 2.051372766494751, "learning_rate": 8.181806144067797e-06, "loss": 1.0053, "mean_token_accuracy": 0.7569925636053085, "num_tokens": 11044616.0, "step": 13732 }, { "epoch": 3.6371822033898304, "grad_norm": 1.625365138053894, "learning_rate": 8.181541313559322e-06, "loss": 1.2245, "mean_token_accuracy": 0.7280922085046768, "num_tokens": 11045999.0, "step": 13734 }, { "epoch": 3.6377118644067794, "grad_norm": 2.0614588260650635, "learning_rate": 8.181276483050849e-06, "loss": 1.435, "mean_token_accuracy": 0.6847097724676132, "num_tokens": 11047568.0, "step": 13736 }, { "epoch": 3.638241525423729, "grad_norm": 1.6273902654647827, "learning_rate": 8.181011652542373e-06, "loss": 1.2104, "mean_token_accuracy": 0.7205957621335983, "num_tokens": 11049277.0, "step": 13738 }, { "epoch": 3.638771186440678, "grad_norm": 1.6806743144989014, "learning_rate": 8.1807468220339e-06, "loss": 1.5632, "mean_token_accuracy": 0.6597434729337692, "num_tokens": 11050925.0, "step": 13740 }, { "epoch": 3.639300847457627, "grad_norm": 1.8568311929702759, "learning_rate": 8.180481991525423e-06, "loss": 1.3441, "mean_token_accuracy": 0.6942349597811699, "num_tokens": 11052828.0, "step": 13742 }, { "epoch": 3.639830508474576, "grad_norm": 1.4737604856491089, "learning_rate": 8.18021716101695e-06, "loss": 1.5145, "mean_token_accuracy": 0.6649545878171921, "num_tokens": 11054497.0, "step": 13744 }, { "epoch": 3.6403601694915255, "grad_norm": 1.890401005744934, "learning_rate": 8.179952330508475e-06, "loss": 0.9466, "mean_token_accuracy": 0.7594121545553207, "num_tokens": 11056002.0, "step": 13746 }, { "epoch": 3.6408898305084745, "grad_norm": 1.7269920110702515, "learning_rate": 8.179687500000001e-06, "loss": 1.3883, "mean_token_accuracy": 0.6665970906615257, "num_tokens": 11057660.0, "step": 13748 }, { "epoch": 3.641419491525424, "grad_norm": 1.8227782249450684, "learning_rate": 8.179422669491526e-06, "loss": 1.4725, "step": 13750 }, { "epoch": 3.641419491525424, "eval_loss": 1.3098381757736206, "eval_mean_token_accuracy": 0.7017376156789916, "eval_num_tokens": 11059229.0, "eval_runtime": 48.1243, "eval_samples_per_second": 6.4, "eval_steps_per_second": 6.4, "step": 13750 }, { "epoch": 3.641949152542373, "grad_norm": 2.068619728088379, "learning_rate": 8.179157838983051e-06, "loss": 1.3727, "mean_token_accuracy": 0.6820976957678795, "num_tokens": 11060832.0, "step": 13752 }, { "epoch": 3.642478813559322, "grad_norm": 1.9676814079284668, "learning_rate": 8.178893008474576e-06, "loss": 1.6016, "mean_token_accuracy": 0.6575134918093681, "num_tokens": 11062350.0, "step": 13754 }, { "epoch": 3.643008474576271, "grad_norm": 1.6907384395599365, "learning_rate": 8.178628177966103e-06, "loss": 0.9862, "mean_token_accuracy": 0.7441493272781372, "num_tokens": 11063792.0, "step": 13756 }, { "epoch": 3.6435381355932206, "grad_norm": 1.6324931383132935, "learning_rate": 8.178363347457628e-06, "loss": 1.1237, "mean_token_accuracy": 0.7401861548423767, "num_tokens": 11065312.0, "step": 13758 }, { "epoch": 3.6440677966101696, "grad_norm": 2.082105875015259, "learning_rate": 8.178098516949153e-06, "loss": 1.4341, "mean_token_accuracy": 0.7004672139883041, "num_tokens": 11066875.0, "step": 13760 }, { "epoch": 3.6445974576271185, "grad_norm": 1.6521376371383667, "learning_rate": 8.177833686440678e-06, "loss": 1.1279, "mean_token_accuracy": 0.7299479618668556, "num_tokens": 11068605.0, "step": 13762 }, { "epoch": 3.645127118644068, "grad_norm": 1.4198577404022217, "learning_rate": 8.177568855932204e-06, "loss": 1.0115, "mean_token_accuracy": 0.739496149122715, "num_tokens": 11070521.0, "step": 13764 }, { "epoch": 3.645656779661017, "grad_norm": 2.0240983963012695, "learning_rate": 8.17730402542373e-06, "loss": 1.8556, "mean_token_accuracy": 0.6053659617900848, "num_tokens": 11072395.0, "step": 13766 }, { "epoch": 3.6461864406779663, "grad_norm": 1.9528043270111084, "learning_rate": 8.177039194915256e-06, "loss": 0.9425, "mean_token_accuracy": 0.7486484795808792, "num_tokens": 11073939.0, "step": 13768 }, { "epoch": 3.6467161016949152, "grad_norm": 1.8730556964874268, "learning_rate": 8.176774364406779e-06, "loss": 1.464, "mean_token_accuracy": 0.6859900057315826, "num_tokens": 11075522.0, "step": 13770 }, { "epoch": 3.647245762711864, "grad_norm": 1.4267034530639648, "learning_rate": 8.176509533898306e-06, "loss": 1.0768, "mean_token_accuracy": 0.761266678571701, "num_tokens": 11077154.0, "step": 13772 }, { "epoch": 3.6477754237288136, "grad_norm": 1.886685848236084, "learning_rate": 8.17624470338983e-06, "loss": 1.2823, "mean_token_accuracy": 0.6936144195497036, "num_tokens": 11078789.0, "step": 13774 }, { "epoch": 3.648305084745763, "grad_norm": 2.0543065071105957, "learning_rate": 8.175979872881357e-06, "loss": 1.2258, "mean_token_accuracy": 0.7280184999108315, "num_tokens": 11080440.0, "step": 13776 }, { "epoch": 3.648834745762712, "grad_norm": 2.1190929412841797, "learning_rate": 8.175715042372882e-06, "loss": 1.6626, "mean_token_accuracy": 0.6232375204563141, "num_tokens": 11082192.0, "step": 13778 }, { "epoch": 3.649364406779661, "grad_norm": 1.1135038137435913, "learning_rate": 8.175450211864407e-06, "loss": 1.3587, "mean_token_accuracy": 0.6655884683132172, "num_tokens": 11084777.0, "step": 13780 }, { "epoch": 3.6498940677966103, "grad_norm": 1.5162196159362793, "learning_rate": 8.175185381355934e-06, "loss": 1.422, "mean_token_accuracy": 0.6684217974543571, "num_tokens": 11086634.0, "step": 13782 }, { "epoch": 3.6504237288135593, "grad_norm": 2.0931754112243652, "learning_rate": 8.174920550847459e-06, "loss": 1.4636, "mean_token_accuracy": 0.7024167701601982, "num_tokens": 11088341.0, "step": 13784 }, { "epoch": 3.6509533898305087, "grad_norm": 1.3885598182678223, "learning_rate": 8.174655720338984e-06, "loss": 1.0301, "mean_token_accuracy": 0.7484382838010788, "num_tokens": 11090067.0, "step": 13786 }, { "epoch": 3.6514830508474576, "grad_norm": 1.8533109426498413, "learning_rate": 8.174390889830509e-06, "loss": 1.0632, "mean_token_accuracy": 0.747232049703598, "num_tokens": 11091713.0, "step": 13788 }, { "epoch": 3.6520127118644066, "grad_norm": 1.9151883125305176, "learning_rate": 8.174126059322035e-06, "loss": 1.3378, "mean_token_accuracy": 0.7050314620137215, "num_tokens": 11093469.0, "step": 13790 }, { "epoch": 3.652542372881356, "grad_norm": 1.6827493906021118, "learning_rate": 8.17386122881356e-06, "loss": 1.3607, "mean_token_accuracy": 0.6915253847837448, "num_tokens": 11095203.0, "step": 13792 }, { "epoch": 3.653072033898305, "grad_norm": 1.6527881622314453, "learning_rate": 8.173596398305087e-06, "loss": 1.2785, "mean_token_accuracy": 0.704025037586689, "num_tokens": 11097050.0, "step": 13794 }, { "epoch": 3.6536016949152543, "grad_norm": 1.627555012702942, "learning_rate": 8.17333156779661e-06, "loss": 1.21, "mean_token_accuracy": 0.718891017138958, "num_tokens": 11098859.0, "step": 13796 }, { "epoch": 3.6541313559322033, "grad_norm": 2.0004725456237793, "learning_rate": 8.173066737288137e-06, "loss": 1.5349, "mean_token_accuracy": 0.6509410552680492, "num_tokens": 11100523.0, "step": 13798 }, { "epoch": 3.6546610169491527, "grad_norm": 1.6115036010742188, "learning_rate": 8.172801906779661e-06, "loss": 1.3455, "mean_token_accuracy": 0.7210076451301575, "num_tokens": 11102080.0, "step": 13800 }, { "epoch": 3.6551906779661016, "grad_norm": 1.8504575490951538, "learning_rate": 8.172537076271188e-06, "loss": 1.5313, "mean_token_accuracy": 0.6667275279760361, "num_tokens": 11103741.0, "step": 13802 }, { "epoch": 3.655720338983051, "grad_norm": 2.236632823944092, "learning_rate": 8.172272245762713e-06, "loss": 1.1881, "mean_token_accuracy": 0.7232255935668945, "num_tokens": 11105143.0, "step": 13804 }, { "epoch": 3.65625, "grad_norm": 1.805862545967102, "learning_rate": 8.172007415254238e-06, "loss": 1.3476, "mean_token_accuracy": 0.6896155998110771, "num_tokens": 11106778.0, "step": 13806 }, { "epoch": 3.656779661016949, "grad_norm": 2.026552438735962, "learning_rate": 8.171742584745763e-06, "loss": 1.3697, "mean_token_accuracy": 0.7018986195325851, "num_tokens": 11108443.0, "step": 13808 }, { "epoch": 3.6573093220338984, "grad_norm": 1.791233777999878, "learning_rate": 8.17147775423729e-06, "loss": 1.2453, "mean_token_accuracy": 0.731248963624239, "num_tokens": 11109966.0, "step": 13810 }, { "epoch": 3.6578389830508473, "grad_norm": 1.5880061388015747, "learning_rate": 8.171212923728814e-06, "loss": 0.8839, "mean_token_accuracy": 0.7695913687348366, "num_tokens": 11112189.0, "step": 13812 }, { "epoch": 3.6583686440677967, "grad_norm": 2.378788471221924, "learning_rate": 8.17094809322034e-06, "loss": 1.1708, "mean_token_accuracy": 0.7244488820433617, "num_tokens": 11113683.0, "step": 13814 }, { "epoch": 3.6588983050847457, "grad_norm": 1.6235404014587402, "learning_rate": 8.170683262711864e-06, "loss": 1.0781, "mean_token_accuracy": 0.7477134615182877, "num_tokens": 11115057.0, "step": 13816 }, { "epoch": 3.659427966101695, "grad_norm": 1.7919317483901978, "learning_rate": 8.170418432203391e-06, "loss": 1.3085, "mean_token_accuracy": 0.7033075392246246, "num_tokens": 11116582.0, "step": 13818 }, { "epoch": 3.659957627118644, "grad_norm": 2.0370190143585205, "learning_rate": 8.170153601694916e-06, "loss": 1.8014, "mean_token_accuracy": 0.6172058545053005, "num_tokens": 11118327.0, "step": 13820 }, { "epoch": 3.6604872881355934, "grad_norm": 1.5193917751312256, "learning_rate": 8.169888771186442e-06, "loss": 0.9843, "mean_token_accuracy": 0.7671715542674065, "num_tokens": 11119936.0, "step": 13822 }, { "epoch": 3.6610169491525424, "grad_norm": 1.2978625297546387, "learning_rate": 8.169623940677966e-06, "loss": 0.8744, "mean_token_accuracy": 0.7817275747656822, "num_tokens": 11122799.0, "step": 13824 }, { "epoch": 3.6615466101694913, "grad_norm": 2.7585654258728027, "learning_rate": 8.169359110169492e-06, "loss": 1.6796, "mean_token_accuracy": 0.6496246755123138, "num_tokens": 11124126.0, "step": 13826 }, { "epoch": 3.6620762711864407, "grad_norm": 2.0307514667510986, "learning_rate": 8.169094279661017e-06, "loss": 1.6995, "mean_token_accuracy": 0.6098645925521851, "num_tokens": 11125732.0, "step": 13828 }, { "epoch": 3.6626059322033897, "grad_norm": 1.9455164670944214, "learning_rate": 8.168829449152544e-06, "loss": 1.3423, "mean_token_accuracy": 0.6657708808779716, "num_tokens": 11127547.0, "step": 13830 }, { "epoch": 3.663135593220339, "grad_norm": 1.790305733680725, "learning_rate": 8.168564618644069e-06, "loss": 0.972, "mean_token_accuracy": 0.7719709500670433, "num_tokens": 11128895.0, "step": 13832 }, { "epoch": 3.663665254237288, "grad_norm": 2.0950489044189453, "learning_rate": 8.168299788135594e-06, "loss": 1.5205, "mean_token_accuracy": 0.6564420014619827, "num_tokens": 11130242.0, "step": 13834 }, { "epoch": 3.664194915254237, "grad_norm": 1.836936354637146, "learning_rate": 8.168034957627119e-06, "loss": 0.7006, "mean_token_accuracy": 0.8160924464464188, "num_tokens": 11131673.0, "step": 13836 }, { "epoch": 3.6647245762711864, "grad_norm": 1.8883843421936035, "learning_rate": 8.167770127118645e-06, "loss": 1.4188, "mean_token_accuracy": 0.6789396926760674, "num_tokens": 11133203.0, "step": 13838 }, { "epoch": 3.665254237288136, "grad_norm": 1.7334777116775513, "learning_rate": 8.16750529661017e-06, "loss": 0.9799, "mean_token_accuracy": 0.7676345631480217, "num_tokens": 11135098.0, "step": 13840 }, { "epoch": 3.6657838983050848, "grad_norm": 1.686671495437622, "learning_rate": 8.167240466101695e-06, "loss": 1.3242, "mean_token_accuracy": 0.6965871900320053, "num_tokens": 11136878.0, "step": 13842 }, { "epoch": 3.6663135593220337, "grad_norm": 2.3422930240631104, "learning_rate": 8.16697563559322e-06, "loss": 1.4359, "mean_token_accuracy": 0.696078784763813, "num_tokens": 11138598.0, "step": 13844 }, { "epoch": 3.666843220338983, "grad_norm": 1.9632130861282349, "learning_rate": 8.166710805084747e-06, "loss": 1.3335, "mean_token_accuracy": 0.6818817481398582, "num_tokens": 11140185.0, "step": 13846 }, { "epoch": 3.667372881355932, "grad_norm": 1.8292231559753418, "learning_rate": 8.166445974576272e-06, "loss": 1.6182, "mean_token_accuracy": 0.6480342783033848, "num_tokens": 11142104.0, "step": 13848 }, { "epoch": 3.6679025423728815, "grad_norm": 1.5126535892486572, "learning_rate": 8.166181144067797e-06, "loss": 1.477, "mean_token_accuracy": 0.6854475513100624, "num_tokens": 11143790.0, "step": 13850 }, { "epoch": 3.6684322033898304, "grad_norm": 1.9743335247039795, "learning_rate": 8.165916313559321e-06, "loss": 1.4976, "mean_token_accuracy": 0.6686548665165901, "num_tokens": 11145238.0, "step": 13852 }, { "epoch": 3.6689618644067794, "grad_norm": 2.0907819271087646, "learning_rate": 8.165651483050848e-06, "loss": 1.1298, "mean_token_accuracy": 0.7224878072738647, "num_tokens": 11146650.0, "step": 13854 }, { "epoch": 3.669491525423729, "grad_norm": 1.9115430116653442, "learning_rate": 8.165386652542373e-06, "loss": 1.3866, "mean_token_accuracy": 0.6732108071446419, "num_tokens": 11148471.0, "step": 13856 }, { "epoch": 3.670021186440678, "grad_norm": 2.042836904525757, "learning_rate": 8.1651218220339e-06, "loss": 1.1516, "mean_token_accuracy": 0.7144251689314842, "num_tokens": 11150125.0, "step": 13858 }, { "epoch": 3.670550847457627, "grad_norm": 2.163351058959961, "learning_rate": 8.164856991525425e-06, "loss": 1.3713, "mean_token_accuracy": 0.6895016208291054, "num_tokens": 11151467.0, "step": 13860 }, { "epoch": 3.671080508474576, "grad_norm": 1.835585117340088, "learning_rate": 8.16459216101695e-06, "loss": 1.119, "mean_token_accuracy": 0.7331705614924431, "num_tokens": 11152872.0, "step": 13862 }, { "epoch": 3.6716101694915255, "grad_norm": 1.6710565090179443, "learning_rate": 8.164327330508474e-06, "loss": 0.9737, "mean_token_accuracy": 0.7653864994645119, "num_tokens": 11154554.0, "step": 13864 }, { "epoch": 3.6721398305084745, "grad_norm": 1.9998624324798584, "learning_rate": 8.164062500000001e-06, "loss": 1.2218, "mean_token_accuracy": 0.713182769715786, "num_tokens": 11156019.0, "step": 13866 }, { "epoch": 3.672669491525424, "grad_norm": 1.8465819358825684, "learning_rate": 8.163797669491526e-06, "loss": 1.3173, "mean_token_accuracy": 0.7099787630140781, "num_tokens": 11157822.0, "step": 13868 }, { "epoch": 3.673199152542373, "grad_norm": 2.150378465652466, "learning_rate": 8.163532838983051e-06, "loss": 1.5434, "mean_token_accuracy": 0.6765934750437737, "num_tokens": 11159351.0, "step": 13870 }, { "epoch": 3.673728813559322, "grad_norm": 2.0232138633728027, "learning_rate": 8.163268008474578e-06, "loss": 1.1192, "mean_token_accuracy": 0.7520630434155464, "num_tokens": 11160844.0, "step": 13872 }, { "epoch": 3.674258474576271, "grad_norm": 2.1192471981048584, "learning_rate": 8.163003177966102e-06, "loss": 1.38, "mean_token_accuracy": 0.691204272210598, "num_tokens": 11162364.0, "step": 13874 }, { "epoch": 3.6747881355932206, "grad_norm": 1.6226966381072998, "learning_rate": 8.162738347457629e-06, "loss": 1.1042, "mean_token_accuracy": 0.7342747300863266, "num_tokens": 11163916.0, "step": 13876 }, { "epoch": 3.6753177966101696, "grad_norm": 1.7251766920089722, "learning_rate": 8.162473516949152e-06, "loss": 0.8655, "mean_token_accuracy": 0.7870251461863518, "num_tokens": 11165385.0, "step": 13878 }, { "epoch": 3.6758474576271185, "grad_norm": 2.0290024280548096, "learning_rate": 8.162208686440679e-06, "loss": 1.6178, "mean_token_accuracy": 0.6479914858937263, "num_tokens": 11167078.0, "step": 13880 }, { "epoch": 3.676377118644068, "grad_norm": 1.9869135618209839, "learning_rate": 8.161943855932204e-06, "loss": 1.2979, "mean_token_accuracy": 0.7122361809015274, "num_tokens": 11168588.0, "step": 13882 }, { "epoch": 3.676906779661017, "grad_norm": 2.0321085453033447, "learning_rate": 8.16167902542373e-06, "loss": 1.6129, "mean_token_accuracy": 0.6559422388672829, "num_tokens": 11170109.0, "step": 13884 }, { "epoch": 3.6774364406779663, "grad_norm": 1.4625455141067505, "learning_rate": 8.161414194915255e-06, "loss": 0.8451, "mean_token_accuracy": 0.7824530526995659, "num_tokens": 11171834.0, "step": 13886 }, { "epoch": 3.6779661016949152, "grad_norm": 2.0041873455047607, "learning_rate": 8.16114936440678e-06, "loss": 1.7852, "mean_token_accuracy": 0.6048108264803886, "num_tokens": 11173677.0, "step": 13888 }, { "epoch": 3.678495762711864, "grad_norm": 2.0780014991760254, "learning_rate": 8.160884533898305e-06, "loss": 1.5605, "mean_token_accuracy": 0.6618482805788517, "num_tokens": 11175315.0, "step": 13890 }, { "epoch": 3.6790254237288136, "grad_norm": 1.8531254529953003, "learning_rate": 8.160619703389832e-06, "loss": 1.3072, "mean_token_accuracy": 0.7155708819627762, "num_tokens": 11176895.0, "step": 13892 }, { "epoch": 3.679555084745763, "grad_norm": 1.9869470596313477, "learning_rate": 8.160354872881357e-06, "loss": 1.1976, "mean_token_accuracy": 0.7317104563117027, "num_tokens": 11178436.0, "step": 13894 }, { "epoch": 3.680084745762712, "grad_norm": 1.71156644821167, "learning_rate": 8.160090042372882e-06, "loss": 1.0393, "mean_token_accuracy": 0.7373459562659264, "num_tokens": 11180056.0, "step": 13896 }, { "epoch": 3.680614406779661, "grad_norm": 1.8299667835235596, "learning_rate": 8.159825211864407e-06, "loss": 1.3529, "mean_token_accuracy": 0.691691629588604, "num_tokens": 11181572.0, "step": 13898 }, { "epoch": 3.6811440677966103, "grad_norm": 1.7574869394302368, "learning_rate": 8.159560381355933e-06, "loss": 1.3781, "mean_token_accuracy": 0.6963674612343311, "num_tokens": 11183375.0, "step": 13900 }, { "epoch": 3.6816737288135593, "grad_norm": 2.0819149017333984, "learning_rate": 8.159295550847458e-06, "loss": 0.9522, "mean_token_accuracy": 0.7759255692362785, "num_tokens": 11184713.0, "step": 13902 }, { "epoch": 3.6822033898305087, "grad_norm": 2.226355791091919, "learning_rate": 8.159030720338983e-06, "loss": 1.5929, "mean_token_accuracy": 0.6339330300688744, "num_tokens": 11186444.0, "step": 13904 }, { "epoch": 3.6827330508474576, "grad_norm": 1.2128651142120361, "learning_rate": 8.158765889830508e-06, "loss": 1.2584, "mean_token_accuracy": 0.6971158683300018, "num_tokens": 11188704.0, "step": 13906 }, { "epoch": 3.6832627118644066, "grad_norm": 1.3955707550048828, "learning_rate": 8.158501059322035e-06, "loss": 0.8652, "mean_token_accuracy": 0.7864973060786724, "num_tokens": 11190319.0, "step": 13908 }, { "epoch": 3.683792372881356, "grad_norm": 1.579203724861145, "learning_rate": 8.15823622881356e-06, "loss": 1.4459, "mean_token_accuracy": 0.624337799847126, "num_tokens": 11192843.0, "step": 13910 }, { "epoch": 3.684322033898305, "grad_norm": 1.7364377975463867, "learning_rate": 8.157971398305086e-06, "loss": 1.2558, "mean_token_accuracy": 0.7199633792042732, "num_tokens": 11194246.0, "step": 13912 }, { "epoch": 3.6848516949152543, "grad_norm": 1.9876636266708374, "learning_rate": 8.157706567796611e-06, "loss": 1.4344, "mean_token_accuracy": 0.6831868551671505, "num_tokens": 11195887.0, "step": 13914 }, { "epoch": 3.6853813559322033, "grad_norm": 1.8754786252975464, "learning_rate": 8.157441737288136e-06, "loss": 1.4194, "mean_token_accuracy": 0.6876415237784386, "num_tokens": 11197611.0, "step": 13916 }, { "epoch": 3.6859110169491527, "grad_norm": 2.0393950939178467, "learning_rate": 8.157176906779661e-06, "loss": 1.003, "mean_token_accuracy": 0.7801347002387047, "num_tokens": 11198950.0, "step": 13918 }, { "epoch": 3.6864406779661016, "grad_norm": 1.6111372709274292, "learning_rate": 8.156912076271188e-06, "loss": 1.1377, "mean_token_accuracy": 0.7400340586900711, "num_tokens": 11200936.0, "step": 13920 }, { "epoch": 3.686970338983051, "grad_norm": 1.6315749883651733, "learning_rate": 8.156647245762713e-06, "loss": 0.8493, "mean_token_accuracy": 0.7885693982243538, "num_tokens": 11202630.0, "step": 13922 }, { "epoch": 3.6875, "grad_norm": 2.0414202213287354, "learning_rate": 8.156382415254238e-06, "loss": 1.4794, "mean_token_accuracy": 0.6556186676025391, "num_tokens": 11204170.0, "step": 13924 }, { "epoch": 3.688029661016949, "grad_norm": 1.8884187936782837, "learning_rate": 8.156117584745763e-06, "loss": 1.6944, "mean_token_accuracy": 0.6223586425185204, "num_tokens": 11205916.0, "step": 13926 }, { "epoch": 3.6885593220338984, "grad_norm": 2.0791537761688232, "learning_rate": 8.155852754237289e-06, "loss": 1.5084, "mean_token_accuracy": 0.6640879139304161, "num_tokens": 11207343.0, "step": 13928 }, { "epoch": 3.6890889830508473, "grad_norm": 1.4317843914031982, "learning_rate": 8.155587923728814e-06, "loss": 1.1448, "mean_token_accuracy": 0.7384700626134872, "num_tokens": 11209003.0, "step": 13930 }, { "epoch": 3.6896186440677967, "grad_norm": 1.8354591131210327, "learning_rate": 8.155323093220339e-06, "loss": 1.534, "mean_token_accuracy": 0.6750087589025497, "num_tokens": 11210595.0, "step": 13932 }, { "epoch": 3.6901483050847457, "grad_norm": 2.0106799602508545, "learning_rate": 8.155058262711864e-06, "loss": 1.1697, "mean_token_accuracy": 0.7359534204006195, "num_tokens": 11212154.0, "step": 13934 }, { "epoch": 3.690677966101695, "grad_norm": 1.6936012506484985, "learning_rate": 8.15479343220339e-06, "loss": 1.0659, "mean_token_accuracy": 0.7625419422984123, "num_tokens": 11213804.0, "step": 13936 }, { "epoch": 3.691207627118644, "grad_norm": 1.4252853393554688, "learning_rate": 8.154528601694915e-06, "loss": 1.2593, "mean_token_accuracy": 0.7267959266901016, "num_tokens": 11215401.0, "step": 13938 }, { "epoch": 3.6917372881355934, "grad_norm": 1.8770248889923096, "learning_rate": 8.154263771186442e-06, "loss": 1.4624, "mean_token_accuracy": 0.6732845157384872, "num_tokens": 11216965.0, "step": 13940 }, { "epoch": 3.6922669491525424, "grad_norm": 1.9275933504104614, "learning_rate": 8.153998940677967e-06, "loss": 1.3065, "mean_token_accuracy": 0.6925115510821342, "num_tokens": 11218679.0, "step": 13942 }, { "epoch": 3.6927966101694913, "grad_norm": 1.766564130783081, "learning_rate": 8.153734110169492e-06, "loss": 1.4475, "mean_token_accuracy": 0.6548849865794182, "num_tokens": 11220335.0, "step": 13944 }, { "epoch": 3.6933262711864407, "grad_norm": 2.0318338871002197, "learning_rate": 8.153469279661017e-06, "loss": 1.0222, "mean_token_accuracy": 0.7633724957704544, "num_tokens": 11221928.0, "step": 13946 }, { "epoch": 3.6938559322033897, "grad_norm": 1.7047374248504639, "learning_rate": 8.153204449152544e-06, "loss": 1.3635, "mean_token_accuracy": 0.6919631808996201, "num_tokens": 11223743.0, "step": 13948 }, { "epoch": 3.694385593220339, "grad_norm": 1.9644509553909302, "learning_rate": 8.152939618644068e-06, "loss": 1.142, "mean_token_accuracy": 0.7351401671767235, "num_tokens": 11225338.0, "step": 13950 }, { "epoch": 3.694915254237288, "grad_norm": 2.0722362995147705, "learning_rate": 8.152674788135593e-06, "loss": 1.3577, "mean_token_accuracy": 0.7122907862067223, "num_tokens": 11226798.0, "step": 13952 }, { "epoch": 3.695444915254237, "grad_norm": 2.146672487258911, "learning_rate": 8.15240995762712e-06, "loss": 1.2395, "mean_token_accuracy": 0.7293705195188522, "num_tokens": 11228426.0, "step": 13954 }, { "epoch": 3.6959745762711864, "grad_norm": 1.780422568321228, "learning_rate": 8.152145127118645e-06, "loss": 1.0527, "mean_token_accuracy": 0.7488594204187393, "num_tokens": 11230111.0, "step": 13956 }, { "epoch": 3.696504237288136, "grad_norm": 1.6769003868103027, "learning_rate": 8.15188029661017e-06, "loss": 0.7815, "mean_token_accuracy": 0.7860396578907967, "num_tokens": 11231560.0, "step": 13958 }, { "epoch": 3.6970338983050848, "grad_norm": 1.690334677696228, "learning_rate": 8.151615466101695e-06, "loss": 1.325, "mean_token_accuracy": 0.7054477035999298, "num_tokens": 11233261.0, "step": 13960 }, { "epoch": 3.6975635593220337, "grad_norm": 1.5338878631591797, "learning_rate": 8.151350635593221e-06, "loss": 1.1771, "mean_token_accuracy": 0.7170118391513824, "num_tokens": 11235303.0, "step": 13962 }, { "epoch": 3.698093220338983, "grad_norm": 2.018544912338257, "learning_rate": 8.151085805084746e-06, "loss": 1.0582, "mean_token_accuracy": 0.746852234005928, "num_tokens": 11236934.0, "step": 13964 }, { "epoch": 3.698622881355932, "grad_norm": 1.6513962745666504, "learning_rate": 8.150820974576273e-06, "loss": 0.8638, "mean_token_accuracy": 0.789766937494278, "num_tokens": 11238338.0, "step": 13966 }, { "epoch": 3.6991525423728815, "grad_norm": 1.7049493789672852, "learning_rate": 8.150556144067798e-06, "loss": 1.062, "mean_token_accuracy": 0.7377990633249283, "num_tokens": 11240029.0, "step": 13968 }, { "epoch": 3.6996822033898304, "grad_norm": 1.9054360389709473, "learning_rate": 8.150291313559323e-06, "loss": 1.2501, "mean_token_accuracy": 0.6988589242100716, "num_tokens": 11241438.0, "step": 13970 }, { "epoch": 3.7002118644067794, "grad_norm": 2.003671407699585, "learning_rate": 8.150026483050848e-06, "loss": 1.8454, "mean_token_accuracy": 0.61483558639884, "num_tokens": 11243732.0, "step": 13972 }, { "epoch": 3.700741525423729, "grad_norm": 1.9104645252227783, "learning_rate": 8.149761652542374e-06, "loss": 1.2813, "mean_token_accuracy": 0.708404503762722, "num_tokens": 11245470.0, "step": 13974 }, { "epoch": 3.701271186440678, "grad_norm": 1.8580683469772339, "learning_rate": 8.1494968220339e-06, "loss": 1.0673, "mean_token_accuracy": 0.7594690024852753, "num_tokens": 11246838.0, "step": 13976 }, { "epoch": 3.701800847457627, "grad_norm": 1.962767243385315, "learning_rate": 8.149231991525424e-06, "loss": 1.388, "mean_token_accuracy": 0.668810062110424, "num_tokens": 11248488.0, "step": 13978 }, { "epoch": 3.702330508474576, "grad_norm": 1.7951799631118774, "learning_rate": 8.148967161016949e-06, "loss": 1.3739, "mean_token_accuracy": 0.690546490252018, "num_tokens": 11250025.0, "step": 13980 }, { "epoch": 3.7028601694915255, "grad_norm": 1.5351340770721436, "learning_rate": 8.148702330508476e-06, "loss": 0.8538, "mean_token_accuracy": 0.7792031466960907, "num_tokens": 11251516.0, "step": 13982 }, { "epoch": 3.7033898305084745, "grad_norm": 1.5454894304275513, "learning_rate": 8.1484375e-06, "loss": 1.2778, "mean_token_accuracy": 0.6766664236783981, "num_tokens": 11253650.0, "step": 13984 }, { "epoch": 3.703919491525424, "grad_norm": 1.5844054222106934, "learning_rate": 8.148172669491526e-06, "loss": 0.8095, "mean_token_accuracy": 0.7886480242013931, "num_tokens": 11255302.0, "step": 13986 }, { "epoch": 3.704449152542373, "grad_norm": 1.6411619186401367, "learning_rate": 8.14790783898305e-06, "loss": 1.2992, "mean_token_accuracy": 0.7343273088335991, "num_tokens": 11256990.0, "step": 13988 }, { "epoch": 3.704978813559322, "grad_norm": 1.5861648321151733, "learning_rate": 8.147643008474577e-06, "loss": 0.8888, "mean_token_accuracy": 0.7596643790602684, "num_tokens": 11258723.0, "step": 13990 }, { "epoch": 3.705508474576271, "grad_norm": 2.2188169956207275, "learning_rate": 8.147378177966102e-06, "loss": 1.6851, "mean_token_accuracy": 0.6439770758152008, "num_tokens": 11260161.0, "step": 13992 }, { "epoch": 3.7060381355932206, "grad_norm": 2.123579263687134, "learning_rate": 8.147113347457629e-06, "loss": 1.2395, "mean_token_accuracy": 0.7348778694868088, "num_tokens": 11261741.0, "step": 13994 }, { "epoch": 3.7065677966101696, "grad_norm": 1.884286642074585, "learning_rate": 8.146848516949154e-06, "loss": 1.501, "mean_token_accuracy": 0.6518909260630608, "num_tokens": 11263581.0, "step": 13996 }, { "epoch": 3.7070974576271185, "grad_norm": 1.637885570526123, "learning_rate": 8.146583686440679e-06, "loss": 0.9653, "mean_token_accuracy": 0.7572622373700142, "num_tokens": 11265080.0, "step": 13998 }, { "epoch": 3.707627118644068, "grad_norm": 2.009322166442871, "learning_rate": 8.146318855932204e-06, "loss": 1.2922, "step": 14000 }, { "epoch": 3.707627118644068, "eval_loss": 1.3091660737991333, "eval_mean_token_accuracy": 0.7016214511611245, "eval_num_tokens": 11266481.0, "eval_runtime": 48.342, "eval_samples_per_second": 6.371, "eval_steps_per_second": 6.371, "step": 14000 }, { "epoch": 3.708156779661017, "grad_norm": 1.7724486589431763, "learning_rate": 8.14605402542373e-06, "loss": 1.1289, "mean_token_accuracy": 0.7047285847365856, "num_tokens": 11268103.0, "step": 14002 }, { "epoch": 3.7086864406779663, "grad_norm": 2.2190778255462646, "learning_rate": 8.145789194915255e-06, "loss": 1.7089, "mean_token_accuracy": 0.6071780249476433, "num_tokens": 11269662.0, "step": 14004 }, { "epoch": 3.7092161016949152, "grad_norm": 2.029953956604004, "learning_rate": 8.14552436440678e-06, "loss": 1.4532, "mean_token_accuracy": 0.67057054489851, "num_tokens": 11271305.0, "step": 14006 }, { "epoch": 3.709745762711864, "grad_norm": 2.0668580532073975, "learning_rate": 8.145259533898305e-06, "loss": 1.458, "mean_token_accuracy": 0.6780184507369995, "num_tokens": 11273007.0, "step": 14008 }, { "epoch": 3.7102754237288136, "grad_norm": 1.8387887477874756, "learning_rate": 8.144994703389832e-06, "loss": 1.4438, "mean_token_accuracy": 0.6581496447324753, "num_tokens": 11274520.0, "step": 14010 }, { "epoch": 3.710805084745763, "grad_norm": 1.9453874826431274, "learning_rate": 8.144729872881356e-06, "loss": 1.0703, "mean_token_accuracy": 0.7572408765554428, "num_tokens": 11275959.0, "step": 14012 }, { "epoch": 3.711334745762712, "grad_norm": 1.7324559688568115, "learning_rate": 8.144465042372881e-06, "loss": 1.168, "mean_token_accuracy": 0.7171546667814255, "num_tokens": 11277463.0, "step": 14014 }, { "epoch": 3.711864406779661, "grad_norm": 1.5695548057556152, "learning_rate": 8.144200211864406e-06, "loss": 1.1571, "mean_token_accuracy": 0.7294327020645142, "num_tokens": 11278942.0, "step": 14016 }, { "epoch": 3.7123940677966103, "grad_norm": 1.526090383529663, "learning_rate": 8.143935381355933e-06, "loss": 1.0286, "mean_token_accuracy": 0.7445804923772812, "num_tokens": 11280748.0, "step": 14018 }, { "epoch": 3.7129237288135593, "grad_norm": 1.7873139381408691, "learning_rate": 8.143670550847458e-06, "loss": 1.085, "mean_token_accuracy": 0.7255065143108368, "num_tokens": 11282474.0, "step": 14020 }, { "epoch": 3.7134533898305087, "grad_norm": 2.1861062049865723, "learning_rate": 8.143405720338985e-06, "loss": 1.2772, "mean_token_accuracy": 0.69849693775177, "num_tokens": 11284060.0, "step": 14022 }, { "epoch": 3.7139830508474576, "grad_norm": 1.5662263631820679, "learning_rate": 8.14314088983051e-06, "loss": 1.6214, "mean_token_accuracy": 0.6287990659475327, "num_tokens": 11285893.0, "step": 14024 }, { "epoch": 3.7145127118644066, "grad_norm": 1.9489837884902954, "learning_rate": 8.142876059322034e-06, "loss": 1.5062, "mean_token_accuracy": 0.652198851108551, "num_tokens": 11287528.0, "step": 14026 }, { "epoch": 3.715042372881356, "grad_norm": 1.9402271509170532, "learning_rate": 8.14261122881356e-06, "loss": 1.0219, "mean_token_accuracy": 0.7576163187623024, "num_tokens": 11288715.0, "step": 14028 }, { "epoch": 3.715572033898305, "grad_norm": 2.186372756958008, "learning_rate": 8.142346398305086e-06, "loss": 1.3516, "mean_token_accuracy": 0.6737668961286545, "num_tokens": 11290238.0, "step": 14030 }, { "epoch": 3.7161016949152543, "grad_norm": 1.5206702947616577, "learning_rate": 8.142081567796611e-06, "loss": 0.8254, "mean_token_accuracy": 0.7943115755915642, "num_tokens": 11291650.0, "step": 14032 }, { "epoch": 3.7166313559322033, "grad_norm": 2.2418651580810547, "learning_rate": 8.141816737288136e-06, "loss": 1.4288, "mean_token_accuracy": 0.6832994148135185, "num_tokens": 11293229.0, "step": 14034 }, { "epoch": 3.7171610169491527, "grad_norm": 2.0385799407958984, "learning_rate": 8.141551906779662e-06, "loss": 1.9284, "mean_token_accuracy": 0.5748868919909, "num_tokens": 11294896.0, "step": 14036 }, { "epoch": 3.7176906779661016, "grad_norm": 1.9680863618850708, "learning_rate": 8.141287076271187e-06, "loss": 1.3824, "mean_token_accuracy": 0.6907712519168854, "num_tokens": 11296656.0, "step": 14038 }, { "epoch": 3.718220338983051, "grad_norm": 2.1281940937042236, "learning_rate": 8.141022245762712e-06, "loss": 1.6887, "mean_token_accuracy": 0.6720674335956573, "num_tokens": 11298303.0, "step": 14040 }, { "epoch": 3.71875, "grad_norm": 1.5648776292800903, "learning_rate": 8.140757415254237e-06, "loss": 0.7875, "mean_token_accuracy": 0.7974027693271637, "num_tokens": 11299867.0, "step": 14042 }, { "epoch": 3.719279661016949, "grad_norm": 1.8475629091262817, "learning_rate": 8.140492584745764e-06, "loss": 1.4896, "mean_token_accuracy": 0.691688559949398, "num_tokens": 11301260.0, "step": 14044 }, { "epoch": 3.7198093220338984, "grad_norm": 1.8972078561782837, "learning_rate": 8.140227754237289e-06, "loss": 1.3111, "mean_token_accuracy": 0.6999168321490288, "num_tokens": 11302754.0, "step": 14046 }, { "epoch": 3.7203389830508473, "grad_norm": 1.7236297130584717, "learning_rate": 8.139962923728815e-06, "loss": 1.1473, "mean_token_accuracy": 0.737649030983448, "num_tokens": 11304360.0, "step": 14048 }, { "epoch": 3.7208686440677967, "grad_norm": 1.8169105052947998, "learning_rate": 8.13969809322034e-06, "loss": 0.9791, "mean_token_accuracy": 0.7518636956810951, "num_tokens": 11305794.0, "step": 14050 }, { "epoch": 3.7213983050847457, "grad_norm": 2.093226194381714, "learning_rate": 8.139433262711865e-06, "loss": 1.3854, "mean_token_accuracy": 0.6813677623867989, "num_tokens": 11307318.0, "step": 14052 }, { "epoch": 3.721927966101695, "grad_norm": 1.9915857315063477, "learning_rate": 8.13916843220339e-06, "loss": 1.5672, "mean_token_accuracy": 0.6482941433787346, "num_tokens": 11308961.0, "step": 14054 }, { "epoch": 3.722457627118644, "grad_norm": 2.041195869445801, "learning_rate": 8.138903601694917e-06, "loss": 1.5097, "mean_token_accuracy": 0.6795567199587822, "num_tokens": 11310550.0, "step": 14056 }, { "epoch": 3.7229872881355934, "grad_norm": 2.541292428970337, "learning_rate": 8.138638771186442e-06, "loss": 1.4112, "mean_token_accuracy": 0.6942180842161179, "num_tokens": 11311820.0, "step": 14058 }, { "epoch": 3.7235169491525424, "grad_norm": 1.798764705657959, "learning_rate": 8.138373940677967e-06, "loss": 1.4787, "mean_token_accuracy": 0.6681883856654167, "num_tokens": 11313432.0, "step": 14060 }, { "epoch": 3.7240466101694913, "grad_norm": 2.013537645339966, "learning_rate": 8.138109110169492e-06, "loss": 1.0672, "mean_token_accuracy": 0.7483191266655922, "num_tokens": 11314910.0, "step": 14062 }, { "epoch": 3.7245762711864407, "grad_norm": 1.811058521270752, "learning_rate": 8.137844279661018e-06, "loss": 1.3967, "mean_token_accuracy": 0.6780898049473763, "num_tokens": 11316741.0, "step": 14064 }, { "epoch": 3.7251059322033897, "grad_norm": 2.1401751041412354, "learning_rate": 8.137579449152543e-06, "loss": 1.4251, "mean_token_accuracy": 0.701018787920475, "num_tokens": 11318267.0, "step": 14066 }, { "epoch": 3.725635593220339, "grad_norm": 1.8157916069030762, "learning_rate": 8.137314618644068e-06, "loss": 1.1398, "mean_token_accuracy": 0.7201304957270622, "num_tokens": 11319810.0, "step": 14068 }, { "epoch": 3.726165254237288, "grad_norm": 1.904928207397461, "learning_rate": 8.137049788135593e-06, "loss": 1.022, "mean_token_accuracy": 0.757286787033081, "num_tokens": 11321246.0, "step": 14070 }, { "epoch": 3.726694915254237, "grad_norm": 1.9030870199203491, "learning_rate": 8.13678495762712e-06, "loss": 1.1049, "mean_token_accuracy": 0.7443543076515198, "num_tokens": 11322889.0, "step": 14072 }, { "epoch": 3.7272245762711864, "grad_norm": 1.7880107164382935, "learning_rate": 8.136520127118645e-06, "loss": 0.91, "mean_token_accuracy": 0.7519731596112251, "num_tokens": 11324640.0, "step": 14074 }, { "epoch": 3.727754237288136, "grad_norm": 1.9408791065216064, "learning_rate": 8.136255296610171e-06, "loss": 1.4775, "mean_token_accuracy": 0.6643204838037491, "num_tokens": 11326347.0, "step": 14076 }, { "epoch": 3.7282838983050848, "grad_norm": 1.9608924388885498, "learning_rate": 8.135990466101696e-06, "loss": 1.8061, "mean_token_accuracy": 0.6321702040731907, "num_tokens": 11327970.0, "step": 14078 }, { "epoch": 3.7288135593220337, "grad_norm": 1.9497644901275635, "learning_rate": 8.135725635593221e-06, "loss": 1.1085, "mean_token_accuracy": 0.7517764791846275, "num_tokens": 11329586.0, "step": 14080 }, { "epoch": 3.729343220338983, "grad_norm": 2.2392807006835938, "learning_rate": 8.135460805084746e-06, "loss": 1.3684, "mean_token_accuracy": 0.6915527209639549, "num_tokens": 11331164.0, "step": 14082 }, { "epoch": 3.729872881355932, "grad_norm": 1.8296200037002563, "learning_rate": 8.135195974576273e-06, "loss": 1.3703, "mean_token_accuracy": 0.6819515898823738, "num_tokens": 11332885.0, "step": 14084 }, { "epoch": 3.7304025423728815, "grad_norm": 1.4978748559951782, "learning_rate": 8.134931144067798e-06, "loss": 0.9626, "mean_token_accuracy": 0.751072108745575, "num_tokens": 11334529.0, "step": 14086 }, { "epoch": 3.7309322033898304, "grad_norm": 2.1640403270721436, "learning_rate": 8.134666313559322e-06, "loss": 1.2719, "mean_token_accuracy": 0.7193351984024048, "num_tokens": 11335910.0, "step": 14088 }, { "epoch": 3.7314618644067794, "grad_norm": 1.300554633140564, "learning_rate": 8.134401483050847e-06, "loss": 1.384, "mean_token_accuracy": 0.7074280306696892, "num_tokens": 11338335.0, "step": 14090 }, { "epoch": 3.731991525423729, "grad_norm": 1.9844154119491577, "learning_rate": 8.134136652542374e-06, "loss": 1.3799, "mean_token_accuracy": 0.7201751470565796, "num_tokens": 11339884.0, "step": 14092 }, { "epoch": 3.732521186440678, "grad_norm": 1.5098754167556763, "learning_rate": 8.133871822033899e-06, "loss": 1.0695, "mean_token_accuracy": 0.7314491495490074, "num_tokens": 11341541.0, "step": 14094 }, { "epoch": 3.733050847457627, "grad_norm": 1.7115569114685059, "learning_rate": 8.133606991525424e-06, "loss": 1.3583, "mean_token_accuracy": 0.7006764709949493, "num_tokens": 11343472.0, "step": 14096 }, { "epoch": 3.733580508474576, "grad_norm": 1.89798104763031, "learning_rate": 8.133342161016949e-06, "loss": 1.1546, "mean_token_accuracy": 0.6989642679691315, "num_tokens": 11344941.0, "step": 14098 }, { "epoch": 3.7341101694915255, "grad_norm": 1.471139669418335, "learning_rate": 8.133077330508475e-06, "loss": 1.4167, "mean_token_accuracy": 0.6731217876076698, "num_tokens": 11346823.0, "step": 14100 }, { "epoch": 3.7346398305084745, "grad_norm": 2.0729126930236816, "learning_rate": 8.1328125e-06, "loss": 1.3315, "mean_token_accuracy": 0.6960354894399643, "num_tokens": 11348372.0, "step": 14102 }, { "epoch": 3.735169491525424, "grad_norm": 2.191523551940918, "learning_rate": 8.132547669491527e-06, "loss": 1.5519, "mean_token_accuracy": 0.6717427968978882, "num_tokens": 11350057.0, "step": 14104 }, { "epoch": 3.735699152542373, "grad_norm": 1.5651048421859741, "learning_rate": 8.132282838983052e-06, "loss": 1.2313, "mean_token_accuracy": 0.713575966656208, "num_tokens": 11351574.0, "step": 14106 }, { "epoch": 3.736228813559322, "grad_norm": 1.6762696504592896, "learning_rate": 8.132018008474577e-06, "loss": 1.3781, "mean_token_accuracy": 0.6748509183526039, "num_tokens": 11353405.0, "step": 14108 }, { "epoch": 3.736758474576271, "grad_norm": 2.0309488773345947, "learning_rate": 8.131753177966102e-06, "loss": 1.6277, "mean_token_accuracy": 0.6555354744195938, "num_tokens": 11354811.0, "step": 14110 }, { "epoch": 3.7372881355932206, "grad_norm": 1.7321951389312744, "learning_rate": 8.131488347457628e-06, "loss": 1.261, "mean_token_accuracy": 0.6994170099496841, "num_tokens": 11356213.0, "step": 14112 }, { "epoch": 3.7378177966101696, "grad_norm": 2.2034876346588135, "learning_rate": 8.131223516949153e-06, "loss": 1.4001, "mean_token_accuracy": 0.6792909651994705, "num_tokens": 11357833.0, "step": 14114 }, { "epoch": 3.7383474576271185, "grad_norm": 1.4031152725219727, "learning_rate": 8.130958686440678e-06, "loss": 1.0153, "mean_token_accuracy": 0.7210099287331104, "num_tokens": 11360223.0, "step": 14116 }, { "epoch": 3.738877118644068, "grad_norm": 1.6975231170654297, "learning_rate": 8.130693855932203e-06, "loss": 1.2182, "mean_token_accuracy": 0.6989886239171028, "num_tokens": 11361888.0, "step": 14118 }, { "epoch": 3.739406779661017, "grad_norm": 1.734390377998352, "learning_rate": 8.13042902542373e-06, "loss": 1.1164, "mean_token_accuracy": 0.7507789358496666, "num_tokens": 11363312.0, "step": 14120 }, { "epoch": 3.7399364406779663, "grad_norm": 1.981972098350525, "learning_rate": 8.130164194915255e-06, "loss": 1.2278, "mean_token_accuracy": 0.726707324385643, "num_tokens": 11364883.0, "step": 14122 }, { "epoch": 3.7404661016949152, "grad_norm": 1.6536930799484253, "learning_rate": 8.12989936440678e-06, "loss": 1.2849, "mean_token_accuracy": 0.7231127470731735, "num_tokens": 11366583.0, "step": 14124 }, { "epoch": 3.740995762711864, "grad_norm": 1.7279757261276245, "learning_rate": 8.129634533898306e-06, "loss": 1.3693, "mean_token_accuracy": 0.6746850088238716, "num_tokens": 11368458.0, "step": 14126 }, { "epoch": 3.7415254237288136, "grad_norm": 1.9177682399749756, "learning_rate": 8.129369703389831e-06, "loss": 1.4989, "mean_token_accuracy": 0.67258270829916, "num_tokens": 11370064.0, "step": 14128 }, { "epoch": 3.742055084745763, "grad_norm": 2.1173958778381348, "learning_rate": 8.129104872881358e-06, "loss": 1.4085, "mean_token_accuracy": 0.6923639327287674, "num_tokens": 11371594.0, "step": 14130 }, { "epoch": 3.742584745762712, "grad_norm": 1.62494957447052, "learning_rate": 8.128840042372883e-06, "loss": 0.9341, "mean_token_accuracy": 0.7633729465305805, "num_tokens": 11373305.0, "step": 14132 }, { "epoch": 3.743114406779661, "grad_norm": 1.563098430633545, "learning_rate": 8.128575211864408e-06, "loss": 1.2302, "mean_token_accuracy": 0.7205655947327614, "num_tokens": 11374883.0, "step": 14134 }, { "epoch": 3.7436440677966103, "grad_norm": 1.8996922969818115, "learning_rate": 8.128310381355933e-06, "loss": 1.6955, "mean_token_accuracy": 0.6361088305711746, "num_tokens": 11376655.0, "step": 14136 }, { "epoch": 3.7441737288135593, "grad_norm": 1.8835726976394653, "learning_rate": 8.12804555084746e-06, "loss": 1.4549, "mean_token_accuracy": 0.6827311888337135, "num_tokens": 11378071.0, "step": 14138 }, { "epoch": 3.7447033898305087, "grad_norm": 1.7707797288894653, "learning_rate": 8.127780720338984e-06, "loss": 1.2992, "mean_token_accuracy": 0.7163614630699158, "num_tokens": 11379899.0, "step": 14140 }, { "epoch": 3.7452330508474576, "grad_norm": 2.04986834526062, "learning_rate": 8.127515889830509e-06, "loss": 1.343, "mean_token_accuracy": 0.7159637995064259, "num_tokens": 11381250.0, "step": 14142 }, { "epoch": 3.7457627118644066, "grad_norm": 2.228243827819824, "learning_rate": 8.127251059322034e-06, "loss": 1.2072, "mean_token_accuracy": 0.7114025205373764, "num_tokens": 11382670.0, "step": 14144 }, { "epoch": 3.746292372881356, "grad_norm": 1.7461634874343872, "learning_rate": 8.12698622881356e-06, "loss": 1.2792, "mean_token_accuracy": 0.7042569369077682, "num_tokens": 11384161.0, "step": 14146 }, { "epoch": 3.746822033898305, "grad_norm": 2.17866587638855, "learning_rate": 8.126721398305086e-06, "loss": 1.6717, "mean_token_accuracy": 0.6360508501529694, "num_tokens": 11385713.0, "step": 14148 }, { "epoch": 3.7473516949152543, "grad_norm": 2.3286938667297363, "learning_rate": 8.12645656779661e-06, "loss": 1.2082, "mean_token_accuracy": 0.7114166468381882, "num_tokens": 11387426.0, "step": 14150 }, { "epoch": 3.7478813559322033, "grad_norm": 2.184028148651123, "learning_rate": 8.126191737288135e-06, "loss": 1.1318, "mean_token_accuracy": 0.7545998319983482, "num_tokens": 11388765.0, "step": 14152 }, { "epoch": 3.7484110169491527, "grad_norm": 1.6689987182617188, "learning_rate": 8.125926906779662e-06, "loss": 1.0766, "mean_token_accuracy": 0.7469243966042995, "num_tokens": 11390208.0, "step": 14154 }, { "epoch": 3.7489406779661016, "grad_norm": 1.9277745485305786, "learning_rate": 8.125662076271187e-06, "loss": 1.3982, "mean_token_accuracy": 0.679681982845068, "num_tokens": 11391847.0, "step": 14156 }, { "epoch": 3.749470338983051, "grad_norm": 1.851378083229065, "learning_rate": 8.125397245762714e-06, "loss": 1.5143, "mean_token_accuracy": 0.6663612723350525, "num_tokens": 11393654.0, "step": 14158 }, { "epoch": 3.75, "grad_norm": 1.5579086542129517, "learning_rate": 8.125132415254239e-06, "loss": 0.8292, "mean_token_accuracy": 0.7838395535945892, "num_tokens": 11395507.0, "step": 14160 }, { "epoch": 3.750529661016949, "grad_norm": 1.6641000509262085, "learning_rate": 8.124867584745763e-06, "loss": 1.2293, "mean_token_accuracy": 0.713446706533432, "num_tokens": 11397522.0, "step": 14162 }, { "epoch": 3.7510593220338984, "grad_norm": 1.5034682750701904, "learning_rate": 8.124602754237288e-06, "loss": 1.2881, "mean_token_accuracy": 0.7101269140839577, "num_tokens": 11399062.0, "step": 14164 }, { "epoch": 3.7515889830508473, "grad_norm": 2.1190237998962402, "learning_rate": 8.124337923728815e-06, "loss": 1.5012, "mean_token_accuracy": 0.6693136841058731, "num_tokens": 11400886.0, "step": 14166 }, { "epoch": 3.7521186440677967, "grad_norm": 1.6177314519882202, "learning_rate": 8.12407309322034e-06, "loss": 1.409, "mean_token_accuracy": 0.6481879949569702, "num_tokens": 11402783.0, "step": 14168 }, { "epoch": 3.7526483050847457, "grad_norm": 1.8096715211868286, "learning_rate": 8.123808262711865e-06, "loss": 1.2257, "mean_token_accuracy": 0.7128329873085022, "num_tokens": 11404519.0, "step": 14170 }, { "epoch": 3.753177966101695, "grad_norm": 1.8189054727554321, "learning_rate": 8.12354343220339e-06, "loss": 1.1662, "mean_token_accuracy": 0.7089103609323502, "num_tokens": 11406032.0, "step": 14172 }, { "epoch": 3.753707627118644, "grad_norm": 1.9056249856948853, "learning_rate": 8.123278601694916e-06, "loss": 1.8019, "mean_token_accuracy": 0.6309697553515434, "num_tokens": 11407757.0, "step": 14174 }, { "epoch": 3.7542372881355934, "grad_norm": 1.856797695159912, "learning_rate": 8.123013771186441e-06, "loss": 1.0044, "mean_token_accuracy": 0.7431226968765259, "num_tokens": 11409299.0, "step": 14176 }, { "epoch": 3.7547669491525424, "grad_norm": 2.0904250144958496, "learning_rate": 8.122748940677966e-06, "loss": 1.5353, "mean_token_accuracy": 0.6729078069329262, "num_tokens": 11410798.0, "step": 14178 }, { "epoch": 3.7552966101694913, "grad_norm": 2.030421733856201, "learning_rate": 8.122484110169491e-06, "loss": 1.8069, "mean_token_accuracy": 0.6170751824975014, "num_tokens": 11412514.0, "step": 14180 }, { "epoch": 3.7558262711864407, "grad_norm": 1.7954955101013184, "learning_rate": 8.122219279661018e-06, "loss": 1.1667, "mean_token_accuracy": 0.7233403027057648, "num_tokens": 11414139.0, "step": 14182 }, { "epoch": 3.7563559322033897, "grad_norm": 1.7533825635910034, "learning_rate": 8.121954449152543e-06, "loss": 0.7344, "mean_token_accuracy": 0.80617655813694, "num_tokens": 11415740.0, "step": 14184 }, { "epoch": 3.756885593220339, "grad_norm": 1.9100757837295532, "learning_rate": 8.12168961864407e-06, "loss": 1.4202, "mean_token_accuracy": 0.6829820945858955, "num_tokens": 11417207.0, "step": 14186 }, { "epoch": 3.757415254237288, "grad_norm": 1.6588075160980225, "learning_rate": 8.121424788135593e-06, "loss": 0.8758, "mean_token_accuracy": 0.7806014269590378, "num_tokens": 11418749.0, "step": 14188 }, { "epoch": 3.757944915254237, "grad_norm": 1.8583019971847534, "learning_rate": 8.12115995762712e-06, "loss": 1.1785, "mean_token_accuracy": 0.7141248136758804, "num_tokens": 11420420.0, "step": 14190 }, { "epoch": 3.7584745762711864, "grad_norm": 2.1539273262023926, "learning_rate": 8.120895127118644e-06, "loss": 1.712, "mean_token_accuracy": 0.6202712878584862, "num_tokens": 11422119.0, "step": 14192 }, { "epoch": 3.759004237288136, "grad_norm": 2.448277235031128, "learning_rate": 8.12063029661017e-06, "loss": 1.0564, "mean_token_accuracy": 0.7427459582686424, "num_tokens": 11423614.0, "step": 14194 }, { "epoch": 3.7595338983050848, "grad_norm": 1.795280933380127, "learning_rate": 8.120365466101696e-06, "loss": 1.5633, "mean_token_accuracy": 0.651916079223156, "num_tokens": 11425279.0, "step": 14196 }, { "epoch": 3.7600635593220337, "grad_norm": 2.2795348167419434, "learning_rate": 8.12010063559322e-06, "loss": 1.5032, "mean_token_accuracy": 0.6709459722042084, "num_tokens": 11426738.0, "step": 14198 }, { "epoch": 3.760593220338983, "grad_norm": 2.002256155014038, "learning_rate": 8.119835805084746e-06, "loss": 1.1901, "mean_token_accuracy": 0.7300201207399368, "num_tokens": 11428203.0, "step": 14200 }, { "epoch": 3.761122881355932, "grad_norm": 1.9031707048416138, "learning_rate": 8.119570974576272e-06, "loss": 1.5257, "mean_token_accuracy": 0.6918544545769691, "num_tokens": 11429883.0, "step": 14202 }, { "epoch": 3.7616525423728815, "grad_norm": 1.4169349670410156, "learning_rate": 8.119306144067797e-06, "loss": 0.9795, "mean_token_accuracy": 0.7538431286811829, "num_tokens": 11431660.0, "step": 14204 }, { "epoch": 3.7621822033898304, "grad_norm": 1.9547044038772583, "learning_rate": 8.119041313559322e-06, "loss": 1.313, "mean_token_accuracy": 0.700551949441433, "num_tokens": 11433266.0, "step": 14206 }, { "epoch": 3.7627118644067794, "grad_norm": 1.523824691772461, "learning_rate": 8.118776483050849e-06, "loss": 0.9765, "mean_token_accuracy": 0.7774168998003006, "num_tokens": 11434734.0, "step": 14208 }, { "epoch": 3.763241525423729, "grad_norm": 1.6889716386795044, "learning_rate": 8.118511652542374e-06, "loss": 1.4745, "mean_token_accuracy": 0.6559757813811302, "num_tokens": 11436371.0, "step": 14210 }, { "epoch": 3.763771186440678, "grad_norm": 1.8191221952438354, "learning_rate": 8.1182468220339e-06, "loss": 1.1921, "mean_token_accuracy": 0.7307455018162727, "num_tokens": 11437993.0, "step": 14212 }, { "epoch": 3.764300847457627, "grad_norm": 1.6502667665481567, "learning_rate": 8.117981991525425e-06, "loss": 1.165, "mean_token_accuracy": 0.7299369424581528, "num_tokens": 11439673.0, "step": 14214 }, { "epoch": 3.764830508474576, "grad_norm": 1.9470274448394775, "learning_rate": 8.11771716101695e-06, "loss": 1.3275, "mean_token_accuracy": 0.7041963934898376, "num_tokens": 11441134.0, "step": 14216 }, { "epoch": 3.7653601694915255, "grad_norm": 1.7049206495285034, "learning_rate": 8.117452330508475e-06, "loss": 1.2016, "mean_token_accuracy": 0.7099290564656258, "num_tokens": 11442895.0, "step": 14218 }, { "epoch": 3.7658898305084745, "grad_norm": 1.5562388896942139, "learning_rate": 8.117187500000002e-06, "loss": 1.1333, "mean_token_accuracy": 0.7369493991136551, "num_tokens": 11444504.0, "step": 14220 }, { "epoch": 3.766419491525424, "grad_norm": 1.999078631401062, "learning_rate": 8.116922669491527e-06, "loss": 1.2654, "mean_token_accuracy": 0.7095938436686993, "num_tokens": 11446014.0, "step": 14222 }, { "epoch": 3.766949152542373, "grad_norm": 2.0585036277770996, "learning_rate": 8.116657838983051e-06, "loss": 1.3152, "mean_token_accuracy": 0.6834045350551605, "num_tokens": 11447477.0, "step": 14224 }, { "epoch": 3.767478813559322, "grad_norm": 1.972971796989441, "learning_rate": 8.116393008474576e-06, "loss": 1.1692, "mean_token_accuracy": 0.7365547567605972, "num_tokens": 11449147.0, "step": 14226 }, { "epoch": 3.768008474576271, "grad_norm": 2.9823360443115234, "learning_rate": 8.116128177966103e-06, "loss": 1.4397, "mean_token_accuracy": 0.6967004984617233, "num_tokens": 11450580.0, "step": 14228 }, { "epoch": 3.7685381355932206, "grad_norm": 1.8027461767196655, "learning_rate": 8.115863347457628e-06, "loss": 1.3257, "mean_token_accuracy": 0.6869713887572289, "num_tokens": 11452044.0, "step": 14230 }, { "epoch": 3.7690677966101696, "grad_norm": 2.137023687362671, "learning_rate": 8.115598516949153e-06, "loss": 1.5715, "mean_token_accuracy": 0.6886082887649536, "num_tokens": 11453498.0, "step": 14232 }, { "epoch": 3.7695974576271185, "grad_norm": 1.9094867706298828, "learning_rate": 8.115333686440678e-06, "loss": 1.3635, "mean_token_accuracy": 0.6891190111637115, "num_tokens": 11454961.0, "step": 14234 }, { "epoch": 3.770127118644068, "grad_norm": 1.9306073188781738, "learning_rate": 8.115068855932204e-06, "loss": 1.0351, "mean_token_accuracy": 0.7408703491091728, "num_tokens": 11456899.0, "step": 14236 }, { "epoch": 3.770656779661017, "grad_norm": 1.8281441926956177, "learning_rate": 8.11480402542373e-06, "loss": 1.1994, "mean_token_accuracy": 0.7128181606531143, "num_tokens": 11458419.0, "step": 14238 }, { "epoch": 3.7711864406779663, "grad_norm": 2.2483019828796387, "learning_rate": 8.114539194915256e-06, "loss": 1.4079, "mean_token_accuracy": 0.6654175892472267, "num_tokens": 11459911.0, "step": 14240 }, { "epoch": 3.7717161016949152, "grad_norm": 1.697189450263977, "learning_rate": 8.11427436440678e-06, "loss": 1.1671, "mean_token_accuracy": 0.7358541935682297, "num_tokens": 11461439.0, "step": 14242 }, { "epoch": 3.772245762711864, "grad_norm": 2.010901689529419, "learning_rate": 8.114009533898306e-06, "loss": 1.5516, "mean_token_accuracy": 0.6593149155378342, "num_tokens": 11462930.0, "step": 14244 }, { "epoch": 3.7727754237288136, "grad_norm": 1.711814045906067, "learning_rate": 8.11374470338983e-06, "loss": 1.6702, "mean_token_accuracy": 0.6383440494537354, "num_tokens": 11464820.0, "step": 14246 }, { "epoch": 3.773305084745763, "grad_norm": 1.638912320137024, "learning_rate": 8.113479872881357e-06, "loss": 1.23, "mean_token_accuracy": 0.7031463012099266, "num_tokens": 11466471.0, "step": 14248 }, { "epoch": 3.773834745762712, "grad_norm": 2.0448315143585205, "learning_rate": 8.113215042372882e-06, "loss": 1.2931, "step": 14250 }, { "epoch": 3.773834745762712, "eval_loss": 1.3087397813796997, "eval_mean_token_accuracy": 0.7009990086609666, "eval_num_tokens": 11468330.0, "eval_runtime": 48.1135, "eval_samples_per_second": 6.402, "eval_steps_per_second": 6.402, "step": 14250 }, { "epoch": 3.774364406779661, "grad_norm": 2.2071967124938965, "learning_rate": 8.112950211864407e-06, "loss": 1.0264, "mean_token_accuracy": 0.7256781980395317, "num_tokens": 11469787.0, "step": 14252 }, { "epoch": 3.7748940677966103, "grad_norm": 1.8514372110366821, "learning_rate": 8.112685381355932e-06, "loss": 1.6321, "mean_token_accuracy": 0.6424567103385925, "num_tokens": 11471500.0, "step": 14254 }, { "epoch": 3.7754237288135593, "grad_norm": 1.564530611038208, "learning_rate": 8.112420550847459e-06, "loss": 1.2965, "mean_token_accuracy": 0.6909999251365662, "num_tokens": 11473239.0, "step": 14256 }, { "epoch": 3.7759533898305087, "grad_norm": 1.7653224468231201, "learning_rate": 8.112155720338984e-06, "loss": 1.3813, "mean_token_accuracy": 0.6761448308825493, "num_tokens": 11474798.0, "step": 14258 }, { "epoch": 3.7764830508474576, "grad_norm": 1.9653390645980835, "learning_rate": 8.111890889830509e-06, "loss": 1.3574, "mean_token_accuracy": 0.6898065060377121, "num_tokens": 11476356.0, "step": 14260 }, { "epoch": 3.7770127118644066, "grad_norm": 1.5457948446273804, "learning_rate": 8.111626059322034e-06, "loss": 1.0734, "mean_token_accuracy": 0.7420172020792961, "num_tokens": 11477882.0, "step": 14262 }, { "epoch": 3.777542372881356, "grad_norm": 1.8891208171844482, "learning_rate": 8.11136122881356e-06, "loss": 1.356, "mean_token_accuracy": 0.7044400051236153, "num_tokens": 11479533.0, "step": 14264 }, { "epoch": 3.778072033898305, "grad_norm": 2.090482234954834, "learning_rate": 8.111096398305085e-06, "loss": 1.3963, "mean_token_accuracy": 0.6893516480922699, "num_tokens": 11481112.0, "step": 14266 }, { "epoch": 3.7786016949152543, "grad_norm": 2.037759304046631, "learning_rate": 8.110831567796612e-06, "loss": 1.1034, "mean_token_accuracy": 0.7511121556162834, "num_tokens": 11482484.0, "step": 14268 }, { "epoch": 3.7791313559322033, "grad_norm": 1.9032472372055054, "learning_rate": 8.110566737288135e-06, "loss": 1.056, "mean_token_accuracy": 0.7453174665570259, "num_tokens": 11483905.0, "step": 14270 }, { "epoch": 3.7796610169491527, "grad_norm": 2.0490951538085938, "learning_rate": 8.110301906779662e-06, "loss": 1.4526, "mean_token_accuracy": 0.6796559393405914, "num_tokens": 11485521.0, "step": 14272 }, { "epoch": 3.7801906779661016, "grad_norm": 2.2315375804901123, "learning_rate": 8.110037076271187e-06, "loss": 1.2307, "mean_token_accuracy": 0.7075728997588158, "num_tokens": 11486805.0, "step": 14274 }, { "epoch": 3.780720338983051, "grad_norm": 1.6775085926055908, "learning_rate": 8.109772245762713e-06, "loss": 0.968, "mean_token_accuracy": 0.7767262309789658, "num_tokens": 11488544.0, "step": 14276 }, { "epoch": 3.78125, "grad_norm": 1.4275093078613281, "learning_rate": 8.109507415254238e-06, "loss": 1.0658, "mean_token_accuracy": 0.7516390830278397, "num_tokens": 11490062.0, "step": 14278 }, { "epoch": 3.781779661016949, "grad_norm": 1.9757311344146729, "learning_rate": 8.109242584745763e-06, "loss": 1.2561, "mean_token_accuracy": 0.7124802768230438, "num_tokens": 11491750.0, "step": 14280 }, { "epoch": 3.7823093220338984, "grad_norm": 1.794775128364563, "learning_rate": 8.108977754237288e-06, "loss": 1.2137, "mean_token_accuracy": 0.7323976680636406, "num_tokens": 11493514.0, "step": 14282 }, { "epoch": 3.7828389830508473, "grad_norm": 2.1767501831054688, "learning_rate": 8.108712923728815e-06, "loss": 1.2447, "mean_token_accuracy": 0.7273597121238708, "num_tokens": 11495097.0, "step": 14284 }, { "epoch": 3.7833686440677967, "grad_norm": 1.6280359029769897, "learning_rate": 8.10844809322034e-06, "loss": 1.2016, "mean_token_accuracy": 0.7256976291537285, "num_tokens": 11496608.0, "step": 14286 }, { "epoch": 3.7838983050847457, "grad_norm": 1.8530772924423218, "learning_rate": 8.108183262711864e-06, "loss": 1.4357, "mean_token_accuracy": 0.6736861318349838, "num_tokens": 11498261.0, "step": 14288 }, { "epoch": 3.784427966101695, "grad_norm": 1.4160029888153076, "learning_rate": 8.107918432203391e-06, "loss": 1.2606, "mean_token_accuracy": 0.6915499903261662, "num_tokens": 11499984.0, "step": 14290 }, { "epoch": 3.784957627118644, "grad_norm": 1.7262314558029175, "learning_rate": 8.107653601694916e-06, "loss": 0.9163, "mean_token_accuracy": 0.7378208264708519, "num_tokens": 11501886.0, "step": 14292 }, { "epoch": 3.7854872881355934, "grad_norm": 1.6434775590896606, "learning_rate": 8.107388771186443e-06, "loss": 1.261, "mean_token_accuracy": 0.6951767802238464, "num_tokens": 11503523.0, "step": 14294 }, { "epoch": 3.7860169491525424, "grad_norm": 2.1522374153137207, "learning_rate": 8.107123940677966e-06, "loss": 0.8592, "mean_token_accuracy": 0.7843135446310043, "num_tokens": 11504582.0, "step": 14296 }, { "epoch": 3.7865466101694913, "grad_norm": 1.701609492301941, "learning_rate": 8.106859110169493e-06, "loss": 1.2877, "mean_token_accuracy": 0.7022714242339134, "num_tokens": 11506446.0, "step": 14298 }, { "epoch": 3.7870762711864407, "grad_norm": 1.8909387588500977, "learning_rate": 8.106594279661017e-06, "loss": 1.3586, "mean_token_accuracy": 0.6862394288182259, "num_tokens": 11507981.0, "step": 14300 }, { "epoch": 3.7876059322033897, "grad_norm": 1.531800627708435, "learning_rate": 8.106329449152544e-06, "loss": 1.257, "mean_token_accuracy": 0.7080120965838432, "num_tokens": 11509542.0, "step": 14302 }, { "epoch": 3.788135593220339, "grad_norm": 1.7491211891174316, "learning_rate": 8.106064618644069e-06, "loss": 1.117, "mean_token_accuracy": 0.7597605958580971, "num_tokens": 11510891.0, "step": 14304 }, { "epoch": 3.788665254237288, "grad_norm": 1.8143688440322876, "learning_rate": 8.105799788135594e-06, "loss": 1.1901, "mean_token_accuracy": 0.7327852621674538, "num_tokens": 11512437.0, "step": 14306 }, { "epoch": 3.789194915254237, "grad_norm": 2.0626485347747803, "learning_rate": 8.105534957627119e-06, "loss": 0.9357, "mean_token_accuracy": 0.7657181173563004, "num_tokens": 11513761.0, "step": 14308 }, { "epoch": 3.7897245762711864, "grad_norm": 2.1753170490264893, "learning_rate": 8.105270127118645e-06, "loss": 1.4508, "mean_token_accuracy": 0.6845780313014984, "num_tokens": 11515636.0, "step": 14310 }, { "epoch": 3.790254237288136, "grad_norm": 1.6937470436096191, "learning_rate": 8.10500529661017e-06, "loss": 0.819, "mean_token_accuracy": 0.7800819724798203, "num_tokens": 11517304.0, "step": 14312 }, { "epoch": 3.7907838983050848, "grad_norm": 2.198758363723755, "learning_rate": 8.104740466101695e-06, "loss": 1.2296, "mean_token_accuracy": 0.7124087661504745, "num_tokens": 11518865.0, "step": 14314 }, { "epoch": 3.7913135593220337, "grad_norm": 1.9174065589904785, "learning_rate": 8.10447563559322e-06, "loss": 1.213, "mean_token_accuracy": 0.7145288661122322, "num_tokens": 11520506.0, "step": 14316 }, { "epoch": 3.791843220338983, "grad_norm": 1.991545557975769, "learning_rate": 8.104210805084747e-06, "loss": 1.2672, "mean_token_accuracy": 0.7177645117044449, "num_tokens": 11522043.0, "step": 14318 }, { "epoch": 3.792372881355932, "grad_norm": 1.9189292192459106, "learning_rate": 8.103945974576272e-06, "loss": 1.2308, "mean_token_accuracy": 0.715839609503746, "num_tokens": 11523430.0, "step": 14320 }, { "epoch": 3.7929025423728815, "grad_norm": 1.7965294122695923, "learning_rate": 8.103681144067798e-06, "loss": 1.6308, "mean_token_accuracy": 0.654453169554472, "num_tokens": 11525081.0, "step": 14322 }, { "epoch": 3.7934322033898304, "grad_norm": 1.819015383720398, "learning_rate": 8.103416313559322e-06, "loss": 1.0439, "mean_token_accuracy": 0.7187881916761398, "num_tokens": 11526581.0, "step": 14324 }, { "epoch": 3.7939618644067794, "grad_norm": 1.163923740386963, "learning_rate": 8.103151483050848e-06, "loss": 0.9816, "mean_token_accuracy": 0.7516388967633247, "num_tokens": 11529007.0, "step": 14326 }, { "epoch": 3.794491525423729, "grad_norm": 1.3320670127868652, "learning_rate": 8.102886652542373e-06, "loss": 0.9227, "mean_token_accuracy": 0.7506529837846756, "num_tokens": 11530879.0, "step": 14328 }, { "epoch": 3.795021186440678, "grad_norm": 1.8240036964416504, "learning_rate": 8.1026218220339e-06, "loss": 1.002, "mean_token_accuracy": 0.7725142166018486, "num_tokens": 11532397.0, "step": 14330 }, { "epoch": 3.795550847457627, "grad_norm": 1.2174023389816284, "learning_rate": 8.102356991525425e-06, "loss": 0.8644, "mean_token_accuracy": 0.7777683958411217, "num_tokens": 11534092.0, "step": 14332 }, { "epoch": 3.796080508474576, "grad_norm": 2.026390790939331, "learning_rate": 8.10209216101695e-06, "loss": 1.0875, "mean_token_accuracy": 0.7574067749083042, "num_tokens": 11535833.0, "step": 14334 }, { "epoch": 3.7966101694915255, "grad_norm": 1.7558088302612305, "learning_rate": 8.101827330508475e-06, "loss": 1.3908, "mean_token_accuracy": 0.6794644445180893, "num_tokens": 11537428.0, "step": 14336 }, { "epoch": 3.7971398305084745, "grad_norm": 1.5616363286972046, "learning_rate": 8.101562500000001e-06, "loss": 1.232, "mean_token_accuracy": 0.6948644518852234, "num_tokens": 11539307.0, "step": 14338 }, { "epoch": 3.797669491525424, "grad_norm": 2.4872756004333496, "learning_rate": 8.101297669491526e-06, "loss": 1.3523, "mean_token_accuracy": 0.6930551007390022, "num_tokens": 11540572.0, "step": 14340 }, { "epoch": 3.798199152542373, "grad_norm": 1.6467671394348145, "learning_rate": 8.101032838983051e-06, "loss": 1.2437, "mean_token_accuracy": 0.690352775156498, "num_tokens": 11542241.0, "step": 14342 }, { "epoch": 3.798728813559322, "grad_norm": 1.810976266860962, "learning_rate": 8.100768008474576e-06, "loss": 0.8408, "mean_token_accuracy": 0.7817525044083595, "num_tokens": 11543962.0, "step": 14344 }, { "epoch": 3.799258474576271, "grad_norm": 2.2139692306518555, "learning_rate": 8.100503177966103e-06, "loss": 1.4919, "mean_token_accuracy": 0.6744289249181747, "num_tokens": 11545605.0, "step": 14346 }, { "epoch": 3.7997881355932206, "grad_norm": 1.934326171875, "learning_rate": 8.100238347457628e-06, "loss": 1.2831, "mean_token_accuracy": 0.7097586393356323, "num_tokens": 11547315.0, "step": 14348 }, { "epoch": 3.8003177966101696, "grad_norm": 1.7269952297210693, "learning_rate": 8.099973516949153e-06, "loss": 1.1066, "mean_token_accuracy": 0.7572467029094696, "num_tokens": 11548751.0, "step": 14350 }, { "epoch": 3.8008474576271185, "grad_norm": 1.8601313829421997, "learning_rate": 8.099708686440677e-06, "loss": 0.944, "mean_token_accuracy": 0.7687856554985046, "num_tokens": 11550210.0, "step": 14352 }, { "epoch": 3.801377118644068, "grad_norm": 1.6174521446228027, "learning_rate": 8.099443855932204e-06, "loss": 0.9491, "mean_token_accuracy": 0.786673441529274, "num_tokens": 11551562.0, "step": 14354 }, { "epoch": 3.801906779661017, "grad_norm": 2.4982540607452393, "learning_rate": 8.099179025423729e-06, "loss": 1.4023, "mean_token_accuracy": 0.6575939729809761, "num_tokens": 11552986.0, "step": 14356 }, { "epoch": 3.8024364406779663, "grad_norm": 1.948249101638794, "learning_rate": 8.098914194915256e-06, "loss": 1.3529, "mean_token_accuracy": 0.6934029832482338, "num_tokens": 11554507.0, "step": 14358 }, { "epoch": 3.8029661016949152, "grad_norm": 2.161109685897827, "learning_rate": 8.09864936440678e-06, "loss": 1.6478, "mean_token_accuracy": 0.6487627699971199, "num_tokens": 11556023.0, "step": 14360 }, { "epoch": 3.803495762711864, "grad_norm": 1.8884594440460205, "learning_rate": 8.098384533898305e-06, "loss": 1.3423, "mean_token_accuracy": 0.6888958886265755, "num_tokens": 11557562.0, "step": 14362 }, { "epoch": 3.8040254237288136, "grad_norm": 1.882076621055603, "learning_rate": 8.09811970338983e-06, "loss": 1.0401, "mean_token_accuracy": 0.7605510577559471, "num_tokens": 11559080.0, "step": 14364 }, { "epoch": 3.804555084745763, "grad_norm": 2.0664827823638916, "learning_rate": 8.097854872881357e-06, "loss": 1.312, "mean_token_accuracy": 0.7126627787947655, "num_tokens": 11560513.0, "step": 14366 }, { "epoch": 3.805084745762712, "grad_norm": 1.7099701166152954, "learning_rate": 8.097590042372882e-06, "loss": 1.2088, "mean_token_accuracy": 0.7011084705591202, "num_tokens": 11562431.0, "step": 14368 }, { "epoch": 3.805614406779661, "grad_norm": 2.0968472957611084, "learning_rate": 8.097325211864407e-06, "loss": 1.304, "mean_token_accuracy": 0.7240604534745216, "num_tokens": 11563818.0, "step": 14370 }, { "epoch": 3.8061440677966103, "grad_norm": 1.9474023580551147, "learning_rate": 8.097060381355934e-06, "loss": 1.1085, "mean_token_accuracy": 0.7384044677019119, "num_tokens": 11565417.0, "step": 14372 }, { "epoch": 3.8066737288135593, "grad_norm": 1.7346844673156738, "learning_rate": 8.096795550847458e-06, "loss": 1.2759, "mean_token_accuracy": 0.7458688095211983, "num_tokens": 11567058.0, "step": 14374 }, { "epoch": 3.8072033898305087, "grad_norm": 1.539452314376831, "learning_rate": 8.096530720338985e-06, "loss": 1.2466, "mean_token_accuracy": 0.696690671145916, "num_tokens": 11568715.0, "step": 14376 }, { "epoch": 3.8077330508474576, "grad_norm": 1.823258399963379, "learning_rate": 8.096265889830508e-06, "loss": 1.21, "mean_token_accuracy": 0.7200365141034126, "num_tokens": 11570382.0, "step": 14378 }, { "epoch": 3.8082627118644066, "grad_norm": 2.047290802001953, "learning_rate": 8.096001059322035e-06, "loss": 1.2601, "mean_token_accuracy": 0.7093233019113541, "num_tokens": 11571885.0, "step": 14380 }, { "epoch": 3.808792372881356, "grad_norm": 1.7108044624328613, "learning_rate": 8.09573622881356e-06, "loss": 1.054, "mean_token_accuracy": 0.7427065819501877, "num_tokens": 11573494.0, "step": 14382 }, { "epoch": 3.809322033898305, "grad_norm": 2.0398330688476562, "learning_rate": 8.095471398305086e-06, "loss": 1.5282, "mean_token_accuracy": 0.6680561937391758, "num_tokens": 11574984.0, "step": 14384 }, { "epoch": 3.8098516949152543, "grad_norm": 1.9360311031341553, "learning_rate": 8.095206567796611e-06, "loss": 1.0399, "mean_token_accuracy": 0.7487727329134941, "num_tokens": 11576531.0, "step": 14386 }, { "epoch": 3.8103813559322033, "grad_norm": 2.394202947616577, "learning_rate": 8.094941737288136e-06, "loss": 1.4462, "mean_token_accuracy": 0.7039087563753128, "num_tokens": 11577760.0, "step": 14388 }, { "epoch": 3.8109110169491527, "grad_norm": 1.9565662145614624, "learning_rate": 8.094676906779661e-06, "loss": 1.2237, "mean_token_accuracy": 0.7152392566204071, "num_tokens": 11579265.0, "step": 14390 }, { "epoch": 3.8114406779661016, "grad_norm": 1.6145408153533936, "learning_rate": 8.094412076271188e-06, "loss": 1.242, "mean_token_accuracy": 0.7159179300069809, "num_tokens": 11581118.0, "step": 14392 }, { "epoch": 3.811970338983051, "grad_norm": 1.8247607946395874, "learning_rate": 8.094147245762713e-06, "loss": 1.2136, "mean_token_accuracy": 0.7195085138082504, "num_tokens": 11582627.0, "step": 14394 }, { "epoch": 3.8125, "grad_norm": 2.2066140174865723, "learning_rate": 8.093882415254238e-06, "loss": 1.438, "mean_token_accuracy": 0.6711310558021069, "num_tokens": 11584072.0, "step": 14396 }, { "epoch": 3.813029661016949, "grad_norm": 1.9118809700012207, "learning_rate": 8.093617584745763e-06, "loss": 1.5309, "mean_token_accuracy": 0.6645286679267883, "num_tokens": 11585704.0, "step": 14398 }, { "epoch": 3.8135593220338984, "grad_norm": 2.0044631958007812, "learning_rate": 8.09335275423729e-06, "loss": 1.4918, "mean_token_accuracy": 0.669601134955883, "num_tokens": 11587263.0, "step": 14400 }, { "epoch": 3.8140889830508473, "grad_norm": 2.3433353900909424, "learning_rate": 8.093087923728814e-06, "loss": 1.309, "mean_token_accuracy": 0.7158899381756783, "num_tokens": 11588519.0, "step": 14402 }, { "epoch": 3.8146186440677967, "grad_norm": 1.7171714305877686, "learning_rate": 8.09282309322034e-06, "loss": 1.3479, "mean_token_accuracy": 0.6849892027676105, "num_tokens": 11590214.0, "step": 14404 }, { "epoch": 3.8151483050847457, "grad_norm": 1.501839518547058, "learning_rate": 8.092558262711864e-06, "loss": 1.0478, "mean_token_accuracy": 0.7403182908892632, "num_tokens": 11591688.0, "step": 14406 }, { "epoch": 3.815677966101695, "grad_norm": 1.8103116750717163, "learning_rate": 8.09229343220339e-06, "loss": 1.0609, "mean_token_accuracy": 0.7688665017485619, "num_tokens": 11593127.0, "step": 14408 }, { "epoch": 3.816207627118644, "grad_norm": 1.641318917274475, "learning_rate": 8.092028601694916e-06, "loss": 1.2098, "mean_token_accuracy": 0.7031385004520416, "num_tokens": 11594804.0, "step": 14410 }, { "epoch": 3.8167372881355934, "grad_norm": 1.8258250951766968, "learning_rate": 8.091763771186442e-06, "loss": 1.119, "mean_token_accuracy": 0.7265696972608566, "num_tokens": 11596326.0, "step": 14412 }, { "epoch": 3.8172669491525424, "grad_norm": 2.2007153034210205, "learning_rate": 8.091498940677967e-06, "loss": 1.5323, "mean_token_accuracy": 0.6551006063818932, "num_tokens": 11598159.0, "step": 14414 }, { "epoch": 3.8177966101694913, "grad_norm": 2.18500018119812, "learning_rate": 8.091234110169492e-06, "loss": 1.4493, "mean_token_accuracy": 0.6580800637602806, "num_tokens": 11599588.0, "step": 14416 }, { "epoch": 3.8183262711864407, "grad_norm": 2.2682714462280273, "learning_rate": 8.090969279661017e-06, "loss": 1.1344, "mean_token_accuracy": 0.7514955773949623, "num_tokens": 11601012.0, "step": 14418 }, { "epoch": 3.8188559322033897, "grad_norm": 1.755657434463501, "learning_rate": 8.090704449152544e-06, "loss": 1.3744, "mean_token_accuracy": 0.6825566440820694, "num_tokens": 11602467.0, "step": 14420 }, { "epoch": 3.819385593220339, "grad_norm": 1.9175763130187988, "learning_rate": 8.090439618644069e-06, "loss": 1.3297, "mean_token_accuracy": 0.6790129542350769, "num_tokens": 11603826.0, "step": 14422 }, { "epoch": 3.819915254237288, "grad_norm": 1.5410741567611694, "learning_rate": 8.090174788135594e-06, "loss": 1.1749, "mean_token_accuracy": 0.7303770184516907, "num_tokens": 11605451.0, "step": 14424 }, { "epoch": 3.820444915254237, "grad_norm": 1.542555332183838, "learning_rate": 8.089909957627118e-06, "loss": 1.4215, "mean_token_accuracy": 0.6833653748035431, "num_tokens": 11607158.0, "step": 14426 }, { "epoch": 3.8209745762711864, "grad_norm": 1.947097897529602, "learning_rate": 8.089645127118645e-06, "loss": 1.305, "mean_token_accuracy": 0.6828466728329659, "num_tokens": 11608759.0, "step": 14428 }, { "epoch": 3.821504237288136, "grad_norm": 1.8834148645401, "learning_rate": 8.08938029661017e-06, "loss": 1.3232, "mean_token_accuracy": 0.6762760430574417, "num_tokens": 11610366.0, "step": 14430 }, { "epoch": 3.8220338983050848, "grad_norm": 1.4608923196792603, "learning_rate": 8.089115466101695e-06, "loss": 0.8566, "mean_token_accuracy": 0.777878426015377, "num_tokens": 11611784.0, "step": 14432 }, { "epoch": 3.8225635593220337, "grad_norm": 1.5974652767181396, "learning_rate": 8.08885063559322e-06, "loss": 1.2901, "mean_token_accuracy": 0.6977185308933258, "num_tokens": 11613521.0, "step": 14434 }, { "epoch": 3.823093220338983, "grad_norm": 1.853611946105957, "learning_rate": 8.088585805084747e-06, "loss": 1.0756, "mean_token_accuracy": 0.7410148903727531, "num_tokens": 11614974.0, "step": 14436 }, { "epoch": 3.823622881355932, "grad_norm": 1.8206278085708618, "learning_rate": 8.088320974576271e-06, "loss": 1.0542, "mean_token_accuracy": 0.7544820457696915, "num_tokens": 11616311.0, "step": 14438 }, { "epoch": 3.8241525423728815, "grad_norm": 1.6195472478866577, "learning_rate": 8.088056144067798e-06, "loss": 0.7733, "mean_token_accuracy": 0.8096080496907234, "num_tokens": 11617831.0, "step": 14440 }, { "epoch": 3.8246822033898304, "grad_norm": 1.8377611637115479, "learning_rate": 8.087791313559323e-06, "loss": 1.2412, "mean_token_accuracy": 0.718765377998352, "num_tokens": 11619300.0, "step": 14442 }, { "epoch": 3.8252118644067794, "grad_norm": 1.7971347570419312, "learning_rate": 8.087526483050848e-06, "loss": 1.1132, "mean_token_accuracy": 0.7272996157407761, "num_tokens": 11621325.0, "step": 14444 }, { "epoch": 3.825741525423729, "grad_norm": 1.6393014192581177, "learning_rate": 8.087261652542373e-06, "loss": 1.1884, "mean_token_accuracy": 0.733983650803566, "num_tokens": 11623084.0, "step": 14446 }, { "epoch": 3.826271186440678, "grad_norm": 1.9955793619155884, "learning_rate": 8.0869968220339e-06, "loss": 1.5553, "mean_token_accuracy": 0.6493769362568855, "num_tokens": 11624772.0, "step": 14448 }, { "epoch": 3.826800847457627, "grad_norm": 1.7906712293624878, "learning_rate": 8.086731991525424e-06, "loss": 1.1443, "mean_token_accuracy": 0.7204931974411011, "num_tokens": 11626486.0, "step": 14450 }, { "epoch": 3.827330508474576, "grad_norm": 1.884734034538269, "learning_rate": 8.08646716101695e-06, "loss": 1.3879, "mean_token_accuracy": 0.6869712993502617, "num_tokens": 11627957.0, "step": 14452 }, { "epoch": 3.8278601694915255, "grad_norm": 2.189744472503662, "learning_rate": 8.086202330508474e-06, "loss": 1.1948, "mean_token_accuracy": 0.7273742109537125, "num_tokens": 11629428.0, "step": 14454 }, { "epoch": 3.8283898305084745, "grad_norm": 2.1427135467529297, "learning_rate": 8.085937500000001e-06, "loss": 1.7996, "mean_token_accuracy": 0.6367266923189163, "num_tokens": 11631040.0, "step": 14456 }, { "epoch": 3.828919491525424, "grad_norm": 1.8866287469863892, "learning_rate": 8.085672669491526e-06, "loss": 1.0542, "mean_token_accuracy": 0.7769000753760338, "num_tokens": 11632831.0, "step": 14458 }, { "epoch": 3.829449152542373, "grad_norm": 1.685846209526062, "learning_rate": 8.08540783898305e-06, "loss": 1.1989, "mean_token_accuracy": 0.7143569886684418, "num_tokens": 11634497.0, "step": 14460 }, { "epoch": 3.829978813559322, "grad_norm": 1.5645791292190552, "learning_rate": 8.085143008474577e-06, "loss": 1.1083, "mean_token_accuracy": 0.7426902204751968, "num_tokens": 11635987.0, "step": 14462 }, { "epoch": 3.830508474576271, "grad_norm": 2.395331621170044, "learning_rate": 8.084878177966102e-06, "loss": 1.0432, "mean_token_accuracy": 0.751257985830307, "num_tokens": 11637108.0, "step": 14464 }, { "epoch": 3.8310381355932206, "grad_norm": 1.8433841466903687, "learning_rate": 8.084613347457629e-06, "loss": 1.0887, "mean_token_accuracy": 0.75836481153965, "num_tokens": 11639205.0, "step": 14466 }, { "epoch": 3.8315677966101696, "grad_norm": 2.1269962787628174, "learning_rate": 8.084348516949154e-06, "loss": 1.1337, "mean_token_accuracy": 0.7493929713964462, "num_tokens": 11640633.0, "step": 14468 }, { "epoch": 3.8320974576271185, "grad_norm": 1.875454306602478, "learning_rate": 8.084083686440679e-06, "loss": 0.7953, "mean_token_accuracy": 0.7987955957651138, "num_tokens": 11642023.0, "step": 14470 }, { "epoch": 3.832627118644068, "grad_norm": 2.2396183013916016, "learning_rate": 8.083818855932204e-06, "loss": 1.2915, "mean_token_accuracy": 0.7016556710004807, "num_tokens": 11643699.0, "step": 14472 }, { "epoch": 3.833156779661017, "grad_norm": 2.018181800842285, "learning_rate": 8.08355402542373e-06, "loss": 1.1596, "mean_token_accuracy": 0.7086058557033539, "num_tokens": 11645161.0, "step": 14474 }, { "epoch": 3.8336864406779663, "grad_norm": 1.8236647844314575, "learning_rate": 8.083289194915255e-06, "loss": 1.2238, "mean_token_accuracy": 0.7311951890587807, "num_tokens": 11646577.0, "step": 14476 }, { "epoch": 3.8342161016949152, "grad_norm": 2.1375679969787598, "learning_rate": 8.08302436440678e-06, "loss": 1.1761, "mean_token_accuracy": 0.725781761109829, "num_tokens": 11648293.0, "step": 14478 }, { "epoch": 3.834745762711864, "grad_norm": 1.7246884107589722, "learning_rate": 8.082759533898305e-06, "loss": 0.8975, "mean_token_accuracy": 0.7626274302601814, "num_tokens": 11650042.0, "step": 14480 }, { "epoch": 3.8352754237288136, "grad_norm": 1.8970797061920166, "learning_rate": 8.082494703389832e-06, "loss": 1.5366, "mean_token_accuracy": 0.6550568714737892, "num_tokens": 11651646.0, "step": 14482 }, { "epoch": 3.835805084745763, "grad_norm": 1.7562940120697021, "learning_rate": 8.082229872881357e-06, "loss": 1.014, "mean_token_accuracy": 0.7652723342180252, "num_tokens": 11653255.0, "step": 14484 }, { "epoch": 3.836334745762712, "grad_norm": 1.7338250875473022, "learning_rate": 8.081965042372882e-06, "loss": 1.2073, "mean_token_accuracy": 0.7226342558860779, "num_tokens": 11655117.0, "step": 14486 }, { "epoch": 3.836864406779661, "grad_norm": 1.701563835144043, "learning_rate": 8.081700211864407e-06, "loss": 1.2428, "mean_token_accuracy": 0.7145445682108402, "num_tokens": 11656764.0, "step": 14488 }, { "epoch": 3.8373940677966103, "grad_norm": 1.9009289741516113, "learning_rate": 8.081435381355933e-06, "loss": 1.1337, "mean_token_accuracy": 0.7339655011892319, "num_tokens": 11658448.0, "step": 14490 }, { "epoch": 3.8379237288135593, "grad_norm": 1.905333399772644, "learning_rate": 8.081170550847458e-06, "loss": 1.3502, "mean_token_accuracy": 0.6934616230428219, "num_tokens": 11659982.0, "step": 14492 }, { "epoch": 3.8384533898305087, "grad_norm": 2.116748571395874, "learning_rate": 8.080905720338985e-06, "loss": 1.3782, "mean_token_accuracy": 0.6915445551276207, "num_tokens": 11661579.0, "step": 14494 }, { "epoch": 3.8389830508474576, "grad_norm": 1.7577009201049805, "learning_rate": 8.08064088983051e-06, "loss": 1.0576, "mean_token_accuracy": 0.7282232046127319, "num_tokens": 11663258.0, "step": 14496 }, { "epoch": 3.8395127118644066, "grad_norm": 1.9798071384429932, "learning_rate": 8.080376059322035e-06, "loss": 1.5275, "mean_token_accuracy": 0.6665540859103203, "num_tokens": 11664913.0, "step": 14498 }, { "epoch": 3.840042372881356, "grad_norm": 1.5064749717712402, "learning_rate": 8.08011122881356e-06, "loss": 1.1563, "step": 14500 }, { "epoch": 3.840042372881356, "eval_loss": 1.3074105978012085, "eval_mean_token_accuracy": 0.7017254569120221, "eval_num_tokens": 11666442.0, "eval_runtime": 48.272, "eval_samples_per_second": 6.381, "eval_steps_per_second": 6.381, "step": 14500 }, { "epoch": 3.840572033898305, "grad_norm": 1.8041859865188599, "learning_rate": 8.079846398305086e-06, "loss": 1.2169, "mean_token_accuracy": 0.7266123704612255, "num_tokens": 11668313.0, "step": 14502 }, { "epoch": 3.8411016949152543, "grad_norm": 1.8858792781829834, "learning_rate": 8.079581567796611e-06, "loss": 1.2857, "mean_token_accuracy": 0.7054412513971329, "num_tokens": 11669678.0, "step": 14504 }, { "epoch": 3.8416313559322033, "grad_norm": 1.9757530689239502, "learning_rate": 8.079316737288136e-06, "loss": 1.4977, "mean_token_accuracy": 0.6645909771323204, "num_tokens": 11671058.0, "step": 14506 }, { "epoch": 3.8421610169491527, "grad_norm": 2.009613037109375, "learning_rate": 8.079051906779661e-06, "loss": 1.1666, "mean_token_accuracy": 0.717512883245945, "num_tokens": 11672632.0, "step": 14508 }, { "epoch": 3.8426906779661016, "grad_norm": 1.8797513246536255, "learning_rate": 8.078787076271188e-06, "loss": 1.4758, "mean_token_accuracy": 0.6640937849879265, "num_tokens": 11674258.0, "step": 14510 }, { "epoch": 3.843220338983051, "grad_norm": 1.5884705781936646, "learning_rate": 8.078522245762712e-06, "loss": 1.275, "mean_token_accuracy": 0.7115217298269272, "num_tokens": 11676406.0, "step": 14512 }, { "epoch": 3.84375, "grad_norm": 1.5473451614379883, "learning_rate": 8.078257415254237e-06, "loss": 1.0158, "mean_token_accuracy": 0.747566245496273, "num_tokens": 11677936.0, "step": 14514 }, { "epoch": 3.844279661016949, "grad_norm": 1.5574692487716675, "learning_rate": 8.077992584745762e-06, "loss": 1.1035, "mean_token_accuracy": 0.7249561622738838, "num_tokens": 11679619.0, "step": 14516 }, { "epoch": 3.8448093220338984, "grad_norm": 2.0514142513275146, "learning_rate": 8.077727754237289e-06, "loss": 1.1839, "mean_token_accuracy": 0.6898913979530334, "num_tokens": 11681774.0, "step": 14518 }, { "epoch": 3.8453389830508473, "grad_norm": 2.0690176486968994, "learning_rate": 8.077462923728814e-06, "loss": 1.3048, "mean_token_accuracy": 0.7029917240142822, "num_tokens": 11683484.0, "step": 14520 }, { "epoch": 3.8458686440677967, "grad_norm": 1.5683715343475342, "learning_rate": 8.07719809322034e-06, "loss": 1.2065, "mean_token_accuracy": 0.7071066796779633, "num_tokens": 11685108.0, "step": 14522 }, { "epoch": 3.8463983050847457, "grad_norm": 2.1064422130584717, "learning_rate": 8.076933262711865e-06, "loss": 1.3149, "mean_token_accuracy": 0.7028148099780083, "num_tokens": 11686745.0, "step": 14524 }, { "epoch": 3.846927966101695, "grad_norm": 1.5207055807113647, "learning_rate": 8.07666843220339e-06, "loss": 0.8428, "mean_token_accuracy": 0.7781143933534622, "num_tokens": 11688328.0, "step": 14526 }, { "epoch": 3.847457627118644, "grad_norm": 1.908981204032898, "learning_rate": 8.076403601694915e-06, "loss": 1.8174, "mean_token_accuracy": 0.5992592200636864, "num_tokens": 11690025.0, "step": 14528 }, { "epoch": 3.8479872881355934, "grad_norm": 1.2867319583892822, "learning_rate": 8.076138771186442e-06, "loss": 1.2396, "mean_token_accuracy": 0.7202837243676186, "num_tokens": 11691779.0, "step": 14530 }, { "epoch": 3.8485169491525424, "grad_norm": 1.6023987531661987, "learning_rate": 8.075873940677967e-06, "loss": 0.9962, "mean_token_accuracy": 0.753239631652832, "num_tokens": 11693364.0, "step": 14532 }, { "epoch": 3.8490466101694913, "grad_norm": 1.6321187019348145, "learning_rate": 8.075609110169492e-06, "loss": 1.3464, "mean_token_accuracy": 0.6770287677645683, "num_tokens": 11695196.0, "step": 14534 }, { "epoch": 3.8495762711864407, "grad_norm": 1.9580930471420288, "learning_rate": 8.075344279661017e-06, "loss": 1.8693, "mean_token_accuracy": 0.6034772545099258, "num_tokens": 11696977.0, "step": 14536 }, { "epoch": 3.8501059322033897, "grad_norm": 1.8318936824798584, "learning_rate": 8.075079449152543e-06, "loss": 1.4344, "mean_token_accuracy": 0.6833558902144432, "num_tokens": 11698675.0, "step": 14538 }, { "epoch": 3.850635593220339, "grad_norm": 2.1878621578216553, "learning_rate": 8.074814618644068e-06, "loss": 1.2119, "mean_token_accuracy": 0.7088640853762627, "num_tokens": 11700167.0, "step": 14540 }, { "epoch": 3.851165254237288, "grad_norm": 2.425969123840332, "learning_rate": 8.074549788135593e-06, "loss": 1.7431, "mean_token_accuracy": 0.6296267434954643, "num_tokens": 11701587.0, "step": 14542 }, { "epoch": 3.851694915254237, "grad_norm": 2.1208629608154297, "learning_rate": 8.07428495762712e-06, "loss": 1.5561, "mean_token_accuracy": 0.6493295952677727, "num_tokens": 11703089.0, "step": 14544 }, { "epoch": 3.8522245762711864, "grad_norm": 1.6466319561004639, "learning_rate": 8.074020127118645e-06, "loss": 1.3846, "mean_token_accuracy": 0.6652074009180069, "num_tokens": 11704766.0, "step": 14546 }, { "epoch": 3.852754237288136, "grad_norm": 1.878574252128601, "learning_rate": 8.073755296610171e-06, "loss": 1.5016, "mean_token_accuracy": 0.6513636782765388, "num_tokens": 11706395.0, "step": 14548 }, { "epoch": 3.8532838983050848, "grad_norm": 1.866363286972046, "learning_rate": 8.073490466101696e-06, "loss": 1.0405, "mean_token_accuracy": 0.7383545637130737, "num_tokens": 11708104.0, "step": 14550 }, { "epoch": 3.8538135593220337, "grad_norm": 2.0338752269744873, "learning_rate": 8.073225635593221e-06, "loss": 1.3997, "mean_token_accuracy": 0.6966344714164734, "num_tokens": 11709394.0, "step": 14552 }, { "epoch": 3.854343220338983, "grad_norm": 2.359819173812866, "learning_rate": 8.072960805084746e-06, "loss": 1.4295, "mean_token_accuracy": 0.6915626972913742, "num_tokens": 11710982.0, "step": 14554 }, { "epoch": 3.854872881355932, "grad_norm": 1.9587560892105103, "learning_rate": 8.072695974576273e-06, "loss": 1.2478, "mean_token_accuracy": 0.7174385562539101, "num_tokens": 11712490.0, "step": 14556 }, { "epoch": 3.8554025423728815, "grad_norm": 1.6506659984588623, "learning_rate": 8.072431144067798e-06, "loss": 1.3945, "mean_token_accuracy": 0.6910764053463936, "num_tokens": 11714080.0, "step": 14558 }, { "epoch": 3.8559322033898304, "grad_norm": 1.5646378993988037, "learning_rate": 8.072166313559323e-06, "loss": 1.1106, "mean_token_accuracy": 0.745215505361557, "num_tokens": 11715620.0, "step": 14560 }, { "epoch": 3.8564618644067794, "grad_norm": 1.707478404045105, "learning_rate": 8.071901483050848e-06, "loss": 1.511, "mean_token_accuracy": 0.666110061109066, "num_tokens": 11717252.0, "step": 14562 }, { "epoch": 3.856991525423729, "grad_norm": 1.7271939516067505, "learning_rate": 8.071636652542374e-06, "loss": 0.9826, "mean_token_accuracy": 0.744729533791542, "num_tokens": 11718719.0, "step": 14564 }, { "epoch": 3.857521186440678, "grad_norm": 2.3496854305267334, "learning_rate": 8.071371822033899e-06, "loss": 1.1791, "mean_token_accuracy": 0.7452078051865101, "num_tokens": 11720044.0, "step": 14566 }, { "epoch": 3.858050847457627, "grad_norm": 1.9622468948364258, "learning_rate": 8.071106991525424e-06, "loss": 1.0621, "mean_token_accuracy": 0.7306482195854187, "num_tokens": 11721477.0, "step": 14568 }, { "epoch": 3.858580508474576, "grad_norm": 1.7401844263076782, "learning_rate": 8.070842161016949e-06, "loss": 1.2578, "mean_token_accuracy": 0.6996949389576912, "num_tokens": 11723162.0, "step": 14570 }, { "epoch": 3.8591101694915255, "grad_norm": 1.8027719259262085, "learning_rate": 8.070577330508476e-06, "loss": 1.1188, "mean_token_accuracy": 0.7518734410405159, "num_tokens": 11724509.0, "step": 14572 }, { "epoch": 3.8596398305084745, "grad_norm": 1.638664722442627, "learning_rate": 8.0703125e-06, "loss": 1.2873, "mean_token_accuracy": 0.7138003706932068, "num_tokens": 11726123.0, "step": 14574 }, { "epoch": 3.860169491525424, "grad_norm": 1.7200733423233032, "learning_rate": 8.070047669491527e-06, "loss": 1.2976, "mean_token_accuracy": 0.6929611042141914, "num_tokens": 11727807.0, "step": 14576 }, { "epoch": 3.860699152542373, "grad_norm": 1.7808018922805786, "learning_rate": 8.069782838983052e-06, "loss": 1.2021, "mean_token_accuracy": 0.7311769127845764, "num_tokens": 11729239.0, "step": 14578 }, { "epoch": 3.861228813559322, "grad_norm": 1.880159616470337, "learning_rate": 8.069518008474577e-06, "loss": 0.9374, "mean_token_accuracy": 0.7621514275670052, "num_tokens": 11731080.0, "step": 14580 }, { "epoch": 3.861758474576271, "grad_norm": 1.73761785030365, "learning_rate": 8.069253177966102e-06, "loss": 1.1064, "mean_token_accuracy": 0.7180157154798508, "num_tokens": 11732661.0, "step": 14582 }, { "epoch": 3.8622881355932206, "grad_norm": 1.6605827808380127, "learning_rate": 8.068988347457629e-06, "loss": 1.2349, "mean_token_accuracy": 0.7359341904520988, "num_tokens": 11734313.0, "step": 14584 }, { "epoch": 3.8628177966101696, "grad_norm": 1.7198184728622437, "learning_rate": 8.068723516949153e-06, "loss": 0.9953, "mean_token_accuracy": 0.7685771957039833, "num_tokens": 11735932.0, "step": 14586 }, { "epoch": 3.8633474576271185, "grad_norm": 1.7408608198165894, "learning_rate": 8.068458686440678e-06, "loss": 1.4237, "mean_token_accuracy": 0.6993340477347374, "num_tokens": 11737707.0, "step": 14588 }, { "epoch": 3.863877118644068, "grad_norm": 1.6947026252746582, "learning_rate": 8.068193855932203e-06, "loss": 1.1799, "mean_token_accuracy": 0.7075650319457054, "num_tokens": 11739259.0, "step": 14590 }, { "epoch": 3.864406779661017, "grad_norm": 2.0401558876037598, "learning_rate": 8.06792902542373e-06, "loss": 1.2173, "mean_token_accuracy": 0.701483704149723, "num_tokens": 11740638.0, "step": 14592 }, { "epoch": 3.8649364406779663, "grad_norm": 1.6940890550613403, "learning_rate": 8.067664194915255e-06, "loss": 1.2206, "mean_token_accuracy": 0.7055273428559303, "num_tokens": 11742396.0, "step": 14594 }, { "epoch": 3.8654661016949152, "grad_norm": 1.8592909574508667, "learning_rate": 8.06739936440678e-06, "loss": 0.9695, "mean_token_accuracy": 0.7736452296376228, "num_tokens": 11743891.0, "step": 14596 }, { "epoch": 3.865995762711864, "grad_norm": 1.869032382965088, "learning_rate": 8.067134533898305e-06, "loss": 1.4301, "mean_token_accuracy": 0.6513858288526535, "num_tokens": 11745720.0, "step": 14598 }, { "epoch": 3.8665254237288136, "grad_norm": 1.606009840965271, "learning_rate": 8.066869703389831e-06, "loss": 1.3791, "mean_token_accuracy": 0.6858134567737579, "num_tokens": 11747564.0, "step": 14600 }, { "epoch": 3.867055084745763, "grad_norm": 1.8112139701843262, "learning_rate": 8.066604872881356e-06, "loss": 1.3349, "mean_token_accuracy": 0.6919937506318092, "num_tokens": 11749127.0, "step": 14602 }, { "epoch": 3.867584745762712, "grad_norm": 1.7729027271270752, "learning_rate": 8.066340042372883e-06, "loss": 1.5008, "mean_token_accuracy": 0.6761053502559662, "num_tokens": 11751560.0, "step": 14604 }, { "epoch": 3.868114406779661, "grad_norm": 1.6923997402191162, "learning_rate": 8.066075211864408e-06, "loss": 1.2835, "mean_token_accuracy": 0.7248055785894394, "num_tokens": 11753130.0, "step": 14606 }, { "epoch": 3.8686440677966103, "grad_norm": 1.6998893022537231, "learning_rate": 8.065810381355933e-06, "loss": 1.2062, "mean_token_accuracy": 0.7283193320035934, "num_tokens": 11754873.0, "step": 14608 }, { "epoch": 3.8691737288135593, "grad_norm": 1.732139229774475, "learning_rate": 8.065545550847458e-06, "loss": 0.8415, "mean_token_accuracy": 0.7813151702284813, "num_tokens": 11756507.0, "step": 14610 }, { "epoch": 3.8697033898305087, "grad_norm": 2.2923431396484375, "learning_rate": 8.065280720338984e-06, "loss": 1.6541, "mean_token_accuracy": 0.6542421355843544, "num_tokens": 11757989.0, "step": 14612 }, { "epoch": 3.8702330508474576, "grad_norm": 2.0420985221862793, "learning_rate": 8.06501588983051e-06, "loss": 1.5021, "mean_token_accuracy": 0.6387082859873772, "num_tokens": 11759825.0, "step": 14614 }, { "epoch": 3.8707627118644066, "grad_norm": 1.6904926300048828, "learning_rate": 8.064751059322034e-06, "loss": 1.0669, "mean_token_accuracy": 0.7418371364474297, "num_tokens": 11761621.0, "step": 14616 }, { "epoch": 3.871292372881356, "grad_norm": 1.3988699913024902, "learning_rate": 8.064486228813559e-06, "loss": 0.9865, "mean_token_accuracy": 0.7528758943080902, "num_tokens": 11763324.0, "step": 14618 }, { "epoch": 3.871822033898305, "grad_norm": 2.202671527862549, "learning_rate": 8.064221398305086e-06, "loss": 1.3802, "mean_token_accuracy": 0.7042782753705978, "num_tokens": 11764924.0, "step": 14620 }, { "epoch": 3.8723516949152543, "grad_norm": 2.163761615753174, "learning_rate": 8.06395656779661e-06, "loss": 1.1143, "mean_token_accuracy": 0.7068464457988739, "num_tokens": 11766381.0, "step": 14622 }, { "epoch": 3.8728813559322033, "grad_norm": 1.7158390283584595, "learning_rate": 8.063691737288136e-06, "loss": 1.5057, "mean_token_accuracy": 0.6928294748067856, "num_tokens": 11768250.0, "step": 14624 }, { "epoch": 3.8734110169491527, "grad_norm": 1.421143889427185, "learning_rate": 8.063426906779662e-06, "loss": 0.9997, "mean_token_accuracy": 0.7266603857278824, "num_tokens": 11770258.0, "step": 14626 }, { "epoch": 3.8739406779661016, "grad_norm": 2.063019037246704, "learning_rate": 8.063162076271187e-06, "loss": 1.3375, "mean_token_accuracy": 0.6967805176973343, "num_tokens": 11771844.0, "step": 14628 }, { "epoch": 3.874470338983051, "grad_norm": 1.8721665143966675, "learning_rate": 8.062897245762714e-06, "loss": 1.4779, "mean_token_accuracy": 0.6766244769096375, "num_tokens": 11773443.0, "step": 14630 }, { "epoch": 3.875, "grad_norm": 1.5185858011245728, "learning_rate": 8.062632415254239e-06, "loss": 1.4787, "mean_token_accuracy": 0.6970582604408264, "num_tokens": 11774924.0, "step": 14632 }, { "epoch": 3.875529661016949, "grad_norm": 1.9872031211853027, "learning_rate": 8.062367584745764e-06, "loss": 1.5289, "mean_token_accuracy": 0.6590202525258064, "num_tokens": 11776541.0, "step": 14634 }, { "epoch": 3.8760593220338984, "grad_norm": 2.083894729614258, "learning_rate": 8.062102754237289e-06, "loss": 1.5611, "mean_token_accuracy": 0.6593252345919609, "num_tokens": 11778001.0, "step": 14636 }, { "epoch": 3.8765889830508473, "grad_norm": 1.6188780069351196, "learning_rate": 8.061837923728815e-06, "loss": 0.6947, "mean_token_accuracy": 0.8254254534840584, "num_tokens": 11779393.0, "step": 14638 }, { "epoch": 3.8771186440677967, "grad_norm": 1.7774848937988281, "learning_rate": 8.06157309322034e-06, "loss": 0.7083, "mean_token_accuracy": 0.8200381696224213, "num_tokens": 11780804.0, "step": 14640 }, { "epoch": 3.8776483050847457, "grad_norm": 1.9154688119888306, "learning_rate": 8.061308262711865e-06, "loss": 1.1813, "mean_token_accuracy": 0.7113856598734856, "num_tokens": 11782370.0, "step": 14642 }, { "epoch": 3.878177966101695, "grad_norm": 1.9206500053405762, "learning_rate": 8.06104343220339e-06, "loss": 0.9377, "mean_token_accuracy": 0.7830475866794586, "num_tokens": 11783946.0, "step": 14644 }, { "epoch": 3.878707627118644, "grad_norm": 1.881273865699768, "learning_rate": 8.060778601694917e-06, "loss": 0.9286, "mean_token_accuracy": 0.7772565707564354, "num_tokens": 11785351.0, "step": 14646 }, { "epoch": 3.8792372881355934, "grad_norm": 1.286926507949829, "learning_rate": 8.060513771186442e-06, "loss": 0.8388, "mean_token_accuracy": 0.8119307681918144, "num_tokens": 11786829.0, "step": 14648 }, { "epoch": 3.8797669491525424, "grad_norm": 1.923288345336914, "learning_rate": 8.060248940677966e-06, "loss": 1.5148, "mean_token_accuracy": 0.6680354848504066, "num_tokens": 11788332.0, "step": 14650 }, { "epoch": 3.8802966101694913, "grad_norm": 2.416841506958008, "learning_rate": 8.059984110169491e-06, "loss": 0.846, "mean_token_accuracy": 0.7799981981515884, "num_tokens": 11789738.0, "step": 14652 }, { "epoch": 3.8808262711864407, "grad_norm": 1.963811993598938, "learning_rate": 8.059719279661018e-06, "loss": 1.164, "mean_token_accuracy": 0.7368872463703156, "num_tokens": 11791160.0, "step": 14654 }, { "epoch": 3.8813559322033897, "grad_norm": 1.9975299835205078, "learning_rate": 8.059454449152543e-06, "loss": 1.5957, "mean_token_accuracy": 0.6649907380342484, "num_tokens": 11792729.0, "step": 14656 }, { "epoch": 3.881885593220339, "grad_norm": 1.8727281093597412, "learning_rate": 8.05918961864407e-06, "loss": 1.2516, "mean_token_accuracy": 0.7180386632680893, "num_tokens": 11794226.0, "step": 14658 }, { "epoch": 3.882415254237288, "grad_norm": 1.7651642560958862, "learning_rate": 8.058924788135594e-06, "loss": 1.1351, "mean_token_accuracy": 0.7256909236311913, "num_tokens": 11796017.0, "step": 14660 }, { "epoch": 3.882944915254237, "grad_norm": 1.8208754062652588, "learning_rate": 8.05865995762712e-06, "loss": 1.5142, "mean_token_accuracy": 0.6751720644533634, "num_tokens": 11797564.0, "step": 14662 }, { "epoch": 3.8834745762711864, "grad_norm": 1.9395042657852173, "learning_rate": 8.058395127118644e-06, "loss": 1.3931, "mean_token_accuracy": 0.6778926327824593, "num_tokens": 11799312.0, "step": 14664 }, { "epoch": 3.884004237288136, "grad_norm": 2.1161949634552, "learning_rate": 8.058130296610171e-06, "loss": 1.3654, "mean_token_accuracy": 0.6867187172174454, "num_tokens": 11800506.0, "step": 14666 }, { "epoch": 3.8845338983050848, "grad_norm": 1.8921244144439697, "learning_rate": 8.057865466101696e-06, "loss": 1.2833, "mean_token_accuracy": 0.7105318382382393, "num_tokens": 11801973.0, "step": 14668 }, { "epoch": 3.8850635593220337, "grad_norm": 1.6781420707702637, "learning_rate": 8.05760063559322e-06, "loss": 1.0123, "mean_token_accuracy": 0.7895970121026039, "num_tokens": 11803573.0, "step": 14670 }, { "epoch": 3.885593220338983, "grad_norm": 1.8560343980789185, "learning_rate": 8.057335805084746e-06, "loss": 1.6487, "mean_token_accuracy": 0.6538251712918282, "num_tokens": 11805161.0, "step": 14672 }, { "epoch": 3.886122881355932, "grad_norm": 1.6626250743865967, "learning_rate": 8.057070974576272e-06, "loss": 1.3861, "mean_token_accuracy": 0.684864416718483, "num_tokens": 11806982.0, "step": 14674 }, { "epoch": 3.8866525423728815, "grad_norm": 1.8385947942733765, "learning_rate": 8.056806144067797e-06, "loss": 0.9606, "mean_token_accuracy": 0.7760709822177887, "num_tokens": 11808367.0, "step": 14676 }, { "epoch": 3.8871822033898304, "grad_norm": 1.746321439743042, "learning_rate": 8.056541313559322e-06, "loss": 1.4957, "mean_token_accuracy": 0.6688363030552864, "num_tokens": 11809958.0, "step": 14678 }, { "epoch": 3.8877118644067794, "grad_norm": 1.925994873046875, "learning_rate": 8.056276483050847e-06, "loss": 1.2699, "mean_token_accuracy": 0.7072084918618202, "num_tokens": 11811555.0, "step": 14680 }, { "epoch": 3.888241525423729, "grad_norm": 1.772169828414917, "learning_rate": 8.056011652542374e-06, "loss": 1.1851, "mean_token_accuracy": 0.7111475914716721, "num_tokens": 11813321.0, "step": 14682 }, { "epoch": 3.888771186440678, "grad_norm": 1.7050186395645142, "learning_rate": 8.055746822033899e-06, "loss": 1.1641, "mean_token_accuracy": 0.738524541258812, "num_tokens": 11814982.0, "step": 14684 }, { "epoch": 3.889300847457627, "grad_norm": 1.759049892425537, "learning_rate": 8.055481991525425e-06, "loss": 0.9318, "mean_token_accuracy": 0.7757560908794403, "num_tokens": 11816570.0, "step": 14686 }, { "epoch": 3.889830508474576, "grad_norm": 1.9017566442489624, "learning_rate": 8.055217161016949e-06, "loss": 1.2626, "mean_token_accuracy": 0.6963801831007004, "num_tokens": 11818094.0, "step": 14688 }, { "epoch": 3.8903601694915255, "grad_norm": 1.7716166973114014, "learning_rate": 8.054952330508475e-06, "loss": 1.4535, "mean_token_accuracy": 0.6535191908478737, "num_tokens": 11820001.0, "step": 14690 }, { "epoch": 3.8908898305084745, "grad_norm": 1.5820672512054443, "learning_rate": 8.0546875e-06, "loss": 0.6975, "mean_token_accuracy": 0.8091547042131424, "num_tokens": 11821697.0, "step": 14692 }, { "epoch": 3.891419491525424, "grad_norm": 1.9720869064331055, "learning_rate": 8.054422669491527e-06, "loss": 1.5604, "mean_token_accuracy": 0.6570478901267052, "num_tokens": 11823249.0, "step": 14694 }, { "epoch": 3.891949152542373, "grad_norm": 2.0398783683776855, "learning_rate": 8.054157838983052e-06, "loss": 1.5482, "mean_token_accuracy": 0.662038080394268, "num_tokens": 11824943.0, "step": 14696 }, { "epoch": 3.892478813559322, "grad_norm": 1.776835322380066, "learning_rate": 8.053893008474577e-06, "loss": 1.4321, "mean_token_accuracy": 0.6730943098664284, "num_tokens": 11826364.0, "step": 14698 }, { "epoch": 3.893008474576271, "grad_norm": 1.756757378578186, "learning_rate": 8.053628177966102e-06, "loss": 1.2362, "mean_token_accuracy": 0.7078399881720543, "num_tokens": 11828012.0, "step": 14700 }, { "epoch": 3.8935381355932206, "grad_norm": 1.9627293348312378, "learning_rate": 8.053363347457628e-06, "loss": 1.6267, "mean_token_accuracy": 0.6631076745688915, "num_tokens": 11829675.0, "step": 14702 }, { "epoch": 3.8940677966101696, "grad_norm": 1.8407320976257324, "learning_rate": 8.053098516949153e-06, "loss": 1.1163, "mean_token_accuracy": 0.7485067695379257, "num_tokens": 11831396.0, "step": 14704 }, { "epoch": 3.8945974576271185, "grad_norm": 1.83803391456604, "learning_rate": 8.052833686440678e-06, "loss": 1.2128, "mean_token_accuracy": 0.7136514335870743, "num_tokens": 11832894.0, "step": 14706 }, { "epoch": 3.895127118644068, "grad_norm": 2.9450876712799072, "learning_rate": 8.052568855932203e-06, "loss": 1.3059, "mean_token_accuracy": 0.7216510102152824, "num_tokens": 11834373.0, "step": 14708 }, { "epoch": 3.895656779661017, "grad_norm": 2.02951979637146, "learning_rate": 8.05230402542373e-06, "loss": 1.3702, "mean_token_accuracy": 0.6751428693532944, "num_tokens": 11835806.0, "step": 14710 }, { "epoch": 3.8961864406779663, "grad_norm": 1.6369248628616333, "learning_rate": 8.052039194915256e-06, "loss": 1.2495, "mean_token_accuracy": 0.7011199966073036, "num_tokens": 11837588.0, "step": 14712 }, { "epoch": 3.8967161016949152, "grad_norm": 2.0190320014953613, "learning_rate": 8.051774364406781e-06, "loss": 1.6385, "mean_token_accuracy": 0.6544519290328026, "num_tokens": 11839116.0, "step": 14714 }, { "epoch": 3.897245762711864, "grad_norm": 1.4389885663986206, "learning_rate": 8.051509533898306e-06, "loss": 1.2101, "mean_token_accuracy": 0.739042691886425, "num_tokens": 11840595.0, "step": 14716 }, { "epoch": 3.8977754237288136, "grad_norm": 1.7190748453140259, "learning_rate": 8.051244703389831e-06, "loss": 0.9753, "mean_token_accuracy": 0.7380600795149803, "num_tokens": 11842181.0, "step": 14718 }, { "epoch": 3.898305084745763, "grad_norm": 1.6304128170013428, "learning_rate": 8.050979872881358e-06, "loss": 1.2362, "mean_token_accuracy": 0.7004173658788204, "num_tokens": 11844056.0, "step": 14720 }, { "epoch": 3.898834745762712, "grad_norm": 1.8579574823379517, "learning_rate": 8.050715042372883e-06, "loss": 1.214, "mean_token_accuracy": 0.702853187918663, "num_tokens": 11845825.0, "step": 14722 }, { "epoch": 3.899364406779661, "grad_norm": 1.6350873708724976, "learning_rate": 8.050450211864407e-06, "loss": 0.9205, "mean_token_accuracy": 0.7726168632507324, "num_tokens": 11847653.0, "step": 14724 }, { "epoch": 3.8998940677966103, "grad_norm": 1.4952929019927979, "learning_rate": 8.050185381355932e-06, "loss": 0.9276, "mean_token_accuracy": 0.7727417647838593, "num_tokens": 11849123.0, "step": 14726 }, { "epoch": 3.9004237288135593, "grad_norm": 1.9297842979431152, "learning_rate": 8.049920550847459e-06, "loss": 1.5315, "mean_token_accuracy": 0.6705113276839256, "num_tokens": 11850630.0, "step": 14728 }, { "epoch": 3.9009533898305087, "grad_norm": 1.8061193227767944, "learning_rate": 8.049655720338984e-06, "loss": 1.2517, "mean_token_accuracy": 0.7196347266435623, "num_tokens": 11852251.0, "step": 14730 }, { "epoch": 3.9014830508474576, "grad_norm": 1.7100770473480225, "learning_rate": 8.049390889830509e-06, "loss": 1.2498, "mean_token_accuracy": 0.7154485285282135, "num_tokens": 11853965.0, "step": 14732 }, { "epoch": 3.9020127118644066, "grad_norm": 1.657764196395874, "learning_rate": 8.049126059322034e-06, "loss": 0.9233, "mean_token_accuracy": 0.7766077965497971, "num_tokens": 11855423.0, "step": 14734 }, { "epoch": 3.902542372881356, "grad_norm": 1.9677298069000244, "learning_rate": 8.04886122881356e-06, "loss": 0.9661, "mean_token_accuracy": 0.7934888638556004, "num_tokens": 11857006.0, "step": 14736 }, { "epoch": 3.903072033898305, "grad_norm": 2.1912941932678223, "learning_rate": 8.048596398305085e-06, "loss": 1.3394, "mean_token_accuracy": 0.693060141056776, "num_tokens": 11858509.0, "step": 14738 }, { "epoch": 3.9036016949152543, "grad_norm": 1.663820743560791, "learning_rate": 8.048331567796612e-06, "loss": 1.7196, "mean_token_accuracy": 0.6173374615609646, "num_tokens": 11860291.0, "step": 14740 }, { "epoch": 3.9041313559322033, "grad_norm": 1.983612060546875, "learning_rate": 8.048066737288135e-06, "loss": 1.3459, "mean_token_accuracy": 0.6964935436844826, "num_tokens": 11861746.0, "step": 14742 }, { "epoch": 3.9046610169491527, "grad_norm": 1.802841067314148, "learning_rate": 8.047801906779662e-06, "loss": 0.9324, "mean_token_accuracy": 0.7697140276432037, "num_tokens": 11863158.0, "step": 14744 }, { "epoch": 3.9051906779661016, "grad_norm": 1.899660348892212, "learning_rate": 8.047537076271187e-06, "loss": 1.6346, "mean_token_accuracy": 0.6407927200198174, "num_tokens": 11864682.0, "step": 14746 }, { "epoch": 3.905720338983051, "grad_norm": 1.6427158117294312, "learning_rate": 8.047272245762713e-06, "loss": 1.0349, "mean_token_accuracy": 0.7584388554096222, "num_tokens": 11866308.0, "step": 14748 }, { "epoch": 3.90625, "grad_norm": 1.4755927324295044, "learning_rate": 8.047007415254238e-06, "loss": 0.9909, "step": 14750 }, { "epoch": 3.90625, "eval_loss": 1.307822823524475, "eval_mean_token_accuracy": 0.7015087530016899, "eval_num_tokens": 11867816.0, "eval_runtime": 48.3046, "eval_samples_per_second": 6.376, "eval_steps_per_second": 6.376, "step": 14750 }, { "epoch": 3.906779661016949, "grad_norm": 1.6243045330047607, "learning_rate": 8.046742584745763e-06, "loss": 1.1464, "mean_token_accuracy": 0.7389382999390364, "num_tokens": 11869790.0, "step": 14752 }, { "epoch": 3.9073093220338984, "grad_norm": 1.8556804656982422, "learning_rate": 8.046477754237288e-06, "loss": 1.7098, "mean_token_accuracy": 0.6349508315324783, "num_tokens": 11871469.0, "step": 14754 }, { "epoch": 3.9078389830508473, "grad_norm": 1.762716293334961, "learning_rate": 8.046212923728815e-06, "loss": 1.5757, "mean_token_accuracy": 0.6646422147750854, "num_tokens": 11873047.0, "step": 14756 }, { "epoch": 3.9083686440677967, "grad_norm": 2.333489418029785, "learning_rate": 8.04594809322034e-06, "loss": 1.4072, "mean_token_accuracy": 0.6919889077544212, "num_tokens": 11874483.0, "step": 14758 }, { "epoch": 3.9088983050847457, "grad_norm": 2.059938669204712, "learning_rate": 8.045683262711865e-06, "loss": 1.08, "mean_token_accuracy": 0.7484862729907036, "num_tokens": 11876042.0, "step": 14760 }, { "epoch": 3.909427966101695, "grad_norm": 1.6683385372161865, "learning_rate": 8.04541843220339e-06, "loss": 1.322, "mean_token_accuracy": 0.693805530667305, "num_tokens": 11877848.0, "step": 14762 }, { "epoch": 3.909957627118644, "grad_norm": 1.9343277215957642, "learning_rate": 8.045153601694916e-06, "loss": 1.4738, "mean_token_accuracy": 0.6701404750347137, "num_tokens": 11879372.0, "step": 14764 }, { "epoch": 3.9104872881355934, "grad_norm": 1.918540358543396, "learning_rate": 8.044888771186441e-06, "loss": 1.0052, "mean_token_accuracy": 0.7546378970146179, "num_tokens": 11880849.0, "step": 14766 }, { "epoch": 3.9110169491525424, "grad_norm": 1.858860969543457, "learning_rate": 8.044623940677968e-06, "loss": 1.6116, "mean_token_accuracy": 0.6357264742255211, "num_tokens": 11882581.0, "step": 14768 }, { "epoch": 3.9115466101694913, "grad_norm": 1.420074224472046, "learning_rate": 8.044359110169491e-06, "loss": 1.1983, "mean_token_accuracy": 0.7176769822835922, "num_tokens": 11884230.0, "step": 14770 }, { "epoch": 3.9120762711864407, "grad_norm": 1.8313844203948975, "learning_rate": 8.044094279661018e-06, "loss": 1.0967, "mean_token_accuracy": 0.7198429554700851, "num_tokens": 11886078.0, "step": 14772 }, { "epoch": 3.9126059322033897, "grad_norm": 1.7806227207183838, "learning_rate": 8.043829449152543e-06, "loss": 1.3365, "mean_token_accuracy": 0.6983462423086166, "num_tokens": 11887736.0, "step": 14774 }, { "epoch": 3.913135593220339, "grad_norm": 1.6698856353759766, "learning_rate": 8.04356461864407e-06, "loss": 1.2514, "mean_token_accuracy": 0.7209508717060089, "num_tokens": 11889348.0, "step": 14776 }, { "epoch": 3.913665254237288, "grad_norm": 2.2726614475250244, "learning_rate": 8.043299788135594e-06, "loss": 1.4726, "mean_token_accuracy": 0.660976231098175, "num_tokens": 11890730.0, "step": 14778 }, { "epoch": 3.914194915254237, "grad_norm": 2.012897491455078, "learning_rate": 8.043034957627119e-06, "loss": 1.1339, "mean_token_accuracy": 0.7419813722372055, "num_tokens": 11892481.0, "step": 14780 }, { "epoch": 3.9147245762711864, "grad_norm": 1.7405046224594116, "learning_rate": 8.042770127118644e-06, "loss": 1.2803, "mean_token_accuracy": 0.7348727881908417, "num_tokens": 11894182.0, "step": 14782 }, { "epoch": 3.915254237288136, "grad_norm": 2.1289238929748535, "learning_rate": 8.04250529661017e-06, "loss": 1.5287, "mean_token_accuracy": 0.6825912892818451, "num_tokens": 11895769.0, "step": 14784 }, { "epoch": 3.9157838983050848, "grad_norm": 1.8680188655853271, "learning_rate": 8.042240466101696e-06, "loss": 1.4467, "mean_token_accuracy": 0.6704320833086967, "num_tokens": 11897541.0, "step": 14786 }, { "epoch": 3.9163135593220337, "grad_norm": 1.8370705842971802, "learning_rate": 8.04197563559322e-06, "loss": 1.0927, "mean_token_accuracy": 0.7411945387721062, "num_tokens": 11899115.0, "step": 14788 }, { "epoch": 3.916843220338983, "grad_norm": 1.7615950107574463, "learning_rate": 8.041710805084745e-06, "loss": 1.3205, "mean_token_accuracy": 0.6962320134043694, "num_tokens": 11900600.0, "step": 14790 }, { "epoch": 3.917372881355932, "grad_norm": 1.8396233320236206, "learning_rate": 8.041445974576272e-06, "loss": 1.4479, "mean_token_accuracy": 0.6797163859009743, "num_tokens": 11902178.0, "step": 14792 }, { "epoch": 3.9179025423728815, "grad_norm": 2.0366039276123047, "learning_rate": 8.041181144067799e-06, "loss": 1.5071, "mean_token_accuracy": 0.6545687839388847, "num_tokens": 11903838.0, "step": 14794 }, { "epoch": 3.9184322033898304, "grad_norm": 1.9250575304031372, "learning_rate": 8.040916313559322e-06, "loss": 1.3771, "mean_token_accuracy": 0.6867585405707359, "num_tokens": 11905249.0, "step": 14796 }, { "epoch": 3.9189618644067794, "grad_norm": 2.4033682346343994, "learning_rate": 8.040651483050848e-06, "loss": 1.5003, "mean_token_accuracy": 0.6689351052045822, "num_tokens": 11906708.0, "step": 14798 }, { "epoch": 3.919491525423729, "grad_norm": 2.043031692504883, "learning_rate": 8.040386652542373e-06, "loss": 0.8048, "mean_token_accuracy": 0.7930917963385582, "num_tokens": 11907916.0, "step": 14800 }, { "epoch": 3.920021186440678, "grad_norm": 1.6006532907485962, "learning_rate": 8.0401218220339e-06, "loss": 0.8281, "mean_token_accuracy": 0.7550666406750679, "num_tokens": 11909640.0, "step": 14802 }, { "epoch": 3.920550847457627, "grad_norm": 2.070420980453491, "learning_rate": 8.039856991525425e-06, "loss": 1.1755, "mean_token_accuracy": 0.7206970155239105, "num_tokens": 11911323.0, "step": 14804 }, { "epoch": 3.921080508474576, "grad_norm": 2.310899019241333, "learning_rate": 8.03959216101695e-06, "loss": 1.0601, "mean_token_accuracy": 0.7498067170381546, "num_tokens": 11912588.0, "step": 14806 }, { "epoch": 3.9216101694915255, "grad_norm": 2.555037260055542, "learning_rate": 8.039327330508475e-06, "loss": 1.1521, "mean_token_accuracy": 0.7343949303030968, "num_tokens": 11913892.0, "step": 14808 }, { "epoch": 3.9221398305084745, "grad_norm": 2.2651798725128174, "learning_rate": 8.039062500000001e-06, "loss": 1.3423, "mean_token_accuracy": 0.7133696302771568, "num_tokens": 11915276.0, "step": 14810 }, { "epoch": 3.922669491525424, "grad_norm": 2.1625735759735107, "learning_rate": 8.038797669491526e-06, "loss": 1.7367, "mean_token_accuracy": 0.6124474927783012, "num_tokens": 11917070.0, "step": 14812 }, { "epoch": 3.923199152542373, "grad_norm": 1.3372530937194824, "learning_rate": 8.038532838983051e-06, "loss": 1.2244, "mean_token_accuracy": 0.7330975756049156, "num_tokens": 11918840.0, "step": 14814 }, { "epoch": 3.923728813559322, "grad_norm": 1.7031654119491577, "learning_rate": 8.038268008474576e-06, "loss": 1.1428, "mean_token_accuracy": 0.7285554632544518, "num_tokens": 11920523.0, "step": 14816 }, { "epoch": 3.924258474576271, "grad_norm": 1.7033371925354004, "learning_rate": 8.038003177966103e-06, "loss": 1.2968, "mean_token_accuracy": 0.7010623887181282, "num_tokens": 11922362.0, "step": 14818 }, { "epoch": 3.9247881355932206, "grad_norm": 1.6062474250793457, "learning_rate": 8.037738347457628e-06, "loss": 1.6638, "mean_token_accuracy": 0.6503125205636024, "num_tokens": 11924312.0, "step": 14820 }, { "epoch": 3.9253177966101696, "grad_norm": 1.658438801765442, "learning_rate": 8.037473516949154e-06, "loss": 1.3661, "mean_token_accuracy": 0.6987698525190353, "num_tokens": 11925802.0, "step": 14822 }, { "epoch": 3.9258474576271185, "grad_norm": 1.9746873378753662, "learning_rate": 8.037208686440678e-06, "loss": 1.3974, "mean_token_accuracy": 0.6839188635349274, "num_tokens": 11927315.0, "step": 14824 }, { "epoch": 3.926377118644068, "grad_norm": 1.982020378112793, "learning_rate": 8.036943855932204e-06, "loss": 1.7761, "mean_token_accuracy": 0.5947142988443375, "num_tokens": 11929101.0, "step": 14826 }, { "epoch": 3.926906779661017, "grad_norm": 1.4704846143722534, "learning_rate": 8.03667902542373e-06, "loss": 0.6542, "mean_token_accuracy": 0.8290082514286041, "num_tokens": 11930236.0, "step": 14828 }, { "epoch": 3.9274364406779663, "grad_norm": 2.0252554416656494, "learning_rate": 8.036414194915256e-06, "loss": 1.0994, "mean_token_accuracy": 0.7628293707966805, "num_tokens": 11931990.0, "step": 14830 }, { "epoch": 3.9279661016949152, "grad_norm": 1.9135563373565674, "learning_rate": 8.03614936440678e-06, "loss": 1.735, "mean_token_accuracy": 0.6354353874921799, "num_tokens": 11933747.0, "step": 14832 }, { "epoch": 3.928495762711864, "grad_norm": 2.09914493560791, "learning_rate": 8.035884533898306e-06, "loss": 1.4183, "mean_token_accuracy": 0.679060272872448, "num_tokens": 11935020.0, "step": 14834 }, { "epoch": 3.9290254237288136, "grad_norm": 1.862586498260498, "learning_rate": 8.03561970338983e-06, "loss": 1.3188, "mean_token_accuracy": 0.7216976061463356, "num_tokens": 11936711.0, "step": 14836 }, { "epoch": 3.929555084745763, "grad_norm": 1.845523715019226, "learning_rate": 8.035354872881357e-06, "loss": 1.3788, "mean_token_accuracy": 0.681520976126194, "num_tokens": 11938433.0, "step": 14838 }, { "epoch": 3.930084745762712, "grad_norm": 1.7792093753814697, "learning_rate": 8.035090042372882e-06, "loss": 1.053, "mean_token_accuracy": 0.7421940639615059, "num_tokens": 11939816.0, "step": 14840 }, { "epoch": 3.930614406779661, "grad_norm": 2.117292881011963, "learning_rate": 8.034825211864407e-06, "loss": 1.2637, "mean_token_accuracy": 0.6973060443997383, "num_tokens": 11941539.0, "step": 14842 }, { "epoch": 3.9311440677966103, "grad_norm": 2.0260400772094727, "learning_rate": 8.034560381355932e-06, "loss": 1.6784, "mean_token_accuracy": 0.6402366384863853, "num_tokens": 11943014.0, "step": 14844 }, { "epoch": 3.9316737288135593, "grad_norm": 1.7092005014419556, "learning_rate": 8.034295550847459e-06, "loss": 0.969, "mean_token_accuracy": 0.7599676102399826, "num_tokens": 11944684.0, "step": 14846 }, { "epoch": 3.9322033898305087, "grad_norm": 1.9977937936782837, "learning_rate": 8.034030720338984e-06, "loss": 1.5034, "mean_token_accuracy": 0.6678348071873188, "num_tokens": 11946273.0, "step": 14848 }, { "epoch": 3.9327330508474576, "grad_norm": 1.970757007598877, "learning_rate": 8.033765889830508e-06, "loss": 1.3861, "mean_token_accuracy": 0.6690724045038223, "num_tokens": 11947728.0, "step": 14850 }, { "epoch": 3.9332627118644066, "grad_norm": 1.892602801322937, "learning_rate": 8.033501059322033e-06, "loss": 0.9545, "mean_token_accuracy": 0.755906343460083, "num_tokens": 11949284.0, "step": 14852 }, { "epoch": 3.933792372881356, "grad_norm": 2.167341709136963, "learning_rate": 8.03323622881356e-06, "loss": 1.4854, "mean_token_accuracy": 0.6626948490738869, "num_tokens": 11950806.0, "step": 14854 }, { "epoch": 3.934322033898305, "grad_norm": 1.904688835144043, "learning_rate": 8.032971398305085e-06, "loss": 1.1637, "mean_token_accuracy": 0.7142409011721611, "num_tokens": 11952349.0, "step": 14856 }, { "epoch": 3.9348516949152543, "grad_norm": 1.856252908706665, "learning_rate": 8.032706567796612e-06, "loss": 1.4236, "mean_token_accuracy": 0.6982451975345612, "num_tokens": 11953998.0, "step": 14858 }, { "epoch": 3.9353813559322033, "grad_norm": 1.9744360446929932, "learning_rate": 8.032441737288137e-06, "loss": 1.3771, "mean_token_accuracy": 0.7012998312711716, "num_tokens": 11955426.0, "step": 14860 }, { "epoch": 3.9359110169491527, "grad_norm": 1.9254021644592285, "learning_rate": 8.032176906779661e-06, "loss": 1.5486, "mean_token_accuracy": 0.6718461588025093, "num_tokens": 11956909.0, "step": 14862 }, { "epoch": 3.9364406779661016, "grad_norm": 1.9623967409133911, "learning_rate": 8.031912076271186e-06, "loss": 1.4325, "mean_token_accuracy": 0.6753390729427338, "num_tokens": 11958427.0, "step": 14864 }, { "epoch": 3.936970338983051, "grad_norm": 1.4826223850250244, "learning_rate": 8.031647245762713e-06, "loss": 1.2796, "mean_token_accuracy": 0.7057975605130196, "num_tokens": 11960276.0, "step": 14866 }, { "epoch": 3.9375, "grad_norm": 1.6733372211456299, "learning_rate": 8.031382415254238e-06, "loss": 1.2796, "mean_token_accuracy": 0.6717271506786346, "num_tokens": 11962779.0, "step": 14868 }, { "epoch": 3.938029661016949, "grad_norm": 1.7370200157165527, "learning_rate": 8.031117584745763e-06, "loss": 1.1315, "mean_token_accuracy": 0.7318877205252647, "num_tokens": 11964281.0, "step": 14870 }, { "epoch": 3.9385593220338984, "grad_norm": 1.7553786039352417, "learning_rate": 8.030852754237288e-06, "loss": 1.4046, "mean_token_accuracy": 0.6881823688745499, "num_tokens": 11965736.0, "step": 14872 }, { "epoch": 3.9390889830508473, "grad_norm": 2.0735909938812256, "learning_rate": 8.030587923728814e-06, "loss": 1.2801, "mean_token_accuracy": 0.7055754438042641, "num_tokens": 11967217.0, "step": 14874 }, { "epoch": 3.9396186440677967, "grad_norm": 1.86685049533844, "learning_rate": 8.03032309322034e-06, "loss": 1.5564, "mean_token_accuracy": 0.6594517976045609, "num_tokens": 11969056.0, "step": 14876 }, { "epoch": 3.9401483050847457, "grad_norm": 1.7459012269973755, "learning_rate": 8.030058262711864e-06, "loss": 1.5517, "mean_token_accuracy": 0.6622037962079048, "num_tokens": 11970878.0, "step": 14878 }, { "epoch": 3.940677966101695, "grad_norm": 2.3622822761535645, "learning_rate": 8.029793432203391e-06, "loss": 1.2908, "mean_token_accuracy": 0.7094196230173111, "num_tokens": 11972150.0, "step": 14880 }, { "epoch": 3.941207627118644, "grad_norm": 1.9370567798614502, "learning_rate": 8.029528601694916e-06, "loss": 0.9842, "mean_token_accuracy": 0.7444009482860565, "num_tokens": 11973695.0, "step": 14882 }, { "epoch": 3.9417372881355934, "grad_norm": 2.270583152770996, "learning_rate": 8.029263771186442e-06, "loss": 1.3699, "mean_token_accuracy": 0.6857456862926483, "num_tokens": 11975159.0, "step": 14884 }, { "epoch": 3.9422669491525424, "grad_norm": 1.7548742294311523, "learning_rate": 8.028998940677967e-06, "loss": 0.9748, "mean_token_accuracy": 0.742028258740902, "num_tokens": 11976996.0, "step": 14886 }, { "epoch": 3.9427966101694913, "grad_norm": 1.8702610731124878, "learning_rate": 8.028734110169492e-06, "loss": 1.5045, "mean_token_accuracy": 0.6605231240391731, "num_tokens": 11978654.0, "step": 14888 }, { "epoch": 3.9433262711864407, "grad_norm": 1.3935396671295166, "learning_rate": 8.028469279661017e-06, "loss": 1.2346, "mean_token_accuracy": 0.7117013335227966, "num_tokens": 11980496.0, "step": 14890 }, { "epoch": 3.9438559322033897, "grad_norm": 1.980631947517395, "learning_rate": 8.028204449152544e-06, "loss": 1.3705, "mean_token_accuracy": 0.6785658374428749, "num_tokens": 11982137.0, "step": 14892 }, { "epoch": 3.944385593220339, "grad_norm": 1.8278844356536865, "learning_rate": 8.027939618644069e-06, "loss": 1.2769, "mean_token_accuracy": 0.7173982262611389, "num_tokens": 11983819.0, "step": 14894 }, { "epoch": 3.944915254237288, "grad_norm": 1.9661471843719482, "learning_rate": 8.027674788135594e-06, "loss": 1.3744, "mean_token_accuracy": 0.6806420087814331, "num_tokens": 11985453.0, "step": 14896 }, { "epoch": 3.945444915254237, "grad_norm": 1.8589674234390259, "learning_rate": 8.027409957627119e-06, "loss": 1.6397, "mean_token_accuracy": 0.6290698945522308, "num_tokens": 11987192.0, "step": 14898 }, { "epoch": 3.9459745762711864, "grad_norm": 1.8677929639816284, "learning_rate": 8.027145127118645e-06, "loss": 1.2557, "mean_token_accuracy": 0.7262867987155914, "num_tokens": 11989061.0, "step": 14900 }, { "epoch": 3.946504237288136, "grad_norm": 2.206812858581543, "learning_rate": 8.02688029661017e-06, "loss": 1.5628, "mean_token_accuracy": 0.6683987528085709, "num_tokens": 11990693.0, "step": 14902 }, { "epoch": 3.9470338983050848, "grad_norm": 2.1804511547088623, "learning_rate": 8.026615466101695e-06, "loss": 1.5226, "mean_token_accuracy": 0.6539265662431717, "num_tokens": 11992147.0, "step": 14904 }, { "epoch": 3.9475635593220337, "grad_norm": 2.017890691757202, "learning_rate": 8.02635063559322e-06, "loss": 1.4807, "mean_token_accuracy": 0.6791545376181602, "num_tokens": 11993600.0, "step": 14906 }, { "epoch": 3.948093220338983, "grad_norm": 1.4544599056243896, "learning_rate": 8.026085805084747e-06, "loss": 1.2524, "mean_token_accuracy": 0.6790826693177223, "num_tokens": 11995596.0, "step": 14908 }, { "epoch": 3.948622881355932, "grad_norm": 2.140888214111328, "learning_rate": 8.025820974576272e-06, "loss": 1.2058, "mean_token_accuracy": 0.713333822786808, "num_tokens": 11996892.0, "step": 14910 }, { "epoch": 3.9491525423728815, "grad_norm": 1.500363826751709, "learning_rate": 8.025556144067798e-06, "loss": 1.0937, "mean_token_accuracy": 0.7580773755908012, "num_tokens": 11998433.0, "step": 14912 }, { "epoch": 3.9496822033898304, "grad_norm": 1.704494833946228, "learning_rate": 8.025291313559323e-06, "loss": 1.0876, "mean_token_accuracy": 0.7308604791760445, "num_tokens": 11999879.0, "step": 14914 }, { "epoch": 3.9502118644067794, "grad_norm": 2.6628198623657227, "learning_rate": 8.025026483050848e-06, "loss": 1.3145, "mean_token_accuracy": 0.7101369351148605, "num_tokens": 12001256.0, "step": 14916 }, { "epoch": 3.950741525423729, "grad_norm": 1.675222635269165, "learning_rate": 8.024761652542373e-06, "loss": 1.3585, "mean_token_accuracy": 0.7068056687712669, "num_tokens": 12002904.0, "step": 14918 }, { "epoch": 3.951271186440678, "grad_norm": 1.8190422058105469, "learning_rate": 8.0244968220339e-06, "loss": 1.1986, "mean_token_accuracy": 0.7061188369989395, "num_tokens": 12004399.0, "step": 14920 }, { "epoch": 3.951800847457627, "grad_norm": 1.7057909965515137, "learning_rate": 8.024231991525425e-06, "loss": 1.0483, "mean_token_accuracy": 0.7438122779130936, "num_tokens": 12005891.0, "step": 14922 }, { "epoch": 3.952330508474576, "grad_norm": 1.7184728384017944, "learning_rate": 8.02396716101695e-06, "loss": 1.5001, "mean_token_accuracy": 0.66335079818964, "num_tokens": 12007866.0, "step": 14924 }, { "epoch": 3.9528601694915255, "grad_norm": 2.2095134258270264, "learning_rate": 8.023702330508474e-06, "loss": 1.6315, "mean_token_accuracy": 0.6717564538121223, "num_tokens": 12009478.0, "step": 14926 }, { "epoch": 3.9533898305084745, "grad_norm": 1.7899832725524902, "learning_rate": 8.023437500000001e-06, "loss": 1.176, "mean_token_accuracy": 0.7385480999946594, "num_tokens": 12010897.0, "step": 14928 }, { "epoch": 3.953919491525424, "grad_norm": 1.9097641706466675, "learning_rate": 8.023172669491526e-06, "loss": 1.2519, "mean_token_accuracy": 0.7021163031458855, "num_tokens": 12012459.0, "step": 14930 }, { "epoch": 3.954449152542373, "grad_norm": 1.7585713863372803, "learning_rate": 8.022907838983051e-06, "loss": 1.2133, "mean_token_accuracy": 0.7180660888552666, "num_tokens": 12013882.0, "step": 14932 }, { "epoch": 3.954978813559322, "grad_norm": 1.81403386592865, "learning_rate": 8.022643008474576e-06, "loss": 1.495, "mean_token_accuracy": 0.6609267145395279, "num_tokens": 12015577.0, "step": 14934 }, { "epoch": 3.955508474576271, "grad_norm": 2.019850015640259, "learning_rate": 8.022378177966102e-06, "loss": 1.4554, "mean_token_accuracy": 0.7004449293017387, "num_tokens": 12017029.0, "step": 14936 }, { "epoch": 3.9560381355932206, "grad_norm": 2.0283682346343994, "learning_rate": 8.022113347457627e-06, "loss": 1.1753, "mean_token_accuracy": 0.7174528762698174, "num_tokens": 12018607.0, "step": 14938 }, { "epoch": 3.9565677966101696, "grad_norm": 1.8129445314407349, "learning_rate": 8.021848516949154e-06, "loss": 1.0531, "mean_token_accuracy": 0.7761840149760246, "num_tokens": 12020164.0, "step": 14940 }, { "epoch": 3.9570974576271185, "grad_norm": 1.944146990776062, "learning_rate": 8.021583686440679e-06, "loss": 1.278, "mean_token_accuracy": 0.738555159419775, "num_tokens": 12021991.0, "step": 14942 }, { "epoch": 3.957627118644068, "grad_norm": 1.9484320878982544, "learning_rate": 8.021318855932204e-06, "loss": 1.3777, "mean_token_accuracy": 0.7129555605351925, "num_tokens": 12023356.0, "step": 14944 }, { "epoch": 3.958156779661017, "grad_norm": 1.7711176872253418, "learning_rate": 8.021054025423729e-06, "loss": 0.9622, "mean_token_accuracy": 0.7570440918207169, "num_tokens": 12025158.0, "step": 14946 }, { "epoch": 3.9586864406779663, "grad_norm": 1.7712247371673584, "learning_rate": 8.020789194915255e-06, "loss": 1.2821, "mean_token_accuracy": 0.7007188349962234, "num_tokens": 12026852.0, "step": 14948 }, { "epoch": 3.9592161016949152, "grad_norm": 1.8957393169403076, "learning_rate": 8.02052436440678e-06, "loss": 1.2212, "mean_token_accuracy": 0.7162303924560547, "num_tokens": 12028487.0, "step": 14950 }, { "epoch": 3.959745762711864, "grad_norm": 1.9017434120178223, "learning_rate": 8.020259533898305e-06, "loss": 1.7031, "mean_token_accuracy": 0.6452986896038055, "num_tokens": 12030112.0, "step": 14952 }, { "epoch": 3.9602754237288136, "grad_norm": 1.7896071672439575, "learning_rate": 8.01999470338983e-06, "loss": 1.0645, "mean_token_accuracy": 0.7365733608603477, "num_tokens": 12031753.0, "step": 14954 }, { "epoch": 3.960805084745763, "grad_norm": 2.154566526412964, "learning_rate": 8.019729872881357e-06, "loss": 0.9636, "mean_token_accuracy": 0.7583287954330444, "num_tokens": 12033317.0, "step": 14956 }, { "epoch": 3.961334745762712, "grad_norm": 1.965600848197937, "learning_rate": 8.019465042372882e-06, "loss": 0.9553, "mean_token_accuracy": 0.7970998771488667, "num_tokens": 12034854.0, "step": 14958 }, { "epoch": 3.961864406779661, "grad_norm": 1.3834283351898193, "learning_rate": 8.019200211864407e-06, "loss": 0.9339, "mean_token_accuracy": 0.7609597966074944, "num_tokens": 12036403.0, "step": 14960 }, { "epoch": 3.9623940677966103, "grad_norm": 1.8332595825195312, "learning_rate": 8.018935381355933e-06, "loss": 0.8378, "mean_token_accuracy": 0.786383181810379, "num_tokens": 12037776.0, "step": 14962 }, { "epoch": 3.9629237288135593, "grad_norm": 1.755593180656433, "learning_rate": 8.018670550847458e-06, "loss": 1.1742, "mean_token_accuracy": 0.7516996189951897, "num_tokens": 12039442.0, "step": 14964 }, { "epoch": 3.9634533898305087, "grad_norm": 2.2084097862243652, "learning_rate": 8.018405720338985e-06, "loss": 1.4019, "mean_token_accuracy": 0.6668859571218491, "num_tokens": 12040869.0, "step": 14966 }, { "epoch": 3.9639830508474576, "grad_norm": 1.8645539283752441, "learning_rate": 8.01814088983051e-06, "loss": 1.6128, "mean_token_accuracy": 0.6515542343258858, "num_tokens": 12042428.0, "step": 14968 }, { "epoch": 3.9645127118644066, "grad_norm": 1.986628532409668, "learning_rate": 8.017876059322035e-06, "loss": 1.0938, "mean_token_accuracy": 0.7443865314126015, "num_tokens": 12044110.0, "step": 14970 }, { "epoch": 3.965042372881356, "grad_norm": 1.8638867139816284, "learning_rate": 8.01761122881356e-06, "loss": 1.5218, "mean_token_accuracy": 0.6656753495335579, "num_tokens": 12045645.0, "step": 14972 }, { "epoch": 3.965572033898305, "grad_norm": 1.8786914348602295, "learning_rate": 8.017346398305086e-06, "loss": 1.1409, "mean_token_accuracy": 0.7109318450093269, "num_tokens": 12048048.0, "step": 14974 }, { "epoch": 3.9661016949152543, "grad_norm": 1.6143158674240112, "learning_rate": 8.017081567796611e-06, "loss": 1.1871, "mean_token_accuracy": 0.705032654106617, "num_tokens": 12050236.0, "step": 14976 }, { "epoch": 3.9666313559322033, "grad_norm": 1.8988720178604126, "learning_rate": 8.016816737288136e-06, "loss": 1.1649, "mean_token_accuracy": 0.7506815269589424, "num_tokens": 12051872.0, "step": 14978 }, { "epoch": 3.9671610169491527, "grad_norm": 1.8900854587554932, "learning_rate": 8.016551906779661e-06, "loss": 1.2513, "mean_token_accuracy": 0.7291438803076744, "num_tokens": 12053428.0, "step": 14980 }, { "epoch": 3.9676906779661016, "grad_norm": 1.9344141483306885, "learning_rate": 8.016287076271188e-06, "loss": 1.514, "mean_token_accuracy": 0.6527272164821625, "num_tokens": 12055032.0, "step": 14982 }, { "epoch": 3.968220338983051, "grad_norm": 2.3025174140930176, "learning_rate": 8.016022245762713e-06, "loss": 1.6867, "mean_token_accuracy": 0.6422966420650482, "num_tokens": 12056765.0, "step": 14984 }, { "epoch": 3.96875, "grad_norm": 2.0268394947052, "learning_rate": 8.015757415254238e-06, "loss": 1.4823, "mean_token_accuracy": 0.6849265545606613, "num_tokens": 12058398.0, "step": 14986 }, { "epoch": 3.969279661016949, "grad_norm": 1.4457546472549438, "learning_rate": 8.015492584745762e-06, "loss": 1.1244, "mean_token_accuracy": 0.7259707674384117, "num_tokens": 12060307.0, "step": 14988 }, { "epoch": 3.9698093220338984, "grad_norm": 1.389622449874878, "learning_rate": 8.015227754237289e-06, "loss": 0.7833, "mean_token_accuracy": 0.8101689219474792, "num_tokens": 12062368.0, "step": 14990 }, { "epoch": 3.9703389830508473, "grad_norm": 1.6149784326553345, "learning_rate": 8.014962923728814e-06, "loss": 1.0146, "mean_token_accuracy": 0.7670185081660748, "num_tokens": 12063820.0, "step": 14992 }, { "epoch": 3.9708686440677967, "grad_norm": 1.543934941291809, "learning_rate": 8.01469809322034e-06, "loss": 0.964, "mean_token_accuracy": 0.7476478964090347, "num_tokens": 12065641.0, "step": 14994 }, { "epoch": 3.9713983050847457, "grad_norm": 1.6857272386550903, "learning_rate": 8.014433262711866e-06, "loss": 1.0417, "mean_token_accuracy": 0.7326041385531425, "num_tokens": 12067103.0, "step": 14996 }, { "epoch": 3.971927966101695, "grad_norm": 1.652502179145813, "learning_rate": 8.01416843220339e-06, "loss": 1.0398, "mean_token_accuracy": 0.7499602735042572, "num_tokens": 12068778.0, "step": 14998 }, { "epoch": 3.972457627118644, "grad_norm": 1.7722963094711304, "learning_rate": 8.013903601694915e-06, "loss": 0.945, "step": 15000 }, { "epoch": 3.972457627118644, "eval_loss": 1.3075815439224243, "eval_mean_token_accuracy": 0.7019086767520223, "eval_num_tokens": 12070272.0, "eval_runtime": 48.2414, "eval_samples_per_second": 6.385, "eval_steps_per_second": 6.385, "step": 15000 }, { "epoch": 3.9729872881355934, "grad_norm": 1.7737162113189697, "learning_rate": 8.013638771186442e-06, "loss": 1.3258, "mean_token_accuracy": 0.7425017841160297, "num_tokens": 12072046.0, "step": 15002 }, { "epoch": 3.9735169491525424, "grad_norm": 1.7911251783370972, "learning_rate": 8.013373940677967e-06, "loss": 1.2099, "mean_token_accuracy": 0.692983590066433, "num_tokens": 12073674.0, "step": 15004 }, { "epoch": 3.9740466101694913, "grad_norm": 1.7340625524520874, "learning_rate": 8.013109110169492e-06, "loss": 1.1614, "mean_token_accuracy": 0.7202673703432083, "num_tokens": 12075172.0, "step": 15006 }, { "epoch": 3.9745762711864407, "grad_norm": 1.52547025680542, "learning_rate": 8.012844279661017e-06, "loss": 0.947, "mean_token_accuracy": 0.7805057689547539, "num_tokens": 12076566.0, "step": 15008 }, { "epoch": 3.9751059322033897, "grad_norm": 1.5032920837402344, "learning_rate": 8.012579449152543e-06, "loss": 0.9533, "mean_token_accuracy": 0.7730900198221207, "num_tokens": 12078136.0, "step": 15010 }, { "epoch": 3.975635593220339, "grad_norm": 1.6340646743774414, "learning_rate": 8.012314618644068e-06, "loss": 1.1007, "mean_token_accuracy": 0.7553076222538948, "num_tokens": 12079889.0, "step": 15012 }, { "epoch": 3.976165254237288, "grad_norm": 1.9273767471313477, "learning_rate": 8.012049788135593e-06, "loss": 1.4615, "mean_token_accuracy": 0.6566740423440933, "num_tokens": 12081524.0, "step": 15014 }, { "epoch": 3.976694915254237, "grad_norm": 2.0927679538726807, "learning_rate": 8.011784957627118e-06, "loss": 1.2458, "mean_token_accuracy": 0.7313569262623787, "num_tokens": 12082780.0, "step": 15016 }, { "epoch": 3.9772245762711864, "grad_norm": 1.942940592765808, "learning_rate": 8.011520127118645e-06, "loss": 1.0967, "mean_token_accuracy": 0.7063293606042862, "num_tokens": 12085017.0, "step": 15018 }, { "epoch": 3.977754237288136, "grad_norm": 2.023385524749756, "learning_rate": 8.01125529661017e-06, "loss": 1.4718, "mean_token_accuracy": 0.692021831870079, "num_tokens": 12086682.0, "step": 15020 }, { "epoch": 3.9782838983050848, "grad_norm": 1.9669301509857178, "learning_rate": 8.010990466101696e-06, "loss": 1.1899, "mean_token_accuracy": 0.7353275045752525, "num_tokens": 12088119.0, "step": 15022 }, { "epoch": 3.9788135593220337, "grad_norm": 2.2008442878723145, "learning_rate": 8.010725635593221e-06, "loss": 1.294, "mean_token_accuracy": 0.7002424523234367, "num_tokens": 12089507.0, "step": 15024 }, { "epoch": 3.979343220338983, "grad_norm": 1.8784087896347046, "learning_rate": 8.010460805084746e-06, "loss": 1.1855, "mean_token_accuracy": 0.724195584654808, "num_tokens": 12090955.0, "step": 15026 }, { "epoch": 3.979872881355932, "grad_norm": 1.6130962371826172, "learning_rate": 8.010195974576271e-06, "loss": 1.0371, "mean_token_accuracy": 0.754771888256073, "num_tokens": 12092530.0, "step": 15028 }, { "epoch": 3.9804025423728815, "grad_norm": 2.1423325538635254, "learning_rate": 8.009931144067798e-06, "loss": 1.4874, "mean_token_accuracy": 0.6735110878944397, "num_tokens": 12093726.0, "step": 15030 }, { "epoch": 3.9809322033898304, "grad_norm": 2.119168996810913, "learning_rate": 8.009666313559323e-06, "loss": 1.4175, "mean_token_accuracy": 0.6943483576178551, "num_tokens": 12095433.0, "step": 15032 }, { "epoch": 3.9814618644067794, "grad_norm": 1.861586332321167, "learning_rate": 8.009401483050848e-06, "loss": 1.3521, "mean_token_accuracy": 0.7037074714899063, "num_tokens": 12097019.0, "step": 15034 }, { "epoch": 3.981991525423729, "grad_norm": 1.8261034488677979, "learning_rate": 8.009136652542373e-06, "loss": 0.9607, "mean_token_accuracy": 0.7615076303482056, "num_tokens": 12098498.0, "step": 15036 }, { "epoch": 3.982521186440678, "grad_norm": 2.1221396923065186, "learning_rate": 8.0088718220339e-06, "loss": 1.0079, "mean_token_accuracy": 0.7472770363092422, "num_tokens": 12100123.0, "step": 15038 }, { "epoch": 3.983050847457627, "grad_norm": 2.092275381088257, "learning_rate": 8.008606991525424e-06, "loss": 1.462, "mean_token_accuracy": 0.6711657121777534, "num_tokens": 12101598.0, "step": 15040 }, { "epoch": 3.983580508474576, "grad_norm": 1.5178847312927246, "learning_rate": 8.008342161016949e-06, "loss": 1.1769, "mean_token_accuracy": 0.7459277287125587, "num_tokens": 12103111.0, "step": 15042 }, { "epoch": 3.9841101694915255, "grad_norm": 1.9485044479370117, "learning_rate": 8.008077330508474e-06, "loss": 1.3726, "mean_token_accuracy": 0.7106454521417618, "num_tokens": 12104429.0, "step": 15044 }, { "epoch": 3.9846398305084745, "grad_norm": 1.9063223600387573, "learning_rate": 8.0078125e-06, "loss": 1.7473, "mean_token_accuracy": 0.6144200265407562, "num_tokens": 12106249.0, "step": 15046 }, { "epoch": 3.985169491525424, "grad_norm": 1.9165008068084717, "learning_rate": 8.007547669491527e-06, "loss": 1.0425, "mean_token_accuracy": 0.7471847608685493, "num_tokens": 12107717.0, "step": 15048 }, { "epoch": 3.985699152542373, "grad_norm": 1.9558130502700806, "learning_rate": 8.007282838983052e-06, "loss": 1.3332, "mean_token_accuracy": 0.7123166620731354, "num_tokens": 12109431.0, "step": 15050 }, { "epoch": 3.986228813559322, "grad_norm": 2.0038046836853027, "learning_rate": 8.007018008474577e-06, "loss": 1.1627, "mean_token_accuracy": 0.7292734980583191, "num_tokens": 12110778.0, "step": 15052 }, { "epoch": 3.986758474576271, "grad_norm": 1.716590404510498, "learning_rate": 8.006753177966102e-06, "loss": 1.0552, "mean_token_accuracy": 0.7324491441249847, "num_tokens": 12112422.0, "step": 15054 }, { "epoch": 3.9872881355932206, "grad_norm": 1.8866732120513916, "learning_rate": 8.006488347457629e-06, "loss": 1.3679, "mean_token_accuracy": 0.6846789196133614, "num_tokens": 12113864.0, "step": 15056 }, { "epoch": 3.9878177966101696, "grad_norm": 1.7653965950012207, "learning_rate": 8.006223516949154e-06, "loss": 1.4152, "mean_token_accuracy": 0.6815445423126221, "num_tokens": 12115477.0, "step": 15058 }, { "epoch": 3.9883474576271185, "grad_norm": 1.9407511949539185, "learning_rate": 8.005958686440679e-06, "loss": 1.2334, "mean_token_accuracy": 0.7065108940005302, "num_tokens": 12117430.0, "step": 15060 }, { "epoch": 3.988877118644068, "grad_norm": 1.7843680381774902, "learning_rate": 8.005693855932203e-06, "loss": 1.2683, "mean_token_accuracy": 0.6944917589426041, "num_tokens": 12119239.0, "step": 15062 }, { "epoch": 3.989406779661017, "grad_norm": 1.7655917406082153, "learning_rate": 8.00542902542373e-06, "loss": 1.4006, "mean_token_accuracy": 0.6934065371751785, "num_tokens": 12120640.0, "step": 15064 }, { "epoch": 3.9899364406779663, "grad_norm": 2.2365548610687256, "learning_rate": 8.005164194915255e-06, "loss": 1.1593, "mean_token_accuracy": 0.7342672944068909, "num_tokens": 12122002.0, "step": 15066 }, { "epoch": 3.9904661016949152, "grad_norm": 1.924369215965271, "learning_rate": 8.00489936440678e-06, "loss": 1.1338, "mean_token_accuracy": 0.7498621344566345, "num_tokens": 12123361.0, "step": 15068 }, { "epoch": 3.990995762711864, "grad_norm": 1.6311020851135254, "learning_rate": 8.004634533898305e-06, "loss": 1.0097, "mean_token_accuracy": 0.7531079277396202, "num_tokens": 12125355.0, "step": 15070 }, { "epoch": 3.9915254237288136, "grad_norm": 2.1408164501190186, "learning_rate": 8.004369703389832e-06, "loss": 1.7256, "mean_token_accuracy": 0.62970120459795, "num_tokens": 12127168.0, "step": 15072 }, { "epoch": 3.992055084745763, "grad_norm": 2.1466450691223145, "learning_rate": 8.004104872881356e-06, "loss": 1.2266, "mean_token_accuracy": 0.7070472538471222, "num_tokens": 12128830.0, "step": 15074 }, { "epoch": 3.992584745762712, "grad_norm": 1.751929759979248, "learning_rate": 8.003840042372883e-06, "loss": 0.9998, "mean_token_accuracy": 0.763231910765171, "num_tokens": 12130251.0, "step": 15076 }, { "epoch": 3.993114406779661, "grad_norm": 1.7258398532867432, "learning_rate": 8.003575211864408e-06, "loss": 1.4035, "mean_token_accuracy": 0.6765083223581314, "num_tokens": 12131862.0, "step": 15078 }, { "epoch": 3.9936440677966103, "grad_norm": 2.1885592937469482, "learning_rate": 8.003310381355933e-06, "loss": 1.687, "mean_token_accuracy": 0.6764085665345192, "num_tokens": 12133517.0, "step": 15080 }, { "epoch": 3.9941737288135593, "grad_norm": 1.6738234758377075, "learning_rate": 8.003045550847458e-06, "loss": 1.2732, "mean_token_accuracy": 0.7044965885579586, "num_tokens": 12135068.0, "step": 15082 }, { "epoch": 3.9947033898305087, "grad_norm": 1.8682340383529663, "learning_rate": 8.002780720338984e-06, "loss": 1.0371, "mean_token_accuracy": 0.7717448994517326, "num_tokens": 12136597.0, "step": 15084 }, { "epoch": 3.9952330508474576, "grad_norm": 1.7402540445327759, "learning_rate": 8.00251588983051e-06, "loss": 1.226, "mean_token_accuracy": 0.7107035368680954, "num_tokens": 12138365.0, "step": 15086 }, { "epoch": 3.9957627118644066, "grad_norm": 1.6158335208892822, "learning_rate": 8.002251059322034e-06, "loss": 1.2162, "mean_token_accuracy": 0.7256046012043953, "num_tokens": 12139935.0, "step": 15088 }, { "epoch": 3.996292372881356, "grad_norm": 1.6271365880966187, "learning_rate": 8.00198622881356e-06, "loss": 0.9795, "mean_token_accuracy": 0.7426039129495621, "num_tokens": 12141575.0, "step": 15090 }, { "epoch": 3.996822033898305, "grad_norm": 1.9604543447494507, "learning_rate": 8.001721398305086e-06, "loss": 1.233, "mean_token_accuracy": 0.6997645497322083, "num_tokens": 12143312.0, "step": 15092 }, { "epoch": 3.9973516949152543, "grad_norm": 1.52470064163208, "learning_rate": 8.00145656779661e-06, "loss": 1.4466, "mean_token_accuracy": 0.6671043857932091, "num_tokens": 12144875.0, "step": 15094 }, { "epoch": 3.9978813559322033, "grad_norm": 1.9046770334243774, "learning_rate": 8.001191737288136e-06, "loss": 1.3489, "mean_token_accuracy": 0.7096444666385651, "num_tokens": 12146759.0, "step": 15096 }, { "epoch": 3.9984110169491527, "grad_norm": 1.947816014289856, "learning_rate": 8.00092690677966e-06, "loss": 1.4556, "mean_token_accuracy": 0.6735513359308243, "num_tokens": 12148240.0, "step": 15098 }, { "epoch": 3.9989406779661016, "grad_norm": 1.4395807981491089, "learning_rate": 8.000662076271187e-06, "loss": 1.0801, "mean_token_accuracy": 0.7313874214887619, "num_tokens": 12150265.0, "step": 15100 }, { "epoch": 3.999470338983051, "grad_norm": 1.4257948398590088, "learning_rate": 8.000397245762712e-06, "loss": 0.9706, "mean_token_accuracy": 0.7805569022893906, "num_tokens": 12151749.0, "step": 15102 }, { "epoch": 4.0, "grad_norm": 1.9866288900375366, "learning_rate": 8.000132415254239e-06, "loss": 0.8991, "mean_token_accuracy": 0.7912695333361626, "num_tokens": 12153072.0, "step": 15104 }, { "epoch": 4.000529661016949, "grad_norm": 1.6656097173690796, "learning_rate": 7.999867584745764e-06, "loss": 1.5777, "mean_token_accuracy": 0.6483635753393173, "num_tokens": 12154683.0, "step": 15106 }, { "epoch": 4.001059322033898, "grad_norm": 2.2320308685302734, "learning_rate": 7.999602754237289e-06, "loss": 1.5866, "mean_token_accuracy": 0.676315926015377, "num_tokens": 12156371.0, "step": 15108 }, { "epoch": 4.001588983050848, "grad_norm": 1.4047085046768188, "learning_rate": 7.999337923728814e-06, "loss": 0.838, "mean_token_accuracy": 0.7876498401165009, "num_tokens": 12158087.0, "step": 15110 }, { "epoch": 4.002118644067797, "grad_norm": 1.912828803062439, "learning_rate": 7.99907309322034e-06, "loss": 1.5284, "mean_token_accuracy": 0.6611652448773384, "num_tokens": 12159495.0, "step": 15112 }, { "epoch": 4.002648305084746, "grad_norm": 1.9352995157241821, "learning_rate": 7.998808262711865e-06, "loss": 0.913, "mean_token_accuracy": 0.7577668353915215, "num_tokens": 12161048.0, "step": 15114 }, { "epoch": 4.003177966101695, "grad_norm": 1.996639370918274, "learning_rate": 7.99854343220339e-06, "loss": 1.5327, "mean_token_accuracy": 0.6466988027095795, "num_tokens": 12162651.0, "step": 15116 }, { "epoch": 4.0037076271186445, "grad_norm": 1.8781960010528564, "learning_rate": 7.998278601694915e-06, "loss": 1.7322, "mean_token_accuracy": 0.6032566577196121, "num_tokens": 12164216.0, "step": 15118 }, { "epoch": 4.004237288135593, "grad_norm": 1.8550958633422852, "learning_rate": 7.998013771186442e-06, "loss": 1.3921, "mean_token_accuracy": 0.6781604886054993, "num_tokens": 12166105.0, "step": 15120 }, { "epoch": 4.004766949152542, "grad_norm": 1.896256446838379, "learning_rate": 7.997748940677967e-06, "loss": 1.4887, "mean_token_accuracy": 0.671863503754139, "num_tokens": 12167756.0, "step": 15122 }, { "epoch": 4.005296610169491, "grad_norm": 1.7423080205917358, "learning_rate": 7.997484110169492e-06, "loss": 1.3815, "mean_token_accuracy": 0.6689557731151581, "num_tokens": 12169587.0, "step": 15124 }, { "epoch": 4.00582627118644, "grad_norm": 1.7422071695327759, "learning_rate": 7.997219279661016e-06, "loss": 1.2812, "mean_token_accuracy": 0.7075339332222939, "num_tokens": 12170871.0, "step": 15126 }, { "epoch": 4.00635593220339, "grad_norm": 1.951916217803955, "learning_rate": 7.996954449152543e-06, "loss": 1.1086, "mean_token_accuracy": 0.7368421033024788, "num_tokens": 12172180.0, "step": 15128 }, { "epoch": 4.006885593220339, "grad_norm": 1.6175137758255005, "learning_rate": 7.996689618644068e-06, "loss": 0.9003, "mean_token_accuracy": 0.7696218267083168, "num_tokens": 12174069.0, "step": 15130 }, { "epoch": 4.007415254237288, "grad_norm": 1.8554985523223877, "learning_rate": 7.996424788135595e-06, "loss": 1.495, "mean_token_accuracy": 0.6527461409568787, "num_tokens": 12175740.0, "step": 15132 }, { "epoch": 4.007944915254237, "grad_norm": 2.0795044898986816, "learning_rate": 7.99615995762712e-06, "loss": 1.3513, "mean_token_accuracy": 0.693188950419426, "num_tokens": 12177253.0, "step": 15134 }, { "epoch": 4.008474576271187, "grad_norm": 2.1550419330596924, "learning_rate": 7.995895127118645e-06, "loss": 1.3929, "mean_token_accuracy": 0.6831222325563431, "num_tokens": 12178504.0, "step": 15136 }, { "epoch": 4.009004237288136, "grad_norm": 1.2401223182678223, "learning_rate": 7.995630296610171e-06, "loss": 0.9961, "mean_token_accuracy": 0.7684364169836044, "num_tokens": 12180138.0, "step": 15138 }, { "epoch": 4.009533898305085, "grad_norm": 2.2672762870788574, "learning_rate": 7.995365466101696e-06, "loss": 1.4052, "mean_token_accuracy": 0.6846196800470352, "num_tokens": 12181688.0, "step": 15140 }, { "epoch": 4.010063559322034, "grad_norm": 1.4877229928970337, "learning_rate": 7.995100635593221e-06, "loss": 1.1684, "mean_token_accuracy": 0.7247875183820724, "num_tokens": 12183433.0, "step": 15142 }, { "epoch": 4.010593220338983, "grad_norm": 1.5715831518173218, "learning_rate": 7.994835805084746e-06, "loss": 0.9145, "mean_token_accuracy": 0.77119380235672, "num_tokens": 12185179.0, "step": 15144 }, { "epoch": 4.0111228813559325, "grad_norm": 1.9047893285751343, "learning_rate": 7.994570974576273e-06, "loss": 0.9328, "mean_token_accuracy": 0.7700111120939255, "num_tokens": 12186589.0, "step": 15146 }, { "epoch": 4.0116525423728815, "grad_norm": 1.976176142692566, "learning_rate": 7.994306144067797e-06, "loss": 1.1725, "mean_token_accuracy": 0.7223108410835266, "num_tokens": 12188300.0, "step": 15148 }, { "epoch": 4.0121822033898304, "grad_norm": 2.2705538272857666, "learning_rate": 7.994041313559322e-06, "loss": 1.3825, "mean_token_accuracy": 0.7069604620337486, "num_tokens": 12190236.0, "step": 15150 }, { "epoch": 4.012711864406779, "grad_norm": 1.8922030925750732, "learning_rate": 7.993776483050847e-06, "loss": 1.2376, "mean_token_accuracy": 0.6986160427331924, "num_tokens": 12191950.0, "step": 15152 }, { "epoch": 4.013241525423729, "grad_norm": 1.4457151889801025, "learning_rate": 7.993511652542374e-06, "loss": 0.7615, "mean_token_accuracy": 0.7923343181610107, "num_tokens": 12193380.0, "step": 15154 }, { "epoch": 4.013771186440678, "grad_norm": 1.8286561965942383, "learning_rate": 7.993246822033899e-06, "loss": 1.414, "mean_token_accuracy": 0.6971585154533386, "num_tokens": 12195061.0, "step": 15156 }, { "epoch": 4.014300847457627, "grad_norm": 1.9692113399505615, "learning_rate": 7.992981991525426e-06, "loss": 1.3526, "mean_token_accuracy": 0.6896887347102165, "num_tokens": 12196635.0, "step": 15158 }, { "epoch": 4.014830508474576, "grad_norm": 1.3820737600326538, "learning_rate": 7.99271716101695e-06, "loss": 0.9646, "mean_token_accuracy": 0.7549552991986275, "num_tokens": 12199193.0, "step": 15160 }, { "epoch": 4.015360169491525, "grad_norm": 2.105125665664673, "learning_rate": 7.992452330508475e-06, "loss": 0.9276, "mean_token_accuracy": 0.7674803510308266, "num_tokens": 12200673.0, "step": 15162 }, { "epoch": 4.015889830508475, "grad_norm": 1.3935550451278687, "learning_rate": 7.9921875e-06, "loss": 0.9376, "mean_token_accuracy": 0.7736928090453148, "num_tokens": 12202451.0, "step": 15164 }, { "epoch": 4.016419491525424, "grad_norm": 1.8718785047531128, "learning_rate": 7.991922669491527e-06, "loss": 1.1278, "mean_token_accuracy": 0.7374526560306549, "num_tokens": 12203976.0, "step": 15166 }, { "epoch": 4.016949152542373, "grad_norm": 2.071751832962036, "learning_rate": 7.991657838983052e-06, "loss": 1.4781, "mean_token_accuracy": 0.6701912507414818, "num_tokens": 12205396.0, "step": 15168 }, { "epoch": 4.017478813559322, "grad_norm": 2.197713613510132, "learning_rate": 7.991393008474577e-06, "loss": 1.587, "mean_token_accuracy": 0.6808894649147987, "num_tokens": 12206957.0, "step": 15170 }, { "epoch": 4.018008474576271, "grad_norm": 1.376275897026062, "learning_rate": 7.991128177966102e-06, "loss": 0.9298, "mean_token_accuracy": 0.7821977064013481, "num_tokens": 12209264.0, "step": 15172 }, { "epoch": 4.018538135593221, "grad_norm": 1.5355304479599, "learning_rate": 7.990863347457628e-06, "loss": 0.8448, "mean_token_accuracy": 0.7987356409430504, "num_tokens": 12210927.0, "step": 15174 }, { "epoch": 4.0190677966101696, "grad_norm": 1.6968493461608887, "learning_rate": 7.990598516949153e-06, "loss": 1.128, "mean_token_accuracy": 0.7274770736694336, "num_tokens": 12212534.0, "step": 15176 }, { "epoch": 4.0195974576271185, "grad_norm": 1.9109407663345337, "learning_rate": 7.990333686440678e-06, "loss": 1.0021, "mean_token_accuracy": 0.7558713033795357, "num_tokens": 12214067.0, "step": 15178 }, { "epoch": 4.0201271186440675, "grad_norm": 1.9465500116348267, "learning_rate": 7.990068855932203e-06, "loss": 1.5316, "mean_token_accuracy": 0.6645179092884064, "num_tokens": 12215667.0, "step": 15180 }, { "epoch": 4.020656779661017, "grad_norm": 2.195513963699341, "learning_rate": 7.98980402542373e-06, "loss": 1.6004, "mean_token_accuracy": 0.6286685019731522, "num_tokens": 12217169.0, "step": 15182 }, { "epoch": 4.021186440677966, "grad_norm": 2.0871572494506836, "learning_rate": 7.989539194915255e-06, "loss": 1.2197, "mean_token_accuracy": 0.7214291915297508, "num_tokens": 12218577.0, "step": 15184 }, { "epoch": 4.021716101694915, "grad_norm": 1.7861007452011108, "learning_rate": 7.989274364406781e-06, "loss": 1.5439, "mean_token_accuracy": 0.6639059484004974, "num_tokens": 12220138.0, "step": 15186 }, { "epoch": 4.022245762711864, "grad_norm": 1.9915016889572144, "learning_rate": 7.989009533898305e-06, "loss": 1.1037, "mean_token_accuracy": 0.7359582632780075, "num_tokens": 12221884.0, "step": 15188 }, { "epoch": 4.022775423728813, "grad_norm": 1.7973393201828003, "learning_rate": 7.988744703389831e-06, "loss": 1.4402, "mean_token_accuracy": 0.655731312930584, "num_tokens": 12223945.0, "step": 15190 }, { "epoch": 4.023305084745763, "grad_norm": 1.569579005241394, "learning_rate": 7.988479872881356e-06, "loss": 0.991, "mean_token_accuracy": 0.765790730714798, "num_tokens": 12225508.0, "step": 15192 }, { "epoch": 4.023834745762712, "grad_norm": 1.6070754528045654, "learning_rate": 7.988215042372883e-06, "loss": 1.2598, "mean_token_accuracy": 0.7058896273374557, "num_tokens": 12227230.0, "step": 15194 }, { "epoch": 4.024364406779661, "grad_norm": 1.8995403051376343, "learning_rate": 7.987950211864408e-06, "loss": 1.1835, "mean_token_accuracy": 0.7338443584740162, "num_tokens": 12228890.0, "step": 15196 }, { "epoch": 4.02489406779661, "grad_norm": 1.8959819078445435, "learning_rate": 7.987685381355933e-06, "loss": 1.5701, "mean_token_accuracy": 0.6784035265445709, "num_tokens": 12230308.0, "step": 15198 }, { "epoch": 4.02542372881356, "grad_norm": 1.9390392303466797, "learning_rate": 7.987420550847457e-06, "loss": 1.1707, "mean_token_accuracy": 0.7481083422899246, "num_tokens": 12231506.0, "step": 15200 }, { "epoch": 4.025953389830509, "grad_norm": 2.0707554817199707, "learning_rate": 7.987155720338984e-06, "loss": 1.2238, "mean_token_accuracy": 0.7349425218999386, "num_tokens": 12233110.0, "step": 15202 }, { "epoch": 4.026483050847458, "grad_norm": 2.7047348022460938, "learning_rate": 7.986890889830509e-06, "loss": 1.2668, "mean_token_accuracy": 0.6891000345349312, "num_tokens": 12234603.0, "step": 15204 }, { "epoch": 4.027012711864407, "grad_norm": 1.7571415901184082, "learning_rate": 7.986626059322034e-06, "loss": 1.2686, "mean_token_accuracy": 0.7136504128575325, "num_tokens": 12236332.0, "step": 15206 }, { "epoch": 4.0275423728813555, "grad_norm": 1.905120611190796, "learning_rate": 7.986361228813559e-06, "loss": 1.379, "mean_token_accuracy": 0.682972826063633, "num_tokens": 12238080.0, "step": 15208 }, { "epoch": 4.028072033898305, "grad_norm": 1.6121673583984375, "learning_rate": 7.986096398305086e-06, "loss": 0.977, "mean_token_accuracy": 0.75330550968647, "num_tokens": 12239666.0, "step": 15210 }, { "epoch": 4.028601694915254, "grad_norm": 1.7332963943481445, "learning_rate": 7.98583156779661e-06, "loss": 1.2989, "mean_token_accuracy": 0.6884420216083527, "num_tokens": 12241306.0, "step": 15212 }, { "epoch": 4.029131355932203, "grad_norm": 2.248868703842163, "learning_rate": 7.985566737288137e-06, "loss": 1.2173, "mean_token_accuracy": 0.7255138233304024, "num_tokens": 12242730.0, "step": 15214 }, { "epoch": 4.029661016949152, "grad_norm": 2.018054485321045, "learning_rate": 7.985301906779662e-06, "loss": 1.5038, "mean_token_accuracy": 0.6693429201841354, "num_tokens": 12244271.0, "step": 15216 }, { "epoch": 4.030190677966102, "grad_norm": 1.9962453842163086, "learning_rate": 7.985037076271187e-06, "loss": 1.0271, "mean_token_accuracy": 0.7461788579821587, "num_tokens": 12245766.0, "step": 15218 }, { "epoch": 4.030720338983051, "grad_norm": 1.8122825622558594, "learning_rate": 7.984772245762714e-06, "loss": 1.1485, "mean_token_accuracy": 0.7534533217549324, "num_tokens": 12247093.0, "step": 15220 }, { "epoch": 4.03125, "grad_norm": 2.2495458126068115, "learning_rate": 7.984507415254238e-06, "loss": 1.2482, "mean_token_accuracy": 0.7371094524860382, "num_tokens": 12248500.0, "step": 15222 }, { "epoch": 4.031779661016949, "grad_norm": 1.6986030340194702, "learning_rate": 7.984242584745763e-06, "loss": 1.1399, "mean_token_accuracy": 0.7066855803132057, "num_tokens": 12250329.0, "step": 15224 }, { "epoch": 4.032309322033898, "grad_norm": 1.7474116086959839, "learning_rate": 7.983977754237288e-06, "loss": 1.5087, "mean_token_accuracy": 0.6693164333701134, "num_tokens": 12252035.0, "step": 15226 }, { "epoch": 4.032838983050848, "grad_norm": 2.3908987045288086, "learning_rate": 7.983712923728815e-06, "loss": 1.7477, "mean_token_accuracy": 0.6212185695767403, "num_tokens": 12253571.0, "step": 15228 }, { "epoch": 4.033368644067797, "grad_norm": 1.7636114358901978, "learning_rate": 7.98344809322034e-06, "loss": 1.3231, "mean_token_accuracy": 0.6996048539876938, "num_tokens": 12255318.0, "step": 15230 }, { "epoch": 4.033898305084746, "grad_norm": 1.5480420589447021, "learning_rate": 7.983183262711865e-06, "loss": 1.1806, "mean_token_accuracy": 0.7292919903993607, "num_tokens": 12257115.0, "step": 15232 }, { "epoch": 4.034427966101695, "grad_norm": 2.0924410820007324, "learning_rate": 7.98291843220339e-06, "loss": 1.4548, "mean_token_accuracy": 0.6888410821557045, "num_tokens": 12258652.0, "step": 15234 }, { "epoch": 4.0349576271186445, "grad_norm": 2.120079517364502, "learning_rate": 7.982653601694916e-06, "loss": 1.5515, "mean_token_accuracy": 0.6676912605762482, "num_tokens": 12260289.0, "step": 15236 }, { "epoch": 4.035487288135593, "grad_norm": 1.6783064603805542, "learning_rate": 7.982388771186441e-06, "loss": 1.0566, "mean_token_accuracy": 0.7286493182182312, "num_tokens": 12261854.0, "step": 15238 }, { "epoch": 4.036016949152542, "grad_norm": 1.6179912090301514, "learning_rate": 7.982123940677968e-06, "loss": 0.9272, "mean_token_accuracy": 0.7755047455430031, "num_tokens": 12263268.0, "step": 15240 }, { "epoch": 4.036546610169491, "grad_norm": 1.9782520532608032, "learning_rate": 7.981859110169491e-06, "loss": 1.0089, "mean_token_accuracy": 0.7283122017979622, "num_tokens": 12264896.0, "step": 15242 }, { "epoch": 4.03707627118644, "grad_norm": 2.4746100902557373, "learning_rate": 7.981594279661018e-06, "loss": 1.5931, "mean_token_accuracy": 0.6348579376935959, "num_tokens": 12266473.0, "step": 15244 }, { "epoch": 4.03760593220339, "grad_norm": 1.9464387893676758, "learning_rate": 7.981329449152543e-06, "loss": 1.2607, "mean_token_accuracy": 0.7388557568192482, "num_tokens": 12268398.0, "step": 15246 }, { "epoch": 4.038135593220339, "grad_norm": 1.9467350244522095, "learning_rate": 7.98106461864407e-06, "loss": 1.4802, "mean_token_accuracy": 0.6508570238947868, "num_tokens": 12270133.0, "step": 15248 }, { "epoch": 4.038665254237288, "grad_norm": 1.7334595918655396, "learning_rate": 7.980799788135594e-06, "loss": 1.5338, "step": 15250 }, { "epoch": 4.038665254237288, "eval_loss": 1.3120927810668945, "eval_mean_token_accuracy": 0.7015063984246998, "eval_num_tokens": 12271984.0, "eval_runtime": 48.2702, "eval_samples_per_second": 6.381, "eval_steps_per_second": 6.381, "step": 15250 }, { "epoch": 4.039194915254237, "grad_norm": 1.7407464981079102, "learning_rate": 7.98053495762712e-06, "loss": 1.1027, "mean_token_accuracy": 0.6931486837565899, "num_tokens": 12273589.0, "step": 15252 }, { "epoch": 4.039724576271187, "grad_norm": 1.6355605125427246, "learning_rate": 7.980270127118644e-06, "loss": 1.0586, "mean_token_accuracy": 0.7634303346276283, "num_tokens": 12275061.0, "step": 15254 }, { "epoch": 4.040254237288136, "grad_norm": 1.9984294176101685, "learning_rate": 7.98000529661017e-06, "loss": 1.3013, "mean_token_accuracy": 0.7209960222244263, "num_tokens": 12276639.0, "step": 15256 }, { "epoch": 4.040783898305085, "grad_norm": 1.8196715116500854, "learning_rate": 7.979740466101696e-06, "loss": 0.9771, "mean_token_accuracy": 0.7600238472223282, "num_tokens": 12277993.0, "step": 15258 }, { "epoch": 4.041313559322034, "grad_norm": 2.015225648880005, "learning_rate": 7.97947563559322e-06, "loss": 1.7106, "mean_token_accuracy": 0.6189523600041866, "num_tokens": 12279493.0, "step": 15260 }, { "epoch": 4.041843220338983, "grad_norm": 1.5824006795883179, "learning_rate": 7.979210805084746e-06, "loss": 1.2604, "mean_token_accuracy": 0.7053031697869301, "num_tokens": 12281149.0, "step": 15262 }, { "epoch": 4.0423728813559325, "grad_norm": 1.7342164516448975, "learning_rate": 7.978945974576272e-06, "loss": 1.4552, "mean_token_accuracy": 0.7005336806178093, "num_tokens": 12283115.0, "step": 15264 }, { "epoch": 4.0429025423728815, "grad_norm": 2.0315310955047607, "learning_rate": 7.978681144067797e-06, "loss": 1.522, "mean_token_accuracy": 0.6640025973320007, "num_tokens": 12284726.0, "step": 15266 }, { "epoch": 4.0434322033898304, "grad_norm": 2.08585524559021, "learning_rate": 7.978416313559324e-06, "loss": 1.2455, "mean_token_accuracy": 0.7291444316506386, "num_tokens": 12286181.0, "step": 15268 }, { "epoch": 4.043961864406779, "grad_norm": 1.9308451414108276, "learning_rate": 7.978151483050847e-06, "loss": 1.6349, "mean_token_accuracy": 0.6099893413484097, "num_tokens": 12287739.0, "step": 15270 }, { "epoch": 4.044491525423729, "grad_norm": 1.9447091817855835, "learning_rate": 7.977886652542374e-06, "loss": 1.7757, "mean_token_accuracy": 0.612856924533844, "num_tokens": 12289295.0, "step": 15272 }, { "epoch": 4.045021186440678, "grad_norm": 2.1534831523895264, "learning_rate": 7.977621822033899e-06, "loss": 1.269, "mean_token_accuracy": 0.7075618579983711, "num_tokens": 12290825.0, "step": 15274 }, { "epoch": 4.045550847457627, "grad_norm": 2.020587682723999, "learning_rate": 7.977356991525425e-06, "loss": 1.2778, "mean_token_accuracy": 0.7044769525527954, "num_tokens": 12292360.0, "step": 15276 }, { "epoch": 4.046080508474576, "grad_norm": 1.6792595386505127, "learning_rate": 7.97709216101695e-06, "loss": 1.2845, "mean_token_accuracy": 0.7115601003170013, "num_tokens": 12293991.0, "step": 15278 }, { "epoch": 4.046610169491525, "grad_norm": 2.097670793533325, "learning_rate": 7.976827330508475e-06, "loss": 1.697, "mean_token_accuracy": 0.6336831599473953, "num_tokens": 12295563.0, "step": 15280 }, { "epoch": 4.047139830508475, "grad_norm": 1.8042360544204712, "learning_rate": 7.9765625e-06, "loss": 1.1604, "mean_token_accuracy": 0.7321402356028557, "num_tokens": 12297296.0, "step": 15282 }, { "epoch": 4.047669491525424, "grad_norm": 1.9453321695327759, "learning_rate": 7.976297669491527e-06, "loss": 0.8931, "mean_token_accuracy": 0.781398817896843, "num_tokens": 12299036.0, "step": 15284 }, { "epoch": 4.048199152542373, "grad_norm": 1.9232118129730225, "learning_rate": 7.976032838983051e-06, "loss": 1.6938, "mean_token_accuracy": 0.6378956511616707, "num_tokens": 12300869.0, "step": 15286 }, { "epoch": 4.048728813559322, "grad_norm": 1.9692260026931763, "learning_rate": 7.975768008474576e-06, "loss": 1.0481, "mean_token_accuracy": 0.7520036175847054, "num_tokens": 12302403.0, "step": 15288 }, { "epoch": 4.049258474576271, "grad_norm": 1.942051649093628, "learning_rate": 7.975503177966101e-06, "loss": 1.048, "mean_token_accuracy": 0.7371163591742516, "num_tokens": 12304187.0, "step": 15290 }, { "epoch": 4.049788135593221, "grad_norm": 2.1599671840667725, "learning_rate": 7.975238347457628e-06, "loss": 1.285, "mean_token_accuracy": 0.7232713624835014, "num_tokens": 12305877.0, "step": 15292 }, { "epoch": 4.0503177966101696, "grad_norm": 1.9195924997329712, "learning_rate": 7.974973516949153e-06, "loss": 1.466, "mean_token_accuracy": 0.6575097367167473, "num_tokens": 12307612.0, "step": 15294 }, { "epoch": 4.0508474576271185, "grad_norm": 1.8709698915481567, "learning_rate": 7.974708686440678e-06, "loss": 1.5225, "mean_token_accuracy": 0.6682492569088936, "num_tokens": 12309104.0, "step": 15296 }, { "epoch": 4.0513771186440675, "grad_norm": 2.0596675872802734, "learning_rate": 7.974443855932203e-06, "loss": 0.9587, "mean_token_accuracy": 0.7530660182237625, "num_tokens": 12310573.0, "step": 15298 }, { "epoch": 4.051906779661017, "grad_norm": 2.1332173347473145, "learning_rate": 7.97417902542373e-06, "loss": 1.9343, "mean_token_accuracy": 0.5874402187764645, "num_tokens": 12312222.0, "step": 15300 }, { "epoch": 4.052436440677966, "grad_norm": 2.1640772819519043, "learning_rate": 7.973914194915256e-06, "loss": 1.541, "mean_token_accuracy": 0.6490663960576057, "num_tokens": 12313943.0, "step": 15302 }, { "epoch": 4.052966101694915, "grad_norm": 2.158599376678467, "learning_rate": 7.973649364406781e-06, "loss": 1.2286, "mean_token_accuracy": 0.7136587202548981, "num_tokens": 12315531.0, "step": 15304 }, { "epoch": 4.053495762711864, "grad_norm": 2.1878204345703125, "learning_rate": 7.973384533898306e-06, "loss": 1.309, "mean_token_accuracy": 0.7068038582801819, "num_tokens": 12317155.0, "step": 15306 }, { "epoch": 4.054025423728813, "grad_norm": 1.654868483543396, "learning_rate": 7.97311970338983e-06, "loss": 1.1056, "mean_token_accuracy": 0.7408239990472794, "num_tokens": 12318958.0, "step": 15308 }, { "epoch": 4.054555084745763, "grad_norm": 1.9290175437927246, "learning_rate": 7.972854872881357e-06, "loss": 1.4883, "mean_token_accuracy": 0.6605124548077583, "num_tokens": 12320435.0, "step": 15310 }, { "epoch": 4.055084745762712, "grad_norm": 1.6700751781463623, "learning_rate": 7.972590042372882e-06, "loss": 1.1398, "mean_token_accuracy": 0.71800147742033, "num_tokens": 12322001.0, "step": 15312 }, { "epoch": 4.055614406779661, "grad_norm": 2.214123010635376, "learning_rate": 7.972325211864407e-06, "loss": 1.5915, "mean_token_accuracy": 0.6685840636491776, "num_tokens": 12323410.0, "step": 15314 }, { "epoch": 4.05614406779661, "grad_norm": 1.6033326387405396, "learning_rate": 7.972060381355932e-06, "loss": 0.7232, "mean_token_accuracy": 0.8126573786139488, "num_tokens": 12324621.0, "step": 15316 }, { "epoch": 4.05667372881356, "grad_norm": 1.8629242181777954, "learning_rate": 7.971795550847459e-06, "loss": 1.1734, "mean_token_accuracy": 0.7361813113093376, "num_tokens": 12326454.0, "step": 15318 }, { "epoch": 4.057203389830509, "grad_norm": 1.7750657796859741, "learning_rate": 7.971530720338984e-06, "loss": 1.0026, "mean_token_accuracy": 0.7742109447717667, "num_tokens": 12327849.0, "step": 15320 }, { "epoch": 4.057733050847458, "grad_norm": 1.6916232109069824, "learning_rate": 7.97126588983051e-06, "loss": 1.1899, "mean_token_accuracy": 0.7113954573869705, "num_tokens": 12329695.0, "step": 15322 }, { "epoch": 4.058262711864407, "grad_norm": 1.8065884113311768, "learning_rate": 7.971001059322034e-06, "loss": 1.3458, "mean_token_accuracy": 0.7065009623765945, "num_tokens": 12331290.0, "step": 15324 }, { "epoch": 4.0587923728813555, "grad_norm": 2.2065482139587402, "learning_rate": 7.97073622881356e-06, "loss": 1.6, "mean_token_accuracy": 0.6631977930665016, "num_tokens": 12333160.0, "step": 15326 }, { "epoch": 4.059322033898305, "grad_norm": 1.9816529750823975, "learning_rate": 7.970471398305085e-06, "loss": 0.6658, "mean_token_accuracy": 0.8352462947368622, "num_tokens": 12334426.0, "step": 15328 }, { "epoch": 4.059851694915254, "grad_norm": 2.0762240886688232, "learning_rate": 7.970206567796612e-06, "loss": 1.166, "mean_token_accuracy": 0.729506604373455, "num_tokens": 12335788.0, "step": 15330 }, { "epoch": 4.060381355932203, "grad_norm": 1.9843740463256836, "learning_rate": 7.969941737288137e-06, "loss": 1.3222, "mean_token_accuracy": 0.6860094517469406, "num_tokens": 12337179.0, "step": 15332 }, { "epoch": 4.060911016949152, "grad_norm": 1.7518976926803589, "learning_rate": 7.969676906779662e-06, "loss": 1.2212, "mean_token_accuracy": 0.7068235352635384, "num_tokens": 12339035.0, "step": 15334 }, { "epoch": 4.061440677966102, "grad_norm": 1.8786718845367432, "learning_rate": 7.969412076271187e-06, "loss": 1.4058, "mean_token_accuracy": 0.6743138134479523, "num_tokens": 12340538.0, "step": 15336 }, { "epoch": 4.061970338983051, "grad_norm": 1.8444241285324097, "learning_rate": 7.969147245762713e-06, "loss": 1.1927, "mean_token_accuracy": 0.7269127145409584, "num_tokens": 12342060.0, "step": 15338 }, { "epoch": 4.0625, "grad_norm": 1.916422963142395, "learning_rate": 7.968882415254238e-06, "loss": 1.412, "mean_token_accuracy": 0.7058818191289902, "num_tokens": 12343450.0, "step": 15340 }, { "epoch": 4.063029661016949, "grad_norm": 1.7174506187438965, "learning_rate": 7.968617584745763e-06, "loss": 0.9859, "mean_token_accuracy": 0.7498049065470695, "num_tokens": 12344921.0, "step": 15342 }, { "epoch": 4.063559322033898, "grad_norm": 1.974013090133667, "learning_rate": 7.968352754237288e-06, "loss": 1.1331, "mean_token_accuracy": 0.7340465188026428, "num_tokens": 12346392.0, "step": 15344 }, { "epoch": 4.064088983050848, "grad_norm": 2.5147745609283447, "learning_rate": 7.968087923728815e-06, "loss": 1.1155, "mean_token_accuracy": 0.7538645267486572, "num_tokens": 12348064.0, "step": 15346 }, { "epoch": 4.064618644067797, "grad_norm": 2.155832529067993, "learning_rate": 7.96782309322034e-06, "loss": 1.5195, "mean_token_accuracy": 0.6683863177895546, "num_tokens": 12349759.0, "step": 15348 }, { "epoch": 4.065148305084746, "grad_norm": 1.5614681243896484, "learning_rate": 7.967558262711864e-06, "loss": 1.1188, "mean_token_accuracy": 0.7329070419073105, "num_tokens": 12351332.0, "step": 15350 }, { "epoch": 4.065677966101695, "grad_norm": 2.210825204849243, "learning_rate": 7.96729343220339e-06, "loss": 1.367, "mean_token_accuracy": 0.6988062933087349, "num_tokens": 12352625.0, "step": 15352 }, { "epoch": 4.0662076271186445, "grad_norm": 2.015504837036133, "learning_rate": 7.967028601694916e-06, "loss": 1.324, "mean_token_accuracy": 0.7063917070627213, "num_tokens": 12354532.0, "step": 15354 }, { "epoch": 4.066737288135593, "grad_norm": 1.9101543426513672, "learning_rate": 7.966763771186441e-06, "loss": 1.3797, "mean_token_accuracy": 0.6887451224029064, "num_tokens": 12356033.0, "step": 15356 }, { "epoch": 4.067266949152542, "grad_norm": 1.830784559249878, "learning_rate": 7.966498940677968e-06, "loss": 1.0174, "mean_token_accuracy": 0.7553107440471649, "num_tokens": 12357268.0, "step": 15358 }, { "epoch": 4.067796610169491, "grad_norm": 1.9477531909942627, "learning_rate": 7.966234110169492e-06, "loss": 1.267, "mean_token_accuracy": 0.7191862836480141, "num_tokens": 12358856.0, "step": 15360 }, { "epoch": 4.06832627118644, "grad_norm": 1.9271128177642822, "learning_rate": 7.965969279661017e-06, "loss": 1.6742, "mean_token_accuracy": 0.6591257229447365, "num_tokens": 12360376.0, "step": 15362 }, { "epoch": 4.06885593220339, "grad_norm": 2.3994128704071045, "learning_rate": 7.965704449152542e-06, "loss": 1.6354, "mean_token_accuracy": 0.6414657905697823, "num_tokens": 12361765.0, "step": 15364 }, { "epoch": 4.069385593220339, "grad_norm": 1.8872205018997192, "learning_rate": 7.965439618644069e-06, "loss": 1.4552, "mean_token_accuracy": 0.702350489795208, "num_tokens": 12363236.0, "step": 15366 }, { "epoch": 4.069915254237288, "grad_norm": 2.025066375732422, "learning_rate": 7.965174788135594e-06, "loss": 1.4388, "mean_token_accuracy": 0.654995784163475, "num_tokens": 12364868.0, "step": 15368 }, { "epoch": 4.070444915254237, "grad_norm": 1.7272692918777466, "learning_rate": 7.964909957627119e-06, "loss": 1.0439, "mean_token_accuracy": 0.750609926879406, "num_tokens": 12366376.0, "step": 15370 }, { "epoch": 4.070974576271187, "grad_norm": 1.963094711303711, "learning_rate": 7.964645127118644e-06, "loss": 1.3978, "mean_token_accuracy": 0.6906656101346016, "num_tokens": 12367692.0, "step": 15372 }, { "epoch": 4.071504237288136, "grad_norm": 2.382584571838379, "learning_rate": 7.96438029661017e-06, "loss": 1.1581, "mean_token_accuracy": 0.7168998047709465, "num_tokens": 12368901.0, "step": 15374 }, { "epoch": 4.072033898305085, "grad_norm": 2.2310562133789062, "learning_rate": 7.964115466101695e-06, "loss": 1.1963, "mean_token_accuracy": 0.7250301912426949, "num_tokens": 12370379.0, "step": 15376 }, { "epoch": 4.072563559322034, "grad_norm": 2.266793727874756, "learning_rate": 7.96385063559322e-06, "loss": 1.1143, "mean_token_accuracy": 0.7300899401307106, "num_tokens": 12371889.0, "step": 15378 }, { "epoch": 4.073093220338983, "grad_norm": 1.8935991525650024, "learning_rate": 7.963585805084745e-06, "loss": 1.0729, "mean_token_accuracy": 0.744352214038372, "num_tokens": 12373398.0, "step": 15380 }, { "epoch": 4.0736228813559325, "grad_norm": 1.8106518983840942, "learning_rate": 7.963320974576272e-06, "loss": 0.9873, "mean_token_accuracy": 0.7332590371370316, "num_tokens": 12375050.0, "step": 15382 }, { "epoch": 4.0741525423728815, "grad_norm": 2.0104026794433594, "learning_rate": 7.963056144067798e-06, "loss": 1.3897, "mean_token_accuracy": 0.6941860094666481, "num_tokens": 12376666.0, "step": 15384 }, { "epoch": 4.0746822033898304, "grad_norm": 2.060302734375, "learning_rate": 7.962791313559323e-06, "loss": 1.2766, "mean_token_accuracy": 0.7074554190039635, "num_tokens": 12378214.0, "step": 15386 }, { "epoch": 4.075211864406779, "grad_norm": 1.5759257078170776, "learning_rate": 7.962526483050848e-06, "loss": 0.9624, "mean_token_accuracy": 0.760133184492588, "num_tokens": 12380017.0, "step": 15388 }, { "epoch": 4.075741525423729, "grad_norm": 1.8566340208053589, "learning_rate": 7.962261652542373e-06, "loss": 1.0672, "mean_token_accuracy": 0.7698423713445663, "num_tokens": 12381551.0, "step": 15390 }, { "epoch": 4.076271186440678, "grad_norm": 1.7538996934890747, "learning_rate": 7.9619968220339e-06, "loss": 1.559, "mean_token_accuracy": 0.6523201838135719, "num_tokens": 12383276.0, "step": 15392 }, { "epoch": 4.076800847457627, "grad_norm": 1.74553382396698, "learning_rate": 7.961731991525425e-06, "loss": 1.3539, "mean_token_accuracy": 0.6830174848437309, "num_tokens": 12384831.0, "step": 15394 }, { "epoch": 4.077330508474576, "grad_norm": 1.992306113243103, "learning_rate": 7.96146716101695e-06, "loss": 0.9915, "mean_token_accuracy": 0.758327305316925, "num_tokens": 12386360.0, "step": 15396 }, { "epoch": 4.077860169491525, "grad_norm": 2.010183811187744, "learning_rate": 7.961202330508475e-06, "loss": 1.3563, "mean_token_accuracy": 0.7084720730781555, "num_tokens": 12388061.0, "step": 15398 }, { "epoch": 4.078389830508475, "grad_norm": 1.7748055458068848, "learning_rate": 7.960937500000001e-06, "loss": 1.0228, "mean_token_accuracy": 0.7452677860856056, "num_tokens": 12389760.0, "step": 15400 }, { "epoch": 4.078919491525424, "grad_norm": 2.27124285697937, "learning_rate": 7.960672669491526e-06, "loss": 1.513, "mean_token_accuracy": 0.6778479814529419, "num_tokens": 12391421.0, "step": 15402 }, { "epoch": 4.079449152542373, "grad_norm": 2.4784889221191406, "learning_rate": 7.960407838983051e-06, "loss": 1.4174, "mean_token_accuracy": 0.7055442631244659, "num_tokens": 12393119.0, "step": 15404 }, { "epoch": 4.079978813559322, "grad_norm": 1.9849036931991577, "learning_rate": 7.960143008474576e-06, "loss": 0.7749, "mean_token_accuracy": 0.8032184988260269, "num_tokens": 12394624.0, "step": 15406 }, { "epoch": 4.080508474576271, "grad_norm": 2.0400075912475586, "learning_rate": 7.959878177966103e-06, "loss": 1.5761, "mean_token_accuracy": 0.6327817253768444, "num_tokens": 12396253.0, "step": 15408 }, { "epoch": 4.081038135593221, "grad_norm": 1.820331335067749, "learning_rate": 7.959613347457628e-06, "loss": 1.2735, "mean_token_accuracy": 0.710710272192955, "num_tokens": 12397814.0, "step": 15410 }, { "epoch": 4.0815677966101696, "grad_norm": 1.8796370029449463, "learning_rate": 7.959348516949154e-06, "loss": 0.9599, "mean_token_accuracy": 0.747362531721592, "num_tokens": 12399295.0, "step": 15412 }, { "epoch": 4.0820974576271185, "grad_norm": 1.699123501777649, "learning_rate": 7.959083686440679e-06, "loss": 1.2885, "mean_token_accuracy": 0.7260444611310959, "num_tokens": 12401023.0, "step": 15414 }, { "epoch": 4.0826271186440675, "grad_norm": 1.8205703496932983, "learning_rate": 7.958818855932204e-06, "loss": 1.2595, "mean_token_accuracy": 0.6778443194925785, "num_tokens": 12402738.0, "step": 15416 }, { "epoch": 4.083156779661017, "grad_norm": 1.931556224822998, "learning_rate": 7.958554025423729e-06, "loss": 0.8206, "mean_token_accuracy": 0.7898007556796074, "num_tokens": 12404190.0, "step": 15418 }, { "epoch": 4.083686440677966, "grad_norm": 1.938687801361084, "learning_rate": 7.958289194915256e-06, "loss": 1.4476, "mean_token_accuracy": 0.6621315889060497, "num_tokens": 12405844.0, "step": 15420 }, { "epoch": 4.084216101694915, "grad_norm": 1.6075822114944458, "learning_rate": 7.95802436440678e-06, "loss": 0.9374, "mean_token_accuracy": 0.7654110230505466, "num_tokens": 12407678.0, "step": 15422 }, { "epoch": 4.084745762711864, "grad_norm": 1.9580742120742798, "learning_rate": 7.957759533898305e-06, "loss": 1.3831, "mean_token_accuracy": 0.6787829101085663, "num_tokens": 12409322.0, "step": 15424 }, { "epoch": 4.085275423728813, "grad_norm": 1.8364943265914917, "learning_rate": 7.95749470338983e-06, "loss": 1.18, "mean_token_accuracy": 0.6931453347206116, "num_tokens": 12411291.0, "step": 15426 }, { "epoch": 4.085805084745763, "grad_norm": 2.1388204097747803, "learning_rate": 7.957229872881357e-06, "loss": 1.4017, "mean_token_accuracy": 0.6648015230894089, "num_tokens": 12412925.0, "step": 15428 }, { "epoch": 4.086334745762712, "grad_norm": 2.0119049549102783, "learning_rate": 7.956965042372882e-06, "loss": 1.1264, "mean_token_accuracy": 0.7203183546662331, "num_tokens": 12414532.0, "step": 15430 }, { "epoch": 4.086864406779661, "grad_norm": 2.0916810035705566, "learning_rate": 7.956700211864407e-06, "loss": 1.1223, "mean_token_accuracy": 0.7565475478768349, "num_tokens": 12415771.0, "step": 15432 }, { "epoch": 4.08739406779661, "grad_norm": 1.789291262626648, "learning_rate": 7.956435381355932e-06, "loss": 1.0975, "mean_token_accuracy": 0.7427445352077484, "num_tokens": 12417661.0, "step": 15434 }, { "epoch": 4.08792372881356, "grad_norm": 1.864895224571228, "learning_rate": 7.956170550847458e-06, "loss": 1.1369, "mean_token_accuracy": 0.7325010746717453, "num_tokens": 12419191.0, "step": 15436 }, { "epoch": 4.088453389830509, "grad_norm": 1.8520703315734863, "learning_rate": 7.955905720338983e-06, "loss": 1.1823, "mean_token_accuracy": 0.7146513760089874, "num_tokens": 12420649.0, "step": 15438 }, { "epoch": 4.088983050847458, "grad_norm": 2.327659845352173, "learning_rate": 7.95564088983051e-06, "loss": 1.1049, "mean_token_accuracy": 0.7486214563250542, "num_tokens": 12421953.0, "step": 15440 }, { "epoch": 4.089512711864407, "grad_norm": 2.2568306922912598, "learning_rate": 7.955376059322035e-06, "loss": 0.9985, "mean_token_accuracy": 0.7475161477923393, "num_tokens": 12423689.0, "step": 15442 }, { "epoch": 4.0900423728813555, "grad_norm": 2.225504159927368, "learning_rate": 7.95511122881356e-06, "loss": 1.1814, "mean_token_accuracy": 0.731336198747158, "num_tokens": 12425135.0, "step": 15444 }, { "epoch": 4.090572033898305, "grad_norm": 1.940252423286438, "learning_rate": 7.954846398305085e-06, "loss": 1.4937, "mean_token_accuracy": 0.6536448299884796, "num_tokens": 12426582.0, "step": 15446 }, { "epoch": 4.091101694915254, "grad_norm": 2.0472002029418945, "learning_rate": 7.954581567796611e-06, "loss": 1.0504, "mean_token_accuracy": 0.773533284664154, "num_tokens": 12428006.0, "step": 15448 }, { "epoch": 4.091631355932203, "grad_norm": 1.7123225927352905, "learning_rate": 7.954316737288136e-06, "loss": 1.3203, "mean_token_accuracy": 0.6882847733795643, "num_tokens": 12429738.0, "step": 15450 }, { "epoch": 4.092161016949152, "grad_norm": 1.9969722032546997, "learning_rate": 7.954051906779661e-06, "loss": 1.0147, "mean_token_accuracy": 0.7337832301855087, "num_tokens": 12431255.0, "step": 15452 }, { "epoch": 4.092690677966102, "grad_norm": 1.8549926280975342, "learning_rate": 7.953787076271186e-06, "loss": 1.0591, "mean_token_accuracy": 0.7458667680621147, "num_tokens": 12432885.0, "step": 15454 }, { "epoch": 4.093220338983051, "grad_norm": 1.364158034324646, "learning_rate": 7.953522245762713e-06, "loss": 0.8573, "mean_token_accuracy": 0.801655188202858, "num_tokens": 12434400.0, "step": 15456 }, { "epoch": 4.09375, "grad_norm": 2.0097365379333496, "learning_rate": 7.953257415254238e-06, "loss": 1.2619, "mean_token_accuracy": 0.7016907632350922, "num_tokens": 12436061.0, "step": 15458 }, { "epoch": 4.094279661016949, "grad_norm": 2.0393052101135254, "learning_rate": 7.952992584745763e-06, "loss": 1.2972, "mean_token_accuracy": 0.7099757194519043, "num_tokens": 12437550.0, "step": 15460 }, { "epoch": 4.094809322033898, "grad_norm": 1.5402514934539795, "learning_rate": 7.952727754237288e-06, "loss": 1.178, "mean_token_accuracy": 0.7278865501284599, "num_tokens": 12439337.0, "step": 15462 }, { "epoch": 4.095338983050848, "grad_norm": 2.6585516929626465, "learning_rate": 7.952462923728814e-06, "loss": 1.1544, "mean_token_accuracy": 0.7328867167234421, "num_tokens": 12440710.0, "step": 15464 }, { "epoch": 4.095868644067797, "grad_norm": 1.6743884086608887, "learning_rate": 7.952198093220339e-06, "loss": 0.8014, "mean_token_accuracy": 0.7794725894927979, "num_tokens": 12442597.0, "step": 15466 }, { "epoch": 4.096398305084746, "grad_norm": 2.1833343505859375, "learning_rate": 7.951933262711866e-06, "loss": 1.192, "mean_token_accuracy": 0.7037214636802673, "num_tokens": 12444048.0, "step": 15468 }, { "epoch": 4.096927966101695, "grad_norm": 2.4493560791015625, "learning_rate": 7.95166843220339e-06, "loss": 1.0615, "mean_token_accuracy": 0.7659846544265747, "num_tokens": 12445816.0, "step": 15470 }, { "epoch": 4.0974576271186445, "grad_norm": 1.5106533765792847, "learning_rate": 7.951403601694916e-06, "loss": 1.0581, "mean_token_accuracy": 0.7332417666912079, "num_tokens": 12447621.0, "step": 15472 }, { "epoch": 4.097987288135593, "grad_norm": 1.951109766960144, "learning_rate": 7.951138771186442e-06, "loss": 1.3644, "mean_token_accuracy": 0.6814900413155556, "num_tokens": 12449141.0, "step": 15474 }, { "epoch": 4.098516949152542, "grad_norm": 2.0811283588409424, "learning_rate": 7.950873940677967e-06, "loss": 1.3879, "mean_token_accuracy": 0.6859285570681095, "num_tokens": 12450634.0, "step": 15476 }, { "epoch": 4.099046610169491, "grad_norm": 2.1486575603485107, "learning_rate": 7.950609110169492e-06, "loss": 1.42, "mean_token_accuracy": 0.6968643292784691, "num_tokens": 12452260.0, "step": 15478 }, { "epoch": 4.09957627118644, "grad_norm": 1.7550532817840576, "learning_rate": 7.950344279661017e-06, "loss": 1.3111, "mean_token_accuracy": 0.7083592340350151, "num_tokens": 12453926.0, "step": 15480 }, { "epoch": 4.10010593220339, "grad_norm": 1.940006971359253, "learning_rate": 7.950079449152544e-06, "loss": 1.466, "mean_token_accuracy": 0.6805068776011467, "num_tokens": 12455681.0, "step": 15482 }, { "epoch": 4.100635593220339, "grad_norm": 1.6519737243652344, "learning_rate": 7.949814618644069e-06, "loss": 1.62, "mean_token_accuracy": 0.6490965187549591, "num_tokens": 12457674.0, "step": 15484 }, { "epoch": 4.101165254237288, "grad_norm": 1.8622841835021973, "learning_rate": 7.949549788135594e-06, "loss": 0.9986, "mean_token_accuracy": 0.7644850015640259, "num_tokens": 12459249.0, "step": 15486 }, { "epoch": 4.101694915254237, "grad_norm": 2.210646867752075, "learning_rate": 7.949284957627118e-06, "loss": 1.5613, "mean_token_accuracy": 0.6584750264883041, "num_tokens": 12460955.0, "step": 15488 }, { "epoch": 4.102224576271187, "grad_norm": 1.7533419132232666, "learning_rate": 7.949020127118645e-06, "loss": 1.1024, "mean_token_accuracy": 0.7200162783265114, "num_tokens": 12462487.0, "step": 15490 }, { "epoch": 4.102754237288136, "grad_norm": 1.7860612869262695, "learning_rate": 7.94875529661017e-06, "loss": 0.9321, "mean_token_accuracy": 0.7544028535485268, "num_tokens": 12464306.0, "step": 15492 }, { "epoch": 4.103283898305085, "grad_norm": 2.6116926670074463, "learning_rate": 7.948490466101697e-06, "loss": 1.6383, "mean_token_accuracy": 0.6423506140708923, "num_tokens": 12465909.0, "step": 15494 }, { "epoch": 4.103813559322034, "grad_norm": 1.9651730060577393, "learning_rate": 7.948225635593222e-06, "loss": 1.3176, "mean_token_accuracy": 0.7176173254847527, "num_tokens": 12467257.0, "step": 15496 }, { "epoch": 4.104343220338983, "grad_norm": 1.9888145923614502, "learning_rate": 7.947960805084746e-06, "loss": 1.5062, "mean_token_accuracy": 0.6743015795946121, "num_tokens": 12468845.0, "step": 15498 }, { "epoch": 4.1048728813559325, "grad_norm": 1.8810967206954956, "learning_rate": 7.947695974576271e-06, "loss": 0.7321, "step": 15500 }, { "epoch": 4.1048728813559325, "eval_loss": 1.3110986948013306, "eval_mean_token_accuracy": 0.7013096995167918, "eval_num_tokens": 12470467.0, "eval_runtime": 48.3048, "eval_samples_per_second": 6.376, "eval_steps_per_second": 6.376, "step": 15500 }, { "epoch": 4.1054025423728815, "grad_norm": 1.9760563373565674, "learning_rate": 7.947431144067798e-06, "loss": 1.1556, "mean_token_accuracy": 0.7731728367507458, "num_tokens": 12472047.0, "step": 15502 }, { "epoch": 4.1059322033898304, "grad_norm": 1.6751130819320679, "learning_rate": 7.947166313559323e-06, "loss": 0.9082, "mean_token_accuracy": 0.7568393126130104, "num_tokens": 12473766.0, "step": 15504 }, { "epoch": 4.106461864406779, "grad_norm": 2.1446800231933594, "learning_rate": 7.946901483050848e-06, "loss": 1.2321, "mean_token_accuracy": 0.7060000747442245, "num_tokens": 12475168.0, "step": 15506 }, { "epoch": 4.106991525423729, "grad_norm": 1.8703352212905884, "learning_rate": 7.946636652542373e-06, "loss": 0.8788, "mean_token_accuracy": 0.771115779876709, "num_tokens": 12476845.0, "step": 15508 }, { "epoch": 4.107521186440678, "grad_norm": 1.7094879150390625, "learning_rate": 7.9463718220339e-06, "loss": 1.0533, "mean_token_accuracy": 0.7709526568651199, "num_tokens": 12478476.0, "step": 15510 }, { "epoch": 4.108050847457627, "grad_norm": 2.288782835006714, "learning_rate": 7.946106991525424e-06, "loss": 1.1697, "mean_token_accuracy": 0.7120723873376846, "num_tokens": 12480183.0, "step": 15512 }, { "epoch": 4.108580508474576, "grad_norm": 10.461604118347168, "learning_rate": 7.94584216101695e-06, "loss": 1.2217, "mean_token_accuracy": 0.7161912322044373, "num_tokens": 12481539.0, "step": 15514 }, { "epoch": 4.109110169491525, "grad_norm": 1.9380664825439453, "learning_rate": 7.945577330508474e-06, "loss": 1.2264, "mean_token_accuracy": 0.708617553114891, "num_tokens": 12483430.0, "step": 15516 }, { "epoch": 4.109639830508475, "grad_norm": 2.101752281188965, "learning_rate": 7.945312500000001e-06, "loss": 1.5734, "mean_token_accuracy": 0.6490959450602531, "num_tokens": 12485404.0, "step": 15518 }, { "epoch": 4.110169491525424, "grad_norm": 2.1563031673431396, "learning_rate": 7.945047669491526e-06, "loss": 1.5507, "mean_token_accuracy": 0.6687058955430984, "num_tokens": 12486840.0, "step": 15520 }, { "epoch": 4.110699152542373, "grad_norm": 2.1678223609924316, "learning_rate": 7.944782838983052e-06, "loss": 1.3366, "mean_token_accuracy": 0.7178799659013748, "num_tokens": 12488277.0, "step": 15522 }, { "epoch": 4.111228813559322, "grad_norm": 1.9311180114746094, "learning_rate": 7.944518008474577e-06, "loss": 1.3861, "mean_token_accuracy": 0.6926984265446663, "num_tokens": 12489975.0, "step": 15524 }, { "epoch": 4.111758474576271, "grad_norm": 2.015272855758667, "learning_rate": 7.944253177966102e-06, "loss": 1.297, "mean_token_accuracy": 0.7212227508425713, "num_tokens": 12491492.0, "step": 15526 }, { "epoch": 4.112288135593221, "grad_norm": 1.995145559310913, "learning_rate": 7.943988347457627e-06, "loss": 1.1217, "mean_token_accuracy": 0.7213874310255051, "num_tokens": 12493058.0, "step": 15528 }, { "epoch": 4.1128177966101696, "grad_norm": 1.596315622329712, "learning_rate": 7.943723516949154e-06, "loss": 1.0693, "mean_token_accuracy": 0.7353070229291916, "num_tokens": 12494328.0, "step": 15530 }, { "epoch": 4.1133474576271185, "grad_norm": 2.172883987426758, "learning_rate": 7.943458686440679e-06, "loss": 1.7709, "mean_token_accuracy": 0.6269116252660751, "num_tokens": 12495986.0, "step": 15532 }, { "epoch": 4.1138771186440675, "grad_norm": 2.2221505641937256, "learning_rate": 7.943193855932204e-06, "loss": 1.1312, "mean_token_accuracy": 0.7194543480873108, "num_tokens": 12497696.0, "step": 15534 }, { "epoch": 4.114406779661017, "grad_norm": 1.9536300897598267, "learning_rate": 7.942929025423729e-06, "loss": 1.434, "mean_token_accuracy": 0.67621149122715, "num_tokens": 12499209.0, "step": 15536 }, { "epoch": 4.114936440677966, "grad_norm": 1.482842206954956, "learning_rate": 7.942664194915255e-06, "loss": 1.2414, "mean_token_accuracy": 0.716417133808136, "num_tokens": 12500919.0, "step": 15538 }, { "epoch": 4.115466101694915, "grad_norm": 2.5806503295898438, "learning_rate": 7.94239936440678e-06, "loss": 1.3433, "mean_token_accuracy": 0.7160927057266235, "num_tokens": 12502512.0, "step": 15540 }, { "epoch": 4.115995762711864, "grad_norm": 1.7963911294937134, "learning_rate": 7.942134533898305e-06, "loss": 0.8235, "mean_token_accuracy": 0.7765600606799126, "num_tokens": 12504287.0, "step": 15542 }, { "epoch": 4.116525423728813, "grad_norm": 2.433394193649292, "learning_rate": 7.94186970338983e-06, "loss": 1.6196, "mean_token_accuracy": 0.6770513206720352, "num_tokens": 12505818.0, "step": 15544 }, { "epoch": 4.117055084745763, "grad_norm": 2.263456344604492, "learning_rate": 7.941604872881357e-06, "loss": 1.0587, "mean_token_accuracy": 0.7577389478683472, "num_tokens": 12507180.0, "step": 15546 }, { "epoch": 4.117584745762712, "grad_norm": 2.145744800567627, "learning_rate": 7.941340042372882e-06, "loss": 1.1564, "mean_token_accuracy": 0.7412908747792244, "num_tokens": 12508663.0, "step": 15548 }, { "epoch": 4.118114406779661, "grad_norm": 2.0723001956939697, "learning_rate": 7.941075211864408e-06, "loss": 1.6429, "mean_token_accuracy": 0.6485697850584984, "num_tokens": 12510410.0, "step": 15550 }, { "epoch": 4.11864406779661, "grad_norm": 2.1072731018066406, "learning_rate": 7.940810381355933e-06, "loss": 1.5781, "mean_token_accuracy": 0.6495995745062828, "num_tokens": 12512151.0, "step": 15552 }, { "epoch": 4.11917372881356, "grad_norm": 2.187894582748413, "learning_rate": 7.940545550847458e-06, "loss": 1.1571, "mean_token_accuracy": 0.7276033312082291, "num_tokens": 12513726.0, "step": 15554 }, { "epoch": 4.119703389830509, "grad_norm": 2.1331467628479004, "learning_rate": 7.940280720338985e-06, "loss": 1.2049, "mean_token_accuracy": 0.7293812707066536, "num_tokens": 12515022.0, "step": 15556 }, { "epoch": 4.120233050847458, "grad_norm": 1.9013863801956177, "learning_rate": 7.94001588983051e-06, "loss": 1.1652, "mean_token_accuracy": 0.7343636751174927, "num_tokens": 12516549.0, "step": 15558 }, { "epoch": 4.120762711864407, "grad_norm": 1.977917194366455, "learning_rate": 7.939751059322035e-06, "loss": 1.4657, "mean_token_accuracy": 0.6879323795437813, "num_tokens": 12518199.0, "step": 15560 }, { "epoch": 4.1212923728813555, "grad_norm": 2.0265283584594727, "learning_rate": 7.93948622881356e-06, "loss": 1.2153, "mean_token_accuracy": 0.6934864297509193, "num_tokens": 12519733.0, "step": 15562 }, { "epoch": 4.121822033898305, "grad_norm": 1.9752761125564575, "learning_rate": 7.939221398305086e-06, "loss": 1.2815, "mean_token_accuracy": 0.694560706615448, "num_tokens": 12521422.0, "step": 15564 }, { "epoch": 4.122351694915254, "grad_norm": 1.7410491704940796, "learning_rate": 7.938956567796611e-06, "loss": 1.3735, "mean_token_accuracy": 0.6745929419994354, "num_tokens": 12523214.0, "step": 15566 }, { "epoch": 4.122881355932203, "grad_norm": 2.022791862487793, "learning_rate": 7.938691737288136e-06, "loss": 0.9837, "mean_token_accuracy": 0.7519862279295921, "num_tokens": 12524627.0, "step": 15568 }, { "epoch": 4.123411016949152, "grad_norm": 1.940981388092041, "learning_rate": 7.938426906779661e-06, "loss": 1.4166, "mean_token_accuracy": 0.6745780184864998, "num_tokens": 12526359.0, "step": 15570 }, { "epoch": 4.123940677966102, "grad_norm": 2.1409010887145996, "learning_rate": 7.938162076271187e-06, "loss": 1.1668, "mean_token_accuracy": 0.749079667031765, "num_tokens": 12527743.0, "step": 15572 }, { "epoch": 4.124470338983051, "grad_norm": 1.9154876470565796, "learning_rate": 7.937897245762712e-06, "loss": 1.2132, "mean_token_accuracy": 0.7149161249399185, "num_tokens": 12529147.0, "step": 15574 }, { "epoch": 4.125, "grad_norm": 2.0305066108703613, "learning_rate": 7.937632415254239e-06, "loss": 0.9786, "mean_token_accuracy": 0.7661139816045761, "num_tokens": 12530744.0, "step": 15576 }, { "epoch": 4.125529661016949, "grad_norm": 1.7997982501983643, "learning_rate": 7.937367584745764e-06, "loss": 1.22, "mean_token_accuracy": 0.7196721285581589, "num_tokens": 12532322.0, "step": 15578 }, { "epoch": 4.126059322033898, "grad_norm": 2.067359209060669, "learning_rate": 7.937102754237289e-06, "loss": 1.4767, "mean_token_accuracy": 0.6975564211606979, "num_tokens": 12533906.0, "step": 15580 }, { "epoch": 4.126588983050848, "grad_norm": 2.028186798095703, "learning_rate": 7.936837923728814e-06, "loss": 1.5995, "mean_token_accuracy": 0.6372231990098953, "num_tokens": 12535520.0, "step": 15582 }, { "epoch": 4.127118644067797, "grad_norm": 2.0539164543151855, "learning_rate": 7.93657309322034e-06, "loss": 1.0531, "mean_token_accuracy": 0.7188290655612946, "num_tokens": 12537087.0, "step": 15584 }, { "epoch": 4.127648305084746, "grad_norm": 1.907483458518982, "learning_rate": 7.936308262711865e-06, "loss": 1.0356, "mean_token_accuracy": 0.7387200817465782, "num_tokens": 12538466.0, "step": 15586 }, { "epoch": 4.128177966101695, "grad_norm": 1.930853009223938, "learning_rate": 7.93604343220339e-06, "loss": 1.1053, "mean_token_accuracy": 0.7048187702894211, "num_tokens": 12540337.0, "step": 15588 }, { "epoch": 4.1287076271186445, "grad_norm": 1.7127645015716553, "learning_rate": 7.935778601694915e-06, "loss": 1.1565, "mean_token_accuracy": 0.7161311767995358, "num_tokens": 12542142.0, "step": 15590 }, { "epoch": 4.129237288135593, "grad_norm": 1.759952187538147, "learning_rate": 7.935513771186442e-06, "loss": 0.8919, "mean_token_accuracy": 0.7697297185659409, "num_tokens": 12543836.0, "step": 15592 }, { "epoch": 4.129766949152542, "grad_norm": 1.6880120038986206, "learning_rate": 7.935248940677967e-06, "loss": 0.9597, "mean_token_accuracy": 0.7657795250415802, "num_tokens": 12545186.0, "step": 15594 }, { "epoch": 4.130296610169491, "grad_norm": 1.7862590551376343, "learning_rate": 7.934984110169492e-06, "loss": 1.2063, "mean_token_accuracy": 0.717507854104042, "num_tokens": 12546997.0, "step": 15596 }, { "epoch": 4.13082627118644, "grad_norm": 2.1886518001556396, "learning_rate": 7.934719279661017e-06, "loss": 0.9892, "mean_token_accuracy": 0.7466074600815773, "num_tokens": 12548520.0, "step": 15598 }, { "epoch": 4.13135593220339, "grad_norm": 1.7693167924880981, "learning_rate": 7.934454449152543e-06, "loss": 0.7475, "mean_token_accuracy": 0.809837132692337, "num_tokens": 12549984.0, "step": 15600 }, { "epoch": 4.131885593220339, "grad_norm": 2.046125888824463, "learning_rate": 7.934189618644068e-06, "loss": 1.4195, "mean_token_accuracy": 0.699608251452446, "num_tokens": 12551530.0, "step": 15602 }, { "epoch": 4.132415254237288, "grad_norm": 2.034618854522705, "learning_rate": 7.933924788135595e-06, "loss": 1.4509, "mean_token_accuracy": 0.6490472108125687, "num_tokens": 12553208.0, "step": 15604 }, { "epoch": 4.132944915254237, "grad_norm": 1.7265870571136475, "learning_rate": 7.93365995762712e-06, "loss": 1.1897, "mean_token_accuracy": 0.7095498740673065, "num_tokens": 12555124.0, "step": 15606 }, { "epoch": 4.133474576271187, "grad_norm": 2.0969135761260986, "learning_rate": 7.933395127118645e-06, "loss": 0.8468, "mean_token_accuracy": 0.7829676195979118, "num_tokens": 12556384.0, "step": 15608 }, { "epoch": 4.134004237288136, "grad_norm": 1.6232101917266846, "learning_rate": 7.93313029661017e-06, "loss": 1.0806, "mean_token_accuracy": 0.7311525940895081, "num_tokens": 12557702.0, "step": 15610 }, { "epoch": 4.134533898305085, "grad_norm": 2.2010555267333984, "learning_rate": 7.932865466101696e-06, "loss": 1.0278, "mean_token_accuracy": 0.7487748861312866, "num_tokens": 12559019.0, "step": 15612 }, { "epoch": 4.135063559322034, "grad_norm": 1.9715356826782227, "learning_rate": 7.932600635593221e-06, "loss": 1.4518, "mean_token_accuracy": 0.670726053416729, "num_tokens": 12560384.0, "step": 15614 }, { "epoch": 4.135593220338983, "grad_norm": 1.7038320302963257, "learning_rate": 7.932335805084746e-06, "loss": 1.2325, "mean_token_accuracy": 0.725182555615902, "num_tokens": 12562145.0, "step": 15616 }, { "epoch": 4.1361228813559325, "grad_norm": 1.9523817300796509, "learning_rate": 7.932070974576271e-06, "loss": 1.3467, "mean_token_accuracy": 0.6924959421157837, "num_tokens": 12563843.0, "step": 15618 }, { "epoch": 4.1366525423728815, "grad_norm": 2.1508870124816895, "learning_rate": 7.931806144067798e-06, "loss": 1.2566, "mean_token_accuracy": 0.7270305156707764, "num_tokens": 12565152.0, "step": 15620 }, { "epoch": 4.1371822033898304, "grad_norm": 2.2950406074523926, "learning_rate": 7.931541313559323e-06, "loss": 1.5775, "mean_token_accuracy": 0.6451103650033474, "num_tokens": 12566605.0, "step": 15622 }, { "epoch": 4.137711864406779, "grad_norm": 2.0175793170928955, "learning_rate": 7.931276483050848e-06, "loss": 0.9521, "mean_token_accuracy": 0.7885771840810776, "num_tokens": 12568011.0, "step": 15624 }, { "epoch": 4.138241525423728, "grad_norm": 2.0110220909118652, "learning_rate": 7.931011652542372e-06, "loss": 1.2642, "mean_token_accuracy": 0.7070495709776878, "num_tokens": 12569372.0, "step": 15626 }, { "epoch": 4.138771186440678, "grad_norm": 2.2054877281188965, "learning_rate": 7.930746822033899e-06, "loss": 1.5483, "mean_token_accuracy": 0.6591548472642899, "num_tokens": 12571118.0, "step": 15628 }, { "epoch": 4.139300847457627, "grad_norm": 1.9482769966125488, "learning_rate": 7.930481991525424e-06, "loss": 1.4637, "mean_token_accuracy": 0.6602965667843819, "num_tokens": 12572869.0, "step": 15630 }, { "epoch": 4.139830508474576, "grad_norm": 2.400198221206665, "learning_rate": 7.93021716101695e-06, "loss": 1.2777, "mean_token_accuracy": 0.7074092552065849, "num_tokens": 12574180.0, "step": 15632 }, { "epoch": 4.140360169491525, "grad_norm": 2.089439630508423, "learning_rate": 7.929952330508474e-06, "loss": 1.1867, "mean_token_accuracy": 0.7256153225898743, "num_tokens": 12575599.0, "step": 15634 }, { "epoch": 4.140889830508475, "grad_norm": 2.0051565170288086, "learning_rate": 7.9296875e-06, "loss": 1.5108, "mean_token_accuracy": 0.65235935151577, "num_tokens": 12577170.0, "step": 15636 }, { "epoch": 4.141419491525424, "grad_norm": 2.2808494567871094, "learning_rate": 7.929422669491527e-06, "loss": 1.3123, "mean_token_accuracy": 0.7004632577300072, "num_tokens": 12579310.0, "step": 15638 }, { "epoch": 4.141949152542373, "grad_norm": 1.637010931968689, "learning_rate": 7.929157838983052e-06, "loss": 0.6103, "mean_token_accuracy": 0.8224740847945213, "num_tokens": 12580713.0, "step": 15640 }, { "epoch": 4.142478813559322, "grad_norm": 2.282418727874756, "learning_rate": 7.928893008474577e-06, "loss": 1.6873, "mean_token_accuracy": 0.6292588487267494, "num_tokens": 12582455.0, "step": 15642 }, { "epoch": 4.143008474576272, "grad_norm": 2.1354422569274902, "learning_rate": 7.928628177966102e-06, "loss": 1.2964, "mean_token_accuracy": 0.6869313642382622, "num_tokens": 12584150.0, "step": 15644 }, { "epoch": 4.143538135593221, "grad_norm": 1.7400460243225098, "learning_rate": 7.928363347457629e-06, "loss": 0.7959, "mean_token_accuracy": 0.7907379120588303, "num_tokens": 12585941.0, "step": 15646 }, { "epoch": 4.1440677966101696, "grad_norm": 2.1230921745300293, "learning_rate": 7.928098516949153e-06, "loss": 1.1391, "mean_token_accuracy": 0.7464002370834351, "num_tokens": 12587420.0, "step": 15648 }, { "epoch": 4.1445974576271185, "grad_norm": 1.7499710321426392, "learning_rate": 7.927833686440678e-06, "loss": 0.9888, "mean_token_accuracy": 0.7536895349621773, "num_tokens": 12589161.0, "step": 15650 }, { "epoch": 4.1451271186440675, "grad_norm": 2.520548105239868, "learning_rate": 7.927568855932203e-06, "loss": 1.5714, "mean_token_accuracy": 0.6490125954151154, "num_tokens": 12590915.0, "step": 15652 }, { "epoch": 4.145656779661017, "grad_norm": 2.209608316421509, "learning_rate": 7.92730402542373e-06, "loss": 1.6105, "mean_token_accuracy": 0.6452417820692062, "num_tokens": 12592400.0, "step": 15654 }, { "epoch": 4.146186440677966, "grad_norm": 1.6681591272354126, "learning_rate": 7.927039194915255e-06, "loss": 0.8958, "mean_token_accuracy": 0.7777844071388245, "num_tokens": 12594195.0, "step": 15656 }, { "epoch": 4.146716101694915, "grad_norm": 2.034201145172119, "learning_rate": 7.926774364406781e-06, "loss": 1.1042, "mean_token_accuracy": 0.7546025291085243, "num_tokens": 12595727.0, "step": 15658 }, { "epoch": 4.147245762711864, "grad_norm": 1.9499154090881348, "learning_rate": 7.926509533898306e-06, "loss": 1.2367, "mean_token_accuracy": 0.7047041468322277, "num_tokens": 12597451.0, "step": 15660 }, { "epoch": 4.147775423728813, "grad_norm": 1.813014268875122, "learning_rate": 7.926244703389831e-06, "loss": 1.2677, "mean_token_accuracy": 0.7158059775829315, "num_tokens": 12599320.0, "step": 15662 }, { "epoch": 4.148305084745763, "grad_norm": 1.8089427947998047, "learning_rate": 7.925979872881356e-06, "loss": 1.4045, "mean_token_accuracy": 0.6738962307572365, "num_tokens": 12601307.0, "step": 15664 }, { "epoch": 4.148834745762712, "grad_norm": 2.3309454917907715, "learning_rate": 7.925715042372883e-06, "loss": 1.2415, "mean_token_accuracy": 0.7210907638072968, "num_tokens": 12602645.0, "step": 15666 }, { "epoch": 4.149364406779661, "grad_norm": 2.2904460430145264, "learning_rate": 7.925450211864408e-06, "loss": 1.4804, "mean_token_accuracy": 0.6554808914661407, "num_tokens": 12604035.0, "step": 15668 }, { "epoch": 4.14989406779661, "grad_norm": 2.1399736404418945, "learning_rate": 7.925185381355933e-06, "loss": 1.2157, "mean_token_accuracy": 0.729526162147522, "num_tokens": 12605428.0, "step": 15670 }, { "epoch": 4.15042372881356, "grad_norm": 2.0570807456970215, "learning_rate": 7.924920550847458e-06, "loss": 1.328, "mean_token_accuracy": 0.7011258080601692, "num_tokens": 12606787.0, "step": 15672 }, { "epoch": 4.150953389830509, "grad_norm": 1.7440871000289917, "learning_rate": 7.924655720338984e-06, "loss": 1.2431, "mean_token_accuracy": 0.6925017610192299, "num_tokens": 12608438.0, "step": 15674 }, { "epoch": 4.151483050847458, "grad_norm": 1.976464867591858, "learning_rate": 7.92439088983051e-06, "loss": 1.1658, "mean_token_accuracy": 0.7284301221370697, "num_tokens": 12609951.0, "step": 15676 }, { "epoch": 4.152012711864407, "grad_norm": 1.8359627723693848, "learning_rate": 7.924126059322034e-06, "loss": 1.162, "mean_token_accuracy": 0.7192312404513359, "num_tokens": 12611577.0, "step": 15678 }, { "epoch": 4.1525423728813555, "grad_norm": 1.8172383308410645, "learning_rate": 7.923861228813559e-06, "loss": 1.1099, "mean_token_accuracy": 0.7165554016828537, "num_tokens": 12613182.0, "step": 15680 }, { "epoch": 4.153072033898305, "grad_norm": 2.36164927482605, "learning_rate": 7.923596398305086e-06, "loss": 1.0938, "mean_token_accuracy": 0.7396888434886932, "num_tokens": 12614577.0, "step": 15682 }, { "epoch": 4.153601694915254, "grad_norm": 2.0241644382476807, "learning_rate": 7.92333156779661e-06, "loss": 1.1246, "mean_token_accuracy": 0.712107390165329, "num_tokens": 12616177.0, "step": 15684 }, { "epoch": 4.154131355932203, "grad_norm": 1.8585704565048218, "learning_rate": 7.923066737288137e-06, "loss": 1.2511, "mean_token_accuracy": 0.7122378274798393, "num_tokens": 12617705.0, "step": 15686 }, { "epoch": 4.154661016949152, "grad_norm": 2.0353190898895264, "learning_rate": 7.92280190677966e-06, "loss": 1.5362, "mean_token_accuracy": 0.6658288612961769, "num_tokens": 12619411.0, "step": 15688 }, { "epoch": 4.155190677966102, "grad_norm": 1.7942429780960083, "learning_rate": 7.922537076271187e-06, "loss": 1.1886, "mean_token_accuracy": 0.6899764016270638, "num_tokens": 12621110.0, "step": 15690 }, { "epoch": 4.155720338983051, "grad_norm": 2.22733473777771, "learning_rate": 7.922272245762712e-06, "loss": 1.3775, "mean_token_accuracy": 0.6931520327925682, "num_tokens": 12622485.0, "step": 15692 }, { "epoch": 4.15625, "grad_norm": 1.9951688051223755, "learning_rate": 7.922007415254239e-06, "loss": 1.4126, "mean_token_accuracy": 0.6770648956298828, "num_tokens": 12624076.0, "step": 15694 }, { "epoch": 4.156779661016949, "grad_norm": 1.6791410446166992, "learning_rate": 7.921742584745764e-06, "loss": 1.1414, "mean_token_accuracy": 0.7278711348772049, "num_tokens": 12625705.0, "step": 15696 }, { "epoch": 4.157309322033898, "grad_norm": 2.2476882934570312, "learning_rate": 7.921477754237289e-06, "loss": 1.3401, "mean_token_accuracy": 0.7091564163565636, "num_tokens": 12627251.0, "step": 15698 }, { "epoch": 4.157838983050848, "grad_norm": 1.6422499418258667, "learning_rate": 7.921212923728813e-06, "loss": 1.0107, "mean_token_accuracy": 0.7508955225348473, "num_tokens": 12628738.0, "step": 15700 }, { "epoch": 4.158368644067797, "grad_norm": 2.0584988594055176, "learning_rate": 7.92094809322034e-06, "loss": 1.2919, "mean_token_accuracy": 0.7079212665557861, "num_tokens": 12630188.0, "step": 15702 }, { "epoch": 4.158898305084746, "grad_norm": 2.2960660457611084, "learning_rate": 7.920683262711865e-06, "loss": 1.1272, "mean_token_accuracy": 0.7312520816922188, "num_tokens": 12631652.0, "step": 15704 }, { "epoch": 4.159427966101695, "grad_norm": 1.6525315046310425, "learning_rate": 7.92041843220339e-06, "loss": 0.8959, "mean_token_accuracy": 0.765212818980217, "num_tokens": 12633495.0, "step": 15706 }, { "epoch": 4.1599576271186445, "grad_norm": 1.7289483547210693, "learning_rate": 7.920153601694915e-06, "loss": 0.7522, "mean_token_accuracy": 0.8111189156770706, "num_tokens": 12635011.0, "step": 15708 }, { "epoch": 4.160487288135593, "grad_norm": 1.7904337644577026, "learning_rate": 7.919888771186441e-06, "loss": 1.2656, "mean_token_accuracy": 0.735616609454155, "num_tokens": 12636589.0, "step": 15710 }, { "epoch": 4.161016949152542, "grad_norm": 2.098560094833374, "learning_rate": 7.919623940677966e-06, "loss": 1.0971, "mean_token_accuracy": 0.760384164750576, "num_tokens": 12638067.0, "step": 15712 }, { "epoch": 4.161546610169491, "grad_norm": 1.8879244327545166, "learning_rate": 7.919359110169493e-06, "loss": 1.5628, "mean_token_accuracy": 0.6414240002632141, "num_tokens": 12639846.0, "step": 15714 }, { "epoch": 4.16207627118644, "grad_norm": 1.8411202430725098, "learning_rate": 7.919094279661016e-06, "loss": 1.1858, "mean_token_accuracy": 0.7215894162654877, "num_tokens": 12641385.0, "step": 15716 }, { "epoch": 4.16260593220339, "grad_norm": 2.123687744140625, "learning_rate": 7.918829449152543e-06, "loss": 0.8387, "mean_token_accuracy": 0.7855154722929001, "num_tokens": 12642497.0, "step": 15718 }, { "epoch": 4.163135593220339, "grad_norm": 2.0620248317718506, "learning_rate": 7.918564618644068e-06, "loss": 0.9734, "mean_token_accuracy": 0.7539335265755653, "num_tokens": 12644033.0, "step": 15720 }, { "epoch": 4.163665254237288, "grad_norm": 2.2492425441741943, "learning_rate": 7.918299788135594e-06, "loss": 1.178, "mean_token_accuracy": 0.7200702652335167, "num_tokens": 12645451.0, "step": 15722 }, { "epoch": 4.164194915254237, "grad_norm": 2.3448591232299805, "learning_rate": 7.91803495762712e-06, "loss": 1.2693, "mean_token_accuracy": 0.7143087610602379, "num_tokens": 12647008.0, "step": 15724 }, { "epoch": 4.164724576271187, "grad_norm": 2.0098073482513428, "learning_rate": 7.917770127118644e-06, "loss": 1.2929, "mean_token_accuracy": 0.704055480659008, "num_tokens": 12648590.0, "step": 15726 }, { "epoch": 4.165254237288136, "grad_norm": 2.274446964263916, "learning_rate": 7.917505296610171e-06, "loss": 1.4838, "mean_token_accuracy": 0.6694875918328762, "num_tokens": 12650317.0, "step": 15728 }, { "epoch": 4.165783898305085, "grad_norm": 1.8021811246871948, "learning_rate": 7.917240466101696e-06, "loss": 1.2706, "mean_token_accuracy": 0.6845874637365341, "num_tokens": 12651930.0, "step": 15730 }, { "epoch": 4.166313559322034, "grad_norm": 1.827836275100708, "learning_rate": 7.91697563559322e-06, "loss": 1.1164, "mean_token_accuracy": 0.7647690549492836, "num_tokens": 12653440.0, "step": 15732 }, { "epoch": 4.166843220338983, "grad_norm": 2.2237765789031982, "learning_rate": 7.916710805084746e-06, "loss": 1.7988, "mean_token_accuracy": 0.60954749584198, "num_tokens": 12655009.0, "step": 15734 }, { "epoch": 4.1673728813559325, "grad_norm": 2.57692551612854, "learning_rate": 7.916445974576272e-06, "loss": 1.6134, "mean_token_accuracy": 0.6375944800674915, "num_tokens": 12656444.0, "step": 15736 }, { "epoch": 4.1679025423728815, "grad_norm": 1.8501622676849365, "learning_rate": 7.916181144067797e-06, "loss": 1.0448, "mean_token_accuracy": 0.7324673607945442, "num_tokens": 12658103.0, "step": 15738 }, { "epoch": 4.1684322033898304, "grad_norm": 2.3179092407226562, "learning_rate": 7.915916313559324e-06, "loss": 1.252, "mean_token_accuracy": 0.7181921005249023, "num_tokens": 12659573.0, "step": 15740 }, { "epoch": 4.168961864406779, "grad_norm": 2.0293221473693848, "learning_rate": 7.915651483050847e-06, "loss": 1.3674, "mean_token_accuracy": 0.7119286507368088, "num_tokens": 12660985.0, "step": 15742 }, { "epoch": 4.169491525423728, "grad_norm": 1.7846177816390991, "learning_rate": 7.915386652542374e-06, "loss": 0.9031, "mean_token_accuracy": 0.7580352053046227, "num_tokens": 12662609.0, "step": 15744 }, { "epoch": 4.170021186440678, "grad_norm": 2.2854623794555664, "learning_rate": 7.915121822033899e-06, "loss": 1.3935, "mean_token_accuracy": 0.6825416386127472, "num_tokens": 12664234.0, "step": 15746 }, { "epoch": 4.170550847457627, "grad_norm": 1.9617165327072144, "learning_rate": 7.914856991525425e-06, "loss": 1.5133, "mean_token_accuracy": 0.6721478030085564, "num_tokens": 12665752.0, "step": 15748 }, { "epoch": 4.171080508474576, "grad_norm": 2.032318115234375, "learning_rate": 7.91459216101695e-06, "loss": 1.0713, "step": 15750 }, { "epoch": 4.171080508474576, "eval_loss": 1.3133442401885986, "eval_mean_token_accuracy": 0.7014757036775737, "eval_num_tokens": 12667089.0, "eval_runtime": 48.2745, "eval_samples_per_second": 6.38, "eval_steps_per_second": 6.38, "step": 15750 }, { "epoch": 4.171610169491525, "grad_norm": 2.013495445251465, "learning_rate": 7.914327330508475e-06, "loss": 1.5432, "mean_token_accuracy": 0.6982677578926086, "num_tokens": 12668855.0, "step": 15752 }, { "epoch": 4.172139830508475, "grad_norm": 1.9369689226150513, "learning_rate": 7.9140625e-06, "loss": 1.1067, "mean_token_accuracy": 0.7604715004563332, "num_tokens": 12670551.0, "step": 15754 }, { "epoch": 4.172669491525424, "grad_norm": 1.8604837656021118, "learning_rate": 7.913797669491527e-06, "loss": 1.074, "mean_token_accuracy": 0.7300411984324455, "num_tokens": 12672260.0, "step": 15756 }, { "epoch": 4.173199152542373, "grad_norm": 1.766765832901001, "learning_rate": 7.913532838983052e-06, "loss": 1.2137, "mean_token_accuracy": 0.7180886715650558, "num_tokens": 12673911.0, "step": 15758 }, { "epoch": 4.173728813559322, "grad_norm": 2.1718688011169434, "learning_rate": 7.913268008474577e-06, "loss": 1.4452, "mean_token_accuracy": 0.6729704737663269, "num_tokens": 12675354.0, "step": 15760 }, { "epoch": 4.174258474576272, "grad_norm": 2.696220636367798, "learning_rate": 7.913003177966102e-06, "loss": 1.5266, "mean_token_accuracy": 0.6625869125127792, "num_tokens": 12676800.0, "step": 15762 }, { "epoch": 4.174788135593221, "grad_norm": 1.9677703380584717, "learning_rate": 7.912738347457628e-06, "loss": 1.2889, "mean_token_accuracy": 0.6967739388346672, "num_tokens": 12678597.0, "step": 15764 }, { "epoch": 4.1753177966101696, "grad_norm": 1.9968899488449097, "learning_rate": 7.912473516949153e-06, "loss": 1.3729, "mean_token_accuracy": 0.6904266104102135, "num_tokens": 12680296.0, "step": 15766 }, { "epoch": 4.1758474576271185, "grad_norm": 2.0643680095672607, "learning_rate": 7.91220868644068e-06, "loss": 1.2249, "mean_token_accuracy": 0.6902012899518013, "num_tokens": 12681830.0, "step": 15768 }, { "epoch": 4.1763771186440675, "grad_norm": 1.9749610424041748, "learning_rate": 7.911943855932203e-06, "loss": 0.9633, "mean_token_accuracy": 0.7545949444174767, "num_tokens": 12683229.0, "step": 15770 }, { "epoch": 4.176906779661017, "grad_norm": 1.5519397258758545, "learning_rate": 7.91167902542373e-06, "loss": 0.8238, "mean_token_accuracy": 0.7857371270656586, "num_tokens": 12684698.0, "step": 15772 }, { "epoch": 4.177436440677966, "grad_norm": 1.6762802600860596, "learning_rate": 7.911414194915254e-06, "loss": 1.2094, "mean_token_accuracy": 0.7153885066509247, "num_tokens": 12686376.0, "step": 15774 }, { "epoch": 4.177966101694915, "grad_norm": 2.081568956375122, "learning_rate": 7.911149364406781e-06, "loss": 1.1685, "mean_token_accuracy": 0.7298838943243027, "num_tokens": 12687805.0, "step": 15776 }, { "epoch": 4.178495762711864, "grad_norm": 1.7158616781234741, "learning_rate": 7.910884533898306e-06, "loss": 1.1192, "mean_token_accuracy": 0.7404061630368233, "num_tokens": 12689239.0, "step": 15778 }, { "epoch": 4.179025423728813, "grad_norm": 2.3806493282318115, "learning_rate": 7.910619703389831e-06, "loss": 1.4282, "mean_token_accuracy": 0.695218950510025, "num_tokens": 12690563.0, "step": 15780 }, { "epoch": 4.179555084745763, "grad_norm": 2.3490476608276367, "learning_rate": 7.910354872881356e-06, "loss": 1.1049, "mean_token_accuracy": 0.7430460453033447, "num_tokens": 12691960.0, "step": 15782 }, { "epoch": 4.180084745762712, "grad_norm": 2.0971829891204834, "learning_rate": 7.910090042372882e-06, "loss": 1.4539, "mean_token_accuracy": 0.6691173315048218, "num_tokens": 12693630.0, "step": 15784 }, { "epoch": 4.180614406779661, "grad_norm": 1.7530549764633179, "learning_rate": 7.909825211864407e-06, "loss": 1.0939, "mean_token_accuracy": 0.7431881353259087, "num_tokens": 12695147.0, "step": 15786 }, { "epoch": 4.18114406779661, "grad_norm": 1.5670913457870483, "learning_rate": 7.909560381355932e-06, "loss": 1.2585, "mean_token_accuracy": 0.6970000937581062, "num_tokens": 12697170.0, "step": 15788 }, { "epoch": 4.18167372881356, "grad_norm": 2.1032297611236572, "learning_rate": 7.909295550847457e-06, "loss": 1.5647, "mean_token_accuracy": 0.6360280439257622, "num_tokens": 12698633.0, "step": 15790 }, { "epoch": 4.182203389830509, "grad_norm": 1.9430166482925415, "learning_rate": 7.909030720338984e-06, "loss": 1.3911, "mean_token_accuracy": 0.6930420100688934, "num_tokens": 12700225.0, "step": 15792 }, { "epoch": 4.182733050847458, "grad_norm": 1.8159677982330322, "learning_rate": 7.908765889830509e-06, "loss": 0.9791, "mean_token_accuracy": 0.7572203725576401, "num_tokens": 12702334.0, "step": 15794 }, { "epoch": 4.183262711864407, "grad_norm": 1.5805211067199707, "learning_rate": 7.908501059322034e-06, "loss": 1.3872, "mean_token_accuracy": 0.6699384041130543, "num_tokens": 12704307.0, "step": 15796 }, { "epoch": 4.1837923728813555, "grad_norm": 1.7415108680725098, "learning_rate": 7.908236228813559e-06, "loss": 1.4215, "mean_token_accuracy": 0.6778870522975922, "num_tokens": 12706077.0, "step": 15798 }, { "epoch": 4.184322033898305, "grad_norm": 2.6525001525878906, "learning_rate": 7.907971398305085e-06, "loss": 1.8253, "mean_token_accuracy": 0.6103250756859779, "num_tokens": 12707689.0, "step": 15800 }, { "epoch": 4.184851694915254, "grad_norm": 1.9380388259887695, "learning_rate": 7.90770656779661e-06, "loss": 1.093, "mean_token_accuracy": 0.717405840754509, "num_tokens": 12709222.0, "step": 15802 }, { "epoch": 4.185381355932203, "grad_norm": 1.9791662693023682, "learning_rate": 7.907441737288137e-06, "loss": 1.332, "mean_token_accuracy": 0.7246533632278442, "num_tokens": 12710811.0, "step": 15804 }, { "epoch": 4.185911016949152, "grad_norm": 1.9561163187026978, "learning_rate": 7.907176906779662e-06, "loss": 1.1977, "mean_token_accuracy": 0.7045793831348419, "num_tokens": 12712440.0, "step": 15806 }, { "epoch": 4.186440677966102, "grad_norm": 2.013981342315674, "learning_rate": 7.906912076271187e-06, "loss": 1.2446, "mean_token_accuracy": 0.7157641276717186, "num_tokens": 12713857.0, "step": 15808 }, { "epoch": 4.186970338983051, "grad_norm": 2.123959541320801, "learning_rate": 7.906647245762713e-06, "loss": 1.1531, "mean_token_accuracy": 0.731728807091713, "num_tokens": 12715650.0, "step": 15810 }, { "epoch": 4.1875, "grad_norm": 1.9676860570907593, "learning_rate": 7.906382415254238e-06, "loss": 1.2049, "mean_token_accuracy": 0.7297721058130264, "num_tokens": 12717076.0, "step": 15812 }, { "epoch": 4.188029661016949, "grad_norm": 1.947448968887329, "learning_rate": 7.906117584745763e-06, "loss": 1.3023, "mean_token_accuracy": 0.6758056208491325, "num_tokens": 12718763.0, "step": 15814 }, { "epoch": 4.188559322033898, "grad_norm": 1.6994965076446533, "learning_rate": 7.905852754237288e-06, "loss": 1.3978, "mean_token_accuracy": 0.6984900161623955, "num_tokens": 12720752.0, "step": 15816 }, { "epoch": 4.189088983050848, "grad_norm": 2.0770742893218994, "learning_rate": 7.905587923728815e-06, "loss": 1.4194, "mean_token_accuracy": 0.7170004919171333, "num_tokens": 12722245.0, "step": 15818 }, { "epoch": 4.189618644067797, "grad_norm": 1.8925871849060059, "learning_rate": 7.90532309322034e-06, "loss": 1.3514, "mean_token_accuracy": 0.7057758495211601, "num_tokens": 12724000.0, "step": 15820 }, { "epoch": 4.190148305084746, "grad_norm": 1.858486294746399, "learning_rate": 7.905058262711866e-06, "loss": 1.2303, "mean_token_accuracy": 0.7124340012669563, "num_tokens": 12725555.0, "step": 15822 }, { "epoch": 4.190677966101695, "grad_norm": 1.9415459632873535, "learning_rate": 7.90479343220339e-06, "loss": 1.1826, "mean_token_accuracy": 0.7392294220626354, "num_tokens": 12726994.0, "step": 15824 }, { "epoch": 4.1912076271186445, "grad_norm": 1.7491499185562134, "learning_rate": 7.904528601694916e-06, "loss": 0.917, "mean_token_accuracy": 0.7730580866336823, "num_tokens": 12728361.0, "step": 15826 }, { "epoch": 4.191737288135593, "grad_norm": 1.8263421058654785, "learning_rate": 7.904263771186441e-06, "loss": 0.7955, "mean_token_accuracy": 0.7859045192599297, "num_tokens": 12730031.0, "step": 15828 }, { "epoch": 4.192266949152542, "grad_norm": 2.289881706237793, "learning_rate": 7.903998940677968e-06, "loss": 1.6698, "mean_token_accuracy": 0.6196122877299786, "num_tokens": 12731862.0, "step": 15830 }, { "epoch": 4.192796610169491, "grad_norm": 1.939916729927063, "learning_rate": 7.903734110169493e-06, "loss": 1.3405, "mean_token_accuracy": 0.7034766599535942, "num_tokens": 12733374.0, "step": 15832 }, { "epoch": 4.19332627118644, "grad_norm": 2.4097423553466797, "learning_rate": 7.903469279661018e-06, "loss": 1.0094, "mean_token_accuracy": 0.7543248683214188, "num_tokens": 12734530.0, "step": 15834 }, { "epoch": 4.19385593220339, "grad_norm": 2.0833516120910645, "learning_rate": 7.903204449152543e-06, "loss": 1.246, "mean_token_accuracy": 0.7217390611767769, "num_tokens": 12736108.0, "step": 15836 }, { "epoch": 4.194385593220339, "grad_norm": 2.1650426387786865, "learning_rate": 7.902939618644069e-06, "loss": 1.2723, "mean_token_accuracy": 0.7156125828623772, "num_tokens": 12737508.0, "step": 15838 }, { "epoch": 4.194915254237288, "grad_norm": 1.9882380962371826, "learning_rate": 7.902674788135594e-06, "loss": 1.5126, "mean_token_accuracy": 0.6609829515218735, "num_tokens": 12739064.0, "step": 15840 }, { "epoch": 4.195444915254237, "grad_norm": 1.7875453233718872, "learning_rate": 7.902409957627119e-06, "loss": 1.391, "mean_token_accuracy": 0.7034831792116165, "num_tokens": 12741033.0, "step": 15842 }, { "epoch": 4.195974576271187, "grad_norm": 1.869684100151062, "learning_rate": 7.902145127118644e-06, "loss": 1.3242, "mean_token_accuracy": 0.7166933640837669, "num_tokens": 12742605.0, "step": 15844 }, { "epoch": 4.196504237288136, "grad_norm": 1.6263108253479004, "learning_rate": 7.90188029661017e-06, "loss": 1.198, "mean_token_accuracy": 0.7629675045609474, "num_tokens": 12744192.0, "step": 15846 }, { "epoch": 4.197033898305085, "grad_norm": 1.8715859651565552, "learning_rate": 7.901615466101695e-06, "loss": 1.3797, "mean_token_accuracy": 0.6842048466205597, "num_tokens": 12745978.0, "step": 15848 }, { "epoch": 4.197563559322034, "grad_norm": 2.3650200366973877, "learning_rate": 7.90135063559322e-06, "loss": 1.4988, "mean_token_accuracy": 0.6508165672421455, "num_tokens": 12747843.0, "step": 15850 }, { "epoch": 4.198093220338983, "grad_norm": 1.9728440046310425, "learning_rate": 7.901085805084745e-06, "loss": 1.0999, "mean_token_accuracy": 0.7383763641119003, "num_tokens": 12749545.0, "step": 15852 }, { "epoch": 4.1986228813559325, "grad_norm": 2.13893461227417, "learning_rate": 7.900820974576272e-06, "loss": 1.2484, "mean_token_accuracy": 0.7041454389691353, "num_tokens": 12751079.0, "step": 15854 }, { "epoch": 4.1991525423728815, "grad_norm": 1.4436665773391724, "learning_rate": 7.900556144067797e-06, "loss": 1.1869, "mean_token_accuracy": 0.7112212926149368, "num_tokens": 12753088.0, "step": 15856 }, { "epoch": 4.1996822033898304, "grad_norm": 1.728598952293396, "learning_rate": 7.900291313559324e-06, "loss": 1.3238, "mean_token_accuracy": 0.6763909794390202, "num_tokens": 12754955.0, "step": 15858 }, { "epoch": 4.200211864406779, "grad_norm": 2.332832098007202, "learning_rate": 7.900026483050848e-06, "loss": 1.0453, "mean_token_accuracy": 0.7457485273480415, "num_tokens": 12756428.0, "step": 15860 }, { "epoch": 4.200741525423728, "grad_norm": 1.7550811767578125, "learning_rate": 7.899761652542373e-06, "loss": 0.9858, "mean_token_accuracy": 0.7517682388424873, "num_tokens": 12758042.0, "step": 15862 }, { "epoch": 4.201271186440678, "grad_norm": 1.9693397283554077, "learning_rate": 7.899496822033898e-06, "loss": 0.9464, "mean_token_accuracy": 0.7679837346076965, "num_tokens": 12759410.0, "step": 15864 }, { "epoch": 4.201800847457627, "grad_norm": 1.9109688997268677, "learning_rate": 7.899231991525425e-06, "loss": 1.1777, "mean_token_accuracy": 0.7362226620316505, "num_tokens": 12760917.0, "step": 15866 }, { "epoch": 4.202330508474576, "grad_norm": 1.9223984479904175, "learning_rate": 7.89896716101695e-06, "loss": 1.3313, "mean_token_accuracy": 0.6952648237347603, "num_tokens": 12762451.0, "step": 15868 }, { "epoch": 4.202860169491525, "grad_norm": 1.9708030223846436, "learning_rate": 7.898702330508475e-06, "loss": 1.4693, "mean_token_accuracy": 0.669057622551918, "num_tokens": 12764000.0, "step": 15870 }, { "epoch": 4.203389830508475, "grad_norm": 2.175391435623169, "learning_rate": 7.8984375e-06, "loss": 1.2053, "mean_token_accuracy": 0.7078441679477692, "num_tokens": 12765663.0, "step": 15872 }, { "epoch": 4.203919491525424, "grad_norm": 1.9245058298110962, "learning_rate": 7.898172669491526e-06, "loss": 1.5335, "mean_token_accuracy": 0.6501005291938782, "num_tokens": 12767252.0, "step": 15874 }, { "epoch": 4.204449152542373, "grad_norm": 2.3312265872955322, "learning_rate": 7.897907838983051e-06, "loss": 1.4494, "mean_token_accuracy": 0.676665648818016, "num_tokens": 12768802.0, "step": 15876 }, { "epoch": 4.204978813559322, "grad_norm": 1.8249266147613525, "learning_rate": 7.897643008474576e-06, "loss": 1.0623, "mean_token_accuracy": 0.7313614711165428, "num_tokens": 12770547.0, "step": 15878 }, { "epoch": 4.205508474576272, "grad_norm": 1.9373670816421509, "learning_rate": 7.897378177966101e-06, "loss": 1.2758, "mean_token_accuracy": 0.7136068493127823, "num_tokens": 12772385.0, "step": 15880 }, { "epoch": 4.206038135593221, "grad_norm": 2.0063860416412354, "learning_rate": 7.897113347457628e-06, "loss": 0.9855, "mean_token_accuracy": 0.7733101546764374, "num_tokens": 12773808.0, "step": 15882 }, { "epoch": 4.2065677966101696, "grad_norm": 1.2532739639282227, "learning_rate": 7.896848516949153e-06, "loss": 1.1443, "mean_token_accuracy": 0.7175885587930679, "num_tokens": 12776345.0, "step": 15884 }, { "epoch": 4.2070974576271185, "grad_norm": 2.711446762084961, "learning_rate": 7.89658368644068e-06, "loss": 1.4223, "mean_token_accuracy": 0.6887689679861069, "num_tokens": 12777782.0, "step": 15886 }, { "epoch": 4.2076271186440675, "grad_norm": 2.1338260173797607, "learning_rate": 7.896318855932204e-06, "loss": 1.3693, "mean_token_accuracy": 0.6954417005181313, "num_tokens": 12779477.0, "step": 15888 }, { "epoch": 4.208156779661017, "grad_norm": 1.8362981081008911, "learning_rate": 7.89605402542373e-06, "loss": 1.1136, "mean_token_accuracy": 0.7222777977585793, "num_tokens": 12781130.0, "step": 15890 }, { "epoch": 4.208686440677966, "grad_norm": 1.923830270767212, "learning_rate": 7.895789194915256e-06, "loss": 1.2803, "mean_token_accuracy": 0.7001627162098885, "num_tokens": 12782884.0, "step": 15892 }, { "epoch": 4.209216101694915, "grad_norm": 2.0835349559783936, "learning_rate": 7.89552436440678e-06, "loss": 1.0756, "mean_token_accuracy": 0.7463503256440163, "num_tokens": 12784115.0, "step": 15894 }, { "epoch": 4.209745762711864, "grad_norm": 2.5448451042175293, "learning_rate": 7.895259533898306e-06, "loss": 1.8361, "mean_token_accuracy": 0.592050563544035, "num_tokens": 12785476.0, "step": 15896 }, { "epoch": 4.210275423728813, "grad_norm": 1.78001070022583, "learning_rate": 7.89499470338983e-06, "loss": 1.3042, "mean_token_accuracy": 0.706070102751255, "num_tokens": 12786885.0, "step": 15898 }, { "epoch": 4.210805084745763, "grad_norm": 1.9633049964904785, "learning_rate": 7.894729872881357e-06, "loss": 1.5221, "mean_token_accuracy": 0.6795983836054802, "num_tokens": 12788396.0, "step": 15900 }, { "epoch": 4.211334745762712, "grad_norm": 1.850299596786499, "learning_rate": 7.894465042372882e-06, "loss": 1.1771, "mean_token_accuracy": 0.715086817741394, "num_tokens": 12789994.0, "step": 15902 }, { "epoch": 4.211864406779661, "grad_norm": 1.980578899383545, "learning_rate": 7.894200211864407e-06, "loss": 1.5865, "mean_token_accuracy": 0.6634872555732727, "num_tokens": 12791895.0, "step": 15904 }, { "epoch": 4.21239406779661, "grad_norm": 2.015213966369629, "learning_rate": 7.893935381355932e-06, "loss": 1.3501, "mean_token_accuracy": 0.687492661178112, "num_tokens": 12793479.0, "step": 15906 }, { "epoch": 4.21292372881356, "grad_norm": 2.212402820587158, "learning_rate": 7.893670550847459e-06, "loss": 1.2627, "mean_token_accuracy": 0.7064861953258514, "num_tokens": 12794902.0, "step": 15908 }, { "epoch": 4.213453389830509, "grad_norm": 1.9240151643753052, "learning_rate": 7.893405720338984e-06, "loss": 1.2778, "mean_token_accuracy": 0.6854768320918083, "num_tokens": 12796777.0, "step": 15910 }, { "epoch": 4.213983050847458, "grad_norm": 2.148869037628174, "learning_rate": 7.89314088983051e-06, "loss": 1.0386, "mean_token_accuracy": 0.7604277804493904, "num_tokens": 12798503.0, "step": 15912 }, { "epoch": 4.214512711864407, "grad_norm": 2.0402121543884277, "learning_rate": 7.892876059322035e-06, "loss": 1.5992, "mean_token_accuracy": 0.6421896517276764, "num_tokens": 12800225.0, "step": 15914 }, { "epoch": 4.2150423728813555, "grad_norm": 2.1267244815826416, "learning_rate": 7.89261122881356e-06, "loss": 1.2986, "mean_token_accuracy": 0.680313128978014, "num_tokens": 12801904.0, "step": 15916 }, { "epoch": 4.215572033898305, "grad_norm": 1.9522897005081177, "learning_rate": 7.892346398305085e-06, "loss": 1.2559, "mean_token_accuracy": 0.7312665656208992, "num_tokens": 12803337.0, "step": 15918 }, { "epoch": 4.216101694915254, "grad_norm": 1.6928966045379639, "learning_rate": 7.892081567796612e-06, "loss": 1.0175, "mean_token_accuracy": 0.7613461762666702, "num_tokens": 12804765.0, "step": 15920 }, { "epoch": 4.216631355932203, "grad_norm": 2.1505796909332275, "learning_rate": 7.891816737288136e-06, "loss": 1.324, "mean_token_accuracy": 0.6917656436562538, "num_tokens": 12806507.0, "step": 15922 }, { "epoch": 4.217161016949152, "grad_norm": 1.5964746475219727, "learning_rate": 7.891551906779661e-06, "loss": 1.1147, "mean_token_accuracy": 0.7415739074349403, "num_tokens": 12808191.0, "step": 15924 }, { "epoch": 4.217690677966102, "grad_norm": 2.4303197860717773, "learning_rate": 7.891287076271186e-06, "loss": 1.3672, "mean_token_accuracy": 0.6863500401377678, "num_tokens": 12809515.0, "step": 15926 }, { "epoch": 4.218220338983051, "grad_norm": 1.8158526420593262, "learning_rate": 7.891022245762713e-06, "loss": 0.9558, "mean_token_accuracy": 0.7573636546730995, "num_tokens": 12810802.0, "step": 15928 }, { "epoch": 4.21875, "grad_norm": 2.060079574584961, "learning_rate": 7.890757415254238e-06, "loss": 1.174, "mean_token_accuracy": 0.6952412948012352, "num_tokens": 12812245.0, "step": 15930 }, { "epoch": 4.219279661016949, "grad_norm": 2.374429941177368, "learning_rate": 7.890492584745763e-06, "loss": 1.6371, "mean_token_accuracy": 0.6326917409896851, "num_tokens": 12813744.0, "step": 15932 }, { "epoch": 4.219809322033898, "grad_norm": 1.3343700170516968, "learning_rate": 7.890227754237288e-06, "loss": 0.8194, "mean_token_accuracy": 0.7952627241611481, "num_tokens": 12815439.0, "step": 15934 }, { "epoch": 4.220338983050848, "grad_norm": 1.9765137434005737, "learning_rate": 7.889962923728814e-06, "loss": 1.3401, "mean_token_accuracy": 0.6850200481712818, "num_tokens": 12816872.0, "step": 15936 }, { "epoch": 4.220868644067797, "grad_norm": 2.243577718734741, "learning_rate": 7.88969809322034e-06, "loss": 1.0327, "mean_token_accuracy": 0.751041367650032, "num_tokens": 12818236.0, "step": 15938 }, { "epoch": 4.221398305084746, "grad_norm": 1.6780861616134644, "learning_rate": 7.889433262711866e-06, "loss": 0.8628, "mean_token_accuracy": 0.7729564681649208, "num_tokens": 12820224.0, "step": 15940 }, { "epoch": 4.221927966101695, "grad_norm": 1.7999476194381714, "learning_rate": 7.889168432203391e-06, "loss": 1.3083, "mean_token_accuracy": 0.6949641481041908, "num_tokens": 12821936.0, "step": 15942 }, { "epoch": 4.2224576271186445, "grad_norm": 2.020620822906494, "learning_rate": 7.888903601694916e-06, "loss": 1.2602, "mean_token_accuracy": 0.7134093195199966, "num_tokens": 12823558.0, "step": 15944 }, { "epoch": 4.222987288135593, "grad_norm": 1.8755834102630615, "learning_rate": 7.88863877118644e-06, "loss": 1.2087, "mean_token_accuracy": 0.7486146539449692, "num_tokens": 12825011.0, "step": 15946 }, { "epoch": 4.223516949152542, "grad_norm": 1.9234423637390137, "learning_rate": 7.888373940677967e-06, "loss": 1.2654, "mean_token_accuracy": 0.7490211501717567, "num_tokens": 12826619.0, "step": 15948 }, { "epoch": 4.224046610169491, "grad_norm": 1.4440333843231201, "learning_rate": 7.888109110169492e-06, "loss": 1.1245, "mean_token_accuracy": 0.7066730484366417, "num_tokens": 12828498.0, "step": 15950 }, { "epoch": 4.22457627118644, "grad_norm": 2.121636390686035, "learning_rate": 7.887844279661017e-06, "loss": 1.6117, "mean_token_accuracy": 0.6398096904158592, "num_tokens": 12830297.0, "step": 15952 }, { "epoch": 4.22510593220339, "grad_norm": 2.092080593109131, "learning_rate": 7.887579449152542e-06, "loss": 1.5238, "mean_token_accuracy": 0.6552659049630165, "num_tokens": 12832316.0, "step": 15954 }, { "epoch": 4.225635593220339, "grad_norm": 2.144660711288452, "learning_rate": 7.887314618644069e-06, "loss": 1.4894, "mean_token_accuracy": 0.6916262581944466, "num_tokens": 12834238.0, "step": 15956 }, { "epoch": 4.226165254237288, "grad_norm": 2.2239696979522705, "learning_rate": 7.887049788135594e-06, "loss": 1.06, "mean_token_accuracy": 0.7389152124524117, "num_tokens": 12835685.0, "step": 15958 }, { "epoch": 4.226694915254237, "grad_norm": 1.8840718269348145, "learning_rate": 7.886784957627119e-06, "loss": 1.5176, "mean_token_accuracy": 0.6557947024703026, "num_tokens": 12837827.0, "step": 15960 }, { "epoch": 4.227224576271187, "grad_norm": 2.3015666007995605, "learning_rate": 7.886520127118644e-06, "loss": 1.3227, "mean_token_accuracy": 0.7131317183375359, "num_tokens": 12839206.0, "step": 15962 }, { "epoch": 4.227754237288136, "grad_norm": 2.0872671604156494, "learning_rate": 7.88625529661017e-06, "loss": 1.6329, "mean_token_accuracy": 0.6460317969322205, "num_tokens": 12840973.0, "step": 15964 }, { "epoch": 4.228283898305085, "grad_norm": 1.6746243238449097, "learning_rate": 7.885990466101695e-06, "loss": 1.3684, "mean_token_accuracy": 0.7076233327388763, "num_tokens": 12842536.0, "step": 15966 }, { "epoch": 4.228813559322034, "grad_norm": 2.214324474334717, "learning_rate": 7.885725635593222e-06, "loss": 1.4823, "mean_token_accuracy": 0.6962351053953171, "num_tokens": 12844072.0, "step": 15968 }, { "epoch": 4.229343220338983, "grad_norm": 1.8299105167388916, "learning_rate": 7.885460805084747e-06, "loss": 1.0938, "mean_token_accuracy": 0.7241701185703278, "num_tokens": 12845543.0, "step": 15970 }, { "epoch": 4.2298728813559325, "grad_norm": 2.0884313583374023, "learning_rate": 7.885195974576272e-06, "loss": 1.5021, "mean_token_accuracy": 0.6345453783869743, "num_tokens": 12847395.0, "step": 15972 }, { "epoch": 4.2304025423728815, "grad_norm": 1.994462251663208, "learning_rate": 7.884931144067798e-06, "loss": 0.8944, "mean_token_accuracy": 0.7973780259490013, "num_tokens": 12848624.0, "step": 15974 }, { "epoch": 4.2309322033898304, "grad_norm": 1.939037561416626, "learning_rate": 7.884666313559323e-06, "loss": 1.6665, "mean_token_accuracy": 0.6351938545703888, "num_tokens": 12850435.0, "step": 15976 }, { "epoch": 4.231461864406779, "grad_norm": 1.7449188232421875, "learning_rate": 7.884401483050848e-06, "loss": 1.2084, "mean_token_accuracy": 0.7298070937395096, "num_tokens": 12851994.0, "step": 15978 }, { "epoch": 4.231991525423728, "grad_norm": 2.298093795776367, "learning_rate": 7.884136652542373e-06, "loss": 1.6796, "mean_token_accuracy": 0.6315041407942772, "num_tokens": 12853488.0, "step": 15980 }, { "epoch": 4.232521186440678, "grad_norm": 1.874267339706421, "learning_rate": 7.8838718220339e-06, "loss": 1.2156, "mean_token_accuracy": 0.738196350634098, "num_tokens": 12855104.0, "step": 15982 }, { "epoch": 4.233050847457627, "grad_norm": 1.9777941703796387, "learning_rate": 7.883606991525425e-06, "loss": 1.2277, "mean_token_accuracy": 0.7021525949239731, "num_tokens": 12856838.0, "step": 15984 }, { "epoch": 4.233580508474576, "grad_norm": 2.1365442276000977, "learning_rate": 7.88334216101695e-06, "loss": 1.3767, "mean_token_accuracy": 0.6887936443090439, "num_tokens": 12858612.0, "step": 15986 }, { "epoch": 4.234110169491525, "grad_norm": 2.2935755252838135, "learning_rate": 7.883077330508474e-06, "loss": 1.5207, "mean_token_accuracy": 0.6610271632671356, "num_tokens": 12860268.0, "step": 15988 }, { "epoch": 4.234639830508475, "grad_norm": 2.0780699253082275, "learning_rate": 7.882812500000001e-06, "loss": 0.8482, "mean_token_accuracy": 0.7834471166133881, "num_tokens": 12861884.0, "step": 15990 }, { "epoch": 4.235169491525424, "grad_norm": 1.7524147033691406, "learning_rate": 7.882547669491526e-06, "loss": 1.4195, "mean_token_accuracy": 0.6969684809446335, "num_tokens": 12863668.0, "step": 15992 }, { "epoch": 4.235699152542373, "grad_norm": 1.8814561367034912, "learning_rate": 7.882282838983053e-06, "loss": 1.1756, "mean_token_accuracy": 0.7246514707803726, "num_tokens": 12865525.0, "step": 15994 }, { "epoch": 4.236228813559322, "grad_norm": 2.099851131439209, "learning_rate": 7.882018008474578e-06, "loss": 1.6482, "mean_token_accuracy": 0.646369218826294, "num_tokens": 12867271.0, "step": 15996 }, { "epoch": 4.236758474576272, "grad_norm": 1.7939720153808594, "learning_rate": 7.881753177966102e-06, "loss": 0.9649, "mean_token_accuracy": 0.7712480053305626, "num_tokens": 12868798.0, "step": 15998 }, { "epoch": 4.237288135593221, "grad_norm": 1.8814641237258911, "learning_rate": 7.881488347457627e-06, "loss": 0.9744, "step": 16000 }, { "epoch": 4.237288135593221, "eval_loss": 1.3121814727783203, "eval_mean_token_accuracy": 0.7013332546724902, "eval_num_tokens": 12870455.0, "eval_runtime": 48.08, "eval_samples_per_second": 6.406, "eval_steps_per_second": 6.406, "step": 16000 }, { "epoch": 4.2378177966101696, "grad_norm": 1.759303331375122, "learning_rate": 7.881223516949154e-06, "loss": 1.1656, "mean_token_accuracy": 0.737153597176075, "num_tokens": 12872195.0, "step": 16002 }, { "epoch": 4.2383474576271185, "grad_norm": 1.973083734512329, "learning_rate": 7.880958686440679e-06, "loss": 1.1595, "mean_token_accuracy": 0.7281640022993088, "num_tokens": 12873952.0, "step": 16004 }, { "epoch": 4.2388771186440675, "grad_norm": 1.8836021423339844, "learning_rate": 7.880693855932204e-06, "loss": 1.1708, "mean_token_accuracy": 0.7210140451788902, "num_tokens": 12875537.0, "step": 16006 }, { "epoch": 4.239406779661017, "grad_norm": 1.6883634328842163, "learning_rate": 7.880429025423729e-06, "loss": 1.0507, "mean_token_accuracy": 0.735833078622818, "num_tokens": 12877112.0, "step": 16008 }, { "epoch": 4.239936440677966, "grad_norm": 1.8988511562347412, "learning_rate": 7.880164194915255e-06, "loss": 1.4225, "mean_token_accuracy": 0.6703986302018166, "num_tokens": 12878774.0, "step": 16010 }, { "epoch": 4.240466101694915, "grad_norm": 2.3264636993408203, "learning_rate": 7.87989936440678e-06, "loss": 1.2339, "mean_token_accuracy": 0.7200180441141129, "num_tokens": 12880310.0, "step": 16012 }, { "epoch": 4.240995762711864, "grad_norm": 1.9116348028182983, "learning_rate": 7.879634533898305e-06, "loss": 1.4444, "mean_token_accuracy": 0.6821737661957741, "num_tokens": 12882101.0, "step": 16014 }, { "epoch": 4.241525423728813, "grad_norm": 2.220095634460449, "learning_rate": 7.87936970338983e-06, "loss": 1.6354, "mean_token_accuracy": 0.6562810465693474, "num_tokens": 12883493.0, "step": 16016 }, { "epoch": 4.242055084745763, "grad_norm": 2.145479440689087, "learning_rate": 7.879104872881357e-06, "loss": 1.2071, "mean_token_accuracy": 0.7289841547608376, "num_tokens": 12884981.0, "step": 16018 }, { "epoch": 4.242584745762712, "grad_norm": 2.2890634536743164, "learning_rate": 7.878840042372882e-06, "loss": 1.0626, "mean_token_accuracy": 0.7511430457234383, "num_tokens": 12886561.0, "step": 16020 }, { "epoch": 4.243114406779661, "grad_norm": 2.0598320960998535, "learning_rate": 7.878575211864408e-06, "loss": 1.6172, "mean_token_accuracy": 0.6239335834980011, "num_tokens": 12888770.0, "step": 16022 }, { "epoch": 4.24364406779661, "grad_norm": 2.0589818954467773, "learning_rate": 7.878310381355933e-06, "loss": 1.0347, "mean_token_accuracy": 0.763253852725029, "num_tokens": 12890137.0, "step": 16024 }, { "epoch": 4.24417372881356, "grad_norm": 2.6178932189941406, "learning_rate": 7.878045550847458e-06, "loss": 1.4691, "mean_token_accuracy": 0.654863141477108, "num_tokens": 12891584.0, "step": 16026 }, { "epoch": 4.244703389830509, "grad_norm": 2.455751895904541, "learning_rate": 7.877780720338983e-06, "loss": 1.5103, "mean_token_accuracy": 0.6583674624562263, "num_tokens": 12893034.0, "step": 16028 }, { "epoch": 4.245233050847458, "grad_norm": 1.7805238962173462, "learning_rate": 7.87751588983051e-06, "loss": 1.1439, "mean_token_accuracy": 0.7308686710894108, "num_tokens": 12894965.0, "step": 16030 }, { "epoch": 4.245762711864407, "grad_norm": 1.9084851741790771, "learning_rate": 7.877251059322035e-06, "loss": 1.0992, "mean_token_accuracy": 0.728093720972538, "num_tokens": 12896650.0, "step": 16032 }, { "epoch": 4.2462923728813555, "grad_norm": 2.3031978607177734, "learning_rate": 7.87698622881356e-06, "loss": 1.5316, "mean_token_accuracy": 0.6965542137622833, "num_tokens": 12898063.0, "step": 16034 }, { "epoch": 4.246822033898305, "grad_norm": 1.7315430641174316, "learning_rate": 7.876721398305085e-06, "loss": 0.9025, "mean_token_accuracy": 0.7796876206994057, "num_tokens": 12899498.0, "step": 16036 }, { "epoch": 4.247351694915254, "grad_norm": 1.88029146194458, "learning_rate": 7.876456567796611e-06, "loss": 1.2625, "mean_token_accuracy": 0.6940545737743378, "num_tokens": 12901083.0, "step": 16038 }, { "epoch": 4.247881355932203, "grad_norm": 2.2532336711883545, "learning_rate": 7.876191737288136e-06, "loss": 1.4652, "mean_token_accuracy": 0.6891271620988846, "num_tokens": 12902609.0, "step": 16040 }, { "epoch": 4.248411016949152, "grad_norm": 2.2606451511383057, "learning_rate": 7.875926906779661e-06, "loss": 1.3905, "mean_token_accuracy": 0.713815726339817, "num_tokens": 12903933.0, "step": 16042 }, { "epoch": 4.248940677966102, "grad_norm": 1.8509329557418823, "learning_rate": 7.875662076271186e-06, "loss": 1.1036, "mean_token_accuracy": 0.7431391701102257, "num_tokens": 12905359.0, "step": 16044 }, { "epoch": 4.249470338983051, "grad_norm": 1.995276927947998, "learning_rate": 7.875397245762713e-06, "loss": 1.3889, "mean_token_accuracy": 0.6795797199010849, "num_tokens": 12906706.0, "step": 16046 }, { "epoch": 4.25, "grad_norm": 1.8154809474945068, "learning_rate": 7.875132415254238e-06, "loss": 1.5091, "mean_token_accuracy": 0.6592293009161949, "num_tokens": 12908663.0, "step": 16048 }, { "epoch": 4.250529661016949, "grad_norm": 2.0968987941741943, "learning_rate": 7.874867584745764e-06, "loss": 1.1129, "mean_token_accuracy": 0.7412296086549759, "num_tokens": 12910106.0, "step": 16050 }, { "epoch": 4.251059322033898, "grad_norm": 2.1097114086151123, "learning_rate": 7.874602754237289e-06, "loss": 1.4914, "mean_token_accuracy": 0.6811023503541946, "num_tokens": 12911840.0, "step": 16052 }, { "epoch": 4.251588983050848, "grad_norm": 2.367335557937622, "learning_rate": 7.874337923728814e-06, "loss": 1.5199, "mean_token_accuracy": 0.6526972278952599, "num_tokens": 12913578.0, "step": 16054 }, { "epoch": 4.252118644067797, "grad_norm": 1.707458257675171, "learning_rate": 7.874073093220339e-06, "loss": 0.9259, "mean_token_accuracy": 0.7583356127142906, "num_tokens": 12915305.0, "step": 16056 }, { "epoch": 4.252648305084746, "grad_norm": 1.9100509881973267, "learning_rate": 7.873808262711866e-06, "loss": 1.1654, "mean_token_accuracy": 0.7261163592338562, "num_tokens": 12917142.0, "step": 16058 }, { "epoch": 4.253177966101695, "grad_norm": 2.0298538208007812, "learning_rate": 7.87354343220339e-06, "loss": 1.3721, "mean_token_accuracy": 0.7107262760400772, "num_tokens": 12918890.0, "step": 16060 }, { "epoch": 4.2537076271186445, "grad_norm": 2.0953855514526367, "learning_rate": 7.873278601694915e-06, "loss": 1.508, "mean_token_accuracy": 0.6712092906236649, "num_tokens": 12920562.0, "step": 16062 }, { "epoch": 4.254237288135593, "grad_norm": 1.9967505931854248, "learning_rate": 7.873013771186442e-06, "loss": 1.0269, "mean_token_accuracy": 0.772394485771656, "num_tokens": 12921955.0, "step": 16064 }, { "epoch": 4.254766949152542, "grad_norm": 2.088850975036621, "learning_rate": 7.872748940677967e-06, "loss": 1.1512, "mean_token_accuracy": 0.7232557684183121, "num_tokens": 12923545.0, "step": 16066 }, { "epoch": 4.255296610169491, "grad_norm": 2.290357828140259, "learning_rate": 7.872484110169492e-06, "loss": 1.2771, "mean_token_accuracy": 0.7214871123433113, "num_tokens": 12925136.0, "step": 16068 }, { "epoch": 4.25582627118644, "grad_norm": 2.049053430557251, "learning_rate": 7.872219279661017e-06, "loss": 1.6346, "mean_token_accuracy": 0.6341901645064354, "num_tokens": 12926715.0, "step": 16070 }, { "epoch": 4.25635593220339, "grad_norm": 2.0730741024017334, "learning_rate": 7.871954449152543e-06, "loss": 1.4636, "mean_token_accuracy": 0.6503847688436508, "num_tokens": 12928536.0, "step": 16072 }, { "epoch": 4.256885593220339, "grad_norm": 2.2064096927642822, "learning_rate": 7.871689618644068e-06, "loss": 1.1608, "mean_token_accuracy": 0.7278659343719482, "num_tokens": 12929846.0, "step": 16074 }, { "epoch": 4.257415254237288, "grad_norm": 1.6108452081680298, "learning_rate": 7.871424788135595e-06, "loss": 0.7391, "mean_token_accuracy": 0.8057931363582611, "num_tokens": 12931597.0, "step": 16076 }, { "epoch": 4.257944915254237, "grad_norm": 1.8829902410507202, "learning_rate": 7.87115995762712e-06, "loss": 1.0262, "mean_token_accuracy": 0.746000126004219, "num_tokens": 12933216.0, "step": 16078 }, { "epoch": 4.258474576271187, "grad_norm": 1.6858307123184204, "learning_rate": 7.870895127118645e-06, "loss": 1.1265, "mean_token_accuracy": 0.7402013689279556, "num_tokens": 12934788.0, "step": 16080 }, { "epoch": 4.259004237288136, "grad_norm": 2.571211099624634, "learning_rate": 7.87063029661017e-06, "loss": 0.8618, "mean_token_accuracy": 0.7703352198004723, "num_tokens": 12936249.0, "step": 16082 }, { "epoch": 4.259533898305085, "grad_norm": 2.2230632305145264, "learning_rate": 7.870365466101696e-06, "loss": 1.4873, "mean_token_accuracy": 0.6805540025234222, "num_tokens": 12938050.0, "step": 16084 }, { "epoch": 4.260063559322034, "grad_norm": 2.0066416263580322, "learning_rate": 7.870100635593221e-06, "loss": 1.0642, "mean_token_accuracy": 0.7275354564189911, "num_tokens": 12939610.0, "step": 16086 }, { "epoch": 4.260593220338983, "grad_norm": 1.9365431070327759, "learning_rate": 7.869835805084746e-06, "loss": 1.0659, "mean_token_accuracy": 0.7304356098175049, "num_tokens": 12941299.0, "step": 16088 }, { "epoch": 4.2611228813559325, "grad_norm": 2.5149102210998535, "learning_rate": 7.869570974576271e-06, "loss": 1.3487, "mean_token_accuracy": 0.6999187022447586, "num_tokens": 12942806.0, "step": 16090 }, { "epoch": 4.2616525423728815, "grad_norm": 1.9067115783691406, "learning_rate": 7.869306144067798e-06, "loss": 0.8912, "mean_token_accuracy": 0.7824134528636932, "num_tokens": 12944565.0, "step": 16092 }, { "epoch": 4.2621822033898304, "grad_norm": 2.1392862796783447, "learning_rate": 7.869041313559323e-06, "loss": 1.6423, "mean_token_accuracy": 0.6407008469104767, "num_tokens": 12946100.0, "step": 16094 }, { "epoch": 4.262711864406779, "grad_norm": 2.20932674407959, "learning_rate": 7.868776483050848e-06, "loss": 1.3831, "mean_token_accuracy": 0.7072205990552902, "num_tokens": 12947806.0, "step": 16096 }, { "epoch": 4.263241525423728, "grad_norm": 2.1031219959259033, "learning_rate": 7.868511652542373e-06, "loss": 1.1786, "mean_token_accuracy": 0.747321680188179, "num_tokens": 12949970.0, "step": 16098 }, { "epoch": 4.263771186440678, "grad_norm": 1.868066668510437, "learning_rate": 7.8682468220339e-06, "loss": 1.367, "mean_token_accuracy": 0.671907551586628, "num_tokens": 12951697.0, "step": 16100 }, { "epoch": 4.264300847457627, "grad_norm": 1.4661177396774292, "learning_rate": 7.867981991525424e-06, "loss": 0.9236, "mean_token_accuracy": 0.788004994392395, "num_tokens": 12953455.0, "step": 16102 }, { "epoch": 4.264830508474576, "grad_norm": 1.9916319847106934, "learning_rate": 7.86771716101695e-06, "loss": 1.0398, "mean_token_accuracy": 0.7441642731428146, "num_tokens": 12954762.0, "step": 16104 }, { "epoch": 4.265360169491525, "grad_norm": 1.9661685228347778, "learning_rate": 7.867452330508476e-06, "loss": 1.4409, "mean_token_accuracy": 0.6720395497977734, "num_tokens": 12956289.0, "step": 16106 }, { "epoch": 4.265889830508475, "grad_norm": 1.9482781887054443, "learning_rate": 7.8671875e-06, "loss": 1.0134, "mean_token_accuracy": 0.7521505355834961, "num_tokens": 12957698.0, "step": 16108 }, { "epoch": 4.266419491525424, "grad_norm": 1.7457811832427979, "learning_rate": 7.866922669491526e-06, "loss": 1.0991, "mean_token_accuracy": 0.7246808111667633, "num_tokens": 12959387.0, "step": 16110 }, { "epoch": 4.266949152542373, "grad_norm": 2.475029706954956, "learning_rate": 7.866657838983052e-06, "loss": 1.4011, "mean_token_accuracy": 0.6583707258105278, "num_tokens": 12960947.0, "step": 16112 }, { "epoch": 4.267478813559322, "grad_norm": 1.8453898429870605, "learning_rate": 7.866393008474577e-06, "loss": 1.0774, "mean_token_accuracy": 0.7210601046681404, "num_tokens": 12962537.0, "step": 16114 }, { "epoch": 4.268008474576272, "grad_norm": 2.745924711227417, "learning_rate": 7.866128177966102e-06, "loss": 1.3835, "mean_token_accuracy": 0.6885218322277069, "num_tokens": 12963741.0, "step": 16116 }, { "epoch": 4.268538135593221, "grad_norm": 1.838240385055542, "learning_rate": 7.865863347457627e-06, "loss": 1.2669, "mean_token_accuracy": 0.7090218737721443, "num_tokens": 12965582.0, "step": 16118 }, { "epoch": 4.2690677966101696, "grad_norm": 1.6028053760528564, "learning_rate": 7.865598516949154e-06, "loss": 0.9093, "mean_token_accuracy": 0.783595897257328, "num_tokens": 12967141.0, "step": 16120 }, { "epoch": 4.2695974576271185, "grad_norm": 1.617193579673767, "learning_rate": 7.865333686440679e-06, "loss": 1.1157, "mean_token_accuracy": 0.7353665307164192, "num_tokens": 12968684.0, "step": 16122 }, { "epoch": 4.2701271186440675, "grad_norm": 2.2448508739471436, "learning_rate": 7.865068855932203e-06, "loss": 1.2887, "mean_token_accuracy": 0.7003418356180191, "num_tokens": 12970357.0, "step": 16124 }, { "epoch": 4.270656779661017, "grad_norm": 1.9711452722549438, "learning_rate": 7.864804025423728e-06, "loss": 1.14, "mean_token_accuracy": 0.7191530391573906, "num_tokens": 12972062.0, "step": 16126 }, { "epoch": 4.271186440677966, "grad_norm": 1.3572901487350464, "learning_rate": 7.864539194915255e-06, "loss": 1.0453, "mean_token_accuracy": 0.7510179877281189, "num_tokens": 12974445.0, "step": 16128 }, { "epoch": 4.271716101694915, "grad_norm": 2.3437230587005615, "learning_rate": 7.86427436440678e-06, "loss": 1.4282, "mean_token_accuracy": 0.6810598596930504, "num_tokens": 12975968.0, "step": 16130 }, { "epoch": 4.272245762711864, "grad_norm": 2.139350175857544, "learning_rate": 7.864009533898307e-06, "loss": 1.251, "mean_token_accuracy": 0.7098636776208878, "num_tokens": 12977355.0, "step": 16132 }, { "epoch": 4.272775423728813, "grad_norm": 1.8552613258361816, "learning_rate": 7.86374470338983e-06, "loss": 1.0137, "mean_token_accuracy": 0.7380300015211105, "num_tokens": 12978802.0, "step": 16134 }, { "epoch": 4.273305084745763, "grad_norm": 1.8267502784729004, "learning_rate": 7.863479872881356e-06, "loss": 1.3116, "mean_token_accuracy": 0.7029507830739021, "num_tokens": 12980265.0, "step": 16136 }, { "epoch": 4.273834745762712, "grad_norm": 1.5772799253463745, "learning_rate": 7.863215042372881e-06, "loss": 0.9865, "mean_token_accuracy": 0.7514561489224434, "num_tokens": 12982588.0, "step": 16138 }, { "epoch": 4.274364406779661, "grad_norm": 2.1907620429992676, "learning_rate": 7.862950211864408e-06, "loss": 1.5187, "mean_token_accuracy": 0.6682549193501472, "num_tokens": 12984294.0, "step": 16140 }, { "epoch": 4.27489406779661, "grad_norm": 1.9595165252685547, "learning_rate": 7.862685381355933e-06, "loss": 1.3772, "mean_token_accuracy": 0.6998449862003326, "num_tokens": 12985778.0, "step": 16142 }, { "epoch": 4.27542372881356, "grad_norm": 1.8728121519088745, "learning_rate": 7.862420550847458e-06, "loss": 0.7465, "mean_token_accuracy": 0.8018483817577362, "num_tokens": 12987363.0, "step": 16144 }, { "epoch": 4.275953389830509, "grad_norm": 1.817619800567627, "learning_rate": 7.862155720338984e-06, "loss": 0.9369, "mean_token_accuracy": 0.7755110636353493, "num_tokens": 12988778.0, "step": 16146 }, { "epoch": 4.276483050847458, "grad_norm": 2.0584475994110107, "learning_rate": 7.86189088983051e-06, "loss": 1.4434, "mean_token_accuracy": 0.677770771086216, "num_tokens": 12990568.0, "step": 16148 }, { "epoch": 4.277012711864407, "grad_norm": 1.9362900257110596, "learning_rate": 7.861626059322034e-06, "loss": 1.1977, "mean_token_accuracy": 0.7607438340783119, "num_tokens": 12991991.0, "step": 16150 }, { "epoch": 4.2775423728813555, "grad_norm": 2.1321752071380615, "learning_rate": 7.86136122881356e-06, "loss": 1.4345, "mean_token_accuracy": 0.7040546089410782, "num_tokens": 12993405.0, "step": 16152 }, { "epoch": 4.278072033898305, "grad_norm": 2.1082253456115723, "learning_rate": 7.861096398305086e-06, "loss": 1.6159, "mean_token_accuracy": 0.6519342437386513, "num_tokens": 12994987.0, "step": 16154 }, { "epoch": 4.278601694915254, "grad_norm": 2.052447557449341, "learning_rate": 7.86083156779661e-06, "loss": 1.0905, "mean_token_accuracy": 0.7482546120882034, "num_tokens": 12996565.0, "step": 16156 }, { "epoch": 4.279131355932203, "grad_norm": 2.2057242393493652, "learning_rate": 7.860566737288137e-06, "loss": 1.3699, "mean_token_accuracy": 0.6871268153190613, "num_tokens": 12998054.0, "step": 16158 }, { "epoch": 4.279661016949152, "grad_norm": 1.9210456609725952, "learning_rate": 7.860301906779662e-06, "loss": 0.8579, "mean_token_accuracy": 0.7818091362714767, "num_tokens": 12999424.0, "step": 16160 }, { "epoch": 4.280190677966102, "grad_norm": 2.346513032913208, "learning_rate": 7.860037076271187e-06, "loss": 1.2918, "mean_token_accuracy": 0.6867004781961441, "num_tokens": 13001236.0, "step": 16162 }, { "epoch": 4.280720338983051, "grad_norm": 1.8747628927230835, "learning_rate": 7.859772245762712e-06, "loss": 1.3041, "mean_token_accuracy": 0.7007075697183609, "num_tokens": 13003080.0, "step": 16164 }, { "epoch": 4.28125, "grad_norm": 1.6913070678710938, "learning_rate": 7.859507415254239e-06, "loss": 0.9304, "mean_token_accuracy": 0.7755569815635681, "num_tokens": 13004816.0, "step": 16166 }, { "epoch": 4.281779661016949, "grad_norm": 1.8896467685699463, "learning_rate": 7.859242584745764e-06, "loss": 0.9054, "mean_token_accuracy": 0.793727271258831, "num_tokens": 13006231.0, "step": 16168 }, { "epoch": 4.282309322033898, "grad_norm": 2.315114736557007, "learning_rate": 7.858977754237289e-06, "loss": 1.3769, "mean_token_accuracy": 0.7177176550030708, "num_tokens": 13007727.0, "step": 16170 }, { "epoch": 4.282838983050848, "grad_norm": 2.6113195419311523, "learning_rate": 7.858712923728814e-06, "loss": 1.0615, "mean_token_accuracy": 0.732143223285675, "num_tokens": 13009416.0, "step": 16172 }, { "epoch": 4.283368644067797, "grad_norm": 2.264904022216797, "learning_rate": 7.85844809322034e-06, "loss": 0.9636, "mean_token_accuracy": 0.7718624398112297, "num_tokens": 13010858.0, "step": 16174 }, { "epoch": 4.283898305084746, "grad_norm": 1.8601123094558716, "learning_rate": 7.858183262711865e-06, "loss": 1.3604, "mean_token_accuracy": 0.7023510411381721, "num_tokens": 13012391.0, "step": 16176 }, { "epoch": 4.284427966101695, "grad_norm": 2.3348162174224854, "learning_rate": 7.85791843220339e-06, "loss": 1.5504, "mean_token_accuracy": 0.6580817699432373, "num_tokens": 13013912.0, "step": 16178 }, { "epoch": 4.2849576271186445, "grad_norm": 2.391587018966675, "learning_rate": 7.857653601694915e-06, "loss": 1.2416, "mean_token_accuracy": 0.7124384045600891, "num_tokens": 13015367.0, "step": 16180 }, { "epoch": 4.285487288135593, "grad_norm": 1.5319284200668335, "learning_rate": 7.857388771186442e-06, "loss": 1.0387, "mean_token_accuracy": 0.7547210231423378, "num_tokens": 13017089.0, "step": 16182 }, { "epoch": 4.286016949152542, "grad_norm": 2.1619842052459717, "learning_rate": 7.857123940677967e-06, "loss": 0.8985, "mean_token_accuracy": 0.7617778033018112, "num_tokens": 13018815.0, "step": 16184 }, { "epoch": 4.286546610169491, "grad_norm": 2.060807704925537, "learning_rate": 7.856859110169493e-06, "loss": 1.491, "mean_token_accuracy": 0.6767979674041271, "num_tokens": 13020552.0, "step": 16186 }, { "epoch": 4.28707627118644, "grad_norm": 2.160160779953003, "learning_rate": 7.856594279661016e-06, "loss": 1.4375, "mean_token_accuracy": 0.6812907233834267, "num_tokens": 13022363.0, "step": 16188 }, { "epoch": 4.28760593220339, "grad_norm": 1.927418828010559, "learning_rate": 7.856329449152543e-06, "loss": 1.3422, "mean_token_accuracy": 0.6807163879275322, "num_tokens": 13023933.0, "step": 16190 }, { "epoch": 4.288135593220339, "grad_norm": 1.5406417846679688, "learning_rate": 7.856064618644068e-06, "loss": 1.1516, "mean_token_accuracy": 0.7417220324277878, "num_tokens": 13025397.0, "step": 16192 }, { "epoch": 4.288665254237288, "grad_norm": 2.1712899208068848, "learning_rate": 7.855799788135595e-06, "loss": 1.5323, "mean_token_accuracy": 0.6440271586179733, "num_tokens": 13027130.0, "step": 16194 }, { "epoch": 4.289194915254237, "grad_norm": 1.4791913032531738, "learning_rate": 7.85553495762712e-06, "loss": 1.0397, "mean_token_accuracy": 0.749807558953762, "num_tokens": 13028813.0, "step": 16196 }, { "epoch": 4.289724576271187, "grad_norm": 2.5681612491607666, "learning_rate": 7.855270127118644e-06, "loss": 1.0553, "mean_token_accuracy": 0.7727959677577019, "num_tokens": 13030113.0, "step": 16198 }, { "epoch": 4.290254237288136, "grad_norm": 1.816855549812317, "learning_rate": 7.85500529661017e-06, "loss": 0.7395, "mean_token_accuracy": 0.8142194673418999, "num_tokens": 13031547.0, "step": 16200 }, { "epoch": 4.290783898305085, "grad_norm": 2.040006637573242, "learning_rate": 7.854740466101696e-06, "loss": 1.028, "mean_token_accuracy": 0.7701760455965996, "num_tokens": 13033109.0, "step": 16202 }, { "epoch": 4.291313559322034, "grad_norm": 2.7594237327575684, "learning_rate": 7.854475635593221e-06, "loss": 1.6804, "mean_token_accuracy": 0.6466847881674767, "num_tokens": 13034701.0, "step": 16204 }, { "epoch": 4.291843220338983, "grad_norm": 2.106395959854126, "learning_rate": 7.854210805084746e-06, "loss": 1.4955, "mean_token_accuracy": 0.6624566987156868, "num_tokens": 13036172.0, "step": 16206 }, { "epoch": 4.2923728813559325, "grad_norm": 1.8887767791748047, "learning_rate": 7.85394597457627e-06, "loss": 1.2367, "mean_token_accuracy": 0.6796550005674362, "num_tokens": 13037970.0, "step": 16208 }, { "epoch": 4.2929025423728815, "grad_norm": 2.1005587577819824, "learning_rate": 7.853681144067797e-06, "loss": 1.2608, "mean_token_accuracy": 0.7289377823472023, "num_tokens": 13039515.0, "step": 16210 }, { "epoch": 4.2934322033898304, "grad_norm": 2.443307638168335, "learning_rate": 7.853416313559322e-06, "loss": 1.4045, "mean_token_accuracy": 0.6903119385242462, "num_tokens": 13041196.0, "step": 16212 }, { "epoch": 4.293961864406779, "grad_norm": 1.8494486808776855, "learning_rate": 7.853151483050849e-06, "loss": 1.1628, "mean_token_accuracy": 0.7180061638355255, "num_tokens": 13042784.0, "step": 16214 }, { "epoch": 4.294491525423728, "grad_norm": 2.251743793487549, "learning_rate": 7.852886652542372e-06, "loss": 1.6111, "mean_token_accuracy": 0.6535816490650177, "num_tokens": 13044222.0, "step": 16216 }, { "epoch": 4.295021186440678, "grad_norm": 2.0028107166290283, "learning_rate": 7.852621822033899e-06, "loss": 0.989, "mean_token_accuracy": 0.765791192650795, "num_tokens": 13045696.0, "step": 16218 }, { "epoch": 4.295550847457627, "grad_norm": 2.428230047225952, "learning_rate": 7.852356991525424e-06, "loss": 1.3228, "mean_token_accuracy": 0.693290077149868, "num_tokens": 13047440.0, "step": 16220 }, { "epoch": 4.296080508474576, "grad_norm": 1.942293643951416, "learning_rate": 7.85209216101695e-06, "loss": 1.2911, "mean_token_accuracy": 0.6881283335387707, "num_tokens": 13048945.0, "step": 16222 }, { "epoch": 4.296610169491525, "grad_norm": 2.317014217376709, "learning_rate": 7.851827330508475e-06, "loss": 1.0922, "mean_token_accuracy": 0.7510672248899937, "num_tokens": 13050502.0, "step": 16224 }, { "epoch": 4.297139830508475, "grad_norm": 1.9985285997390747, "learning_rate": 7.8515625e-06, "loss": 1.4185, "mean_token_accuracy": 0.6868520826101303, "num_tokens": 13052028.0, "step": 16226 }, { "epoch": 4.297669491525424, "grad_norm": 1.7842832803726196, "learning_rate": 7.851297669491527e-06, "loss": 1.0491, "mean_token_accuracy": 0.7674402296543121, "num_tokens": 13053521.0, "step": 16228 }, { "epoch": 4.298199152542373, "grad_norm": 2.0457603931427, "learning_rate": 7.851032838983052e-06, "loss": 1.5132, "mean_token_accuracy": 0.6754421815276146, "num_tokens": 13055206.0, "step": 16230 }, { "epoch": 4.298728813559322, "grad_norm": 1.9490468502044678, "learning_rate": 7.850768008474577e-06, "loss": 0.9536, "mean_token_accuracy": 0.7585252448916435, "num_tokens": 13056494.0, "step": 16232 }, { "epoch": 4.299258474576272, "grad_norm": 1.9763274192810059, "learning_rate": 7.850503177966102e-06, "loss": 1.2915, "mean_token_accuracy": 0.7153045237064362, "num_tokens": 13058208.0, "step": 16234 }, { "epoch": 4.299788135593221, "grad_norm": 2.289316415786743, "learning_rate": 7.850238347457628e-06, "loss": 1.2024, "mean_token_accuracy": 0.7147620096802711, "num_tokens": 13059513.0, "step": 16236 }, { "epoch": 4.3003177966101696, "grad_norm": 2.2207653522491455, "learning_rate": 7.849973516949153e-06, "loss": 1.5559, "mean_token_accuracy": 0.6488640680909157, "num_tokens": 13061175.0, "step": 16238 }, { "epoch": 4.3008474576271185, "grad_norm": 1.6600162982940674, "learning_rate": 7.84970868644068e-06, "loss": 0.7592, "mean_token_accuracy": 0.7929022461175919, "num_tokens": 13062964.0, "step": 16240 }, { "epoch": 4.3013771186440675, "grad_norm": 1.576263427734375, "learning_rate": 7.849443855932203e-06, "loss": 1.0636, "mean_token_accuracy": 0.7368012741208076, "num_tokens": 13064687.0, "step": 16242 }, { "epoch": 4.301906779661017, "grad_norm": 1.6469660997390747, "learning_rate": 7.84917902542373e-06, "loss": 1.1462, "mean_token_accuracy": 0.747200220823288, "num_tokens": 13066107.0, "step": 16244 }, { "epoch": 4.302436440677966, "grad_norm": 1.9619245529174805, "learning_rate": 7.848914194915255e-06, "loss": 1.2214, "mean_token_accuracy": 0.7048158720135689, "num_tokens": 13067487.0, "step": 16246 }, { "epoch": 4.302966101694915, "grad_norm": 1.4191112518310547, "learning_rate": 7.848649364406781e-06, "loss": 1.107, "mean_token_accuracy": 0.7291534021496773, "num_tokens": 13069450.0, "step": 16248 }, { "epoch": 4.303495762711864, "grad_norm": 2.336331605911255, "learning_rate": 7.848384533898306e-06, "loss": 1.2227, "step": 16250 }, { "epoch": 4.303495762711864, "eval_loss": 1.3139194250106812, "eval_mean_token_accuracy": 0.7015914750563634, "eval_num_tokens": 13070950.0, "eval_runtime": 48.1548, "eval_samples_per_second": 6.396, "eval_steps_per_second": 6.396, "step": 16250 }, { "epoch": 4.304025423728813, "grad_norm": 1.9393434524536133, "learning_rate": 7.848119703389831e-06, "loss": 0.9412, "mean_token_accuracy": 0.74495505169034, "num_tokens": 13072484.0, "step": 16252 }, { "epoch": 4.304555084745763, "grad_norm": 2.1854658126831055, "learning_rate": 7.847854872881356e-06, "loss": 1.124, "mean_token_accuracy": 0.72514808177948, "num_tokens": 13073737.0, "step": 16254 }, { "epoch": 4.305084745762712, "grad_norm": 1.9272805452346802, "learning_rate": 7.847590042372883e-06, "loss": 1.3559, "mean_token_accuracy": 0.696110911667347, "num_tokens": 13075582.0, "step": 16256 }, { "epoch": 4.305614406779661, "grad_norm": 2.3393170833587646, "learning_rate": 7.847325211864408e-06, "loss": 1.5445, "mean_token_accuracy": 0.6702397838234901, "num_tokens": 13077333.0, "step": 16258 }, { "epoch": 4.30614406779661, "grad_norm": 1.8848686218261719, "learning_rate": 7.847060381355933e-06, "loss": 1.5591, "mean_token_accuracy": 0.6604719534516335, "num_tokens": 13079113.0, "step": 16260 }, { "epoch": 4.30667372881356, "grad_norm": 1.9180277585983276, "learning_rate": 7.846795550847457e-06, "loss": 1.0424, "mean_token_accuracy": 0.7461871579289436, "num_tokens": 13080667.0, "step": 16262 }, { "epoch": 4.307203389830509, "grad_norm": 1.607877492904663, "learning_rate": 7.846530720338984e-06, "loss": 1.3297, "mean_token_accuracy": 0.7019784078001976, "num_tokens": 13082713.0, "step": 16264 }, { "epoch": 4.307733050847458, "grad_norm": 1.8152174949645996, "learning_rate": 7.846265889830509e-06, "loss": 0.9497, "mean_token_accuracy": 0.7816269174218178, "num_tokens": 13084240.0, "step": 16266 }, { "epoch": 4.308262711864407, "grad_norm": 1.8240200281143188, "learning_rate": 7.846001059322036e-06, "loss": 1.1908, "mean_token_accuracy": 0.7117075026035309, "num_tokens": 13086019.0, "step": 16268 }, { "epoch": 4.3087923728813555, "grad_norm": 2.025251865386963, "learning_rate": 7.845736228813559e-06, "loss": 1.1772, "mean_token_accuracy": 0.7285347804427147, "num_tokens": 13087501.0, "step": 16270 }, { "epoch": 4.309322033898305, "grad_norm": 1.9498035907745361, "learning_rate": 7.845471398305085e-06, "loss": 0.9052, "mean_token_accuracy": 0.7652943059802055, "num_tokens": 13088842.0, "step": 16272 }, { "epoch": 4.309851694915254, "grad_norm": 2.358391761779785, "learning_rate": 7.84520656779661e-06, "loss": 1.5195, "mean_token_accuracy": 0.6358329355716705, "num_tokens": 13090338.0, "step": 16274 }, { "epoch": 4.310381355932203, "grad_norm": 2.174102783203125, "learning_rate": 7.844941737288137e-06, "loss": 1.9071, "mean_token_accuracy": 0.5866027399897575, "num_tokens": 13092352.0, "step": 16276 }, { "epoch": 4.310911016949152, "grad_norm": 1.5528682470321655, "learning_rate": 7.844676906779662e-06, "loss": 0.9621, "mean_token_accuracy": 0.7493689656257629, "num_tokens": 13094160.0, "step": 16278 }, { "epoch": 4.311440677966102, "grad_norm": 2.0042428970336914, "learning_rate": 7.844412076271187e-06, "loss": 1.15, "mean_token_accuracy": 0.7515361681580544, "num_tokens": 13095629.0, "step": 16280 }, { "epoch": 4.311970338983051, "grad_norm": 1.950808048248291, "learning_rate": 7.844147245762712e-06, "loss": 1.1948, "mean_token_accuracy": 0.723463624715805, "num_tokens": 13097308.0, "step": 16282 }, { "epoch": 4.3125, "grad_norm": 1.9170989990234375, "learning_rate": 7.843882415254238e-06, "loss": 1.2319, "mean_token_accuracy": 0.7154118865728378, "num_tokens": 13099125.0, "step": 16284 }, { "epoch": 4.313029661016949, "grad_norm": 1.8630262613296509, "learning_rate": 7.843617584745763e-06, "loss": 1.4313, "mean_token_accuracy": 0.668310884386301, "num_tokens": 13100879.0, "step": 16286 }, { "epoch": 4.313559322033898, "grad_norm": 2.1858291625976562, "learning_rate": 7.843352754237288e-06, "loss": 1.0366, "mean_token_accuracy": 0.7547910735011101, "num_tokens": 13102423.0, "step": 16288 }, { "epoch": 4.314088983050848, "grad_norm": 2.148822069168091, "learning_rate": 7.843087923728813e-06, "loss": 0.9062, "mean_token_accuracy": 0.769976794719696, "num_tokens": 13103815.0, "step": 16290 }, { "epoch": 4.314618644067797, "grad_norm": 2.3168792724609375, "learning_rate": 7.84282309322034e-06, "loss": 1.4482, "mean_token_accuracy": 0.6853951066732407, "num_tokens": 13105321.0, "step": 16292 }, { "epoch": 4.315148305084746, "grad_norm": 1.9660953283309937, "learning_rate": 7.842558262711865e-06, "loss": 1.3379, "mean_token_accuracy": 0.7093976438045502, "num_tokens": 13106865.0, "step": 16294 }, { "epoch": 4.315677966101695, "grad_norm": 2.3317317962646484, "learning_rate": 7.84229343220339e-06, "loss": 1.6787, "mean_token_accuracy": 0.6375647112727165, "num_tokens": 13108252.0, "step": 16296 }, { "epoch": 4.3162076271186445, "grad_norm": 1.9279249906539917, "learning_rate": 7.842028601694915e-06, "loss": 0.9067, "mean_token_accuracy": 0.7832157015800476, "num_tokens": 13109814.0, "step": 16298 }, { "epoch": 4.316737288135593, "grad_norm": 1.7195677757263184, "learning_rate": 7.841763771186441e-06, "loss": 1.0019, "mean_token_accuracy": 0.7566952332854271, "num_tokens": 13111613.0, "step": 16300 }, { "epoch": 4.317266949152542, "grad_norm": 1.9399391412734985, "learning_rate": 7.841498940677966e-06, "loss": 1.0205, "mean_token_accuracy": 0.744915097951889, "num_tokens": 13113122.0, "step": 16302 }, { "epoch": 4.317796610169491, "grad_norm": 1.983229160308838, "learning_rate": 7.841234110169493e-06, "loss": 1.4749, "mean_token_accuracy": 0.6965723484754562, "num_tokens": 13114597.0, "step": 16304 }, { "epoch": 4.31832627118644, "grad_norm": 1.8058228492736816, "learning_rate": 7.840969279661018e-06, "loss": 1.0319, "mean_token_accuracy": 0.7360404580831528, "num_tokens": 13116364.0, "step": 16306 }, { "epoch": 4.31885593220339, "grad_norm": 1.6282049417495728, "learning_rate": 7.840704449152543e-06, "loss": 0.887, "mean_token_accuracy": 0.7870735973119736, "num_tokens": 13118164.0, "step": 16308 }, { "epoch": 4.319385593220339, "grad_norm": 1.8863478899002075, "learning_rate": 7.840439618644068e-06, "loss": 1.1281, "mean_token_accuracy": 0.7212904319167137, "num_tokens": 13119584.0, "step": 16310 }, { "epoch": 4.319915254237288, "grad_norm": 2.1133434772491455, "learning_rate": 7.840174788135594e-06, "loss": 1.0505, "mean_token_accuracy": 0.7846993133425713, "num_tokens": 13121037.0, "step": 16312 }, { "epoch": 4.320444915254237, "grad_norm": 2.519268035888672, "learning_rate": 7.83990995762712e-06, "loss": 1.6583, "mean_token_accuracy": 0.6403207257390022, "num_tokens": 13122629.0, "step": 16314 }, { "epoch": 4.320974576271187, "grad_norm": 2.2193148136138916, "learning_rate": 7.839645127118644e-06, "loss": 1.1113, "mean_token_accuracy": 0.7619872540235519, "num_tokens": 13123948.0, "step": 16316 }, { "epoch": 4.321504237288136, "grad_norm": 2.361098527908325, "learning_rate": 7.83938029661017e-06, "loss": 1.6802, "mean_token_accuracy": 0.6560249961912632, "num_tokens": 13125583.0, "step": 16318 }, { "epoch": 4.322033898305085, "grad_norm": 2.3634791374206543, "learning_rate": 7.839115466101696e-06, "loss": 1.5274, "mean_token_accuracy": 0.6667177937924862, "num_tokens": 13127115.0, "step": 16320 }, { "epoch": 4.322563559322034, "grad_norm": 2.249282121658325, "learning_rate": 7.838850635593222e-06, "loss": 1.0652, "mean_token_accuracy": 0.7552710622549057, "num_tokens": 13128630.0, "step": 16322 }, { "epoch": 4.323093220338983, "grad_norm": 2.47355580329895, "learning_rate": 7.838585805084746e-06, "loss": 1.4314, "mean_token_accuracy": 0.682866059243679, "num_tokens": 13130121.0, "step": 16324 }, { "epoch": 4.3236228813559325, "grad_norm": 1.7584471702575684, "learning_rate": 7.838320974576272e-06, "loss": 1.1195, "mean_token_accuracy": 0.7349297106266022, "num_tokens": 13131612.0, "step": 16326 }, { "epoch": 4.3241525423728815, "grad_norm": 1.8252794742584229, "learning_rate": 7.838056144067797e-06, "loss": 1.1904, "mean_token_accuracy": 0.7093209102749825, "num_tokens": 13133220.0, "step": 16328 }, { "epoch": 4.3246822033898304, "grad_norm": 1.8144327402114868, "learning_rate": 7.837791313559324e-06, "loss": 1.4672, "mean_token_accuracy": 0.655887208878994, "num_tokens": 13134973.0, "step": 16330 }, { "epoch": 4.325211864406779, "grad_norm": 2.4186007976531982, "learning_rate": 7.837526483050849e-06, "loss": 1.3534, "mean_token_accuracy": 0.7114172950387001, "num_tokens": 13136665.0, "step": 16332 }, { "epoch": 4.325741525423728, "grad_norm": 2.0105302333831787, "learning_rate": 7.837261652542374e-06, "loss": 1.5514, "mean_token_accuracy": 0.6537456661462784, "num_tokens": 13138429.0, "step": 16334 }, { "epoch": 4.326271186440678, "grad_norm": 1.648599624633789, "learning_rate": 7.836996822033898e-06, "loss": 1.1057, "mean_token_accuracy": 0.729562908411026, "num_tokens": 13140237.0, "step": 16336 }, { "epoch": 4.326800847457627, "grad_norm": 2.151226043701172, "learning_rate": 7.836731991525425e-06, "loss": 0.9711, "mean_token_accuracy": 0.774348720908165, "num_tokens": 13141589.0, "step": 16338 }, { "epoch": 4.327330508474576, "grad_norm": 1.881874442100525, "learning_rate": 7.83646716101695e-06, "loss": 1.3415, "mean_token_accuracy": 0.6957112327218056, "num_tokens": 13143370.0, "step": 16340 }, { "epoch": 4.327860169491525, "grad_norm": 2.3765652179718018, "learning_rate": 7.836202330508475e-06, "loss": 0.8944, "mean_token_accuracy": 0.7706980034708977, "num_tokens": 13145080.0, "step": 16342 }, { "epoch": 4.328389830508475, "grad_norm": 2.6036269664764404, "learning_rate": 7.8359375e-06, "loss": 1.393, "mean_token_accuracy": 0.6952685788273811, "num_tokens": 13146341.0, "step": 16344 }, { "epoch": 4.328919491525424, "grad_norm": 1.6302289962768555, "learning_rate": 7.835672669491527e-06, "loss": 0.8666, "mean_token_accuracy": 0.7807264402508736, "num_tokens": 13147953.0, "step": 16346 }, { "epoch": 4.329449152542373, "grad_norm": 1.7447580099105835, "learning_rate": 7.835407838983051e-06, "loss": 1.3867, "mean_token_accuracy": 0.6865096837282181, "num_tokens": 13149767.0, "step": 16348 }, { "epoch": 4.329978813559322, "grad_norm": 2.2176520824432373, "learning_rate": 7.835143008474576e-06, "loss": 1.3336, "mean_token_accuracy": 0.6799603030085564, "num_tokens": 13151716.0, "step": 16350 }, { "epoch": 4.330508474576272, "grad_norm": 1.9425158500671387, "learning_rate": 7.834878177966101e-06, "loss": 0.9211, "mean_token_accuracy": 0.7544678449630737, "num_tokens": 13153340.0, "step": 16352 }, { "epoch": 4.331038135593221, "grad_norm": 1.9725483655929565, "learning_rate": 7.834613347457628e-06, "loss": 1.4257, "mean_token_accuracy": 0.6775721982121468, "num_tokens": 13155013.0, "step": 16354 }, { "epoch": 4.3315677966101696, "grad_norm": 2.354508638381958, "learning_rate": 7.834348516949153e-06, "loss": 1.8179, "mean_token_accuracy": 0.6201595738530159, "num_tokens": 13156463.0, "step": 16356 }, { "epoch": 4.3320974576271185, "grad_norm": 2.13193416595459, "learning_rate": 7.83408368644068e-06, "loss": 1.043, "mean_token_accuracy": 0.7488151341676712, "num_tokens": 13157968.0, "step": 16358 }, { "epoch": 4.3326271186440675, "grad_norm": 2.0807368755340576, "learning_rate": 7.833818855932204e-06, "loss": 1.178, "mean_token_accuracy": 0.7337293922901154, "num_tokens": 13159349.0, "step": 16360 }, { "epoch": 4.333156779661017, "grad_norm": 2.059112548828125, "learning_rate": 7.83355402542373e-06, "loss": 1.321, "mean_token_accuracy": 0.700148805975914, "num_tokens": 13160965.0, "step": 16362 }, { "epoch": 4.333686440677966, "grad_norm": 2.2647511959075928, "learning_rate": 7.833289194915254e-06, "loss": 1.3694, "mean_token_accuracy": 0.6855513975024223, "num_tokens": 13162381.0, "step": 16364 }, { "epoch": 4.334216101694915, "grad_norm": 2.0833740234375, "learning_rate": 7.833024364406781e-06, "loss": 1.2789, "mean_token_accuracy": 0.7051266580820084, "num_tokens": 13163879.0, "step": 16366 }, { "epoch": 4.334745762711864, "grad_norm": 2.098992347717285, "learning_rate": 7.832759533898306e-06, "loss": 0.9437, "mean_token_accuracy": 0.7691465765237808, "num_tokens": 13165498.0, "step": 16368 }, { "epoch": 4.335275423728813, "grad_norm": 2.013580083847046, "learning_rate": 7.83249470338983e-06, "loss": 1.5836, "mean_token_accuracy": 0.6523369327187538, "num_tokens": 13167554.0, "step": 16370 }, { "epoch": 4.335805084745763, "grad_norm": 1.6582183837890625, "learning_rate": 7.832229872881356e-06, "loss": 0.7873, "mean_token_accuracy": 0.7650264203548431, "num_tokens": 13169803.0, "step": 16372 }, { "epoch": 4.336334745762712, "grad_norm": 1.6100265979766846, "learning_rate": 7.831965042372882e-06, "loss": 1.1458, "mean_token_accuracy": 0.7202320694923401, "num_tokens": 13171457.0, "step": 16374 }, { "epoch": 4.336864406779661, "grad_norm": 2.1473631858825684, "learning_rate": 7.831700211864407e-06, "loss": 1.2358, "mean_token_accuracy": 0.7165786027908325, "num_tokens": 13173167.0, "step": 16376 }, { "epoch": 4.33739406779661, "grad_norm": 2.1030678749084473, "learning_rate": 7.831435381355932e-06, "loss": 1.1052, "mean_token_accuracy": 0.738263413310051, "num_tokens": 13174654.0, "step": 16378 }, { "epoch": 4.33792372881356, "grad_norm": 1.9745231866836548, "learning_rate": 7.831170550847457e-06, "loss": 1.5369, "mean_token_accuracy": 0.6529034599661827, "num_tokens": 13176249.0, "step": 16380 }, { "epoch": 4.338453389830509, "grad_norm": 1.8788707256317139, "learning_rate": 7.830905720338984e-06, "loss": 1.5494, "mean_token_accuracy": 0.6632149517536163, "num_tokens": 13177896.0, "step": 16382 }, { "epoch": 4.338983050847458, "grad_norm": 1.7172291278839111, "learning_rate": 7.830640889830509e-06, "loss": 0.6578, "mean_token_accuracy": 0.808507427573204, "num_tokens": 13179328.0, "step": 16384 }, { "epoch": 4.339512711864407, "grad_norm": 1.8646175861358643, "learning_rate": 7.830376059322035e-06, "loss": 1.0079, "mean_token_accuracy": 0.7643719539046288, "num_tokens": 13181173.0, "step": 16386 }, { "epoch": 4.3400423728813555, "grad_norm": 2.1860909461975098, "learning_rate": 7.83011122881356e-06, "loss": 1.2748, "mean_token_accuracy": 0.7094442322850227, "num_tokens": 13182795.0, "step": 16388 }, { "epoch": 4.340572033898305, "grad_norm": 2.0715255737304688, "learning_rate": 7.829846398305085e-06, "loss": 1.4302, "mean_token_accuracy": 0.6971249356865883, "num_tokens": 13184333.0, "step": 16390 }, { "epoch": 4.341101694915254, "grad_norm": 1.8066531419754028, "learning_rate": 7.82958156779661e-06, "loss": 1.2895, "mean_token_accuracy": 0.7087341845035553, "num_tokens": 13185746.0, "step": 16392 }, { "epoch": 4.341631355932203, "grad_norm": 2.7510740756988525, "learning_rate": 7.829316737288137e-06, "loss": 1.44, "mean_token_accuracy": 0.6718448475003242, "num_tokens": 13187070.0, "step": 16394 }, { "epoch": 4.342161016949152, "grad_norm": 1.9685267210006714, "learning_rate": 7.829051906779662e-06, "loss": 1.0468, "mean_token_accuracy": 0.7453137189149857, "num_tokens": 13188685.0, "step": 16396 }, { "epoch": 4.342690677966102, "grad_norm": 1.979736089706421, "learning_rate": 7.828787076271187e-06, "loss": 0.8255, "mean_token_accuracy": 0.786241888999939, "num_tokens": 13190594.0, "step": 16398 }, { "epoch": 4.343220338983051, "grad_norm": 1.7050156593322754, "learning_rate": 7.828522245762713e-06, "loss": 1.3031, "mean_token_accuracy": 0.7141212001442909, "num_tokens": 13192234.0, "step": 16400 }, { "epoch": 4.34375, "grad_norm": 1.8679699897766113, "learning_rate": 7.828257415254238e-06, "loss": 0.8028, "mean_token_accuracy": 0.7883923426270485, "num_tokens": 13193505.0, "step": 16402 }, { "epoch": 4.344279661016949, "grad_norm": 2.301393985748291, "learning_rate": 7.827992584745763e-06, "loss": 1.0914, "mean_token_accuracy": 0.7317022308707237, "num_tokens": 13195223.0, "step": 16404 }, { "epoch": 4.344809322033898, "grad_norm": 1.9376108646392822, "learning_rate": 7.827727754237288e-06, "loss": 1.2729, "mean_token_accuracy": 0.6964586079120636, "num_tokens": 13196874.0, "step": 16406 }, { "epoch": 4.345338983050848, "grad_norm": 2.488924264907837, "learning_rate": 7.827462923728815e-06, "loss": 1.2953, "mean_token_accuracy": 0.6940635666251183, "num_tokens": 13198361.0, "step": 16408 }, { "epoch": 4.345868644067797, "grad_norm": 2.111133337020874, "learning_rate": 7.82719809322034e-06, "loss": 1.1858, "mean_token_accuracy": 0.7169901952147484, "num_tokens": 13199744.0, "step": 16410 }, { "epoch": 4.346398305084746, "grad_norm": 2.110907554626465, "learning_rate": 7.826933262711866e-06, "loss": 1.5788, "mean_token_accuracy": 0.6436129361391068, "num_tokens": 13201274.0, "step": 16412 }, { "epoch": 4.346927966101695, "grad_norm": 1.6913068294525146, "learning_rate": 7.826668432203391e-06, "loss": 0.9184, "mean_token_accuracy": 0.7790902331471443, "num_tokens": 13203395.0, "step": 16414 }, { "epoch": 4.3474576271186445, "grad_norm": 2.1121315956115723, "learning_rate": 7.826403601694916e-06, "loss": 1.3636, "mean_token_accuracy": 0.6930340155959129, "num_tokens": 13205132.0, "step": 16416 }, { "epoch": 4.347987288135593, "grad_norm": 2.166644811630249, "learning_rate": 7.826138771186441e-06, "loss": 1.1591, "mean_token_accuracy": 0.726994477212429, "num_tokens": 13206840.0, "step": 16418 }, { "epoch": 4.348516949152542, "grad_norm": 2.099259614944458, "learning_rate": 7.825873940677968e-06, "loss": 1.3378, "mean_token_accuracy": 0.6825510412454605, "num_tokens": 13208547.0, "step": 16420 }, { "epoch": 4.349046610169491, "grad_norm": 1.912647008895874, "learning_rate": 7.825609110169492e-06, "loss": 1.1663, "mean_token_accuracy": 0.7173984348773956, "num_tokens": 13210187.0, "step": 16422 }, { "epoch": 4.34957627118644, "grad_norm": 2.1286745071411133, "learning_rate": 7.825344279661017e-06, "loss": 1.1276, "mean_token_accuracy": 0.7238412201404572, "num_tokens": 13211598.0, "step": 16424 }, { "epoch": 4.35010593220339, "grad_norm": 1.58607017993927, "learning_rate": 7.825079449152542e-06, "loss": 1.0399, "mean_token_accuracy": 0.7644436806440353, "num_tokens": 13213206.0, "step": 16426 }, { "epoch": 4.350635593220339, "grad_norm": 2.027269124984741, "learning_rate": 7.824814618644069e-06, "loss": 1.3033, "mean_token_accuracy": 0.7125931419432163, "num_tokens": 13214604.0, "step": 16428 }, { "epoch": 4.351165254237288, "grad_norm": 2.2177176475524902, "learning_rate": 7.824549788135594e-06, "loss": 1.332, "mean_token_accuracy": 0.6872466877102852, "num_tokens": 13216158.0, "step": 16430 }, { "epoch": 4.351694915254237, "grad_norm": 2.0938851833343506, "learning_rate": 7.824284957627119e-06, "loss": 1.5755, "mean_token_accuracy": 0.6377224028110504, "num_tokens": 13217973.0, "step": 16432 }, { "epoch": 4.352224576271187, "grad_norm": 1.9459383487701416, "learning_rate": 7.824020127118644e-06, "loss": 1.1289, "mean_token_accuracy": 0.7054281607270241, "num_tokens": 13219496.0, "step": 16434 }, { "epoch": 4.352754237288136, "grad_norm": 1.8827643394470215, "learning_rate": 7.82375529661017e-06, "loss": 1.0971, "mean_token_accuracy": 0.7524904012680054, "num_tokens": 13221112.0, "step": 16436 }, { "epoch": 4.353283898305085, "grad_norm": 1.3869298696517944, "learning_rate": 7.823490466101695e-06, "loss": 1.0869, "mean_token_accuracy": 0.7458896189928055, "num_tokens": 13223361.0, "step": 16438 }, { "epoch": 4.353813559322034, "grad_norm": 1.8988111019134521, "learning_rate": 7.823225635593222e-06, "loss": 1.2659, "mean_token_accuracy": 0.7055575549602509, "num_tokens": 13224862.0, "step": 16440 }, { "epoch": 4.354343220338983, "grad_norm": 3.24483060836792, "learning_rate": 7.822960805084747e-06, "loss": 1.1454, "mean_token_accuracy": 0.7426673099398613, "num_tokens": 13226138.0, "step": 16442 }, { "epoch": 4.3548728813559325, "grad_norm": 1.9428699016571045, "learning_rate": 7.822695974576272e-06, "loss": 1.3563, "mean_token_accuracy": 0.6843095570802689, "num_tokens": 13227785.0, "step": 16444 }, { "epoch": 4.3554025423728815, "grad_norm": 2.459608316421509, "learning_rate": 7.822431144067797e-06, "loss": 1.4066, "mean_token_accuracy": 0.7002210766077042, "num_tokens": 13229229.0, "step": 16446 }, { "epoch": 4.3559322033898304, "grad_norm": 1.9809236526489258, "learning_rate": 7.822166313559323e-06, "loss": 1.0604, "mean_token_accuracy": 0.7420192062854767, "num_tokens": 13230861.0, "step": 16448 }, { "epoch": 4.356461864406779, "grad_norm": 1.9048060178756714, "learning_rate": 7.821901483050848e-06, "loss": 0.9368, "mean_token_accuracy": 0.7707809284329414, "num_tokens": 13232415.0, "step": 16450 }, { "epoch": 4.356991525423728, "grad_norm": 1.8631030321121216, "learning_rate": 7.821636652542373e-06, "loss": 1.1967, "mean_token_accuracy": 0.7230966985225677, "num_tokens": 13234146.0, "step": 16452 }, { "epoch": 4.357521186440678, "grad_norm": 2.311887264251709, "learning_rate": 7.821371822033898e-06, "loss": 1.4369, "mean_token_accuracy": 0.6693291813135147, "num_tokens": 13235787.0, "step": 16454 }, { "epoch": 4.358050847457627, "grad_norm": 2.0190935134887695, "learning_rate": 7.821106991525425e-06, "loss": 1.0435, "mean_token_accuracy": 0.752276822924614, "num_tokens": 13237269.0, "step": 16456 }, { "epoch": 4.358580508474576, "grad_norm": 2.392827033996582, "learning_rate": 7.82084216101695e-06, "loss": 1.4702, "mean_token_accuracy": 0.6735830828547478, "num_tokens": 13238782.0, "step": 16458 }, { "epoch": 4.359110169491525, "grad_norm": 1.6540101766586304, "learning_rate": 7.820577330508475e-06, "loss": 1.0542, "mean_token_accuracy": 0.7489940822124481, "num_tokens": 13240827.0, "step": 16460 }, { "epoch": 4.359639830508475, "grad_norm": 2.2654831409454346, "learning_rate": 7.8203125e-06, "loss": 1.6311, "mean_token_accuracy": 0.6413698419928551, "num_tokens": 13242459.0, "step": 16462 }, { "epoch": 4.360169491525424, "grad_norm": 1.8018429279327393, "learning_rate": 7.820047669491526e-06, "loss": 1.1423, "mean_token_accuracy": 0.7217362970113754, "num_tokens": 13244092.0, "step": 16464 }, { "epoch": 4.360699152542373, "grad_norm": 2.4506537914276123, "learning_rate": 7.819782838983051e-06, "loss": 1.8008, "mean_token_accuracy": 0.6392794698476791, "num_tokens": 13245633.0, "step": 16466 }, { "epoch": 4.361228813559322, "grad_norm": 2.0709755420684814, "learning_rate": 7.819518008474578e-06, "loss": 1.5066, "mean_token_accuracy": 0.6749728471040726, "num_tokens": 13247550.0, "step": 16468 }, { "epoch": 4.361758474576272, "grad_norm": 2.2049968242645264, "learning_rate": 7.819253177966103e-06, "loss": 1.0028, "mean_token_accuracy": 0.7499571293592453, "num_tokens": 13249162.0, "step": 16470 }, { "epoch": 4.362288135593221, "grad_norm": 2.143124580383301, "learning_rate": 7.818988347457628e-06, "loss": 1.4749, "mean_token_accuracy": 0.6838839426636696, "num_tokens": 13250648.0, "step": 16472 }, { "epoch": 4.3628177966101696, "grad_norm": 1.9216336011886597, "learning_rate": 7.818723516949152e-06, "loss": 1.3079, "mean_token_accuracy": 0.7152419462800026, "num_tokens": 13252280.0, "step": 16474 }, { "epoch": 4.3633474576271185, "grad_norm": 1.7266088724136353, "learning_rate": 7.818458686440679e-06, "loss": 1.129, "mean_token_accuracy": 0.7454121857881546, "num_tokens": 13253814.0, "step": 16476 }, { "epoch": 4.3638771186440675, "grad_norm": 2.1177139282226562, "learning_rate": 7.818193855932204e-06, "loss": 1.1826, "mean_token_accuracy": 0.7167643830180168, "num_tokens": 13255353.0, "step": 16478 }, { "epoch": 4.364406779661017, "grad_norm": 1.8106720447540283, "learning_rate": 7.817929025423729e-06, "loss": 1.1028, "mean_token_accuracy": 0.726651132106781, "num_tokens": 13256798.0, "step": 16480 }, { "epoch": 4.364936440677966, "grad_norm": 1.9339265823364258, "learning_rate": 7.817664194915256e-06, "loss": 1.4978, "mean_token_accuracy": 0.6782696209847927, "num_tokens": 13258680.0, "step": 16482 }, { "epoch": 4.365466101694915, "grad_norm": 2.027540683746338, "learning_rate": 7.81739936440678e-06, "loss": 1.242, "mean_token_accuracy": 0.7158541306853294, "num_tokens": 13260137.0, "step": 16484 }, { "epoch": 4.365995762711864, "grad_norm": 1.7641373872756958, "learning_rate": 7.817134533898305e-06, "loss": 1.0009, "mean_token_accuracy": 0.73090510815382, "num_tokens": 13261902.0, "step": 16486 }, { "epoch": 4.366525423728813, "grad_norm": 2.1862118244171143, "learning_rate": 7.81686970338983e-06, "loss": 1.2691, "mean_token_accuracy": 0.7207878604531288, "num_tokens": 13263444.0, "step": 16488 }, { "epoch": 4.367055084745763, "grad_norm": 2.2902395725250244, "learning_rate": 7.816604872881357e-06, "loss": 1.1593, "mean_token_accuracy": 0.7302017509937286, "num_tokens": 13264843.0, "step": 16490 }, { "epoch": 4.367584745762712, "grad_norm": 1.9879236221313477, "learning_rate": 7.816340042372882e-06, "loss": 1.8744, "mean_token_accuracy": 0.588500514626503, "num_tokens": 13266625.0, "step": 16492 }, { "epoch": 4.368114406779661, "grad_norm": 2.1911556720733643, "learning_rate": 7.816075211864409e-06, "loss": 1.1086, "mean_token_accuracy": 0.7165097594261169, "num_tokens": 13267908.0, "step": 16494 }, { "epoch": 4.36864406779661, "grad_norm": 2.111438274383545, "learning_rate": 7.815810381355933e-06, "loss": 0.9309, "mean_token_accuracy": 0.7817188054323196, "num_tokens": 13269369.0, "step": 16496 }, { "epoch": 4.36917372881356, "grad_norm": 2.5285544395446777, "learning_rate": 7.815545550847458e-06, "loss": 1.0294, "mean_token_accuracy": 0.7485950663685799, "num_tokens": 13270820.0, "step": 16498 }, { "epoch": 4.369703389830509, "grad_norm": 2.4438509941101074, "learning_rate": 7.815280720338983e-06, "loss": 1.6632, "step": 16500 }, { "epoch": 4.369703389830509, "eval_loss": 1.313454270362854, "eval_mean_token_accuracy": 0.7014390508849899, "eval_num_tokens": 13272519.0, "eval_runtime": 48.2126, "eval_samples_per_second": 6.388, "eval_steps_per_second": 6.388, "step": 16500 }, { "epoch": 4.370233050847458, "grad_norm": 1.9726808071136475, "learning_rate": 7.81501588983051e-06, "loss": 1.1271, "mean_token_accuracy": 0.6829047203063965, "num_tokens": 13274037.0, "step": 16502 }, { "epoch": 4.370762711864407, "grad_norm": 2.4873342514038086, "learning_rate": 7.814751059322035e-06, "loss": 1.601, "mean_token_accuracy": 0.6344557106494904, "num_tokens": 13275461.0, "step": 16504 }, { "epoch": 4.3712923728813555, "grad_norm": 2.404635429382324, "learning_rate": 7.81448622881356e-06, "loss": 1.2352, "mean_token_accuracy": 0.6958513483405113, "num_tokens": 13277079.0, "step": 16506 }, { "epoch": 4.371822033898305, "grad_norm": 2.1142728328704834, "learning_rate": 7.814221398305085e-06, "loss": 1.4088, "mean_token_accuracy": 0.6760592013597488, "num_tokens": 13278636.0, "step": 16508 }, { "epoch": 4.372351694915254, "grad_norm": 2.3146555423736572, "learning_rate": 7.813956567796611e-06, "loss": 1.5888, "mean_token_accuracy": 0.6481237187981606, "num_tokens": 13280361.0, "step": 16510 }, { "epoch": 4.372881355932203, "grad_norm": 2.0456180572509766, "learning_rate": 7.813691737288136e-06, "loss": 0.9676, "mean_token_accuracy": 0.7793792933225632, "num_tokens": 13281639.0, "step": 16512 }, { "epoch": 4.373411016949152, "grad_norm": 2.0434207916259766, "learning_rate": 7.813426906779661e-06, "loss": 1.4108, "mean_token_accuracy": 0.6932819560170174, "num_tokens": 13283176.0, "step": 16514 }, { "epoch": 4.373940677966102, "grad_norm": 2.4683916568756104, "learning_rate": 7.813162076271186e-06, "loss": 0.9396, "mean_token_accuracy": 0.7514224424958229, "num_tokens": 13284745.0, "step": 16516 }, { "epoch": 4.374470338983051, "grad_norm": 2.219045877456665, "learning_rate": 7.812897245762713e-06, "loss": 1.0183, "mean_token_accuracy": 0.7406523749232292, "num_tokens": 13286255.0, "step": 16518 }, { "epoch": 4.375, "grad_norm": 1.8557476997375488, "learning_rate": 7.812632415254238e-06, "loss": 1.5536, "mean_token_accuracy": 0.6548589840531349, "num_tokens": 13288040.0, "step": 16520 }, { "epoch": 4.375529661016949, "grad_norm": 2.161757230758667, "learning_rate": 7.812367584745764e-06, "loss": 1.2079, "mean_token_accuracy": 0.7151528000831604, "num_tokens": 13289410.0, "step": 16522 }, { "epoch": 4.376059322033898, "grad_norm": 1.9297555685043335, "learning_rate": 7.81210275423729e-06, "loss": 1.1896, "mean_token_accuracy": 0.72679802775383, "num_tokens": 13291230.0, "step": 16524 }, { "epoch": 4.376588983050848, "grad_norm": 1.960248589515686, "learning_rate": 7.811837923728814e-06, "loss": 1.1432, "mean_token_accuracy": 0.7497575357556343, "num_tokens": 13292771.0, "step": 16526 }, { "epoch": 4.377118644067797, "grad_norm": 2.213810443878174, "learning_rate": 7.811573093220339e-06, "loss": 1.2775, "mean_token_accuracy": 0.6821921020746231, "num_tokens": 13294243.0, "step": 16528 }, { "epoch": 4.377648305084746, "grad_norm": 2.291642904281616, "learning_rate": 7.811308262711866e-06, "loss": 1.1538, "mean_token_accuracy": 0.735080398619175, "num_tokens": 13295691.0, "step": 16530 }, { "epoch": 4.378177966101695, "grad_norm": 1.998180866241455, "learning_rate": 7.81104343220339e-06, "loss": 1.253, "mean_token_accuracy": 0.7107740789651871, "num_tokens": 13297201.0, "step": 16532 }, { "epoch": 4.3787076271186445, "grad_norm": 2.046346664428711, "learning_rate": 7.810778601694916e-06, "loss": 1.3521, "mean_token_accuracy": 0.7040534615516663, "num_tokens": 13298755.0, "step": 16534 }, { "epoch": 4.379237288135593, "grad_norm": 2.799515724182129, "learning_rate": 7.81051377118644e-06, "loss": 1.2303, "mean_token_accuracy": 0.7274738028645515, "num_tokens": 13300174.0, "step": 16536 }, { "epoch": 4.379766949152542, "grad_norm": 1.8607538938522339, "learning_rate": 7.810248940677967e-06, "loss": 1.1722, "mean_token_accuracy": 0.7327890992164612, "num_tokens": 13301660.0, "step": 16538 }, { "epoch": 4.380296610169491, "grad_norm": 1.981704831123352, "learning_rate": 7.809984110169492e-06, "loss": 1.1356, "mean_token_accuracy": 0.7569917887449265, "num_tokens": 13303245.0, "step": 16540 }, { "epoch": 4.38082627118644, "grad_norm": 2.076845169067383, "learning_rate": 7.809719279661017e-06, "loss": 1.3261, "mean_token_accuracy": 0.6975060850381851, "num_tokens": 13304914.0, "step": 16542 }, { "epoch": 4.38135593220339, "grad_norm": 1.926861047744751, "learning_rate": 7.809454449152542e-06, "loss": 1.4463, "mean_token_accuracy": 0.6774891912937164, "num_tokens": 13306470.0, "step": 16544 }, { "epoch": 4.381885593220339, "grad_norm": 1.8657920360565186, "learning_rate": 7.809189618644069e-06, "loss": 0.9466, "mean_token_accuracy": 0.7559924274682999, "num_tokens": 13307960.0, "step": 16546 }, { "epoch": 4.382415254237288, "grad_norm": 1.9140360355377197, "learning_rate": 7.808924788135593e-06, "loss": 1.0255, "mean_token_accuracy": 0.748777449131012, "num_tokens": 13309671.0, "step": 16548 }, { "epoch": 4.382944915254237, "grad_norm": 1.8777095079421997, "learning_rate": 7.80865995762712e-06, "loss": 1.1457, "mean_token_accuracy": 0.7358994036912918, "num_tokens": 13311103.0, "step": 16550 }, { "epoch": 4.383474576271187, "grad_norm": 2.023376941680908, "learning_rate": 7.808395127118645e-06, "loss": 1.3857, "mean_token_accuracy": 0.6878102719783783, "num_tokens": 13312511.0, "step": 16552 }, { "epoch": 4.384004237288136, "grad_norm": 2.1175684928894043, "learning_rate": 7.80813029661017e-06, "loss": 1.3745, "mean_token_accuracy": 0.6715496256947517, "num_tokens": 13314235.0, "step": 16554 }, { "epoch": 4.384533898305085, "grad_norm": 1.781345248222351, "learning_rate": 7.807865466101695e-06, "loss": 1.0263, "mean_token_accuracy": 0.7474318295717239, "num_tokens": 13315907.0, "step": 16556 }, { "epoch": 4.385063559322034, "grad_norm": 1.8751782178878784, "learning_rate": 7.807600635593222e-06, "loss": 0.9591, "mean_token_accuracy": 0.771702453494072, "num_tokens": 13317452.0, "step": 16558 }, { "epoch": 4.385593220338983, "grad_norm": 2.3057198524475098, "learning_rate": 7.807335805084746e-06, "loss": 1.6364, "mean_token_accuracy": 0.6728250831365585, "num_tokens": 13318981.0, "step": 16560 }, { "epoch": 4.3861228813559325, "grad_norm": 2.189053773880005, "learning_rate": 7.807070974576271e-06, "loss": 1.2222, "mean_token_accuracy": 0.706864982843399, "num_tokens": 13320623.0, "step": 16562 }, { "epoch": 4.3866525423728815, "grad_norm": 2.136671543121338, "learning_rate": 7.806806144067798e-06, "loss": 1.3754, "mean_token_accuracy": 0.6757506728172302, "num_tokens": 13322382.0, "step": 16564 }, { "epoch": 4.3871822033898304, "grad_norm": 2.0621984004974365, "learning_rate": 7.806541313559323e-06, "loss": 1.6777, "mean_token_accuracy": 0.6377159655094147, "num_tokens": 13324020.0, "step": 16566 }, { "epoch": 4.387711864406779, "grad_norm": 2.2814059257507324, "learning_rate": 7.806276483050848e-06, "loss": 1.7927, "mean_token_accuracy": 0.6219148710370064, "num_tokens": 13325425.0, "step": 16568 }, { "epoch": 4.388241525423728, "grad_norm": 2.534027099609375, "learning_rate": 7.806011652542373e-06, "loss": 1.026, "mean_token_accuracy": 0.768924355506897, "num_tokens": 13326972.0, "step": 16570 }, { "epoch": 4.388771186440678, "grad_norm": 1.5187908411026, "learning_rate": 7.8057468220339e-06, "loss": 0.9449, "mean_token_accuracy": 0.7624446526169777, "num_tokens": 13329160.0, "step": 16572 }, { "epoch": 4.389300847457627, "grad_norm": 2.0217478275299072, "learning_rate": 7.805481991525424e-06, "loss": 0.6843, "mean_token_accuracy": 0.8008832409977913, "num_tokens": 13330855.0, "step": 16574 }, { "epoch": 4.389830508474576, "grad_norm": 1.8931224346160889, "learning_rate": 7.805217161016951e-06, "loss": 1.3437, "mean_token_accuracy": 0.6936537250876427, "num_tokens": 13332425.0, "step": 16576 }, { "epoch": 4.390360169491525, "grad_norm": 1.6068741083145142, "learning_rate": 7.804952330508476e-06, "loss": 0.8183, "mean_token_accuracy": 0.7890482395887375, "num_tokens": 13334002.0, "step": 16578 }, { "epoch": 4.390889830508475, "grad_norm": 2.302196502685547, "learning_rate": 7.8046875e-06, "loss": 1.2156, "mean_token_accuracy": 0.7077320516109467, "num_tokens": 13335556.0, "step": 16580 }, { "epoch": 4.391419491525424, "grad_norm": 1.8508210182189941, "learning_rate": 7.804422669491526e-06, "loss": 1.4337, "mean_token_accuracy": 0.6582527905702591, "num_tokens": 13337270.0, "step": 16582 }, { "epoch": 4.391949152542373, "grad_norm": 2.0490682125091553, "learning_rate": 7.804157838983052e-06, "loss": 1.4064, "mean_token_accuracy": 0.6446553468704224, "num_tokens": 13339002.0, "step": 16584 }, { "epoch": 4.392478813559322, "grad_norm": 2.4562020301818848, "learning_rate": 7.803893008474577e-06, "loss": 1.4556, "mean_token_accuracy": 0.7189974039793015, "num_tokens": 13340421.0, "step": 16586 }, { "epoch": 4.393008474576272, "grad_norm": 2.138040542602539, "learning_rate": 7.803628177966102e-06, "loss": 1.3117, "mean_token_accuracy": 0.7122947871685028, "num_tokens": 13342150.0, "step": 16588 }, { "epoch": 4.393538135593221, "grad_norm": 2.6061947345733643, "learning_rate": 7.803363347457627e-06, "loss": 0.9728, "mean_token_accuracy": 0.7627294436097145, "num_tokens": 13343648.0, "step": 16590 }, { "epoch": 4.3940677966101696, "grad_norm": 1.5895562171936035, "learning_rate": 7.803098516949154e-06, "loss": 0.9222, "mean_token_accuracy": 0.7863991037011147, "num_tokens": 13345186.0, "step": 16592 }, { "epoch": 4.3945974576271185, "grad_norm": 1.8958531618118286, "learning_rate": 7.802833686440679e-06, "loss": 1.3851, "mean_token_accuracy": 0.6911488398909569, "num_tokens": 13347041.0, "step": 16594 }, { "epoch": 4.3951271186440675, "grad_norm": 2.546266555786133, "learning_rate": 7.802568855932204e-06, "loss": 0.8062, "mean_token_accuracy": 0.7848711758852005, "num_tokens": 13348434.0, "step": 16596 }, { "epoch": 4.395656779661017, "grad_norm": 1.924924373626709, "learning_rate": 7.802304025423729e-06, "loss": 1.1128, "mean_token_accuracy": 0.7255501449108124, "num_tokens": 13350165.0, "step": 16598 }, { "epoch": 4.396186440677966, "grad_norm": 2.112262725830078, "learning_rate": 7.802039194915255e-06, "loss": 1.2278, "mean_token_accuracy": 0.7278055474162102, "num_tokens": 13351599.0, "step": 16600 }, { "epoch": 4.396716101694915, "grad_norm": 1.942581057548523, "learning_rate": 7.80177436440678e-06, "loss": 1.1444, "mean_token_accuracy": 0.731435514986515, "num_tokens": 13353117.0, "step": 16602 }, { "epoch": 4.397245762711864, "grad_norm": 1.924113154411316, "learning_rate": 7.801509533898307e-06, "loss": 1.3867, "mean_token_accuracy": 0.6995733007788658, "num_tokens": 13354692.0, "step": 16604 }, { "epoch": 4.397775423728813, "grad_norm": 1.931756615638733, "learning_rate": 7.801244703389832e-06, "loss": 1.155, "mean_token_accuracy": 0.7155156582593918, "num_tokens": 13356509.0, "step": 16606 }, { "epoch": 4.398305084745763, "grad_norm": 2.2139806747436523, "learning_rate": 7.800979872881357e-06, "loss": 1.2813, "mean_token_accuracy": 0.711082324385643, "num_tokens": 13358058.0, "step": 16608 }, { "epoch": 4.398834745762712, "grad_norm": 1.6767306327819824, "learning_rate": 7.800715042372882e-06, "loss": 1.2043, "mean_token_accuracy": 0.7203500345349312, "num_tokens": 13359868.0, "step": 16610 }, { "epoch": 4.399364406779661, "grad_norm": 2.41961407661438, "learning_rate": 7.800450211864408e-06, "loss": 1.2064, "mean_token_accuracy": 0.7113370001316071, "num_tokens": 13361627.0, "step": 16612 }, { "epoch": 4.39989406779661, "grad_norm": 1.9462473392486572, "learning_rate": 7.800185381355933e-06, "loss": 1.4187, "mean_token_accuracy": 0.6753217428922653, "num_tokens": 13363183.0, "step": 16614 }, { "epoch": 4.40042372881356, "grad_norm": 2.1759209632873535, "learning_rate": 7.799920550847458e-06, "loss": 1.4145, "mean_token_accuracy": 0.6663404330611229, "num_tokens": 13364739.0, "step": 16616 }, { "epoch": 4.400953389830509, "grad_norm": 2.095834493637085, "learning_rate": 7.799655720338983e-06, "loss": 1.2448, "mean_token_accuracy": 0.7193634547293186, "num_tokens": 13366235.0, "step": 16618 }, { "epoch": 4.401483050847458, "grad_norm": 2.2987992763519287, "learning_rate": 7.79939088983051e-06, "loss": 1.7067, "mean_token_accuracy": 0.6392332538962364, "num_tokens": 13367802.0, "step": 16620 }, { "epoch": 4.402012711864407, "grad_norm": 2.1030187606811523, "learning_rate": 7.799126059322034e-06, "loss": 1.5729, "mean_token_accuracy": 0.654667429625988, "num_tokens": 13369614.0, "step": 16622 }, { "epoch": 4.4025423728813555, "grad_norm": 1.9320569038391113, "learning_rate": 7.79886122881356e-06, "loss": 1.2584, "mean_token_accuracy": 0.6917813159525394, "num_tokens": 13371220.0, "step": 16624 }, { "epoch": 4.403072033898305, "grad_norm": 2.409515380859375, "learning_rate": 7.798596398305084e-06, "loss": 1.2485, "mean_token_accuracy": 0.691951721906662, "num_tokens": 13372936.0, "step": 16626 }, { "epoch": 4.403601694915254, "grad_norm": 2.2213807106018066, "learning_rate": 7.798331567796611e-06, "loss": 1.2672, "mean_token_accuracy": 0.7004934549331665, "num_tokens": 13374394.0, "step": 16628 }, { "epoch": 4.404131355932203, "grad_norm": 2.548067808151245, "learning_rate": 7.798066737288136e-06, "loss": 1.1327, "mean_token_accuracy": 0.7192516401410103, "num_tokens": 13375812.0, "step": 16630 }, { "epoch": 4.404661016949152, "grad_norm": 1.9247660636901855, "learning_rate": 7.797801906779663e-06, "loss": 0.7288, "mean_token_accuracy": 0.8090417236089706, "num_tokens": 13377146.0, "step": 16632 }, { "epoch": 4.405190677966102, "grad_norm": 2.3868865966796875, "learning_rate": 7.797537076271186e-06, "loss": 1.2349, "mean_token_accuracy": 0.6943170353770256, "num_tokens": 13379031.0, "step": 16634 }, { "epoch": 4.405720338983051, "grad_norm": 1.8460962772369385, "learning_rate": 7.797272245762712e-06, "loss": 1.579, "mean_token_accuracy": 0.6755017563700676, "num_tokens": 13380580.0, "step": 16636 }, { "epoch": 4.40625, "grad_norm": 2.2571680545806885, "learning_rate": 7.797007415254237e-06, "loss": 1.3077, "mean_token_accuracy": 0.7113237306475639, "num_tokens": 13382022.0, "step": 16638 }, { "epoch": 4.406779661016949, "grad_norm": 2.4180829524993896, "learning_rate": 7.796742584745764e-06, "loss": 1.607, "mean_token_accuracy": 0.6420261785387993, "num_tokens": 13383420.0, "step": 16640 }, { "epoch": 4.407309322033898, "grad_norm": 2.2097034454345703, "learning_rate": 7.796477754237289e-06, "loss": 1.3294, "mean_token_accuracy": 0.712707445025444, "num_tokens": 13384882.0, "step": 16642 }, { "epoch": 4.407838983050848, "grad_norm": 2.1016287803649902, "learning_rate": 7.796212923728814e-06, "loss": 1.289, "mean_token_accuracy": 0.7156090661883354, "num_tokens": 13386622.0, "step": 16644 }, { "epoch": 4.408368644067797, "grad_norm": 2.569582462310791, "learning_rate": 7.795948093220339e-06, "loss": 1.5805, "mean_token_accuracy": 0.6470145769417286, "num_tokens": 13388194.0, "step": 16646 }, { "epoch": 4.408898305084746, "grad_norm": 2.0262796878814697, "learning_rate": 7.795683262711865e-06, "loss": 1.2391, "mean_token_accuracy": 0.7284190878272057, "num_tokens": 13389914.0, "step": 16648 }, { "epoch": 4.409427966101695, "grad_norm": 1.7950912714004517, "learning_rate": 7.79541843220339e-06, "loss": 1.5148, "mean_token_accuracy": 0.6664897501468658, "num_tokens": 13391898.0, "step": 16650 }, { "epoch": 4.4099576271186445, "grad_norm": 1.783844232559204, "learning_rate": 7.795153601694915e-06, "loss": 0.9404, "mean_token_accuracy": 0.7884300425648689, "num_tokens": 13393505.0, "step": 16652 }, { "epoch": 4.410487288135593, "grad_norm": 2.258697032928467, "learning_rate": 7.794888771186442e-06, "loss": 1.139, "mean_token_accuracy": 0.7243052199482918, "num_tokens": 13394879.0, "step": 16654 }, { "epoch": 4.411016949152542, "grad_norm": 1.885172724723816, "learning_rate": 7.794623940677967e-06, "loss": 1.099, "mean_token_accuracy": 0.7522841989994049, "num_tokens": 13396693.0, "step": 16656 }, { "epoch": 4.411546610169491, "grad_norm": 2.4396235942840576, "learning_rate": 7.794359110169493e-06, "loss": 1.6075, "mean_token_accuracy": 0.661637969315052, "num_tokens": 13398445.0, "step": 16658 }, { "epoch": 4.41207627118644, "grad_norm": 2.069531202316284, "learning_rate": 7.794094279661018e-06, "loss": 1.3449, "mean_token_accuracy": 0.6832903772592545, "num_tokens": 13400089.0, "step": 16660 }, { "epoch": 4.41260593220339, "grad_norm": 2.134657382965088, "learning_rate": 7.793829449152543e-06, "loss": 1.435, "mean_token_accuracy": 0.690010741353035, "num_tokens": 13401885.0, "step": 16662 }, { "epoch": 4.413135593220339, "grad_norm": 1.3208402395248413, "learning_rate": 7.793564618644068e-06, "loss": 1.7929, "mean_token_accuracy": 0.6129600051790476, "num_tokens": 13404316.0, "step": 16664 }, { "epoch": 4.413665254237288, "grad_norm": 1.7866665124893188, "learning_rate": 7.793299788135595e-06, "loss": 1.2519, "mean_token_accuracy": 0.7230157256126404, "num_tokens": 13406241.0, "step": 16666 }, { "epoch": 4.414194915254237, "grad_norm": 1.432399034500122, "learning_rate": 7.79303495762712e-06, "loss": 1.0878, "mean_token_accuracy": 0.7562664300203323, "num_tokens": 13408150.0, "step": 16668 }, { "epoch": 4.414724576271187, "grad_norm": 1.8057671785354614, "learning_rate": 7.792770127118645e-06, "loss": 0.9998, "mean_token_accuracy": 0.7664728537201881, "num_tokens": 13409861.0, "step": 16670 }, { "epoch": 4.415254237288136, "grad_norm": 2.1979167461395264, "learning_rate": 7.79250529661017e-06, "loss": 1.2762, "mean_token_accuracy": 0.7176588401198387, "num_tokens": 13411232.0, "step": 16672 }, { "epoch": 4.415783898305085, "grad_norm": 2.0388951301574707, "learning_rate": 7.792240466101696e-06, "loss": 1.3084, "mean_token_accuracy": 0.7180139645934105, "num_tokens": 13412761.0, "step": 16674 }, { "epoch": 4.416313559322034, "grad_norm": 1.2937872409820557, "learning_rate": 7.791975635593221e-06, "loss": 1.2075, "mean_token_accuracy": 0.6624778881669044, "num_tokens": 13415545.0, "step": 16676 }, { "epoch": 4.416843220338983, "grad_norm": 2.164547920227051, "learning_rate": 7.791710805084746e-06, "loss": 1.5141, "mean_token_accuracy": 0.6671023890376091, "num_tokens": 13417301.0, "step": 16678 }, { "epoch": 4.4173728813559325, "grad_norm": 1.8206068277359009, "learning_rate": 7.791445974576271e-06, "loss": 0.9422, "mean_token_accuracy": 0.7683722302317619, "num_tokens": 13418873.0, "step": 16680 }, { "epoch": 4.4179025423728815, "grad_norm": 2.5442559719085693, "learning_rate": 7.791181144067798e-06, "loss": 0.786, "mean_token_accuracy": 0.7999795228242874, "num_tokens": 13420310.0, "step": 16682 }, { "epoch": 4.4184322033898304, "grad_norm": 1.7034552097320557, "learning_rate": 7.790916313559323e-06, "loss": 1.1135, "mean_token_accuracy": 0.73826102912426, "num_tokens": 13422167.0, "step": 16684 }, { "epoch": 4.418961864406779, "grad_norm": 1.8169547319412231, "learning_rate": 7.79065148305085e-06, "loss": 1.1172, "mean_token_accuracy": 0.7252485454082489, "num_tokens": 13424163.0, "step": 16686 }, { "epoch": 4.419491525423728, "grad_norm": 2.2941246032714844, "learning_rate": 7.790386652542372e-06, "loss": 1.2263, "mean_token_accuracy": 0.6813866943120956, "num_tokens": 13425906.0, "step": 16688 }, { "epoch": 4.420021186440678, "grad_norm": 2.480797052383423, "learning_rate": 7.790121822033899e-06, "loss": 1.3674, "mean_token_accuracy": 0.7066801711916924, "num_tokens": 13427346.0, "step": 16690 }, { "epoch": 4.420550847457627, "grad_norm": 2.1394078731536865, "learning_rate": 7.789856991525424e-06, "loss": 1.1205, "mean_token_accuracy": 0.7346707656979561, "num_tokens": 13428871.0, "step": 16692 }, { "epoch": 4.421080508474576, "grad_norm": 2.2796919345855713, "learning_rate": 7.78959216101695e-06, "loss": 1.6405, "mean_token_accuracy": 0.6688885018229485, "num_tokens": 13430343.0, "step": 16694 }, { "epoch": 4.421610169491525, "grad_norm": 1.9351589679718018, "learning_rate": 7.789327330508476e-06, "loss": 1.0933, "mean_token_accuracy": 0.7531937435269356, "num_tokens": 13431838.0, "step": 16696 }, { "epoch": 4.422139830508475, "grad_norm": 1.7577842473983765, "learning_rate": 7.7890625e-06, "loss": 1.2815, "mean_token_accuracy": 0.6826440207660198, "num_tokens": 13433718.0, "step": 16698 }, { "epoch": 4.422669491525424, "grad_norm": 1.8334933519363403, "learning_rate": 7.788797669491525e-06, "loss": 1.0328, "mean_token_accuracy": 0.7548885643482208, "num_tokens": 13435335.0, "step": 16700 }, { "epoch": 4.423199152542373, "grad_norm": 1.908430576324463, "learning_rate": 7.788532838983052e-06, "loss": 1.3631, "mean_token_accuracy": 0.7118317931890488, "num_tokens": 13437070.0, "step": 16702 }, { "epoch": 4.423728813559322, "grad_norm": 1.9772995710372925, "learning_rate": 7.788268008474577e-06, "loss": 1.5425, "mean_token_accuracy": 0.6887883804738522, "num_tokens": 13438528.0, "step": 16704 }, { "epoch": 4.424258474576272, "grad_norm": 1.8858389854431152, "learning_rate": 7.788003177966102e-06, "loss": 1.2457, "mean_token_accuracy": 0.7392183728516102, "num_tokens": 13440474.0, "step": 16706 }, { "epoch": 4.424788135593221, "grad_norm": 2.3932580947875977, "learning_rate": 7.787738347457627e-06, "loss": 1.4097, "mean_token_accuracy": 0.6693555414676666, "num_tokens": 13441868.0, "step": 16708 }, { "epoch": 4.4253177966101696, "grad_norm": 2.221205472946167, "learning_rate": 7.787473516949153e-06, "loss": 1.245, "mean_token_accuracy": 0.7013322226703167, "num_tokens": 13443424.0, "step": 16710 }, { "epoch": 4.4258474576271185, "grad_norm": 1.8907908201217651, "learning_rate": 7.787208686440678e-06, "loss": 0.9984, "mean_token_accuracy": 0.7662389874458313, "num_tokens": 13445092.0, "step": 16712 }, { "epoch": 4.4263771186440675, "grad_norm": 2.182648181915283, "learning_rate": 7.786943855932205e-06, "loss": 1.0952, "mean_token_accuracy": 0.7679633647203445, "num_tokens": 13446500.0, "step": 16714 }, { "epoch": 4.426906779661017, "grad_norm": 1.9698323011398315, "learning_rate": 7.786679025423728e-06, "loss": 1.3098, "mean_token_accuracy": 0.6987439766526222, "num_tokens": 13448233.0, "step": 16716 }, { "epoch": 4.427436440677966, "grad_norm": 2.219930410385132, "learning_rate": 7.786414194915255e-06, "loss": 1.5436, "mean_token_accuracy": 0.6428078711032867, "num_tokens": 13449900.0, "step": 16718 }, { "epoch": 4.427966101694915, "grad_norm": 1.9314022064208984, "learning_rate": 7.78614936440678e-06, "loss": 1.205, "mean_token_accuracy": 0.6957934312522411, "num_tokens": 13451702.0, "step": 16720 }, { "epoch": 4.428495762711864, "grad_norm": 1.9086486101150513, "learning_rate": 7.785884533898306e-06, "loss": 1.4053, "mean_token_accuracy": 0.6942581683397293, "num_tokens": 13453275.0, "step": 16722 }, { "epoch": 4.429025423728813, "grad_norm": 1.5745859146118164, "learning_rate": 7.785619703389831e-06, "loss": 0.9896, "mean_token_accuracy": 0.7864899635314941, "num_tokens": 13454744.0, "step": 16724 }, { "epoch": 4.429555084745763, "grad_norm": 1.666853904724121, "learning_rate": 7.785354872881356e-06, "loss": 1.0153, "mean_token_accuracy": 0.750797264277935, "num_tokens": 13456303.0, "step": 16726 }, { "epoch": 4.430084745762712, "grad_norm": 1.8024468421936035, "learning_rate": 7.785090042372881e-06, "loss": 0.8914, "mean_token_accuracy": 0.7895042970776558, "num_tokens": 13458172.0, "step": 16728 }, { "epoch": 4.430614406779661, "grad_norm": 2.112192153930664, "learning_rate": 7.784825211864408e-06, "loss": 0.9564, "mean_token_accuracy": 0.7642679587006569, "num_tokens": 13459746.0, "step": 16730 }, { "epoch": 4.43114406779661, "grad_norm": 2.228262424468994, "learning_rate": 7.784560381355933e-06, "loss": 1.6194, "mean_token_accuracy": 0.6453431323170662, "num_tokens": 13461221.0, "step": 16732 }, { "epoch": 4.43167372881356, "grad_norm": 1.385961651802063, "learning_rate": 7.784295550847458e-06, "loss": 0.6789, "mean_token_accuracy": 0.825788602232933, "num_tokens": 13462841.0, "step": 16734 }, { "epoch": 4.432203389830509, "grad_norm": 1.8731955289840698, "learning_rate": 7.784030720338984e-06, "loss": 1.1759, "mean_token_accuracy": 0.7327824756503105, "num_tokens": 13464376.0, "step": 16736 }, { "epoch": 4.432733050847458, "grad_norm": 2.188673734664917, "learning_rate": 7.78376588983051e-06, "loss": 1.3962, "mean_token_accuracy": 0.6821774244308472, "num_tokens": 13465953.0, "step": 16738 }, { "epoch": 4.433262711864407, "grad_norm": 1.8429384231567383, "learning_rate": 7.783501059322036e-06, "loss": 1.2787, "mean_token_accuracy": 0.7101312130689621, "num_tokens": 13467843.0, "step": 16740 }, { "epoch": 4.4337923728813555, "grad_norm": 1.9594005346298218, "learning_rate": 7.783236228813559e-06, "loss": 1.0839, "mean_token_accuracy": 0.7328587770462036, "num_tokens": 13469458.0, "step": 16742 }, { "epoch": 4.434322033898305, "grad_norm": 1.782631278038025, "learning_rate": 7.782971398305086e-06, "loss": 1.458, "mean_token_accuracy": 0.6496266238391399, "num_tokens": 13471349.0, "step": 16744 }, { "epoch": 4.434851694915254, "grad_norm": 2.4163897037506104, "learning_rate": 7.78270656779661e-06, "loss": 1.1465, "mean_token_accuracy": 0.735041543841362, "num_tokens": 13473030.0, "step": 16746 }, { "epoch": 4.435381355932203, "grad_norm": 1.7255326509475708, "learning_rate": 7.782441737288137e-06, "loss": 1.2098, "mean_token_accuracy": 0.7162367179989815, "num_tokens": 13474488.0, "step": 16748 }, { "epoch": 4.435911016949152, "grad_norm": 1.8295562267303467, "learning_rate": 7.782176906779662e-06, "loss": 1.2019, "step": 16750 }, { "epoch": 4.435911016949152, "eval_loss": 1.3111844062805176, "eval_mean_token_accuracy": 0.7015095984974464, "eval_num_tokens": 13476416.0, "eval_runtime": 48.2965, "eval_samples_per_second": 6.377, "eval_steps_per_second": 6.377, "step": 16750 }, { "epoch": 4.436440677966102, "grad_norm": 2.0482285022735596, "learning_rate": 7.781912076271187e-06, "loss": 1.2829, "mean_token_accuracy": 0.7069513164460659, "num_tokens": 13478232.0, "step": 16752 }, { "epoch": 4.436970338983051, "grad_norm": 5.578736305236816, "learning_rate": 7.781647245762712e-06, "loss": 1.2674, "mean_token_accuracy": 0.7026563808321953, "num_tokens": 13479601.0, "step": 16754 }, { "epoch": 4.4375, "grad_norm": 1.9391111135482788, "learning_rate": 7.781382415254239e-06, "loss": 1.2904, "mean_token_accuracy": 0.7189532294869423, "num_tokens": 13481258.0, "step": 16756 }, { "epoch": 4.438029661016949, "grad_norm": 1.8894211053848267, "learning_rate": 7.781117584745764e-06, "loss": 1.2512, "mean_token_accuracy": 0.7121936082839966, "num_tokens": 13482848.0, "step": 16758 }, { "epoch": 4.438559322033898, "grad_norm": 1.945747971534729, "learning_rate": 7.780852754237288e-06, "loss": 1.3148, "mean_token_accuracy": 0.6982967853546143, "num_tokens": 13484249.0, "step": 16760 }, { "epoch": 4.439088983050848, "grad_norm": 1.8774888515472412, "learning_rate": 7.780587923728813e-06, "loss": 1.0115, "mean_token_accuracy": 0.7594025731086731, "num_tokens": 13485757.0, "step": 16762 }, { "epoch": 4.439618644067797, "grad_norm": 1.60514235496521, "learning_rate": 7.78032309322034e-06, "loss": 0.9488, "mean_token_accuracy": 0.7740950882434845, "num_tokens": 13487480.0, "step": 16764 }, { "epoch": 4.440148305084746, "grad_norm": 1.704751968383789, "learning_rate": 7.780058262711865e-06, "loss": 1.2118, "mean_token_accuracy": 0.7392344996333122, "num_tokens": 13489056.0, "step": 16766 }, { "epoch": 4.440677966101695, "grad_norm": 1.9808502197265625, "learning_rate": 7.779793432203392e-06, "loss": 1.4926, "mean_token_accuracy": 0.6523484662175179, "num_tokens": 13490940.0, "step": 16768 }, { "epoch": 4.4412076271186445, "grad_norm": 2.1408586502075195, "learning_rate": 7.779528601694915e-06, "loss": 1.3407, "mean_token_accuracy": 0.6989707946777344, "num_tokens": 13492713.0, "step": 16770 }, { "epoch": 4.441737288135593, "grad_norm": 1.8844163417816162, "learning_rate": 7.779263771186441e-06, "loss": 1.1828, "mean_token_accuracy": 0.7286230996251106, "num_tokens": 13494218.0, "step": 16772 }, { "epoch": 4.442266949152542, "grad_norm": 2.199991464614868, "learning_rate": 7.778998940677966e-06, "loss": 1.6128, "mean_token_accuracy": 0.6408644244074821, "num_tokens": 13496006.0, "step": 16774 }, { "epoch": 4.442796610169491, "grad_norm": 2.449263095855713, "learning_rate": 7.778734110169493e-06, "loss": 1.3634, "mean_token_accuracy": 0.6805687732994556, "num_tokens": 13497615.0, "step": 16776 }, { "epoch": 4.44332627118644, "grad_norm": 1.7976762056350708, "learning_rate": 7.778469279661018e-06, "loss": 0.6941, "mean_token_accuracy": 0.8184860199689865, "num_tokens": 13499127.0, "step": 16778 }, { "epoch": 4.44385593220339, "grad_norm": 1.5581910610198975, "learning_rate": 7.778204449152543e-06, "loss": 0.8959, "mean_token_accuracy": 0.7816953286528587, "num_tokens": 13500822.0, "step": 16780 }, { "epoch": 4.444385593220339, "grad_norm": 2.0198445320129395, "learning_rate": 7.777939618644068e-06, "loss": 1.4187, "mean_token_accuracy": 0.6917231976985931, "num_tokens": 13502518.0, "step": 16782 }, { "epoch": 4.444915254237288, "grad_norm": 1.8936277627944946, "learning_rate": 7.777674788135594e-06, "loss": 1.2103, "mean_token_accuracy": 0.7082996591925621, "num_tokens": 13504387.0, "step": 16784 }, { "epoch": 4.445444915254237, "grad_norm": 2.3440444469451904, "learning_rate": 7.77740995762712e-06, "loss": 1.6562, "mean_token_accuracy": 0.6444806754589081, "num_tokens": 13505941.0, "step": 16786 }, { "epoch": 4.445974576271187, "grad_norm": 2.1622509956359863, "learning_rate": 7.777145127118644e-06, "loss": 1.3734, "mean_token_accuracy": 0.6706616505980492, "num_tokens": 13507707.0, "step": 16788 }, { "epoch": 4.446504237288136, "grad_norm": 1.7309538125991821, "learning_rate": 7.77688029661017e-06, "loss": 1.1185, "mean_token_accuracy": 0.6830324456095695, "num_tokens": 13510263.0, "step": 16790 }, { "epoch": 4.447033898305085, "grad_norm": 1.9996320009231567, "learning_rate": 7.776615466101696e-06, "loss": 1.2739, "mean_token_accuracy": 0.685418501496315, "num_tokens": 13512123.0, "step": 16792 }, { "epoch": 4.447563559322034, "grad_norm": 1.822353720664978, "learning_rate": 7.77635063559322e-06, "loss": 1.217, "mean_token_accuracy": 0.7081218436360359, "num_tokens": 13513703.0, "step": 16794 }, { "epoch": 4.448093220338983, "grad_norm": 2.029463768005371, "learning_rate": 7.776085805084746e-06, "loss": 0.951, "mean_token_accuracy": 0.7767767161130905, "num_tokens": 13515027.0, "step": 16796 }, { "epoch": 4.4486228813559325, "grad_norm": 2.187852382659912, "learning_rate": 7.77582097457627e-06, "loss": 1.2478, "mean_token_accuracy": 0.7047205716371536, "num_tokens": 13516514.0, "step": 16798 }, { "epoch": 4.4491525423728815, "grad_norm": 1.7529376745224, "learning_rate": 7.775556144067797e-06, "loss": 0.8294, "mean_token_accuracy": 0.7929656058549881, "num_tokens": 13517927.0, "step": 16800 }, { "epoch": 4.4496822033898304, "grad_norm": 2.038954973220825, "learning_rate": 7.775291313559322e-06, "loss": 1.2891, "mean_token_accuracy": 0.7101314142346382, "num_tokens": 13519325.0, "step": 16802 }, { "epoch": 4.450211864406779, "grad_norm": 2.428492307662964, "learning_rate": 7.775026483050849e-06, "loss": 1.4879, "mean_token_accuracy": 0.6472361907362938, "num_tokens": 13520963.0, "step": 16804 }, { "epoch": 4.450741525423728, "grad_norm": 1.8656359910964966, "learning_rate": 7.774761652542374e-06, "loss": 1.2294, "mean_token_accuracy": 0.7236673310399055, "num_tokens": 13522526.0, "step": 16806 }, { "epoch": 4.451271186440678, "grad_norm": 2.2066264152526855, "learning_rate": 7.774496822033899e-06, "loss": 1.0219, "mean_token_accuracy": 0.7733941860496998, "num_tokens": 13523916.0, "step": 16808 }, { "epoch": 4.451800847457627, "grad_norm": 2.0345306396484375, "learning_rate": 7.774231991525424e-06, "loss": 1.3549, "mean_token_accuracy": 0.6969459056854248, "num_tokens": 13525589.0, "step": 16810 }, { "epoch": 4.452330508474576, "grad_norm": 1.9959867000579834, "learning_rate": 7.77396716101695e-06, "loss": 1.3145, "mean_token_accuracy": 0.6900217980146408, "num_tokens": 13527060.0, "step": 16812 }, { "epoch": 4.452860169491525, "grad_norm": 2.2232649326324463, "learning_rate": 7.773702330508475e-06, "loss": 1.3638, "mean_token_accuracy": 0.6932148784399033, "num_tokens": 13528566.0, "step": 16814 }, { "epoch": 4.453389830508475, "grad_norm": 2.2804689407348633, "learning_rate": 7.7734375e-06, "loss": 1.5707, "mean_token_accuracy": 0.6678904891014099, "num_tokens": 13530273.0, "step": 16816 }, { "epoch": 4.453919491525424, "grad_norm": 2.227916717529297, "learning_rate": 7.773172669491527e-06, "loss": 1.7674, "mean_token_accuracy": 0.6252386048436165, "num_tokens": 13531793.0, "step": 16818 }, { "epoch": 4.454449152542373, "grad_norm": 2.4324395656585693, "learning_rate": 7.772907838983052e-06, "loss": 1.4214, "mean_token_accuracy": 0.6764620840549469, "num_tokens": 13533318.0, "step": 16820 }, { "epoch": 4.454978813559322, "grad_norm": 2.173546075820923, "learning_rate": 7.772643008474578e-06, "loss": 1.2717, "mean_token_accuracy": 0.6973569914698601, "num_tokens": 13534759.0, "step": 16822 }, { "epoch": 4.455508474576272, "grad_norm": 2.304185390472412, "learning_rate": 7.772378177966101e-06, "loss": 1.2283, "mean_token_accuracy": 0.7110195532441139, "num_tokens": 13536237.0, "step": 16824 }, { "epoch": 4.456038135593221, "grad_norm": 2.1396372318267822, "learning_rate": 7.772113347457628e-06, "loss": 1.259, "mean_token_accuracy": 0.7099848091602325, "num_tokens": 13537570.0, "step": 16826 }, { "epoch": 4.4565677966101696, "grad_norm": 1.794911503791809, "learning_rate": 7.771848516949153e-06, "loss": 1.1041, "mean_token_accuracy": 0.7483836561441422, "num_tokens": 13538942.0, "step": 16828 }, { "epoch": 4.4570974576271185, "grad_norm": 1.9675880670547485, "learning_rate": 7.77158368644068e-06, "loss": 1.0974, "mean_token_accuracy": 0.7535910904407501, "num_tokens": 13540339.0, "step": 16830 }, { "epoch": 4.4576271186440675, "grad_norm": 2.2235305309295654, "learning_rate": 7.771318855932205e-06, "loss": 0.9696, "mean_token_accuracy": 0.7748246043920517, "num_tokens": 13541665.0, "step": 16832 }, { "epoch": 4.458156779661017, "grad_norm": 2.0187172889709473, "learning_rate": 7.77105402542373e-06, "loss": 0.9189, "mean_token_accuracy": 0.7671587690711021, "num_tokens": 13543158.0, "step": 16834 }, { "epoch": 4.458686440677966, "grad_norm": 1.7381839752197266, "learning_rate": 7.770789194915254e-06, "loss": 1.0795, "mean_token_accuracy": 0.7446592897176743, "num_tokens": 13544714.0, "step": 16836 }, { "epoch": 4.459216101694915, "grad_norm": 1.8925572633743286, "learning_rate": 7.770524364406781e-06, "loss": 1.259, "mean_token_accuracy": 0.7164412811398506, "num_tokens": 13546338.0, "step": 16838 }, { "epoch": 4.459745762711864, "grad_norm": 2.0753350257873535, "learning_rate": 7.770259533898306e-06, "loss": 1.4057, "mean_token_accuracy": 0.6778437122702599, "num_tokens": 13548299.0, "step": 16840 }, { "epoch": 4.460275423728813, "grad_norm": 2.058227300643921, "learning_rate": 7.769994703389831e-06, "loss": 1.467, "mean_token_accuracy": 0.6660535857081413, "num_tokens": 13550084.0, "step": 16842 }, { "epoch": 4.460805084745763, "grad_norm": 2.1835532188415527, "learning_rate": 7.769729872881356e-06, "loss": 1.219, "mean_token_accuracy": 0.7267214506864548, "num_tokens": 13551535.0, "step": 16844 }, { "epoch": 4.461334745762712, "grad_norm": 1.8256984949111938, "learning_rate": 7.769465042372882e-06, "loss": 0.9648, "mean_token_accuracy": 0.7723667845129967, "num_tokens": 13553292.0, "step": 16846 }, { "epoch": 4.461864406779661, "grad_norm": 1.7691150903701782, "learning_rate": 7.769200211864407e-06, "loss": 1.4287, "mean_token_accuracy": 0.6937436535954475, "num_tokens": 13555277.0, "step": 16848 }, { "epoch": 4.46239406779661, "grad_norm": 1.6531583070755005, "learning_rate": 7.768935381355932e-06, "loss": 1.091, "mean_token_accuracy": 0.7670342847704887, "num_tokens": 13557119.0, "step": 16850 }, { "epoch": 4.46292372881356, "grad_norm": 1.785977840423584, "learning_rate": 7.768670550847457e-06, "loss": 1.3019, "mean_token_accuracy": 0.6998927742242813, "num_tokens": 13558944.0, "step": 16852 }, { "epoch": 4.463453389830509, "grad_norm": 2.0578508377075195, "learning_rate": 7.768405720338984e-06, "loss": 1.3539, "mean_token_accuracy": 0.6902664229273796, "num_tokens": 13560467.0, "step": 16854 }, { "epoch": 4.463983050847458, "grad_norm": 1.893239140510559, "learning_rate": 7.768140889830509e-06, "loss": 1.0739, "mean_token_accuracy": 0.7532191425561905, "num_tokens": 13561831.0, "step": 16856 }, { "epoch": 4.464512711864407, "grad_norm": 2.19899845123291, "learning_rate": 7.767876059322035e-06, "loss": 1.4504, "mean_token_accuracy": 0.6858475506305695, "num_tokens": 13563315.0, "step": 16858 }, { "epoch": 4.4650423728813555, "grad_norm": 1.953305721282959, "learning_rate": 7.76761122881356e-06, "loss": 1.1366, "mean_token_accuracy": 0.7360395565629005, "num_tokens": 13565036.0, "step": 16860 }, { "epoch": 4.465572033898305, "grad_norm": 1.9907827377319336, "learning_rate": 7.767346398305085e-06, "loss": 1.0391, "mean_token_accuracy": 0.744004912674427, "num_tokens": 13566370.0, "step": 16862 }, { "epoch": 4.466101694915254, "grad_norm": 2.348896026611328, "learning_rate": 7.76708156779661e-06, "loss": 1.5293, "mean_token_accuracy": 0.706131212413311, "num_tokens": 13567856.0, "step": 16864 }, { "epoch": 4.466631355932203, "grad_norm": 2.3210344314575195, "learning_rate": 7.766816737288137e-06, "loss": 1.3793, "mean_token_accuracy": 0.6940410137176514, "num_tokens": 13569345.0, "step": 16866 }, { "epoch": 4.467161016949152, "grad_norm": 2.14192271232605, "learning_rate": 7.766551906779662e-06, "loss": 1.0883, "mean_token_accuracy": 0.7389285191893578, "num_tokens": 13570758.0, "step": 16868 }, { "epoch": 4.467690677966102, "grad_norm": 2.3792314529418945, "learning_rate": 7.766287076271187e-06, "loss": 1.2884, "mean_token_accuracy": 0.7077566236257553, "num_tokens": 13572234.0, "step": 16870 }, { "epoch": 4.468220338983051, "grad_norm": 2.291111469268799, "learning_rate": 7.766022245762712e-06, "loss": 1.2892, "mean_token_accuracy": 0.693433590233326, "num_tokens": 13573822.0, "step": 16872 }, { "epoch": 4.46875, "grad_norm": 2.0109472274780273, "learning_rate": 7.765757415254238e-06, "loss": 1.2972, "mean_token_accuracy": 0.6828733533620834, "num_tokens": 13575592.0, "step": 16874 }, { "epoch": 4.469279661016949, "grad_norm": 2.1213645935058594, "learning_rate": 7.765492584745763e-06, "loss": 1.1268, "mean_token_accuracy": 0.7538440898060799, "num_tokens": 13577190.0, "step": 16876 }, { "epoch": 4.469809322033898, "grad_norm": 1.9614206552505493, "learning_rate": 7.765227754237288e-06, "loss": 0.9134, "mean_token_accuracy": 0.7647038474678993, "num_tokens": 13578705.0, "step": 16878 }, { "epoch": 4.470338983050848, "grad_norm": 1.8363960981369019, "learning_rate": 7.764962923728813e-06, "loss": 0.9131, "mean_token_accuracy": 0.7665248736739159, "num_tokens": 13580517.0, "step": 16880 }, { "epoch": 4.470868644067797, "grad_norm": 2.2340128421783447, "learning_rate": 7.76469809322034e-06, "loss": 1.3699, "mean_token_accuracy": 0.6918025836348534, "num_tokens": 13582034.0, "step": 16882 }, { "epoch": 4.471398305084746, "grad_norm": 1.3748093843460083, "learning_rate": 7.764433262711865e-06, "loss": 0.8865, "mean_token_accuracy": 0.784464918076992, "num_tokens": 13583910.0, "step": 16884 }, { "epoch": 4.471927966101695, "grad_norm": 2.1007232666015625, "learning_rate": 7.764168432203391e-06, "loss": 1.2237, "mean_token_accuracy": 0.7287990152835846, "num_tokens": 13585137.0, "step": 16886 }, { "epoch": 4.4724576271186445, "grad_norm": 1.6210718154907227, "learning_rate": 7.763903601694916e-06, "loss": 0.7938, "mean_token_accuracy": 0.7975753545761108, "num_tokens": 13586954.0, "step": 16888 }, { "epoch": 4.472987288135593, "grad_norm": 2.352459192276001, "learning_rate": 7.763638771186441e-06, "loss": 1.3685, "mean_token_accuracy": 0.6784926056861877, "num_tokens": 13588523.0, "step": 16890 }, { "epoch": 4.473516949152542, "grad_norm": 2.1269307136535645, "learning_rate": 7.763373940677966e-06, "loss": 1.4087, "mean_token_accuracy": 0.6902114897966385, "num_tokens": 13590097.0, "step": 16892 }, { "epoch": 4.474046610169491, "grad_norm": 2.4062376022338867, "learning_rate": 7.763109110169493e-06, "loss": 1.085, "mean_token_accuracy": 0.7468733936548233, "num_tokens": 13591491.0, "step": 16894 }, { "epoch": 4.47457627118644, "grad_norm": 2.068068504333496, "learning_rate": 7.762844279661018e-06, "loss": 1.5229, "mean_token_accuracy": 0.6622690930962563, "num_tokens": 13593079.0, "step": 16896 }, { "epoch": 4.47510593220339, "grad_norm": 2.2041451930999756, "learning_rate": 7.762579449152542e-06, "loss": 1.5694, "mean_token_accuracy": 0.6657143384218216, "num_tokens": 13594658.0, "step": 16898 }, { "epoch": 4.475635593220339, "grad_norm": 1.648669719696045, "learning_rate": 7.762314618644067e-06, "loss": 1.0203, "mean_token_accuracy": 0.7599519044160843, "num_tokens": 13596431.0, "step": 16900 }, { "epoch": 4.476165254237288, "grad_norm": 2.0111453533172607, "learning_rate": 7.762049788135594e-06, "loss": 1.043, "mean_token_accuracy": 0.7493611797690392, "num_tokens": 13597893.0, "step": 16902 }, { "epoch": 4.476694915254237, "grad_norm": 2.080193519592285, "learning_rate": 7.761784957627119e-06, "loss": 1.0825, "mean_token_accuracy": 0.7623676359653473, "num_tokens": 13599416.0, "step": 16904 }, { "epoch": 4.477224576271187, "grad_norm": 2.0970773696899414, "learning_rate": 7.761520127118644e-06, "loss": 1.6247, "mean_token_accuracy": 0.6354840658605099, "num_tokens": 13601189.0, "step": 16906 }, { "epoch": 4.477754237288136, "grad_norm": 2.2096545696258545, "learning_rate": 7.76125529661017e-06, "loss": 1.2365, "mean_token_accuracy": 0.6939677596092224, "num_tokens": 13602691.0, "step": 16908 }, { "epoch": 4.478283898305085, "grad_norm": 2.1737406253814697, "learning_rate": 7.760990466101695e-06, "loss": 1.3035, "mean_token_accuracy": 0.7069188505411148, "num_tokens": 13604026.0, "step": 16910 }, { "epoch": 4.478813559322034, "grad_norm": 2.175842761993408, "learning_rate": 7.760725635593222e-06, "loss": 1.1759, "mean_token_accuracy": 0.7129122987389565, "num_tokens": 13605494.0, "step": 16912 }, { "epoch": 4.479343220338983, "grad_norm": 2.1184732913970947, "learning_rate": 7.760460805084747e-06, "loss": 1.1974, "mean_token_accuracy": 0.7161248177289963, "num_tokens": 13607119.0, "step": 16914 }, { "epoch": 4.4798728813559325, "grad_norm": 2.235856056213379, "learning_rate": 7.760195974576272e-06, "loss": 1.182, "mean_token_accuracy": 0.7096079662442207, "num_tokens": 13608726.0, "step": 16916 }, { "epoch": 4.4804025423728815, "grad_norm": 2.077765941619873, "learning_rate": 7.759931144067797e-06, "loss": 1.208, "mean_token_accuracy": 0.7346504852175713, "num_tokens": 13610111.0, "step": 16918 }, { "epoch": 4.4809322033898304, "grad_norm": 2.4039669036865234, "learning_rate": 7.759666313559323e-06, "loss": 1.2978, "mean_token_accuracy": 0.6948867589235306, "num_tokens": 13611580.0, "step": 16920 }, { "epoch": 4.481461864406779, "grad_norm": 2.016465425491333, "learning_rate": 7.759401483050848e-06, "loss": 1.3344, "mean_token_accuracy": 0.7057878598570824, "num_tokens": 13612937.0, "step": 16922 }, { "epoch": 4.481991525423728, "grad_norm": 2.079332113265991, "learning_rate": 7.759136652542373e-06, "loss": 1.4482, "mean_token_accuracy": 0.6726158782839775, "num_tokens": 13614510.0, "step": 16924 }, { "epoch": 4.482521186440678, "grad_norm": 2.274947166442871, "learning_rate": 7.758871822033898e-06, "loss": 1.414, "mean_token_accuracy": 0.6672124564647675, "num_tokens": 13616151.0, "step": 16926 }, { "epoch": 4.483050847457627, "grad_norm": 2.2820584774017334, "learning_rate": 7.758606991525425e-06, "loss": 1.2317, "mean_token_accuracy": 0.7083241865038872, "num_tokens": 13617754.0, "step": 16928 }, { "epoch": 4.483580508474576, "grad_norm": 1.6256858110427856, "learning_rate": 7.75834216101695e-06, "loss": 1.3811, "mean_token_accuracy": 0.6814987771213055, "num_tokens": 13619565.0, "step": 16930 }, { "epoch": 4.484110169491525, "grad_norm": 2.1812355518341064, "learning_rate": 7.758077330508475e-06, "loss": 1.1429, "mean_token_accuracy": 0.7243823334574699, "num_tokens": 13621213.0, "step": 16932 }, { "epoch": 4.484639830508475, "grad_norm": 2.670917272567749, "learning_rate": 7.7578125e-06, "loss": 1.4822, "mean_token_accuracy": 0.6818323954939842, "num_tokens": 13622524.0, "step": 16934 }, { "epoch": 4.485169491525424, "grad_norm": 1.6153271198272705, "learning_rate": 7.757547669491526e-06, "loss": 0.9827, "mean_token_accuracy": 0.7716715037822723, "num_tokens": 13624274.0, "step": 16936 }, { "epoch": 4.485699152542373, "grad_norm": 2.2549612522125244, "learning_rate": 7.757282838983051e-06, "loss": 1.3774, "mean_token_accuracy": 0.6813346594572067, "num_tokens": 13625834.0, "step": 16938 }, { "epoch": 4.486228813559322, "grad_norm": 1.5970630645751953, "learning_rate": 7.757018008474578e-06, "loss": 0.7608, "mean_token_accuracy": 0.795474112033844, "num_tokens": 13627847.0, "step": 16940 }, { "epoch": 4.486758474576272, "grad_norm": 2.094426393508911, "learning_rate": 7.756753177966103e-06, "loss": 1.3262, "mean_token_accuracy": 0.7069802805781364, "num_tokens": 13629389.0, "step": 16942 }, { "epoch": 4.487288135593221, "grad_norm": 2.5579891204833984, "learning_rate": 7.756488347457628e-06, "loss": 1.2981, "mean_token_accuracy": 0.7143615782260895, "num_tokens": 13631042.0, "step": 16944 }, { "epoch": 4.4878177966101696, "grad_norm": 2.055856466293335, "learning_rate": 7.756223516949153e-06, "loss": 1.3601, "mean_token_accuracy": 0.6901553645730019, "num_tokens": 13632580.0, "step": 16946 }, { "epoch": 4.4883474576271185, "grad_norm": 1.8460392951965332, "learning_rate": 7.75595868644068e-06, "loss": 1.5785, "mean_token_accuracy": 0.6777089089155197, "num_tokens": 13634047.0, "step": 16948 }, { "epoch": 4.4888771186440675, "grad_norm": 1.7566956281661987, "learning_rate": 7.755693855932204e-06, "loss": 1.1588, "mean_token_accuracy": 0.7162242084741592, "num_tokens": 13635847.0, "step": 16950 }, { "epoch": 4.489406779661017, "grad_norm": 1.4736453294754028, "learning_rate": 7.755429025423729e-06, "loss": 1.0818, "mean_token_accuracy": 0.7309005260467529, "num_tokens": 13637306.0, "step": 16952 }, { "epoch": 4.489936440677966, "grad_norm": 1.988791823387146, "learning_rate": 7.755164194915254e-06, "loss": 1.3688, "mean_token_accuracy": 0.6786766201257706, "num_tokens": 13638917.0, "step": 16954 }, { "epoch": 4.490466101694915, "grad_norm": 2.2566542625427246, "learning_rate": 7.75489936440678e-06, "loss": 1.4825, "mean_token_accuracy": 0.6965561360120773, "num_tokens": 13640500.0, "step": 16956 }, { "epoch": 4.490995762711864, "grad_norm": 1.887439489364624, "learning_rate": 7.754634533898306e-06, "loss": 1.2206, "mean_token_accuracy": 0.7145659253001213, "num_tokens": 13642155.0, "step": 16958 }, { "epoch": 4.491525423728813, "grad_norm": 1.3689138889312744, "learning_rate": 7.75436970338983e-06, "loss": 0.8748, "mean_token_accuracy": 0.7783660888671875, "num_tokens": 13643840.0, "step": 16960 }, { "epoch": 4.492055084745763, "grad_norm": 2.149855375289917, "learning_rate": 7.754104872881355e-06, "loss": 1.7697, "mean_token_accuracy": 0.6019943132996559, "num_tokens": 13645451.0, "step": 16962 }, { "epoch": 4.492584745762712, "grad_norm": 1.8585257530212402, "learning_rate": 7.753840042372882e-06, "loss": 1.6598, "mean_token_accuracy": 0.6388176754117012, "num_tokens": 13647157.0, "step": 16964 }, { "epoch": 4.493114406779661, "grad_norm": 2.218543767929077, "learning_rate": 7.753575211864407e-06, "loss": 1.2257, "mean_token_accuracy": 0.7069820165634155, "num_tokens": 13648753.0, "step": 16966 }, { "epoch": 4.49364406779661, "grad_norm": 2.034590721130371, "learning_rate": 7.753310381355934e-06, "loss": 1.5728, "mean_token_accuracy": 0.659363154321909, "num_tokens": 13650528.0, "step": 16968 }, { "epoch": 4.49417372881356, "grad_norm": 1.7163869142532349, "learning_rate": 7.753045550847459e-06, "loss": 0.9667, "mean_token_accuracy": 0.7581044510006905, "num_tokens": 13651965.0, "step": 16970 }, { "epoch": 4.494703389830509, "grad_norm": 2.1761837005615234, "learning_rate": 7.752780720338983e-06, "loss": 1.132, "mean_token_accuracy": 0.7272340431809425, "num_tokens": 13653334.0, "step": 16972 }, { "epoch": 4.495233050847458, "grad_norm": 1.9076261520385742, "learning_rate": 7.752515889830508e-06, "loss": 0.9054, "mean_token_accuracy": 0.7743149325251579, "num_tokens": 13654776.0, "step": 16974 }, { "epoch": 4.495762711864407, "grad_norm": 2.2089755535125732, "learning_rate": 7.752251059322035e-06, "loss": 1.6775, "mean_token_accuracy": 0.6550085321068764, "num_tokens": 13656375.0, "step": 16976 }, { "epoch": 4.4962923728813555, "grad_norm": 1.7190135717391968, "learning_rate": 7.75198622881356e-06, "loss": 0.9576, "mean_token_accuracy": 0.745451495051384, "num_tokens": 13658110.0, "step": 16978 }, { "epoch": 4.496822033898305, "grad_norm": 2.0629959106445312, "learning_rate": 7.751721398305085e-06, "loss": 1.3161, "mean_token_accuracy": 0.7098546475172043, "num_tokens": 13659885.0, "step": 16980 }, { "epoch": 4.497351694915254, "grad_norm": 2.507681369781494, "learning_rate": 7.75145656779661e-06, "loss": 1.3124, "mean_token_accuracy": 0.7056190744042397, "num_tokens": 13661233.0, "step": 16982 }, { "epoch": 4.497881355932203, "grad_norm": 1.9065961837768555, "learning_rate": 7.751191737288136e-06, "loss": 0.9621, "mean_token_accuracy": 0.7558972164988518, "num_tokens": 13662775.0, "step": 16984 }, { "epoch": 4.498411016949152, "grad_norm": 1.9834706783294678, "learning_rate": 7.750926906779661e-06, "loss": 0.9156, "mean_token_accuracy": 0.7617192193865776, "num_tokens": 13664445.0, "step": 16986 }, { "epoch": 4.498940677966102, "grad_norm": 2.1346447467803955, "learning_rate": 7.750662076271186e-06, "loss": 1.3684, "mean_token_accuracy": 0.7024679630994797, "num_tokens": 13666265.0, "step": 16988 }, { "epoch": 4.499470338983051, "grad_norm": 1.8746401071548462, "learning_rate": 7.750397245762713e-06, "loss": 0.9854, "mean_token_accuracy": 0.7631069198250771, "num_tokens": 13667757.0, "step": 16990 }, { "epoch": 4.5, "grad_norm": 2.6713945865631104, "learning_rate": 7.750132415254238e-06, "loss": 1.468, "mean_token_accuracy": 0.6656271815299988, "num_tokens": 13669221.0, "step": 16992 }, { "epoch": 4.500529661016949, "grad_norm": 2.345111608505249, "learning_rate": 7.749867584745764e-06, "loss": 1.5636, "mean_token_accuracy": 0.6624579876661301, "num_tokens": 13670682.0, "step": 16994 }, { "epoch": 4.501059322033898, "grad_norm": 1.9312514066696167, "learning_rate": 7.74960275423729e-06, "loss": 1.4202, "mean_token_accuracy": 0.669117771089077, "num_tokens": 13672342.0, "step": 16996 }, { "epoch": 4.501588983050848, "grad_norm": 2.4062845706939697, "learning_rate": 7.749337923728814e-06, "loss": 1.0096, "mean_token_accuracy": 0.7400438115000725, "num_tokens": 13673827.0, "step": 16998 }, { "epoch": 4.502118644067797, "grad_norm": 1.455326795578003, "learning_rate": 7.74907309322034e-06, "loss": 1.0124, "step": 17000 }, { "epoch": 4.502118644067797, "eval_loss": 1.312840223312378, "eval_mean_token_accuracy": 0.700804006163176, "eval_num_tokens": 13675499.0, "eval_runtime": 48.2561, "eval_samples_per_second": 6.383, "eval_steps_per_second": 6.383, "step": 17000 }, { "epoch": 4.502648305084746, "grad_norm": 2.0746495723724365, "learning_rate": 7.748808262711866e-06, "loss": 1.1882, "mean_token_accuracy": 0.7375262379646301, "num_tokens": 13677172.0, "step": 17002 }, { "epoch": 4.503177966101695, "grad_norm": 2.405921459197998, "learning_rate": 7.74854343220339e-06, "loss": 1.3944, "mean_token_accuracy": 0.7191842570900917, "num_tokens": 13678592.0, "step": 17004 }, { "epoch": 4.503707627118644, "grad_norm": 2.28694486618042, "learning_rate": 7.748278601694916e-06, "loss": 1.1878, "mean_token_accuracy": 0.7272952273488045, "num_tokens": 13680120.0, "step": 17006 }, { "epoch": 4.504237288135593, "grad_norm": 2.0216403007507324, "learning_rate": 7.74801377118644e-06, "loss": 1.6548, "mean_token_accuracy": 0.6175948902964592, "num_tokens": 13681852.0, "step": 17008 }, { "epoch": 4.504766949152542, "grad_norm": 1.5876189470291138, "learning_rate": 7.747748940677967e-06, "loss": 1.2108, "mean_token_accuracy": 0.7113446816802025, "num_tokens": 13683836.0, "step": 17010 }, { "epoch": 4.505296610169491, "grad_norm": 2.512082576751709, "learning_rate": 7.747484110169492e-06, "loss": 1.1452, "mean_token_accuracy": 0.7355258390307426, "num_tokens": 13685221.0, "step": 17012 }, { "epoch": 4.50582627118644, "grad_norm": 1.8304951190948486, "learning_rate": 7.747219279661017e-06, "loss": 0.803, "mean_token_accuracy": 0.7866894975304604, "num_tokens": 13686730.0, "step": 17014 }, { "epoch": 4.50635593220339, "grad_norm": 1.8283641338348389, "learning_rate": 7.746954449152542e-06, "loss": 1.1697, "mean_token_accuracy": 0.7158475965261459, "num_tokens": 13688232.0, "step": 17016 }, { "epoch": 4.506885593220339, "grad_norm": 2.290553092956543, "learning_rate": 7.746689618644069e-06, "loss": 1.4159, "mean_token_accuracy": 0.6744732856750488, "num_tokens": 13689702.0, "step": 17018 }, { "epoch": 4.507415254237288, "grad_norm": 2.0318784713745117, "learning_rate": 7.746424788135594e-06, "loss": 0.9935, "mean_token_accuracy": 0.7677080854773521, "num_tokens": 13691276.0, "step": 17020 }, { "epoch": 4.507944915254237, "grad_norm": 1.7004573345184326, "learning_rate": 7.74615995762712e-06, "loss": 1.153, "mean_token_accuracy": 0.7158637717366219, "num_tokens": 13692875.0, "step": 17022 }, { "epoch": 4.508474576271187, "grad_norm": 2.382861375808716, "learning_rate": 7.745895127118645e-06, "loss": 1.3182, "mean_token_accuracy": 0.7115743532776833, "num_tokens": 13694312.0, "step": 17024 }, { "epoch": 4.509004237288136, "grad_norm": 2.581878900527954, "learning_rate": 7.74563029661017e-06, "loss": 0.9925, "mean_token_accuracy": 0.7548309117555618, "num_tokens": 13695881.0, "step": 17026 }, { "epoch": 4.509533898305085, "grad_norm": 1.820070743560791, "learning_rate": 7.745365466101695e-06, "loss": 1.1636, "mean_token_accuracy": 0.7184578999876976, "num_tokens": 13697522.0, "step": 17028 }, { "epoch": 4.510063559322034, "grad_norm": 2.6134278774261475, "learning_rate": 7.745100635593222e-06, "loss": 1.1444, "mean_token_accuracy": 0.7293220907449722, "num_tokens": 13698839.0, "step": 17030 }, { "epoch": 4.510593220338983, "grad_norm": 2.051151990890503, "learning_rate": 7.744835805084747e-06, "loss": 1.3987, "mean_token_accuracy": 0.6632097437977791, "num_tokens": 13700559.0, "step": 17032 }, { "epoch": 4.5111228813559325, "grad_norm": 2.062453508377075, "learning_rate": 7.744570974576272e-06, "loss": 1.2973, "mean_token_accuracy": 0.7119703181087971, "num_tokens": 13702124.0, "step": 17034 }, { "epoch": 4.5116525423728815, "grad_norm": 2.1149919033050537, "learning_rate": 7.744306144067796e-06, "loss": 1.1131, "mean_token_accuracy": 0.7391120418906212, "num_tokens": 13703676.0, "step": 17036 }, { "epoch": 4.5121822033898304, "grad_norm": 1.7937067747116089, "learning_rate": 7.744041313559323e-06, "loss": 1.1409, "mean_token_accuracy": 0.7414699867367744, "num_tokens": 13705335.0, "step": 17038 }, { "epoch": 4.512711864406779, "grad_norm": 2.1714653968811035, "learning_rate": 7.743776483050848e-06, "loss": 1.4407, "mean_token_accuracy": 0.6666285321116447, "num_tokens": 13706900.0, "step": 17040 }, { "epoch": 4.513241525423728, "grad_norm": 2.4476816654205322, "learning_rate": 7.743511652542373e-06, "loss": 1.261, "mean_token_accuracy": 0.700676292181015, "num_tokens": 13708818.0, "step": 17042 }, { "epoch": 4.513771186440678, "grad_norm": 1.8221893310546875, "learning_rate": 7.743246822033898e-06, "loss": 0.9189, "mean_token_accuracy": 0.7835552245378494, "num_tokens": 13710104.0, "step": 17044 }, { "epoch": 4.514300847457627, "grad_norm": 1.7618714570999146, "learning_rate": 7.742981991525425e-06, "loss": 0.8899, "mean_token_accuracy": 0.7691029012203217, "num_tokens": 13711802.0, "step": 17046 }, { "epoch": 4.514830508474576, "grad_norm": 1.992408275604248, "learning_rate": 7.74271716101695e-06, "loss": 1.1243, "mean_token_accuracy": 0.7474936544895172, "num_tokens": 13713346.0, "step": 17048 }, { "epoch": 4.515360169491525, "grad_norm": 1.6997575759887695, "learning_rate": 7.742452330508476e-06, "loss": 1.4498, "mean_token_accuracy": 0.6706904768943787, "num_tokens": 13714850.0, "step": 17050 }, { "epoch": 4.515889830508475, "grad_norm": 2.2123327255249023, "learning_rate": 7.742187500000001e-06, "loss": 1.2687, "mean_token_accuracy": 0.7230985462665558, "num_tokens": 13716608.0, "step": 17052 }, { "epoch": 4.516419491525424, "grad_norm": 1.9435796737670898, "learning_rate": 7.741922669491526e-06, "loss": 1.1643, "mean_token_accuracy": 0.7631499022245407, "num_tokens": 13718282.0, "step": 17054 }, { "epoch": 4.516949152542373, "grad_norm": 2.0010318756103516, "learning_rate": 7.741657838983051e-06, "loss": 1.087, "mean_token_accuracy": 0.7232454121112823, "num_tokens": 13719979.0, "step": 17056 }, { "epoch": 4.517478813559322, "grad_norm": 2.3376426696777344, "learning_rate": 7.741393008474577e-06, "loss": 1.4729, "mean_token_accuracy": 0.6537570431828499, "num_tokens": 13721666.0, "step": 17058 }, { "epoch": 4.518008474576272, "grad_norm": 2.172016143798828, "learning_rate": 7.741128177966102e-06, "loss": 1.3222, "mean_token_accuracy": 0.6851529143750668, "num_tokens": 13723366.0, "step": 17060 }, { "epoch": 4.518538135593221, "grad_norm": 1.695336103439331, "learning_rate": 7.740863347457627e-06, "loss": 0.7049, "mean_token_accuracy": 0.8101237863302231, "num_tokens": 13724977.0, "step": 17062 }, { "epoch": 4.5190677966101696, "grad_norm": 1.9940102100372314, "learning_rate": 7.740598516949152e-06, "loss": 0.7928, "mean_token_accuracy": 0.7980623245239258, "num_tokens": 13726527.0, "step": 17064 }, { "epoch": 4.5195974576271185, "grad_norm": 1.7963701486587524, "learning_rate": 7.740333686440679e-06, "loss": 1.1342, "mean_token_accuracy": 0.7216291204094887, "num_tokens": 13728161.0, "step": 17066 }, { "epoch": 4.5201271186440675, "grad_norm": 1.7376266717910767, "learning_rate": 7.740068855932204e-06, "loss": 1.2453, "mean_token_accuracy": 0.7267116978764534, "num_tokens": 13729782.0, "step": 17068 }, { "epoch": 4.520656779661017, "grad_norm": 2.3191778659820557, "learning_rate": 7.739804025423729e-06, "loss": 1.5125, "mean_token_accuracy": 0.6795322075486183, "num_tokens": 13731283.0, "step": 17070 }, { "epoch": 4.521186440677966, "grad_norm": 2.3642172813415527, "learning_rate": 7.739539194915255e-06, "loss": 1.2975, "mean_token_accuracy": 0.6934971809387207, "num_tokens": 13732964.0, "step": 17072 }, { "epoch": 4.521716101694915, "grad_norm": 1.5165565013885498, "learning_rate": 7.73927436440678e-06, "loss": 0.6324, "mean_token_accuracy": 0.8427828326821327, "num_tokens": 13734440.0, "step": 17074 }, { "epoch": 4.522245762711864, "grad_norm": 1.8996258974075317, "learning_rate": 7.739009533898307e-06, "loss": 1.1892, "mean_token_accuracy": 0.7100182473659515, "num_tokens": 13736122.0, "step": 17076 }, { "epoch": 4.522775423728813, "grad_norm": 2.182694435119629, "learning_rate": 7.738744703389832e-06, "loss": 1.3381, "mean_token_accuracy": 0.6965287029743195, "num_tokens": 13737645.0, "step": 17078 }, { "epoch": 4.523305084745763, "grad_norm": 3.030076503753662, "learning_rate": 7.738479872881357e-06, "loss": 1.5204, "mean_token_accuracy": 0.6728099808096886, "num_tokens": 13738955.0, "step": 17080 }, { "epoch": 4.523834745762712, "grad_norm": 2.288515567779541, "learning_rate": 7.738215042372882e-06, "loss": 1.1959, "mean_token_accuracy": 0.7196374759078026, "num_tokens": 13740433.0, "step": 17082 }, { "epoch": 4.524364406779661, "grad_norm": 2.2174649238586426, "learning_rate": 7.737950211864408e-06, "loss": 1.3581, "mean_token_accuracy": 0.6870243698358536, "num_tokens": 13741987.0, "step": 17084 }, { "epoch": 4.52489406779661, "grad_norm": 2.140695571899414, "learning_rate": 7.737685381355933e-06, "loss": 1.6109, "mean_token_accuracy": 0.6428023800253868, "num_tokens": 13743789.0, "step": 17086 }, { "epoch": 4.52542372881356, "grad_norm": 2.1016223430633545, "learning_rate": 7.737420550847458e-06, "loss": 1.4098, "mean_token_accuracy": 0.6850830093026161, "num_tokens": 13745755.0, "step": 17088 }, { "epoch": 4.525953389830509, "grad_norm": 1.9941855669021606, "learning_rate": 7.737155720338983e-06, "loss": 1.3299, "mean_token_accuracy": 0.699819765985012, "num_tokens": 13748245.0, "step": 17090 }, { "epoch": 4.526483050847458, "grad_norm": 1.821205973625183, "learning_rate": 7.73689088983051e-06, "loss": 1.0017, "mean_token_accuracy": 0.7615417838096619, "num_tokens": 13749799.0, "step": 17092 }, { "epoch": 4.527012711864407, "grad_norm": 1.9249768257141113, "learning_rate": 7.736626059322035e-06, "loss": 1.5105, "mean_token_accuracy": 0.6697528511285782, "num_tokens": 13751413.0, "step": 17094 }, { "epoch": 4.527542372881356, "grad_norm": 2.064195394515991, "learning_rate": 7.73636122881356e-06, "loss": 1.1827, "mean_token_accuracy": 0.7325239032506943, "num_tokens": 13752927.0, "step": 17096 }, { "epoch": 4.528072033898305, "grad_norm": 2.1427161693573, "learning_rate": 7.736096398305085e-06, "loss": 1.5359, "mean_token_accuracy": 0.6389504596590996, "num_tokens": 13754747.0, "step": 17098 }, { "epoch": 4.528601694915254, "grad_norm": 1.4683951139450073, "learning_rate": 7.735831567796611e-06, "loss": 0.9953, "mean_token_accuracy": 0.7553291022777557, "num_tokens": 13756510.0, "step": 17100 }, { "epoch": 4.529131355932203, "grad_norm": 1.9296324253082275, "learning_rate": 7.735566737288136e-06, "loss": 0.9489, "mean_token_accuracy": 0.7494922056794167, "num_tokens": 13757845.0, "step": 17102 }, { "epoch": 4.529661016949152, "grad_norm": 2.005213737487793, "learning_rate": 7.735301906779663e-06, "loss": 1.4979, "mean_token_accuracy": 0.6534536778926849, "num_tokens": 13759401.0, "step": 17104 }, { "epoch": 4.530190677966102, "grad_norm": 2.2043728828430176, "learning_rate": 7.735037076271188e-06, "loss": 1.4947, "mean_token_accuracy": 0.6414967551827431, "num_tokens": 13761078.0, "step": 17106 }, { "epoch": 4.530720338983051, "grad_norm": 2.125669240951538, "learning_rate": 7.734772245762713e-06, "loss": 1.2137, "mean_token_accuracy": 0.7132459729909897, "num_tokens": 13762667.0, "step": 17108 }, { "epoch": 4.53125, "grad_norm": 2.0328903198242188, "learning_rate": 7.734507415254237e-06, "loss": 0.8906, "mean_token_accuracy": 0.7617114335298538, "num_tokens": 13764166.0, "step": 17110 }, { "epoch": 4.531779661016949, "grad_norm": 2.382103681564331, "learning_rate": 7.734242584745764e-06, "loss": 1.7915, "mean_token_accuracy": 0.619282953441143, "num_tokens": 13765613.0, "step": 17112 }, { "epoch": 4.532309322033898, "grad_norm": 2.1388347148895264, "learning_rate": 7.733977754237289e-06, "loss": 1.5004, "mean_token_accuracy": 0.6849503368139267, "num_tokens": 13767138.0, "step": 17114 }, { "epoch": 4.532838983050848, "grad_norm": 2.336620330810547, "learning_rate": 7.733712923728814e-06, "loss": 1.5031, "mean_token_accuracy": 0.6673547700047493, "num_tokens": 13768894.0, "step": 17116 }, { "epoch": 4.533368644067797, "grad_norm": 2.2540299892425537, "learning_rate": 7.733448093220339e-06, "loss": 1.4954, "mean_token_accuracy": 0.6570976786315441, "num_tokens": 13770506.0, "step": 17118 }, { "epoch": 4.533898305084746, "grad_norm": 2.012415647506714, "learning_rate": 7.733183262711866e-06, "loss": 1.5549, "mean_token_accuracy": 0.6432496085762978, "num_tokens": 13772239.0, "step": 17120 }, { "epoch": 4.534427966101695, "grad_norm": 2.132826089859009, "learning_rate": 7.73291843220339e-06, "loss": 1.6482, "mean_token_accuracy": 0.647962860763073, "num_tokens": 13773882.0, "step": 17122 }, { "epoch": 4.534957627118644, "grad_norm": 1.940346598625183, "learning_rate": 7.732653601694915e-06, "loss": 1.1264, "mean_token_accuracy": 0.7267385348677635, "num_tokens": 13775732.0, "step": 17124 }, { "epoch": 4.535487288135593, "grad_norm": 1.5913975238800049, "learning_rate": 7.73238877118644e-06, "loss": 1.1922, "mean_token_accuracy": 0.7187716439366341, "num_tokens": 13777416.0, "step": 17126 }, { "epoch": 4.536016949152542, "grad_norm": 2.1528921127319336, "learning_rate": 7.732123940677967e-06, "loss": 0.9378, "mean_token_accuracy": 0.7551415786147118, "num_tokens": 13778856.0, "step": 17128 }, { "epoch": 4.536546610169491, "grad_norm": 1.7795023918151855, "learning_rate": 7.731859110169492e-06, "loss": 1.2158, "mean_token_accuracy": 0.7009658813476562, "num_tokens": 13780772.0, "step": 17130 }, { "epoch": 4.53707627118644, "grad_norm": 1.791176199913025, "learning_rate": 7.731594279661018e-06, "loss": 1.2315, "mean_token_accuracy": 0.7024255953729153, "num_tokens": 13782573.0, "step": 17132 }, { "epoch": 4.53760593220339, "grad_norm": 1.813767671585083, "learning_rate": 7.731329449152543e-06, "loss": 1.073, "mean_token_accuracy": 0.7428345009684563, "num_tokens": 13784246.0, "step": 17134 }, { "epoch": 4.538135593220339, "grad_norm": 1.888228178024292, "learning_rate": 7.731064618644068e-06, "loss": 1.0872, "mean_token_accuracy": 0.7318376377224922, "num_tokens": 13785805.0, "step": 17136 }, { "epoch": 4.538665254237288, "grad_norm": 1.8401966094970703, "learning_rate": 7.730799788135593e-06, "loss": 1.363, "mean_token_accuracy": 0.724015012383461, "num_tokens": 13787337.0, "step": 17138 }, { "epoch": 4.539194915254237, "grad_norm": 2.136979341506958, "learning_rate": 7.73053495762712e-06, "loss": 1.0468, "mean_token_accuracy": 0.7569656148552895, "num_tokens": 13788800.0, "step": 17140 }, { "epoch": 4.539724576271187, "grad_norm": 2.065126895904541, "learning_rate": 7.730270127118645e-06, "loss": 1.3006, "mean_token_accuracy": 0.6896363645792007, "num_tokens": 13790621.0, "step": 17142 }, { "epoch": 4.540254237288136, "grad_norm": 2.3328163623809814, "learning_rate": 7.73000529661017e-06, "loss": 1.0858, "mean_token_accuracy": 0.7483485266566277, "num_tokens": 13792028.0, "step": 17144 }, { "epoch": 4.540783898305085, "grad_norm": 2.1529691219329834, "learning_rate": 7.729740466101695e-06, "loss": 1.4144, "mean_token_accuracy": 0.6757649853825569, "num_tokens": 13793829.0, "step": 17146 }, { "epoch": 4.541313559322034, "grad_norm": 2.29994797706604, "learning_rate": 7.729475635593221e-06, "loss": 1.3612, "mean_token_accuracy": 0.6875647380948067, "num_tokens": 13795626.0, "step": 17148 }, { "epoch": 4.541843220338983, "grad_norm": 1.6395796537399292, "learning_rate": 7.729210805084746e-06, "loss": 0.9104, "mean_token_accuracy": 0.7481856420636177, "num_tokens": 13798342.0, "step": 17150 }, { "epoch": 4.5423728813559325, "grad_norm": 1.8033959865570068, "learning_rate": 7.728945974576271e-06, "loss": 1.246, "mean_token_accuracy": 0.7149986326694489, "num_tokens": 13799978.0, "step": 17152 }, { "epoch": 4.5429025423728815, "grad_norm": 1.986359715461731, "learning_rate": 7.728681144067798e-06, "loss": 0.945, "mean_token_accuracy": 0.7688761651515961, "num_tokens": 13801385.0, "step": 17154 }, { "epoch": 4.5434322033898304, "grad_norm": 2.019564390182495, "learning_rate": 7.728416313559323e-06, "loss": 1.3002, "mean_token_accuracy": 0.6918476074934006, "num_tokens": 13802987.0, "step": 17156 }, { "epoch": 4.543961864406779, "grad_norm": 2.1471333503723145, "learning_rate": 7.72815148305085e-06, "loss": 1.1533, "mean_token_accuracy": 0.7305890917778015, "num_tokens": 13804510.0, "step": 17158 }, { "epoch": 4.544491525423728, "grad_norm": 1.9369611740112305, "learning_rate": 7.727886652542374e-06, "loss": 1.2166, "mean_token_accuracy": 0.728722870349884, "num_tokens": 13806348.0, "step": 17160 }, { "epoch": 4.545021186440678, "grad_norm": 1.9059525728225708, "learning_rate": 7.7276218220339e-06, "loss": 0.8468, "mean_token_accuracy": 0.7861173003911972, "num_tokens": 13807746.0, "step": 17162 }, { "epoch": 4.545550847457627, "grad_norm": 2.3180909156799316, "learning_rate": 7.727356991525424e-06, "loss": 0.8474, "mean_token_accuracy": 0.7890107557177544, "num_tokens": 13809119.0, "step": 17164 }, { "epoch": 4.546080508474576, "grad_norm": 1.8486392498016357, "learning_rate": 7.72709216101695e-06, "loss": 0.7577, "mean_token_accuracy": 0.7972248792648315, "num_tokens": 13810824.0, "step": 17166 }, { "epoch": 4.546610169491525, "grad_norm": 2.3551862239837646, "learning_rate": 7.726827330508476e-06, "loss": 1.4796, "mean_token_accuracy": 0.6730351448059082, "num_tokens": 13812253.0, "step": 17168 }, { "epoch": 4.547139830508475, "grad_norm": 1.8399111032485962, "learning_rate": 7.7265625e-06, "loss": 0.9588, "mean_token_accuracy": 0.7519035339355469, "num_tokens": 13813737.0, "step": 17170 }, { "epoch": 4.547669491525424, "grad_norm": 2.327072858810425, "learning_rate": 7.726297669491526e-06, "loss": 1.8382, "mean_token_accuracy": 0.6018907502293587, "num_tokens": 13815398.0, "step": 17172 }, { "epoch": 4.548199152542373, "grad_norm": 1.7339890003204346, "learning_rate": 7.726032838983052e-06, "loss": 0.8527, "mean_token_accuracy": 0.7891146540641785, "num_tokens": 13817028.0, "step": 17174 }, { "epoch": 4.548728813559322, "grad_norm": 2.337193250656128, "learning_rate": 7.725768008474577e-06, "loss": 1.1187, "mean_token_accuracy": 0.716734878718853, "num_tokens": 13818565.0, "step": 17176 }, { "epoch": 4.549258474576272, "grad_norm": 2.2300233840942383, "learning_rate": 7.725503177966102e-06, "loss": 1.391, "mean_token_accuracy": 0.7112375646829605, "num_tokens": 13819920.0, "step": 17178 }, { "epoch": 4.549788135593221, "grad_norm": 2.2436861991882324, "learning_rate": 7.725238347457627e-06, "loss": 0.9775, "mean_token_accuracy": 0.7589482516050339, "num_tokens": 13821432.0, "step": 17180 }, { "epoch": 4.5503177966101696, "grad_norm": 2.046908378601074, "learning_rate": 7.724973516949154e-06, "loss": 1.8322, "mean_token_accuracy": 0.6307505294680595, "num_tokens": 13823042.0, "step": 17182 }, { "epoch": 4.5508474576271185, "grad_norm": 2.3794212341308594, "learning_rate": 7.724708686440679e-06, "loss": 1.4626, "mean_token_accuracy": 0.6657344549894333, "num_tokens": 13824546.0, "step": 17184 }, { "epoch": 4.5513771186440675, "grad_norm": 2.0462865829467773, "learning_rate": 7.724443855932205e-06, "loss": 1.1859, "mean_token_accuracy": 0.7274282723665237, "num_tokens": 13826270.0, "step": 17186 }, { "epoch": 4.551906779661017, "grad_norm": 2.107405424118042, "learning_rate": 7.72417902542373e-06, "loss": 1.3583, "mean_token_accuracy": 0.6925728768110275, "num_tokens": 13827721.0, "step": 17188 }, { "epoch": 4.552436440677966, "grad_norm": 2.0466389656066895, "learning_rate": 7.723914194915255e-06, "loss": 1.4851, "mean_token_accuracy": 0.6711291745305061, "num_tokens": 13829204.0, "step": 17190 }, { "epoch": 4.552966101694915, "grad_norm": 2.321974277496338, "learning_rate": 7.72364936440678e-06, "loss": 1.3333, "mean_token_accuracy": 0.6970295310020447, "num_tokens": 13830775.0, "step": 17192 }, { "epoch": 4.553495762711864, "grad_norm": 2.1257097721099854, "learning_rate": 7.723384533898307e-06, "loss": 1.2684, "mean_token_accuracy": 0.6861740685999393, "num_tokens": 13832505.0, "step": 17194 }, { "epoch": 4.554025423728813, "grad_norm": 2.105281352996826, "learning_rate": 7.723119703389831e-06, "loss": 0.8056, "mean_token_accuracy": 0.7801389768719673, "num_tokens": 13834339.0, "step": 17196 }, { "epoch": 4.554555084745763, "grad_norm": 1.3879152536392212, "learning_rate": 7.722854872881356e-06, "loss": 1.0949, "mean_token_accuracy": 0.7401143535971642, "num_tokens": 13836857.0, "step": 17198 }, { "epoch": 4.555084745762712, "grad_norm": 2.0308449268341064, "learning_rate": 7.722590042372881e-06, "loss": 0.969, "mean_token_accuracy": 0.7743916809558868, "num_tokens": 13838372.0, "step": 17200 }, { "epoch": 4.555614406779661, "grad_norm": 2.1678361892700195, "learning_rate": 7.722325211864408e-06, "loss": 1.477, "mean_token_accuracy": 0.6735772788524628, "num_tokens": 13840066.0, "step": 17202 }, { "epoch": 4.55614406779661, "grad_norm": 2.0553207397460938, "learning_rate": 7.722060381355933e-06, "loss": 1.5917, "mean_token_accuracy": 0.6374626383185387, "num_tokens": 13841918.0, "step": 17204 }, { "epoch": 4.55667372881356, "grad_norm": 1.7626769542694092, "learning_rate": 7.721795550847458e-06, "loss": 1.1255, "mean_token_accuracy": 0.7420052662491798, "num_tokens": 13843415.0, "step": 17206 }, { "epoch": 4.557203389830509, "grad_norm": 2.023627519607544, "learning_rate": 7.721530720338983e-06, "loss": 1.0558, "mean_token_accuracy": 0.7313177287578583, "num_tokens": 13845069.0, "step": 17208 }, { "epoch": 4.557733050847458, "grad_norm": 2.1713759899139404, "learning_rate": 7.72126588983051e-06, "loss": 1.3505, "mean_token_accuracy": 0.691073440015316, "num_tokens": 13846652.0, "step": 17210 }, { "epoch": 4.558262711864407, "grad_norm": 2.4236488342285156, "learning_rate": 7.721001059322034e-06, "loss": 1.1974, "mean_token_accuracy": 0.7197625935077667, "num_tokens": 13848171.0, "step": 17212 }, { "epoch": 4.558792372881356, "grad_norm": 2.3402493000030518, "learning_rate": 7.720736228813561e-06, "loss": 1.6234, "mean_token_accuracy": 0.6491322964429855, "num_tokens": 13849731.0, "step": 17214 }, { "epoch": 4.559322033898305, "grad_norm": 2.0988402366638184, "learning_rate": 7.720471398305084e-06, "loss": 0.8809, "mean_token_accuracy": 0.7891687601804733, "num_tokens": 13851042.0, "step": 17216 }, { "epoch": 4.559851694915254, "grad_norm": 1.8674447536468506, "learning_rate": 7.72020656779661e-06, "loss": 1.1019, "mean_token_accuracy": 0.7558583468198776, "num_tokens": 13852457.0, "step": 17218 }, { "epoch": 4.560381355932203, "grad_norm": 1.8022140264511108, "learning_rate": 7.719941737288136e-06, "loss": 1.2447, "mean_token_accuracy": 0.7089028656482697, "num_tokens": 13854331.0, "step": 17220 }, { "epoch": 4.560911016949152, "grad_norm": 2.2213313579559326, "learning_rate": 7.719676906779662e-06, "loss": 1.0226, "mean_token_accuracy": 0.7581937909126282, "num_tokens": 13855721.0, "step": 17222 }, { "epoch": 4.561440677966102, "grad_norm": 2.2813847064971924, "learning_rate": 7.719412076271187e-06, "loss": 1.439, "mean_token_accuracy": 0.6778987646102905, "num_tokens": 13857166.0, "step": 17224 }, { "epoch": 4.561970338983051, "grad_norm": 2.0927579402923584, "learning_rate": 7.719147245762712e-06, "loss": 1.6162, "mean_token_accuracy": 0.6448197662830353, "num_tokens": 13858837.0, "step": 17226 }, { "epoch": 4.5625, "grad_norm": 1.8782647848129272, "learning_rate": 7.718882415254237e-06, "loss": 1.5539, "mean_token_accuracy": 0.6471459418535233, "num_tokens": 13860735.0, "step": 17228 }, { "epoch": 4.563029661016949, "grad_norm": 1.8198895454406738, "learning_rate": 7.718617584745764e-06, "loss": 1.2502, "mean_token_accuracy": 0.7171987369656563, "num_tokens": 13862453.0, "step": 17230 }, { "epoch": 4.563559322033898, "grad_norm": 1.6982098817825317, "learning_rate": 7.718352754237289e-06, "loss": 1.1627, "mean_token_accuracy": 0.733331024646759, "num_tokens": 13864049.0, "step": 17232 }, { "epoch": 4.564088983050848, "grad_norm": 1.7464574575424194, "learning_rate": 7.718087923728814e-06, "loss": 0.8729, "mean_token_accuracy": 0.7767012789845467, "num_tokens": 13865428.0, "step": 17234 }, { "epoch": 4.564618644067797, "grad_norm": 2.5726633071899414, "learning_rate": 7.717823093220339e-06, "loss": 1.5789, "mean_token_accuracy": 0.6617047004401684, "num_tokens": 13866876.0, "step": 17236 }, { "epoch": 4.565148305084746, "grad_norm": 1.4612281322479248, "learning_rate": 7.717558262711865e-06, "loss": 1.0673, "mean_token_accuracy": 0.7465871497988701, "num_tokens": 13868698.0, "step": 17238 }, { "epoch": 4.565677966101695, "grad_norm": 2.028865337371826, "learning_rate": 7.717293432203392e-06, "loss": 1.4847, "mean_token_accuracy": 0.6861062049865723, "num_tokens": 13870182.0, "step": 17240 }, { "epoch": 4.566207627118644, "grad_norm": 2.022561550140381, "learning_rate": 7.717028601694917e-06, "loss": 1.6717, "mean_token_accuracy": 0.6522846966981888, "num_tokens": 13871736.0, "step": 17242 }, { "epoch": 4.566737288135593, "grad_norm": 1.7980180978775024, "learning_rate": 7.716763771186442e-06, "loss": 1.4323, "mean_token_accuracy": 0.680472768843174, "num_tokens": 13873266.0, "step": 17244 }, { "epoch": 4.567266949152542, "grad_norm": 2.118978261947632, "learning_rate": 7.716498940677967e-06, "loss": 1.3333, "mean_token_accuracy": 0.685979351401329, "num_tokens": 13874878.0, "step": 17246 }, { "epoch": 4.567796610169491, "grad_norm": 2.1966967582702637, "learning_rate": 7.716234110169493e-06, "loss": 1.2429, "mean_token_accuracy": 0.7170390263199806, "num_tokens": 13876441.0, "step": 17248 }, { "epoch": 4.56832627118644, "grad_norm": 1.9226471185684204, "learning_rate": 7.715969279661018e-06, "loss": 1.497, "step": 17250 }, { "epoch": 4.56832627118644, "eval_loss": 1.313908576965332, "eval_mean_token_accuracy": 0.7013708019217888, "eval_num_tokens": 13878705.0, "eval_runtime": 48.1618, "eval_samples_per_second": 6.395, "eval_steps_per_second": 6.395, "step": 17250 }, { "epoch": 4.56885593220339, "grad_norm": 1.873353362083435, "learning_rate": 7.715704449152543e-06, "loss": 1.1659, "mean_token_accuracy": 0.7057113461196423, "num_tokens": 13880538.0, "step": 17252 }, { "epoch": 4.569385593220339, "grad_norm": 2.072505235671997, "learning_rate": 7.715439618644068e-06, "loss": 1.339, "mean_token_accuracy": 0.6852636449038982, "num_tokens": 13882205.0, "step": 17254 }, { "epoch": 4.569915254237288, "grad_norm": 2.74827241897583, "learning_rate": 7.715174788135595e-06, "loss": 1.3585, "mean_token_accuracy": 0.6732310131192207, "num_tokens": 13883710.0, "step": 17256 }, { "epoch": 4.570444915254237, "grad_norm": 1.7591331005096436, "learning_rate": 7.71490995762712e-06, "loss": 1.5223, "mean_token_accuracy": 0.6637616008520126, "num_tokens": 13885528.0, "step": 17258 }, { "epoch": 4.570974576271187, "grad_norm": 1.9137176275253296, "learning_rate": 7.714645127118644e-06, "loss": 1.1737, "mean_token_accuracy": 0.742184467613697, "num_tokens": 13887090.0, "step": 17260 }, { "epoch": 4.571504237288136, "grad_norm": 2.3131539821624756, "learning_rate": 7.71438029661017e-06, "loss": 1.1916, "mean_token_accuracy": 0.7022694572806358, "num_tokens": 13888673.0, "step": 17262 }, { "epoch": 4.572033898305085, "grad_norm": 2.48522686958313, "learning_rate": 7.714115466101696e-06, "loss": 1.6559, "mean_token_accuracy": 0.6403626687824726, "num_tokens": 13890376.0, "step": 17264 }, { "epoch": 4.572563559322034, "grad_norm": 2.0099568367004395, "learning_rate": 7.713850635593221e-06, "loss": 1.3885, "mean_token_accuracy": 0.674856886267662, "num_tokens": 13891978.0, "step": 17266 }, { "epoch": 4.573093220338983, "grad_norm": 2.060579538345337, "learning_rate": 7.713585805084748e-06, "loss": 1.2905, "mean_token_accuracy": 0.7090070471167564, "num_tokens": 13893560.0, "step": 17268 }, { "epoch": 4.5736228813559325, "grad_norm": 1.905734658241272, "learning_rate": 7.71332097457627e-06, "loss": 1.1619, "mean_token_accuracy": 0.7246450632810593, "num_tokens": 13894891.0, "step": 17270 }, { "epoch": 4.5741525423728815, "grad_norm": 1.7567113637924194, "learning_rate": 7.713056144067797e-06, "loss": 1.1105, "mean_token_accuracy": 0.7407716289162636, "num_tokens": 13896431.0, "step": 17272 }, { "epoch": 4.5746822033898304, "grad_norm": 2.099721908569336, "learning_rate": 7.712791313559322e-06, "loss": 1.4963, "mean_token_accuracy": 0.6577972695231438, "num_tokens": 13897984.0, "step": 17274 }, { "epoch": 4.575211864406779, "grad_norm": 2.1080222129821777, "learning_rate": 7.712526483050849e-06, "loss": 1.7185, "mean_token_accuracy": 0.6189439296722412, "num_tokens": 13899592.0, "step": 17276 }, { "epoch": 4.575741525423728, "grad_norm": 1.824618935585022, "learning_rate": 7.712261652542374e-06, "loss": 1.127, "mean_token_accuracy": 0.7333934232592583, "num_tokens": 13901375.0, "step": 17278 }, { "epoch": 4.576271186440678, "grad_norm": 2.4349217414855957, "learning_rate": 7.711996822033899e-06, "loss": 0.6954, "mean_token_accuracy": 0.8082966431975365, "num_tokens": 13902821.0, "step": 17280 }, { "epoch": 4.576800847457627, "grad_norm": 2.1310572624206543, "learning_rate": 7.711731991525424e-06, "loss": 0.9935, "mean_token_accuracy": 0.7599732279777527, "num_tokens": 13904220.0, "step": 17282 }, { "epoch": 4.577330508474576, "grad_norm": 1.7397432327270508, "learning_rate": 7.71146716101695e-06, "loss": 1.1623, "mean_token_accuracy": 0.7192524895071983, "num_tokens": 13906014.0, "step": 17284 }, { "epoch": 4.577860169491525, "grad_norm": 2.367262601852417, "learning_rate": 7.711202330508475e-06, "loss": 1.0829, "mean_token_accuracy": 0.7393245175480843, "num_tokens": 13907778.0, "step": 17286 }, { "epoch": 4.578389830508475, "grad_norm": 2.1430046558380127, "learning_rate": 7.7109375e-06, "loss": 1.2396, "mean_token_accuracy": 0.7172266095876694, "num_tokens": 13909206.0, "step": 17288 }, { "epoch": 4.578919491525424, "grad_norm": 1.5379395484924316, "learning_rate": 7.710672669491525e-06, "loss": 1.2556, "mean_token_accuracy": 0.6920193806290627, "num_tokens": 13911230.0, "step": 17290 }, { "epoch": 4.579449152542373, "grad_norm": 2.1713321208953857, "learning_rate": 7.710407838983052e-06, "loss": 0.9902, "mean_token_accuracy": 0.7745823413133621, "num_tokens": 13912589.0, "step": 17292 }, { "epoch": 4.579978813559322, "grad_norm": 1.8523311614990234, "learning_rate": 7.710143008474577e-06, "loss": 1.3183, "mean_token_accuracy": 0.6722506359219551, "num_tokens": 13914166.0, "step": 17294 }, { "epoch": 4.580508474576272, "grad_norm": 2.2268149852752686, "learning_rate": 7.709878177966103e-06, "loss": 1.0081, "mean_token_accuracy": 0.7452508583664894, "num_tokens": 13915535.0, "step": 17296 }, { "epoch": 4.581038135593221, "grad_norm": 1.8620452880859375, "learning_rate": 7.709613347457627e-06, "loss": 1.1261, "mean_token_accuracy": 0.7465591728687286, "num_tokens": 13917035.0, "step": 17298 }, { "epoch": 4.5815677966101696, "grad_norm": 1.6729117631912231, "learning_rate": 7.709348516949153e-06, "loss": 0.9655, "mean_token_accuracy": 0.7900535017251968, "num_tokens": 13918741.0, "step": 17300 }, { "epoch": 4.5820974576271185, "grad_norm": 2.208385705947876, "learning_rate": 7.709083686440678e-06, "loss": 1.1136, "mean_token_accuracy": 0.7351015135645866, "num_tokens": 13920450.0, "step": 17302 }, { "epoch": 4.5826271186440675, "grad_norm": 1.8104928731918335, "learning_rate": 7.708818855932205e-06, "loss": 1.2302, "mean_token_accuracy": 0.7010773308575153, "num_tokens": 13922149.0, "step": 17304 }, { "epoch": 4.583156779661017, "grad_norm": 1.992007851600647, "learning_rate": 7.70855402542373e-06, "loss": 1.1383, "mean_token_accuracy": 0.7352014854550362, "num_tokens": 13923614.0, "step": 17306 }, { "epoch": 4.583686440677966, "grad_norm": 1.9435409307479858, "learning_rate": 7.708289194915255e-06, "loss": 1.2872, "mean_token_accuracy": 0.7217016294598579, "num_tokens": 13925346.0, "step": 17308 }, { "epoch": 4.584216101694915, "grad_norm": 2.143007516860962, "learning_rate": 7.70802436440678e-06, "loss": 1.8994, "mean_token_accuracy": 0.6008256897330284, "num_tokens": 13927289.0, "step": 17310 }, { "epoch": 4.584745762711864, "grad_norm": 2.378098726272583, "learning_rate": 7.707759533898306e-06, "loss": 1.5383, "mean_token_accuracy": 0.6626998111605644, "num_tokens": 13928645.0, "step": 17312 }, { "epoch": 4.585275423728813, "grad_norm": 1.9671909809112549, "learning_rate": 7.707494703389831e-06, "loss": 0.8901, "mean_token_accuracy": 0.7720593586564064, "num_tokens": 13930316.0, "step": 17314 }, { "epoch": 4.585805084745763, "grad_norm": 2.091946840286255, "learning_rate": 7.707229872881356e-06, "loss": 1.1462, "mean_token_accuracy": 0.7459873929619789, "num_tokens": 13931815.0, "step": 17316 }, { "epoch": 4.586334745762712, "grad_norm": 2.3444936275482178, "learning_rate": 7.706965042372881e-06, "loss": 1.2848, "mean_token_accuracy": 0.7080305516719818, "num_tokens": 13933025.0, "step": 17318 }, { "epoch": 4.586864406779661, "grad_norm": 1.9128222465515137, "learning_rate": 7.706700211864408e-06, "loss": 1.0221, "mean_token_accuracy": 0.7435923740267754, "num_tokens": 13935602.0, "step": 17320 }, { "epoch": 4.58739406779661, "grad_norm": 1.5746175050735474, "learning_rate": 7.706435381355934e-06, "loss": 0.7662, "mean_token_accuracy": 0.796063095331192, "num_tokens": 13937089.0, "step": 17322 }, { "epoch": 4.58792372881356, "grad_norm": 1.9002715349197388, "learning_rate": 7.706170550847457e-06, "loss": 1.3705, "mean_token_accuracy": 0.7161506041884422, "num_tokens": 13938784.0, "step": 17324 }, { "epoch": 4.588453389830509, "grad_norm": 2.0855629444122314, "learning_rate": 7.705905720338984e-06, "loss": 1.2024, "mean_token_accuracy": 0.720206581056118, "num_tokens": 13940354.0, "step": 17326 }, { "epoch": 4.588983050847458, "grad_norm": 1.9919545650482178, "learning_rate": 7.705640889830509e-06, "loss": 0.8851, "mean_token_accuracy": 0.8013226613402367, "num_tokens": 13941493.0, "step": 17328 }, { "epoch": 4.589512711864407, "grad_norm": 2.0546834468841553, "learning_rate": 7.705376059322036e-06, "loss": 1.5106, "mean_token_accuracy": 0.6910203248262405, "num_tokens": 13942988.0, "step": 17330 }, { "epoch": 4.590042372881356, "grad_norm": 2.181952476501465, "learning_rate": 7.70511122881356e-06, "loss": 1.568, "mean_token_accuracy": 0.6324388086795807, "num_tokens": 13944649.0, "step": 17332 }, { "epoch": 4.590572033898305, "grad_norm": 2.3389511108398438, "learning_rate": 7.704846398305085e-06, "loss": 1.8496, "mean_token_accuracy": 0.6121114306151867, "num_tokens": 13946415.0, "step": 17334 }, { "epoch": 4.591101694915254, "grad_norm": 2.01827073097229, "learning_rate": 7.70458156779661e-06, "loss": 1.2025, "mean_token_accuracy": 0.720834530889988, "num_tokens": 13947809.0, "step": 17336 }, { "epoch": 4.591631355932203, "grad_norm": 2.1397621631622314, "learning_rate": 7.704316737288137e-06, "loss": 0.9274, "mean_token_accuracy": 0.7716900855302811, "num_tokens": 13949651.0, "step": 17338 }, { "epoch": 4.592161016949152, "grad_norm": 2.187762498855591, "learning_rate": 7.704051906779662e-06, "loss": 1.4153, "mean_token_accuracy": 0.6646321266889572, "num_tokens": 13951291.0, "step": 17340 }, { "epoch": 4.592690677966102, "grad_norm": 2.071403980255127, "learning_rate": 7.703787076271187e-06, "loss": 0.9374, "mean_token_accuracy": 0.7575028985738754, "num_tokens": 13952850.0, "step": 17342 }, { "epoch": 4.593220338983051, "grad_norm": 2.065680980682373, "learning_rate": 7.703522245762712e-06, "loss": 1.0455, "mean_token_accuracy": 0.7552703469991684, "num_tokens": 13954387.0, "step": 17344 }, { "epoch": 4.59375, "grad_norm": 1.984702467918396, "learning_rate": 7.703257415254238e-06, "loss": 1.2742, "mean_token_accuracy": 0.712988369166851, "num_tokens": 13956020.0, "step": 17346 }, { "epoch": 4.594279661016949, "grad_norm": 1.6550602912902832, "learning_rate": 7.702992584745763e-06, "loss": 1.0227, "mean_token_accuracy": 0.7511446997523308, "num_tokens": 13957641.0, "step": 17348 }, { "epoch": 4.594809322033898, "grad_norm": 2.3250792026519775, "learning_rate": 7.70272775423729e-06, "loss": 1.3017, "mean_token_accuracy": 0.6853704527020454, "num_tokens": 13958968.0, "step": 17350 }, { "epoch": 4.595338983050848, "grad_norm": 1.9834544658660889, "learning_rate": 7.702462923728813e-06, "loss": 1.5717, "mean_token_accuracy": 0.6570432782173157, "num_tokens": 13960482.0, "step": 17352 }, { "epoch": 4.595868644067797, "grad_norm": 1.7425081729888916, "learning_rate": 7.70219809322034e-06, "loss": 0.9212, "mean_token_accuracy": 0.772241584956646, "num_tokens": 13962126.0, "step": 17354 }, { "epoch": 4.596398305084746, "grad_norm": 1.4596600532531738, "learning_rate": 7.701933262711865e-06, "loss": 1.0074, "mean_token_accuracy": 0.7748575955629349, "num_tokens": 13963595.0, "step": 17356 }, { "epoch": 4.596927966101695, "grad_norm": 2.2179548740386963, "learning_rate": 7.701668432203391e-06, "loss": 1.418, "mean_token_accuracy": 0.6975885033607483, "num_tokens": 13965221.0, "step": 17358 }, { "epoch": 4.597457627118644, "grad_norm": 1.781574010848999, "learning_rate": 7.701403601694916e-06, "loss": 1.5995, "mean_token_accuracy": 0.6500667557120323, "num_tokens": 13966948.0, "step": 17360 }, { "epoch": 4.597987288135593, "grad_norm": 2.168743371963501, "learning_rate": 7.701138771186441e-06, "loss": 1.0741, "mean_token_accuracy": 0.7397991269826889, "num_tokens": 13968490.0, "step": 17362 }, { "epoch": 4.598516949152542, "grad_norm": 2.2883589267730713, "learning_rate": 7.700873940677966e-06, "loss": 1.509, "mean_token_accuracy": 0.6613461300730705, "num_tokens": 13970000.0, "step": 17364 }, { "epoch": 4.599046610169491, "grad_norm": 2.140385389328003, "learning_rate": 7.700609110169493e-06, "loss": 1.5138, "mean_token_accuracy": 0.6904298663139343, "num_tokens": 13971343.0, "step": 17366 }, { "epoch": 4.59957627118644, "grad_norm": 1.977434754371643, "learning_rate": 7.700344279661018e-06, "loss": 1.4684, "mean_token_accuracy": 0.6699635535478592, "num_tokens": 13972712.0, "step": 17368 }, { "epoch": 4.60010593220339, "grad_norm": 2.1097640991210938, "learning_rate": 7.700079449152543e-06, "loss": 1.156, "mean_token_accuracy": 0.7331311255693436, "num_tokens": 13974211.0, "step": 17370 }, { "epoch": 4.600635593220339, "grad_norm": 1.9314825534820557, "learning_rate": 7.699814618644068e-06, "loss": 1.1961, "mean_token_accuracy": 0.730481281876564, "num_tokens": 13975791.0, "step": 17372 }, { "epoch": 4.601165254237288, "grad_norm": 1.7234011888504028, "learning_rate": 7.699549788135594e-06, "loss": 1.4395, "mean_token_accuracy": 0.6828079670667648, "num_tokens": 13977613.0, "step": 17374 }, { "epoch": 4.601694915254237, "grad_norm": 1.8567898273468018, "learning_rate": 7.699284957627119e-06, "loss": 0.9859, "mean_token_accuracy": 0.7796743512153625, "num_tokens": 13979639.0, "step": 17376 }, { "epoch": 4.602224576271187, "grad_norm": 2.3240134716033936, "learning_rate": 7.699020127118644e-06, "loss": 1.66, "mean_token_accuracy": 0.6292417272925377, "num_tokens": 13981019.0, "step": 17378 }, { "epoch": 4.602754237288136, "grad_norm": 2.017704963684082, "learning_rate": 7.698755296610169e-06, "loss": 1.1058, "mean_token_accuracy": 0.7637705281376839, "num_tokens": 13982556.0, "step": 17380 }, { "epoch": 4.603283898305085, "grad_norm": 1.9086965322494507, "learning_rate": 7.698490466101696e-06, "loss": 0.9923, "mean_token_accuracy": 0.7697148323059082, "num_tokens": 13984127.0, "step": 17382 }, { "epoch": 4.603813559322034, "grad_norm": 3.564455032348633, "learning_rate": 7.69822563559322e-06, "loss": 1.3377, "mean_token_accuracy": 0.7043707519769669, "num_tokens": 13985578.0, "step": 17384 }, { "epoch": 4.604343220338983, "grad_norm": 1.8897322416305542, "learning_rate": 7.697960805084747e-06, "loss": 1.1256, "mean_token_accuracy": 0.7533597648143768, "num_tokens": 13987185.0, "step": 17386 }, { "epoch": 4.6048728813559325, "grad_norm": 1.8158185482025146, "learning_rate": 7.697695974576272e-06, "loss": 0.7658, "mean_token_accuracy": 0.7882359474897385, "num_tokens": 13988620.0, "step": 17388 }, { "epoch": 4.6054025423728815, "grad_norm": 1.285927176475525, "learning_rate": 7.697431144067797e-06, "loss": 1.0036, "mean_token_accuracy": 0.7342507615685463, "num_tokens": 13990992.0, "step": 17390 }, { "epoch": 4.6059322033898304, "grad_norm": 1.6809879541397095, "learning_rate": 7.697166313559322e-06, "loss": 0.7928, "mean_token_accuracy": 0.8022274300456047, "num_tokens": 13992376.0, "step": 17392 }, { "epoch": 4.606461864406779, "grad_norm": 2.023285388946533, "learning_rate": 7.696901483050849e-06, "loss": 1.4404, "mean_token_accuracy": 0.66655183583498, "num_tokens": 13994129.0, "step": 17394 }, { "epoch": 4.606991525423728, "grad_norm": 1.7830158472061157, "learning_rate": 7.696636652542374e-06, "loss": 1.2838, "mean_token_accuracy": 0.698475070297718, "num_tokens": 13996051.0, "step": 17396 }, { "epoch": 4.607521186440678, "grad_norm": 1.8782669305801392, "learning_rate": 7.696371822033898e-06, "loss": 0.9593, "mean_token_accuracy": 0.7670782133936882, "num_tokens": 13997633.0, "step": 17398 }, { "epoch": 4.608050847457627, "grad_norm": 2.1921024322509766, "learning_rate": 7.696106991525423e-06, "loss": 1.0198, "mean_token_accuracy": 0.7703921496868134, "num_tokens": 13999247.0, "step": 17400 }, { "epoch": 4.608580508474576, "grad_norm": 2.863070487976074, "learning_rate": 7.69584216101695e-06, "loss": 1.4796, "mean_token_accuracy": 0.672541119158268, "num_tokens": 14000821.0, "step": 17402 }, { "epoch": 4.609110169491525, "grad_norm": 2.1356420516967773, "learning_rate": 7.695577330508475e-06, "loss": 1.1246, "mean_token_accuracy": 0.7542910873889923, "num_tokens": 14002304.0, "step": 17404 }, { "epoch": 4.609639830508475, "grad_norm": 2.435821294784546, "learning_rate": 7.6953125e-06, "loss": 1.4924, "mean_token_accuracy": 0.6748212650418282, "num_tokens": 14004120.0, "step": 17406 }, { "epoch": 4.610169491525424, "grad_norm": 2.1689822673797607, "learning_rate": 7.695047669491526e-06, "loss": 1.2207, "mean_token_accuracy": 0.7087509445846081, "num_tokens": 14005653.0, "step": 17408 }, { "epoch": 4.610699152542373, "grad_norm": 2.2322540283203125, "learning_rate": 7.694782838983051e-06, "loss": 1.4641, "mean_token_accuracy": 0.6873485147953033, "num_tokens": 14007185.0, "step": 17410 }, { "epoch": 4.611228813559322, "grad_norm": 1.7751721143722534, "learning_rate": 7.694518008474578e-06, "loss": 1.1551, "mean_token_accuracy": 0.7145761847496033, "num_tokens": 14008691.0, "step": 17412 }, { "epoch": 4.611758474576272, "grad_norm": 2.3214497566223145, "learning_rate": 7.694253177966103e-06, "loss": 1.129, "mean_token_accuracy": 0.7302683591842651, "num_tokens": 14010218.0, "step": 17414 }, { "epoch": 4.612288135593221, "grad_norm": 2.094067335128784, "learning_rate": 7.693988347457628e-06, "loss": 1.2027, "mean_token_accuracy": 0.7243611961603165, "num_tokens": 14011828.0, "step": 17416 }, { "epoch": 4.6128177966101696, "grad_norm": 2.237511157989502, "learning_rate": 7.693723516949153e-06, "loss": 1.1995, "mean_token_accuracy": 0.7329409420490265, "num_tokens": 14013272.0, "step": 17418 }, { "epoch": 4.6133474576271185, "grad_norm": 2.103501796722412, "learning_rate": 7.69345868644068e-06, "loss": 1.5721, "mean_token_accuracy": 0.6508540399372578, "num_tokens": 14015074.0, "step": 17420 }, { "epoch": 4.6138771186440675, "grad_norm": 2.0005741119384766, "learning_rate": 7.693193855932204e-06, "loss": 0.9382, "mean_token_accuracy": 0.7832792028784752, "num_tokens": 14016385.0, "step": 17422 }, { "epoch": 4.614406779661017, "grad_norm": 1.7871949672698975, "learning_rate": 7.69292902542373e-06, "loss": 1.1819, "mean_token_accuracy": 0.7352855652570724, "num_tokens": 14018104.0, "step": 17424 }, { "epoch": 4.614936440677966, "grad_norm": 1.754892349243164, "learning_rate": 7.692664194915254e-06, "loss": 1.3697, "mean_token_accuracy": 0.7138980776071548, "num_tokens": 14019737.0, "step": 17426 }, { "epoch": 4.615466101694915, "grad_norm": 1.729358434677124, "learning_rate": 7.692399364406781e-06, "loss": 1.0795, "mean_token_accuracy": 0.7362094596028328, "num_tokens": 14021298.0, "step": 17428 }, { "epoch": 4.615995762711864, "grad_norm": 2.2840466499328613, "learning_rate": 7.692134533898306e-06, "loss": 1.6805, "mean_token_accuracy": 0.6325479000806808, "num_tokens": 14023120.0, "step": 17430 }, { "epoch": 4.616525423728813, "grad_norm": 1.6521767377853394, "learning_rate": 7.69186970338983e-06, "loss": 1.0805, "mean_token_accuracy": 0.739860787987709, "num_tokens": 14024654.0, "step": 17432 }, { "epoch": 4.617055084745763, "grad_norm": 2.303936243057251, "learning_rate": 7.691604872881356e-06, "loss": 1.5944, "mean_token_accuracy": 0.6363794803619385, "num_tokens": 14026159.0, "step": 17434 }, { "epoch": 4.617584745762712, "grad_norm": 2.106611728668213, "learning_rate": 7.691340042372882e-06, "loss": 1.4779, "mean_token_accuracy": 0.6804652437567711, "num_tokens": 14027681.0, "step": 17436 }, { "epoch": 4.618114406779661, "grad_norm": 2.1497130393981934, "learning_rate": 7.691075211864407e-06, "loss": 1.1947, "mean_token_accuracy": 0.736432820558548, "num_tokens": 14029191.0, "step": 17438 }, { "epoch": 4.61864406779661, "grad_norm": 1.8112374544143677, "learning_rate": 7.690810381355934e-06, "loss": 1.0859, "mean_token_accuracy": 0.7458006367087364, "num_tokens": 14030573.0, "step": 17440 }, { "epoch": 4.61917372881356, "grad_norm": 1.9417071342468262, "learning_rate": 7.690545550847459e-06, "loss": 1.1313, "mean_token_accuracy": 0.7421653121709824, "num_tokens": 14032318.0, "step": 17442 }, { "epoch": 4.619703389830509, "grad_norm": 2.100167989730835, "learning_rate": 7.690280720338984e-06, "loss": 0.9945, "mean_token_accuracy": 0.7353026494383812, "num_tokens": 14033735.0, "step": 17444 }, { "epoch": 4.620233050847458, "grad_norm": 1.85967218875885, "learning_rate": 7.690015889830509e-06, "loss": 1.6238, "mean_token_accuracy": 0.6586802937090397, "num_tokens": 14036184.0, "step": 17446 }, { "epoch": 4.620762711864407, "grad_norm": 2.157566785812378, "learning_rate": 7.689751059322035e-06, "loss": 1.0211, "mean_token_accuracy": 0.7418418675661087, "num_tokens": 14037719.0, "step": 17448 }, { "epoch": 4.621292372881356, "grad_norm": 1.8782353401184082, "learning_rate": 7.68948622881356e-06, "loss": 1.2517, "mean_token_accuracy": 0.7134739309549332, "num_tokens": 14039364.0, "step": 17450 }, { "epoch": 4.621822033898305, "grad_norm": 1.9002474546432495, "learning_rate": 7.689221398305085e-06, "loss": 1.1838, "mean_token_accuracy": 0.728517085313797, "num_tokens": 14041015.0, "step": 17452 }, { "epoch": 4.622351694915254, "grad_norm": 1.8404262065887451, "learning_rate": 7.68895656779661e-06, "loss": 1.1401, "mean_token_accuracy": 0.7431835308670998, "num_tokens": 14042575.0, "step": 17454 }, { "epoch": 4.622881355932203, "grad_norm": 1.6662052869796753, "learning_rate": 7.688691737288137e-06, "loss": 1.2718, "mean_token_accuracy": 0.7184990495443344, "num_tokens": 14044411.0, "step": 17456 }, { "epoch": 4.623411016949152, "grad_norm": 1.781197428703308, "learning_rate": 7.688426906779662e-06, "loss": 1.2043, "mean_token_accuracy": 0.7400550842285156, "num_tokens": 14045903.0, "step": 17458 }, { "epoch": 4.623940677966102, "grad_norm": 1.5287749767303467, "learning_rate": 7.688162076271186e-06, "loss": 1.3046, "mean_token_accuracy": 0.7096343263983727, "num_tokens": 14047662.0, "step": 17460 }, { "epoch": 4.624470338983051, "grad_norm": 1.771808385848999, "learning_rate": 7.687897245762711e-06, "loss": 1.294, "mean_token_accuracy": 0.7276392132043839, "num_tokens": 14049439.0, "step": 17462 }, { "epoch": 4.625, "grad_norm": 2.6662588119506836, "learning_rate": 7.687632415254238e-06, "loss": 1.2524, "mean_token_accuracy": 0.7293512746691704, "num_tokens": 14050627.0, "step": 17464 }, { "epoch": 4.625529661016949, "grad_norm": 1.952392578125, "learning_rate": 7.687367584745763e-06, "loss": 1.2518, "mean_token_accuracy": 0.7298529148101807, "num_tokens": 14052144.0, "step": 17466 }, { "epoch": 4.626059322033898, "grad_norm": 1.725142240524292, "learning_rate": 7.68710275423729e-06, "loss": 1.2907, "mean_token_accuracy": 0.7152023985981941, "num_tokens": 14053845.0, "step": 17468 }, { "epoch": 4.626588983050848, "grad_norm": 2.2929515838623047, "learning_rate": 7.686837923728815e-06, "loss": 1.3042, "mean_token_accuracy": 0.7041998282074928, "num_tokens": 14055207.0, "step": 17470 }, { "epoch": 4.627118644067797, "grad_norm": 2.4938409328460693, "learning_rate": 7.68657309322034e-06, "loss": 1.0314, "mean_token_accuracy": 0.747696191072464, "num_tokens": 14056502.0, "step": 17472 }, { "epoch": 4.627648305084746, "grad_norm": 1.947996973991394, "learning_rate": 7.686308262711864e-06, "loss": 1.236, "mean_token_accuracy": 0.7419099099934101, "num_tokens": 14058276.0, "step": 17474 }, { "epoch": 4.628177966101695, "grad_norm": 1.8262619972229004, "learning_rate": 7.686043432203391e-06, "loss": 1.3659, "mean_token_accuracy": 0.6953795403242111, "num_tokens": 14059822.0, "step": 17476 }, { "epoch": 4.628707627118644, "grad_norm": 1.7301627397537231, "learning_rate": 7.685778601694916e-06, "loss": 0.9062, "mean_token_accuracy": 0.775553971529007, "num_tokens": 14061492.0, "step": 17478 }, { "epoch": 4.629237288135593, "grad_norm": 1.8642754554748535, "learning_rate": 7.685513771186441e-06, "loss": 1.009, "mean_token_accuracy": 0.7564626485109329, "num_tokens": 14063362.0, "step": 17480 }, { "epoch": 4.629766949152542, "grad_norm": 2.111407995223999, "learning_rate": 7.685248940677966e-06, "loss": 1.323, "mean_token_accuracy": 0.715978316962719, "num_tokens": 14065005.0, "step": 17482 }, { "epoch": 4.630296610169491, "grad_norm": 1.8715976476669312, "learning_rate": 7.684984110169492e-06, "loss": 0.7574, "mean_token_accuracy": 0.8257614448666573, "num_tokens": 14066255.0, "step": 17484 }, { "epoch": 4.63082627118644, "grad_norm": 1.8315664529800415, "learning_rate": 7.684719279661017e-06, "loss": 0.8865, "mean_token_accuracy": 0.7846743986010551, "num_tokens": 14067829.0, "step": 17486 }, { "epoch": 4.63135593220339, "grad_norm": 1.8587062358856201, "learning_rate": 7.684454449152542e-06, "loss": 1.3432, "mean_token_accuracy": 0.6736918725073338, "num_tokens": 14069607.0, "step": 17488 }, { "epoch": 4.631885593220339, "grad_norm": 2.1686556339263916, "learning_rate": 7.684189618644067e-06, "loss": 1.1857, "mean_token_accuracy": 0.7258108854293823, "num_tokens": 14071339.0, "step": 17490 }, { "epoch": 4.632415254237288, "grad_norm": 2.310253620147705, "learning_rate": 7.683924788135594e-06, "loss": 1.2179, "mean_token_accuracy": 0.7460156232118607, "num_tokens": 14072710.0, "step": 17492 }, { "epoch": 4.632944915254237, "grad_norm": 2.1763195991516113, "learning_rate": 7.68365995762712e-06, "loss": 1.5028, "mean_token_accuracy": 0.6737349554896355, "num_tokens": 14074329.0, "step": 17494 }, { "epoch": 4.633474576271187, "grad_norm": 2.2441904544830322, "learning_rate": 7.683395127118645e-06, "loss": 1.3155, "mean_token_accuracy": 0.7010372579097748, "num_tokens": 14075840.0, "step": 17496 }, { "epoch": 4.634004237288136, "grad_norm": 2.1720221042633057, "learning_rate": 7.68313029661017e-06, "loss": 1.2905, "mean_token_accuracy": 0.6936571821570396, "num_tokens": 14077207.0, "step": 17498 }, { "epoch": 4.634533898305085, "grad_norm": 2.0787463188171387, "learning_rate": 7.682865466101695e-06, "loss": 1.3088, "step": 17500 }, { "epoch": 4.634533898305085, "eval_loss": 1.3119981288909912, "eval_mean_token_accuracy": 0.7016178529177394, "eval_num_tokens": 14078786.0, "eval_runtime": 48.2877, "eval_samples_per_second": 6.378, "eval_steps_per_second": 6.378, "step": 17500 }, { "epoch": 4.635063559322034, "grad_norm": 2.1883385181427, "learning_rate": 7.682600635593222e-06, "loss": 1.5442, "mean_token_accuracy": 0.674480777233839, "num_tokens": 14080321.0, "step": 17502 }, { "epoch": 4.635593220338983, "grad_norm": 2.17160701751709, "learning_rate": 7.682335805084747e-06, "loss": 1.6215, "mean_token_accuracy": 0.6500027105212212, "num_tokens": 14081809.0, "step": 17504 }, { "epoch": 4.6361228813559325, "grad_norm": 1.9806230068206787, "learning_rate": 7.682070974576272e-06, "loss": 1.3588, "mean_token_accuracy": 0.6892109140753746, "num_tokens": 14083490.0, "step": 17506 }, { "epoch": 4.6366525423728815, "grad_norm": 2.855771064758301, "learning_rate": 7.681806144067797e-06, "loss": 1.2995, "mean_token_accuracy": 0.6986358314752579, "num_tokens": 14085034.0, "step": 17508 }, { "epoch": 4.6371822033898304, "grad_norm": 2.365074396133423, "learning_rate": 7.681541313559323e-06, "loss": 1.4893, "mean_token_accuracy": 0.6841061040759087, "num_tokens": 14086685.0, "step": 17510 }, { "epoch": 4.637711864406779, "grad_norm": 2.0690503120422363, "learning_rate": 7.681276483050848e-06, "loss": 0.938, "mean_token_accuracy": 0.7889842242002487, "num_tokens": 14088216.0, "step": 17512 }, { "epoch": 4.638241525423728, "grad_norm": 1.8281267881393433, "learning_rate": 7.681011652542373e-06, "loss": 1.0848, "mean_token_accuracy": 0.7412643507122993, "num_tokens": 14089936.0, "step": 17514 }, { "epoch": 4.638771186440678, "grad_norm": 2.083977460861206, "learning_rate": 7.680746822033898e-06, "loss": 0.9716, "mean_token_accuracy": 0.7766664326190948, "num_tokens": 14091171.0, "step": 17516 }, { "epoch": 4.639300847457627, "grad_norm": 2.362339735031128, "learning_rate": 7.680481991525425e-06, "loss": 1.3338, "mean_token_accuracy": 0.7000928744673729, "num_tokens": 14092726.0, "step": 17518 }, { "epoch": 4.639830508474576, "grad_norm": 2.070847749710083, "learning_rate": 7.68021716101695e-06, "loss": 1.2799, "mean_token_accuracy": 0.7247824594378471, "num_tokens": 14094128.0, "step": 17520 }, { "epoch": 4.640360169491525, "grad_norm": 2.234938144683838, "learning_rate": 7.679952330508476e-06, "loss": 1.3307, "mean_token_accuracy": 0.7071085646748543, "num_tokens": 14095617.0, "step": 17522 }, { "epoch": 4.640889830508475, "grad_norm": 2.915990114212036, "learning_rate": 7.679687500000001e-06, "loss": 1.41, "mean_token_accuracy": 0.6813563853502274, "num_tokens": 14096916.0, "step": 17524 }, { "epoch": 4.641419491525424, "grad_norm": 2.1032116413116455, "learning_rate": 7.679422669491526e-06, "loss": 1.1309, "mean_token_accuracy": 0.7363097667694092, "num_tokens": 14098359.0, "step": 17526 }, { "epoch": 4.641949152542373, "grad_norm": 1.965141773223877, "learning_rate": 7.679157838983051e-06, "loss": 0.8598, "mean_token_accuracy": 0.7823142409324646, "num_tokens": 14100035.0, "step": 17528 }, { "epoch": 4.642478813559322, "grad_norm": 2.394399642944336, "learning_rate": 7.678893008474578e-06, "loss": 1.0923, "mean_token_accuracy": 0.7530773282051086, "num_tokens": 14101265.0, "step": 17530 }, { "epoch": 4.643008474576272, "grad_norm": 2.0697338581085205, "learning_rate": 7.678628177966103e-06, "loss": 1.2868, "mean_token_accuracy": 0.7223824672400951, "num_tokens": 14102853.0, "step": 17532 }, { "epoch": 4.643538135593221, "grad_norm": 2.5672953128814697, "learning_rate": 7.678363347457628e-06, "loss": 1.2581, "mean_token_accuracy": 0.7055361270904541, "num_tokens": 14104175.0, "step": 17534 }, { "epoch": 4.6440677966101696, "grad_norm": 2.3088266849517822, "learning_rate": 7.678098516949152e-06, "loss": 1.3509, "mean_token_accuracy": 0.6712957732379436, "num_tokens": 14105604.0, "step": 17536 }, { "epoch": 4.6445974576271185, "grad_norm": 2.049412488937378, "learning_rate": 7.677833686440679e-06, "loss": 1.3379, "mean_token_accuracy": 0.7283502593636513, "num_tokens": 14107262.0, "step": 17538 }, { "epoch": 4.6451271186440675, "grad_norm": 2.2993199825286865, "learning_rate": 7.677568855932204e-06, "loss": 1.0972, "mean_token_accuracy": 0.7255324870347977, "num_tokens": 14108800.0, "step": 17540 }, { "epoch": 4.645656779661017, "grad_norm": 2.391826868057251, "learning_rate": 7.677304025423729e-06, "loss": 1.3824, "mean_token_accuracy": 0.6887888386845589, "num_tokens": 14110586.0, "step": 17542 }, { "epoch": 4.646186440677966, "grad_norm": 1.7197909355163574, "learning_rate": 7.677039194915254e-06, "loss": 0.8291, "mean_token_accuracy": 0.7891651019454002, "num_tokens": 14112084.0, "step": 17544 }, { "epoch": 4.646716101694915, "grad_norm": 2.14058256149292, "learning_rate": 7.67677436440678e-06, "loss": 1.0184, "mean_token_accuracy": 0.7498796805739403, "num_tokens": 14113691.0, "step": 17546 }, { "epoch": 4.647245762711864, "grad_norm": 2.246389389038086, "learning_rate": 7.676509533898305e-06, "loss": 0.9371, "mean_token_accuracy": 0.7799345999956131, "num_tokens": 14115057.0, "step": 17548 }, { "epoch": 4.647775423728813, "grad_norm": 2.014554262161255, "learning_rate": 7.676244703389832e-06, "loss": 1.4149, "mean_token_accuracy": 0.7072106227278709, "num_tokens": 14116518.0, "step": 17550 }, { "epoch": 4.648305084745763, "grad_norm": 2.579061985015869, "learning_rate": 7.675979872881357e-06, "loss": 1.2898, "mean_token_accuracy": 0.7081420049071312, "num_tokens": 14117870.0, "step": 17552 }, { "epoch": 4.648834745762712, "grad_norm": 1.9403139352798462, "learning_rate": 7.675715042372882e-06, "loss": 1.1669, "mean_token_accuracy": 0.726419247686863, "num_tokens": 14119460.0, "step": 17554 }, { "epoch": 4.649364406779661, "grad_norm": 1.7572532892227173, "learning_rate": 7.675450211864407e-06, "loss": 1.1113, "mean_token_accuracy": 0.7326792255043983, "num_tokens": 14121258.0, "step": 17556 }, { "epoch": 4.64989406779661, "grad_norm": 1.9375720024108887, "learning_rate": 7.675185381355933e-06, "loss": 1.268, "mean_token_accuracy": 0.7089560851454735, "num_tokens": 14122904.0, "step": 17558 }, { "epoch": 4.65042372881356, "grad_norm": 2.3249142169952393, "learning_rate": 7.674920550847458e-06, "loss": 1.5679, "mean_token_accuracy": 0.6706712767481804, "num_tokens": 14124613.0, "step": 17560 }, { "epoch": 4.650953389830509, "grad_norm": 2.386348009109497, "learning_rate": 7.674655720338983e-06, "loss": 1.248, "mean_token_accuracy": 0.7138031832873821, "num_tokens": 14126201.0, "step": 17562 }, { "epoch": 4.651483050847458, "grad_norm": 2.0472445487976074, "learning_rate": 7.674390889830508e-06, "loss": 1.0265, "mean_token_accuracy": 0.7569416463375092, "num_tokens": 14127991.0, "step": 17564 }, { "epoch": 4.652012711864407, "grad_norm": 1.990756869316101, "learning_rate": 7.674126059322035e-06, "loss": 0.8344, "mean_token_accuracy": 0.7719254270195961, "num_tokens": 14129624.0, "step": 17566 }, { "epoch": 4.652542372881356, "grad_norm": 1.6704440116882324, "learning_rate": 7.67386122881356e-06, "loss": 1.2319, "mean_token_accuracy": 0.7264972403645515, "num_tokens": 14131184.0, "step": 17568 }, { "epoch": 4.653072033898305, "grad_norm": 1.826170563697815, "learning_rate": 7.673596398305085e-06, "loss": 1.6313, "mean_token_accuracy": 0.6460883691906929, "num_tokens": 14132847.0, "step": 17570 }, { "epoch": 4.653601694915254, "grad_norm": 1.7338320016860962, "learning_rate": 7.67333156779661e-06, "loss": 1.3237, "mean_token_accuracy": 0.6751040071249008, "num_tokens": 14134824.0, "step": 17572 }, { "epoch": 4.654131355932203, "grad_norm": 1.724990725517273, "learning_rate": 7.673066737288136e-06, "loss": 1.3105, "mean_token_accuracy": 0.711068905889988, "num_tokens": 14136497.0, "step": 17574 }, { "epoch": 4.654661016949152, "grad_norm": 1.7731760740280151, "learning_rate": 7.672801906779663e-06, "loss": 0.9034, "mean_token_accuracy": 0.7834422886371613, "num_tokens": 14138078.0, "step": 17576 }, { "epoch": 4.655190677966102, "grad_norm": 2.2530126571655273, "learning_rate": 7.672537076271188e-06, "loss": 1.3541, "mean_token_accuracy": 0.6921888813376427, "num_tokens": 14139514.0, "step": 17578 }, { "epoch": 4.655720338983051, "grad_norm": 2.129133701324463, "learning_rate": 7.672272245762713e-06, "loss": 1.1169, "mean_token_accuracy": 0.7186135500669479, "num_tokens": 14140990.0, "step": 17580 }, { "epoch": 4.65625, "grad_norm": 1.6518610715866089, "learning_rate": 7.672007415254238e-06, "loss": 0.9292, "mean_token_accuracy": 0.759665310382843, "num_tokens": 14142519.0, "step": 17582 }, { "epoch": 4.656779661016949, "grad_norm": 1.9082164764404297, "learning_rate": 7.671742584745764e-06, "loss": 1.3745, "mean_token_accuracy": 0.6796051934361458, "num_tokens": 14144307.0, "step": 17584 }, { "epoch": 4.657309322033898, "grad_norm": 1.7364342212677002, "learning_rate": 7.67147775423729e-06, "loss": 0.8028, "mean_token_accuracy": 0.796486459672451, "num_tokens": 14145744.0, "step": 17586 }, { "epoch": 4.657838983050848, "grad_norm": 2.1339075565338135, "learning_rate": 7.671212923728814e-06, "loss": 0.9956, "mean_token_accuracy": 0.7634727656841278, "num_tokens": 14147125.0, "step": 17588 }, { "epoch": 4.658368644067797, "grad_norm": 1.86223304271698, "learning_rate": 7.670948093220339e-06, "loss": 1.4965, "mean_token_accuracy": 0.6897448524832726, "num_tokens": 14148904.0, "step": 17590 }, { "epoch": 4.658898305084746, "grad_norm": 1.7811634540557861, "learning_rate": 7.670683262711866e-06, "loss": 1.5807, "mean_token_accuracy": 0.6615370213985443, "num_tokens": 14150669.0, "step": 17592 }, { "epoch": 4.659427966101695, "grad_norm": 2.543989419937134, "learning_rate": 7.67041843220339e-06, "loss": 1.4116, "mean_token_accuracy": 0.6632841601967812, "num_tokens": 14152340.0, "step": 17594 }, { "epoch": 4.659957627118644, "grad_norm": 2.150076389312744, "learning_rate": 7.670153601694916e-06, "loss": 1.6244, "mean_token_accuracy": 0.6409964635968208, "num_tokens": 14154195.0, "step": 17596 }, { "epoch": 4.660487288135593, "grad_norm": 1.6854091882705688, "learning_rate": 7.66988877118644e-06, "loss": 1.0789, "mean_token_accuracy": 0.7201984152197838, "num_tokens": 14155828.0, "step": 17598 }, { "epoch": 4.661016949152542, "grad_norm": 2.249481678009033, "learning_rate": 7.669623940677967e-06, "loss": 1.1342, "mean_token_accuracy": 0.7272208854556084, "num_tokens": 14157304.0, "step": 17600 }, { "epoch": 4.661546610169491, "grad_norm": 2.077576160430908, "learning_rate": 7.669359110169492e-06, "loss": 0.9675, "mean_token_accuracy": 0.771377295255661, "num_tokens": 14158742.0, "step": 17602 }, { "epoch": 4.66207627118644, "grad_norm": 1.3418859243392944, "learning_rate": 7.669094279661019e-06, "loss": 1.1859, "mean_token_accuracy": 0.7289635837078094, "num_tokens": 14160525.0, "step": 17604 }, { "epoch": 4.66260593220339, "grad_norm": 2.0389246940612793, "learning_rate": 7.668829449152544e-06, "loss": 1.1483, "mean_token_accuracy": 0.718223437666893, "num_tokens": 14161971.0, "step": 17606 }, { "epoch": 4.663135593220339, "grad_norm": 1.986933708190918, "learning_rate": 7.668564618644069e-06, "loss": 1.1146, "mean_token_accuracy": 0.7349165380001068, "num_tokens": 14163767.0, "step": 17608 }, { "epoch": 4.663665254237288, "grad_norm": 1.8734395503997803, "learning_rate": 7.668299788135593e-06, "loss": 1.3173, "mean_token_accuracy": 0.699894092977047, "num_tokens": 14165550.0, "step": 17610 }, { "epoch": 4.664194915254237, "grad_norm": 2.028874397277832, "learning_rate": 7.66803495762712e-06, "loss": 1.3285, "mean_token_accuracy": 0.7019972577691078, "num_tokens": 14167180.0, "step": 17612 }, { "epoch": 4.664724576271187, "grad_norm": 1.562439203262329, "learning_rate": 7.667770127118645e-06, "loss": 1.1158, "mean_token_accuracy": 0.7386883869767189, "num_tokens": 14169090.0, "step": 17614 }, { "epoch": 4.665254237288136, "grad_norm": 1.912683367729187, "learning_rate": 7.66750529661017e-06, "loss": 1.0593, "mean_token_accuracy": 0.7439427077770233, "num_tokens": 14170565.0, "step": 17616 }, { "epoch": 4.665783898305085, "grad_norm": 2.1856448650360107, "learning_rate": 7.667240466101695e-06, "loss": 1.3454, "mean_token_accuracy": 0.6851741597056389, "num_tokens": 14172372.0, "step": 17618 }, { "epoch": 4.666313559322034, "grad_norm": 1.9895585775375366, "learning_rate": 7.666975635593221e-06, "loss": 1.4783, "mean_token_accuracy": 0.6882437989115715, "num_tokens": 14173920.0, "step": 17620 }, { "epoch": 4.666843220338983, "grad_norm": 1.812343955039978, "learning_rate": 7.666710805084746e-06, "loss": 1.1594, "mean_token_accuracy": 0.7149251326918602, "num_tokens": 14175599.0, "step": 17622 }, { "epoch": 4.6673728813559325, "grad_norm": 2.072873592376709, "learning_rate": 7.666445974576271e-06, "loss": 1.4063, "mean_token_accuracy": 0.6879020668566227, "num_tokens": 14177087.0, "step": 17624 }, { "epoch": 4.6679025423728815, "grad_norm": 2.2202181816101074, "learning_rate": 7.666181144067796e-06, "loss": 0.9365, "mean_token_accuracy": 0.7706108838319778, "num_tokens": 14178451.0, "step": 17626 }, { "epoch": 4.6684322033898304, "grad_norm": 1.8268779516220093, "learning_rate": 7.665916313559323e-06, "loss": 1.4989, "mean_token_accuracy": 0.6735965982079506, "num_tokens": 14180213.0, "step": 17628 }, { "epoch": 4.668961864406779, "grad_norm": 1.86625337600708, "learning_rate": 7.665651483050848e-06, "loss": 1.1593, "mean_token_accuracy": 0.7189957723021507, "num_tokens": 14181957.0, "step": 17630 }, { "epoch": 4.669491525423728, "grad_norm": 2.048600435256958, "learning_rate": 7.665386652542374e-06, "loss": 1.2179, "mean_token_accuracy": 0.7063331231474876, "num_tokens": 14183568.0, "step": 17632 }, { "epoch": 4.670021186440678, "grad_norm": 1.9881904125213623, "learning_rate": 7.6651218220339e-06, "loss": 1.411, "mean_token_accuracy": 0.6822425350546837, "num_tokens": 14185109.0, "step": 17634 }, { "epoch": 4.670550847457627, "grad_norm": 2.245673894882202, "learning_rate": 7.664856991525424e-06, "loss": 1.0902, "mean_token_accuracy": 0.7402197048068047, "num_tokens": 14186696.0, "step": 17636 }, { "epoch": 4.671080508474576, "grad_norm": 1.9582228660583496, "learning_rate": 7.66459216101695e-06, "loss": 1.3544, "mean_token_accuracy": 0.6830802485346794, "num_tokens": 14188210.0, "step": 17638 }, { "epoch": 4.671610169491525, "grad_norm": 2.2419891357421875, "learning_rate": 7.664327330508476e-06, "loss": 1.6456, "mean_token_accuracy": 0.6314448043704033, "num_tokens": 14189912.0, "step": 17640 }, { "epoch": 4.672139830508475, "grad_norm": 2.0992136001586914, "learning_rate": 7.6640625e-06, "loss": 1.2627, "mean_token_accuracy": 0.6994223445653915, "num_tokens": 14191576.0, "step": 17642 }, { "epoch": 4.672669491525424, "grad_norm": 2.488611936569214, "learning_rate": 7.663797669491526e-06, "loss": 1.2755, "mean_token_accuracy": 0.6963343843817711, "num_tokens": 14192842.0, "step": 17644 }, { "epoch": 4.673199152542373, "grad_norm": 2.3464643955230713, "learning_rate": 7.66353283898305e-06, "loss": 1.4165, "mean_token_accuracy": 0.66302540153265, "num_tokens": 14194325.0, "step": 17646 }, { "epoch": 4.673728813559322, "grad_norm": 1.7707040309906006, "learning_rate": 7.663268008474577e-06, "loss": 1.4325, "mean_token_accuracy": 0.673743836581707, "num_tokens": 14196125.0, "step": 17648 }, { "epoch": 4.674258474576272, "grad_norm": 2.069389581680298, "learning_rate": 7.663003177966102e-06, "loss": 1.17, "mean_token_accuracy": 0.7246793732047081, "num_tokens": 14197665.0, "step": 17650 }, { "epoch": 4.674788135593221, "grad_norm": 2.3473587036132812, "learning_rate": 7.662738347457627e-06, "loss": 1.4058, "mean_token_accuracy": 0.7003251984715462, "num_tokens": 14199196.0, "step": 17652 }, { "epoch": 4.6753177966101696, "grad_norm": 2.303511381149292, "learning_rate": 7.662473516949152e-06, "loss": 1.4078, "mean_token_accuracy": 0.6865461766719818, "num_tokens": 14200622.0, "step": 17654 }, { "epoch": 4.6758474576271185, "grad_norm": 1.9326728582382202, "learning_rate": 7.662208686440679e-06, "loss": 1.1932, "mean_token_accuracy": 0.7284692749381065, "num_tokens": 14202394.0, "step": 17656 }, { "epoch": 4.6763771186440675, "grad_norm": 2.320258140563965, "learning_rate": 7.661943855932204e-06, "loss": 1.4708, "mean_token_accuracy": 0.6771086826920509, "num_tokens": 14204029.0, "step": 17658 }, { "epoch": 4.676906779661017, "grad_norm": 1.9328930377960205, "learning_rate": 7.66167902542373e-06, "loss": 1.1304, "mean_token_accuracy": 0.7310052514076233, "num_tokens": 14205642.0, "step": 17660 }, { "epoch": 4.677436440677966, "grad_norm": 2.1087429523468018, "learning_rate": 7.661414194915255e-06, "loss": 1.2599, "mean_token_accuracy": 0.720089003443718, "num_tokens": 14207149.0, "step": 17662 }, { "epoch": 4.677966101694915, "grad_norm": 1.815454363822937, "learning_rate": 7.66114936440678e-06, "loss": 1.1698, "mean_token_accuracy": 0.7243719100952148, "num_tokens": 14208576.0, "step": 17664 }, { "epoch": 4.678495762711864, "grad_norm": 2.0749969482421875, "learning_rate": 7.660884533898307e-06, "loss": 1.1278, "mean_token_accuracy": 0.7485819309949875, "num_tokens": 14210202.0, "step": 17666 }, { "epoch": 4.679025423728813, "grad_norm": 1.6991653442382812, "learning_rate": 7.660619703389832e-06, "loss": 0.6307, "mean_token_accuracy": 0.8315404206514359, "num_tokens": 14211923.0, "step": 17668 }, { "epoch": 4.679555084745763, "grad_norm": 2.4386041164398193, "learning_rate": 7.660354872881357e-06, "loss": 1.52, "mean_token_accuracy": 0.6714247316122055, "num_tokens": 14213618.0, "step": 17670 }, { "epoch": 4.680084745762712, "grad_norm": 2.108372688293457, "learning_rate": 7.660090042372882e-06, "loss": 1.1664, "mean_token_accuracy": 0.7060892656445503, "num_tokens": 14215189.0, "step": 17672 }, { "epoch": 4.680614406779661, "grad_norm": 2.0268847942352295, "learning_rate": 7.659825211864408e-06, "loss": 1.2436, "mean_token_accuracy": 0.707107424736023, "num_tokens": 14216858.0, "step": 17674 }, { "epoch": 4.68114406779661, "grad_norm": 1.814599871635437, "learning_rate": 7.659560381355933e-06, "loss": 1.0807, "mean_token_accuracy": 0.7327573671936989, "num_tokens": 14218351.0, "step": 17676 }, { "epoch": 4.68167372881356, "grad_norm": 1.7783398628234863, "learning_rate": 7.659295550847458e-06, "loss": 1.3271, "mean_token_accuracy": 0.6886169835925102, "num_tokens": 14219964.0, "step": 17678 }, { "epoch": 4.682203389830509, "grad_norm": 2.0558176040649414, "learning_rate": 7.659030720338983e-06, "loss": 1.2646, "mean_token_accuracy": 0.7298371642827988, "num_tokens": 14221382.0, "step": 17680 }, { "epoch": 4.682733050847458, "grad_norm": 2.36834979057312, "learning_rate": 7.65876588983051e-06, "loss": 1.1422, "mean_token_accuracy": 0.7515207976102829, "num_tokens": 14222900.0, "step": 17682 }, { "epoch": 4.683262711864407, "grad_norm": 1.8174362182617188, "learning_rate": 7.658501059322034e-06, "loss": 0.9716, "mean_token_accuracy": 0.7652236670255661, "num_tokens": 14224555.0, "step": 17684 }, { "epoch": 4.683792372881356, "grad_norm": 1.9877605438232422, "learning_rate": 7.658236228813561e-06, "loss": 1.2401, "mean_token_accuracy": 0.7034750953316689, "num_tokens": 14226191.0, "step": 17686 }, { "epoch": 4.684322033898305, "grad_norm": 1.6958434581756592, "learning_rate": 7.657971398305086e-06, "loss": 1.306, "mean_token_accuracy": 0.7147959396243095, "num_tokens": 14227789.0, "step": 17688 }, { "epoch": 4.684851694915254, "grad_norm": 2.124600648880005, "learning_rate": 7.657706567796611e-06, "loss": 1.2275, "mean_token_accuracy": 0.7151727676391602, "num_tokens": 14229156.0, "step": 17690 }, { "epoch": 4.685381355932203, "grad_norm": 1.7172695398330688, "learning_rate": 7.657441737288136e-06, "loss": 1.2895, "mean_token_accuracy": 0.7149837017059326, "num_tokens": 14230695.0, "step": 17692 }, { "epoch": 4.685911016949152, "grad_norm": 1.7202036380767822, "learning_rate": 7.657176906779663e-06, "loss": 0.9611, "mean_token_accuracy": 0.7631253004074097, "num_tokens": 14232556.0, "step": 17694 }, { "epoch": 4.686440677966102, "grad_norm": 2.2530031204223633, "learning_rate": 7.656912076271187e-06, "loss": 1.2541, "mean_token_accuracy": 0.7049214467406273, "num_tokens": 14233928.0, "step": 17696 }, { "epoch": 4.686970338983051, "grad_norm": 1.9216902256011963, "learning_rate": 7.656647245762712e-06, "loss": 1.4898, "mean_token_accuracy": 0.6700681820511818, "num_tokens": 14235690.0, "step": 17698 }, { "epoch": 4.6875, "grad_norm": 1.9903212785720825, "learning_rate": 7.656382415254237e-06, "loss": 1.3112, "mean_token_accuracy": 0.7038761749863625, "num_tokens": 14237082.0, "step": 17700 }, { "epoch": 4.688029661016949, "grad_norm": 1.9697190523147583, "learning_rate": 7.656117584745764e-06, "loss": 1.3817, "mean_token_accuracy": 0.6598492562770844, "num_tokens": 14238758.0, "step": 17702 }, { "epoch": 4.688559322033898, "grad_norm": 2.326627016067505, "learning_rate": 7.655852754237289e-06, "loss": 1.1585, "mean_token_accuracy": 0.7275978028774261, "num_tokens": 14240193.0, "step": 17704 }, { "epoch": 4.689088983050848, "grad_norm": 2.0135350227355957, "learning_rate": 7.655587923728814e-06, "loss": 0.777, "mean_token_accuracy": 0.7961162626743317, "num_tokens": 14241710.0, "step": 17706 }, { "epoch": 4.689618644067797, "grad_norm": 2.2386868000030518, "learning_rate": 7.655323093220339e-06, "loss": 1.5333, "mean_token_accuracy": 0.6172235161066055, "num_tokens": 14243391.0, "step": 17708 }, { "epoch": 4.690148305084746, "grad_norm": 2.298510789871216, "learning_rate": 7.655058262711865e-06, "loss": 0.9254, "mean_token_accuracy": 0.7676657736301422, "num_tokens": 14244903.0, "step": 17710 }, { "epoch": 4.690677966101695, "grad_norm": 2.3891358375549316, "learning_rate": 7.65479343220339e-06, "loss": 1.3727, "mean_token_accuracy": 0.7276382967829704, "num_tokens": 14246367.0, "step": 17712 }, { "epoch": 4.691207627118644, "grad_norm": 1.9775341749191284, "learning_rate": 7.654528601694917e-06, "loss": 1.1774, "mean_token_accuracy": 0.7411560490727425, "num_tokens": 14248043.0, "step": 17714 }, { "epoch": 4.691737288135593, "grad_norm": 1.8850586414337158, "learning_rate": 7.65426377118644e-06, "loss": 1.0881, "mean_token_accuracy": 0.7482290267944336, "num_tokens": 14249687.0, "step": 17716 }, { "epoch": 4.692266949152542, "grad_norm": 2.05479097366333, "learning_rate": 7.653998940677967e-06, "loss": 1.5436, "mean_token_accuracy": 0.6648216098546982, "num_tokens": 14251035.0, "step": 17718 }, { "epoch": 4.692796610169491, "grad_norm": 1.9897089004516602, "learning_rate": 7.653734110169492e-06, "loss": 0.9607, "mean_token_accuracy": 0.7697803750634193, "num_tokens": 14252403.0, "step": 17720 }, { "epoch": 4.69332627118644, "grad_norm": 1.9379913806915283, "learning_rate": 7.653469279661018e-06, "loss": 1.2833, "mean_token_accuracy": 0.7125543355941772, "num_tokens": 14253973.0, "step": 17722 }, { "epoch": 4.69385593220339, "grad_norm": 1.9032479524612427, "learning_rate": 7.653204449152543e-06, "loss": 1.2755, "mean_token_accuracy": 0.7175554409623146, "num_tokens": 14255311.0, "step": 17724 }, { "epoch": 4.694385593220339, "grad_norm": 1.985707402229309, "learning_rate": 7.652939618644068e-06, "loss": 1.3482, "mean_token_accuracy": 0.7057598456740379, "num_tokens": 14256842.0, "step": 17726 }, { "epoch": 4.694915254237288, "grad_norm": 2.185697078704834, "learning_rate": 7.652674788135593e-06, "loss": 1.5157, "mean_token_accuracy": 0.6487066298723221, "num_tokens": 14258402.0, "step": 17728 }, { "epoch": 4.695444915254237, "grad_norm": 1.6534478664398193, "learning_rate": 7.65240995762712e-06, "loss": 1.0938, "mean_token_accuracy": 0.7360690459609032, "num_tokens": 14260042.0, "step": 17730 }, { "epoch": 4.695974576271187, "grad_norm": 2.014838218688965, "learning_rate": 7.652145127118645e-06, "loss": 1.1771, "mean_token_accuracy": 0.7483387961983681, "num_tokens": 14261600.0, "step": 17732 }, { "epoch": 4.696504237288136, "grad_norm": 2.0106282234191895, "learning_rate": 7.65188029661017e-06, "loss": 1.4054, "mean_token_accuracy": 0.6794084198772907, "num_tokens": 14263223.0, "step": 17734 }, { "epoch": 4.697033898305085, "grad_norm": 2.099489450454712, "learning_rate": 7.651615466101694e-06, "loss": 1.4125, "mean_token_accuracy": 0.674213245511055, "num_tokens": 14264707.0, "step": 17736 }, { "epoch": 4.697563559322034, "grad_norm": 1.995477318763733, "learning_rate": 7.651350635593221e-06, "loss": 1.494, "mean_token_accuracy": 0.6820612698793411, "num_tokens": 14266169.0, "step": 17738 }, { "epoch": 4.698093220338983, "grad_norm": 1.7404190301895142, "learning_rate": 7.651085805084746e-06, "loss": 1.2914, "mean_token_accuracy": 0.7146836891770363, "num_tokens": 14267937.0, "step": 17740 }, { "epoch": 4.6986228813559325, "grad_norm": 1.6761531829833984, "learning_rate": 7.650820974576273e-06, "loss": 1.1188, "mean_token_accuracy": 0.7306918576359749, "num_tokens": 14269455.0, "step": 17742 }, { "epoch": 4.6991525423728815, "grad_norm": 1.8009182214736938, "learning_rate": 7.650556144067798e-06, "loss": 0.8403, "mean_token_accuracy": 0.7674421742558479, "num_tokens": 14270792.0, "step": 17744 }, { "epoch": 4.6996822033898304, "grad_norm": 2.0590782165527344, "learning_rate": 7.650291313559323e-06, "loss": 1.7345, "mean_token_accuracy": 0.6204392984509468, "num_tokens": 14272647.0, "step": 17746 }, { "epoch": 4.700211864406779, "grad_norm": 1.9633444547653198, "learning_rate": 7.650026483050849e-06, "loss": 0.8745, "mean_token_accuracy": 0.7744278311729431, "num_tokens": 14274182.0, "step": 17748 }, { "epoch": 4.700741525423728, "grad_norm": 2.202190399169922, "learning_rate": 7.649761652542374e-06, "loss": 1.3586, "step": 17750 }, { "epoch": 4.700741525423728, "eval_loss": 1.3120408058166504, "eval_mean_token_accuracy": 0.7015347666554637, "eval_num_tokens": 14276006.0, "eval_runtime": 48.2988, "eval_samples_per_second": 6.377, "eval_steps_per_second": 6.377, "step": 17750 }, { "epoch": 4.701271186440678, "grad_norm": 1.9491220712661743, "learning_rate": 7.649496822033899e-06, "loss": 1.1104, "mean_token_accuracy": 0.729184877127409, "num_tokens": 14277512.0, "step": 17752 }, { "epoch": 4.701800847457627, "grad_norm": 1.5656559467315674, "learning_rate": 7.649231991525424e-06, "loss": 0.9088, "mean_token_accuracy": 0.7648718804121017, "num_tokens": 14279344.0, "step": 17754 }, { "epoch": 4.702330508474576, "grad_norm": 1.8726248741149902, "learning_rate": 7.64896716101695e-06, "loss": 0.7856, "mean_token_accuracy": 0.7932452335953712, "num_tokens": 14280810.0, "step": 17756 }, { "epoch": 4.702860169491525, "grad_norm": 1.6017173528671265, "learning_rate": 7.648702330508475e-06, "loss": 1.1503, "mean_token_accuracy": 0.7451022788882256, "num_tokens": 14282110.0, "step": 17758 }, { "epoch": 4.703389830508475, "grad_norm": 1.8099571466445923, "learning_rate": 7.6484375e-06, "loss": 1.1396, "mean_token_accuracy": 0.7562893778085709, "num_tokens": 14283587.0, "step": 17760 }, { "epoch": 4.703919491525424, "grad_norm": 1.7660987377166748, "learning_rate": 7.648172669491525e-06, "loss": 1.3487, "mean_token_accuracy": 0.6872302927076817, "num_tokens": 14285178.0, "step": 17762 }, { "epoch": 4.704449152542373, "grad_norm": 2.1101739406585693, "learning_rate": 7.647907838983052e-06, "loss": 1.3427, "mean_token_accuracy": 0.6770785748958588, "num_tokens": 14287744.0, "step": 17764 }, { "epoch": 4.704978813559322, "grad_norm": 2.1358978748321533, "learning_rate": 7.647643008474577e-06, "loss": 1.0745, "mean_token_accuracy": 0.7643268629908562, "num_tokens": 14289044.0, "step": 17766 }, { "epoch": 4.705508474576272, "grad_norm": 2.1440012454986572, "learning_rate": 7.647378177966104e-06, "loss": 1.2163, "mean_token_accuracy": 0.7064433246850967, "num_tokens": 14290722.0, "step": 17768 }, { "epoch": 4.706038135593221, "grad_norm": 2.021044969558716, "learning_rate": 7.647113347457627e-06, "loss": 1.2502, "mean_token_accuracy": 0.7005647718906403, "num_tokens": 14292438.0, "step": 17770 }, { "epoch": 4.7065677966101696, "grad_norm": 2.1118063926696777, "learning_rate": 7.646848516949153e-06, "loss": 1.1492, "mean_token_accuracy": 0.7315978705883026, "num_tokens": 14294038.0, "step": 17772 }, { "epoch": 4.7070974576271185, "grad_norm": 1.96442711353302, "learning_rate": 7.646583686440678e-06, "loss": 1.5913, "mean_token_accuracy": 0.6362531408667564, "num_tokens": 14295864.0, "step": 17774 }, { "epoch": 4.7076271186440675, "grad_norm": 2.220252275466919, "learning_rate": 7.646318855932205e-06, "loss": 1.7213, "mean_token_accuracy": 0.613340076059103, "num_tokens": 14297619.0, "step": 17776 }, { "epoch": 4.708156779661017, "grad_norm": 2.1018428802490234, "learning_rate": 7.64605402542373e-06, "loss": 0.9927, "mean_token_accuracy": 0.7409751787781715, "num_tokens": 14299084.0, "step": 17778 }, { "epoch": 4.708686440677966, "grad_norm": 1.7690623998641968, "learning_rate": 7.645789194915255e-06, "loss": 1.3203, "mean_token_accuracy": 0.7201320305466652, "num_tokens": 14300646.0, "step": 17780 }, { "epoch": 4.709216101694915, "grad_norm": 2.0627024173736572, "learning_rate": 7.64552436440678e-06, "loss": 0.9607, "mean_token_accuracy": 0.7557245716452599, "num_tokens": 14302261.0, "step": 17782 }, { "epoch": 4.709745762711864, "grad_norm": 2.077303409576416, "learning_rate": 7.645259533898306e-06, "loss": 1.811, "mean_token_accuracy": 0.5900291875004768, "num_tokens": 14304056.0, "step": 17784 }, { "epoch": 4.710275423728813, "grad_norm": 1.9192063808441162, "learning_rate": 7.644994703389831e-06, "loss": 0.8414, "mean_token_accuracy": 0.7484259828925133, "num_tokens": 14305629.0, "step": 17786 }, { "epoch": 4.710805084745763, "grad_norm": 1.7794266939163208, "learning_rate": 7.644729872881356e-06, "loss": 1.0423, "mean_token_accuracy": 0.7577505260705948, "num_tokens": 14307209.0, "step": 17788 }, { "epoch": 4.711334745762712, "grad_norm": 2.1943812370300293, "learning_rate": 7.644465042372881e-06, "loss": 1.1552, "mean_token_accuracy": 0.7295388653874397, "num_tokens": 14308975.0, "step": 17790 }, { "epoch": 4.711864406779661, "grad_norm": 2.3223214149475098, "learning_rate": 7.644200211864408e-06, "loss": 1.2526, "mean_token_accuracy": 0.7172909900546074, "num_tokens": 14310488.0, "step": 17792 }, { "epoch": 4.71239406779661, "grad_norm": 1.9419517517089844, "learning_rate": 7.643935381355933e-06, "loss": 1.0474, "mean_token_accuracy": 0.7507921010255814, "num_tokens": 14312036.0, "step": 17794 }, { "epoch": 4.71292372881356, "grad_norm": 1.5342607498168945, "learning_rate": 7.64367055084746e-06, "loss": 1.0216, "mean_token_accuracy": 0.7677598744630814, "num_tokens": 14313602.0, "step": 17796 }, { "epoch": 4.713453389830509, "grad_norm": 2.2000479698181152, "learning_rate": 7.643405720338983e-06, "loss": 1.4322, "mean_token_accuracy": 0.6733951196074486, "num_tokens": 14315042.0, "step": 17798 }, { "epoch": 4.713983050847458, "grad_norm": 1.775970697402954, "learning_rate": 7.64314088983051e-06, "loss": 1.013, "mean_token_accuracy": 0.7603904828429222, "num_tokens": 14316666.0, "step": 17800 }, { "epoch": 4.714512711864407, "grad_norm": 2.031726837158203, "learning_rate": 7.642876059322034e-06, "loss": 1.3575, "mean_token_accuracy": 0.6920246779918671, "num_tokens": 14318249.0, "step": 17802 }, { "epoch": 4.715042372881356, "grad_norm": 1.6653960943222046, "learning_rate": 7.64261122881356e-06, "loss": 1.0248, "mean_token_accuracy": 0.7563070207834244, "num_tokens": 14319757.0, "step": 17804 }, { "epoch": 4.715572033898305, "grad_norm": 1.9642131328582764, "learning_rate": 7.642346398305086e-06, "loss": 1.1346, "mean_token_accuracy": 0.7247886210680008, "num_tokens": 14321525.0, "step": 17806 }, { "epoch": 4.716101694915254, "grad_norm": 1.6873105764389038, "learning_rate": 7.64208156779661e-06, "loss": 1.1658, "mean_token_accuracy": 0.7388393208384514, "num_tokens": 14323243.0, "step": 17808 }, { "epoch": 4.716631355932203, "grad_norm": 1.7859554290771484, "learning_rate": 7.641816737288135e-06, "loss": 1.1042, "mean_token_accuracy": 0.7289600968360901, "num_tokens": 14324665.0, "step": 17810 }, { "epoch": 4.717161016949152, "grad_norm": 1.4609451293945312, "learning_rate": 7.641551906779662e-06, "loss": 0.9565, "mean_token_accuracy": 0.7574492022395134, "num_tokens": 14326365.0, "step": 17812 }, { "epoch": 4.717690677966102, "grad_norm": 2.2653722763061523, "learning_rate": 7.641287076271187e-06, "loss": 1.186, "mean_token_accuracy": 0.7328604757785797, "num_tokens": 14327745.0, "step": 17814 }, { "epoch": 4.718220338983051, "grad_norm": 1.6715694665908813, "learning_rate": 7.641022245762712e-06, "loss": 1.3349, "mean_token_accuracy": 0.6966638937592506, "num_tokens": 14329448.0, "step": 17816 }, { "epoch": 4.71875, "grad_norm": 1.9695531129837036, "learning_rate": 7.640757415254237e-06, "loss": 1.0453, "mean_token_accuracy": 0.7400127723813057, "num_tokens": 14330980.0, "step": 17818 }, { "epoch": 4.719279661016949, "grad_norm": 2.1941494941711426, "learning_rate": 7.640492584745764e-06, "loss": 1.5593, "mean_token_accuracy": 0.6512388437986374, "num_tokens": 14332779.0, "step": 17820 }, { "epoch": 4.719809322033898, "grad_norm": 2.5320208072662354, "learning_rate": 7.640227754237288e-06, "loss": 1.647, "mean_token_accuracy": 0.6568707600235939, "num_tokens": 14334107.0, "step": 17822 }, { "epoch": 4.720338983050848, "grad_norm": 2.328819513320923, "learning_rate": 7.639962923728813e-06, "loss": 1.0029, "mean_token_accuracy": 0.7733935713768005, "num_tokens": 14335521.0, "step": 17824 }, { "epoch": 4.720868644067797, "grad_norm": 2.1936588287353516, "learning_rate": 7.639698093220338e-06, "loss": 1.4947, "mean_token_accuracy": 0.6542865931987762, "num_tokens": 14337146.0, "step": 17826 }, { "epoch": 4.721398305084746, "grad_norm": 2.0035345554351807, "learning_rate": 7.639433262711865e-06, "loss": 1.575, "mean_token_accuracy": 0.6528713777661324, "num_tokens": 14338853.0, "step": 17828 }, { "epoch": 4.721927966101695, "grad_norm": 2.036919593811035, "learning_rate": 7.639168432203392e-06, "loss": 1.2846, "mean_token_accuracy": 0.7218509465456009, "num_tokens": 14340331.0, "step": 17830 }, { "epoch": 4.722457627118644, "grad_norm": 1.9554880857467651, "learning_rate": 7.638903601694916e-06, "loss": 1.2385, "mean_token_accuracy": 0.7015143930912018, "num_tokens": 14341973.0, "step": 17832 }, { "epoch": 4.722987288135593, "grad_norm": 2.030700922012329, "learning_rate": 7.638638771186441e-06, "loss": 1.2583, "mean_token_accuracy": 0.7284938022494316, "num_tokens": 14343637.0, "step": 17834 }, { "epoch": 4.723516949152542, "grad_norm": 1.7915130853652954, "learning_rate": 7.638373940677966e-06, "loss": 0.9867, "mean_token_accuracy": 0.7555208280682564, "num_tokens": 14345117.0, "step": 17836 }, { "epoch": 4.724046610169491, "grad_norm": 1.7661399841308594, "learning_rate": 7.638109110169493e-06, "loss": 0.9582, "mean_token_accuracy": 0.7470827102661133, "num_tokens": 14346893.0, "step": 17838 }, { "epoch": 4.72457627118644, "grad_norm": 1.749760627746582, "learning_rate": 7.637844279661018e-06, "loss": 1.0859, "mean_token_accuracy": 0.7330373302102089, "num_tokens": 14348192.0, "step": 17840 }, { "epoch": 4.72510593220339, "grad_norm": 1.9435856342315674, "learning_rate": 7.637579449152543e-06, "loss": 1.3961, "mean_token_accuracy": 0.6932710856199265, "num_tokens": 14349867.0, "step": 17842 }, { "epoch": 4.725635593220339, "grad_norm": 2.095102071762085, "learning_rate": 7.637314618644068e-06, "loss": 1.5432, "mean_token_accuracy": 0.6747578158974648, "num_tokens": 14351526.0, "step": 17844 }, { "epoch": 4.726165254237288, "grad_norm": 1.810150146484375, "learning_rate": 7.637049788135594e-06, "loss": 1.132, "mean_token_accuracy": 0.7555876448750496, "num_tokens": 14353245.0, "step": 17846 }, { "epoch": 4.726694915254237, "grad_norm": 2.391812324523926, "learning_rate": 7.63678495762712e-06, "loss": 1.4657, "mean_token_accuracy": 0.6813077554106712, "num_tokens": 14354717.0, "step": 17848 }, { "epoch": 4.727224576271187, "grad_norm": 1.8000396490097046, "learning_rate": 7.636520127118646e-06, "loss": 1.1548, "mean_token_accuracy": 0.7335281148552895, "num_tokens": 14356332.0, "step": 17850 }, { "epoch": 4.727754237288136, "grad_norm": 2.00889253616333, "learning_rate": 7.63625529661017e-06, "loss": 1.1448, "mean_token_accuracy": 0.7252161353826523, "num_tokens": 14358281.0, "step": 17852 }, { "epoch": 4.728283898305085, "grad_norm": 1.910090446472168, "learning_rate": 7.635990466101696e-06, "loss": 1.4094, "mean_token_accuracy": 0.6936012804508209, "num_tokens": 14360023.0, "step": 17854 }, { "epoch": 4.728813559322034, "grad_norm": 2.198429822921753, "learning_rate": 7.63572563559322e-06, "loss": 1.2683, "mean_token_accuracy": 0.7355326190590858, "num_tokens": 14361535.0, "step": 17856 }, { "epoch": 4.729343220338983, "grad_norm": 2.0188510417938232, "learning_rate": 7.635460805084747e-06, "loss": 0.9347, "mean_token_accuracy": 0.7748411521315575, "num_tokens": 14363015.0, "step": 17858 }, { "epoch": 4.7298728813559325, "grad_norm": 2.6291451454162598, "learning_rate": 7.635195974576272e-06, "loss": 1.288, "mean_token_accuracy": 0.6985624805092812, "num_tokens": 14364496.0, "step": 17860 }, { "epoch": 4.7304025423728815, "grad_norm": 1.7554727792739868, "learning_rate": 7.634931144067797e-06, "loss": 1.1519, "mean_token_accuracy": 0.7427983582019806, "num_tokens": 14365991.0, "step": 17862 }, { "epoch": 4.7309322033898304, "grad_norm": 1.9184887409210205, "learning_rate": 7.634666313559322e-06, "loss": 1.6353, "mean_token_accuracy": 0.6524771824479103, "num_tokens": 14367685.0, "step": 17864 }, { "epoch": 4.731461864406779, "grad_norm": 2.2388858795166016, "learning_rate": 7.634401483050849e-06, "loss": 1.5843, "mean_token_accuracy": 0.6740669459104538, "num_tokens": 14369025.0, "step": 17866 }, { "epoch": 4.731991525423728, "grad_norm": 2.4663689136505127, "learning_rate": 7.634136652542374e-06, "loss": 1.515, "mean_token_accuracy": 0.6701760441064835, "num_tokens": 14370292.0, "step": 17868 }, { "epoch": 4.732521186440678, "grad_norm": 1.8018044233322144, "learning_rate": 7.633871822033899e-06, "loss": 1.2803, "mean_token_accuracy": 0.7188556268811226, "num_tokens": 14372116.0, "step": 17870 }, { "epoch": 4.733050847457627, "grad_norm": 1.8738603591918945, "learning_rate": 7.633606991525424e-06, "loss": 1.4126, "mean_token_accuracy": 0.669635221362114, "num_tokens": 14373832.0, "step": 17872 }, { "epoch": 4.733580508474576, "grad_norm": 2.2378814220428467, "learning_rate": 7.63334216101695e-06, "loss": 1.4242, "mean_token_accuracy": 0.6778949201107025, "num_tokens": 14375462.0, "step": 17874 }, { "epoch": 4.734110169491525, "grad_norm": 2.3650403022766113, "learning_rate": 7.633077330508475e-06, "loss": 1.3888, "mean_token_accuracy": 0.7084409296512604, "num_tokens": 14376814.0, "step": 17876 }, { "epoch": 4.734639830508475, "grad_norm": 2.1244237422943115, "learning_rate": 7.6328125e-06, "loss": 0.9781, "mean_token_accuracy": 0.7534870356321335, "num_tokens": 14378420.0, "step": 17878 }, { "epoch": 4.735169491525424, "grad_norm": 2.099860668182373, "learning_rate": 7.632547669491525e-06, "loss": 1.3744, "mean_token_accuracy": 0.6845361813902855, "num_tokens": 14379792.0, "step": 17880 }, { "epoch": 4.735699152542373, "grad_norm": 2.1051604747772217, "learning_rate": 7.632282838983052e-06, "loss": 0.807, "mean_token_accuracy": 0.7940650954842567, "num_tokens": 14381142.0, "step": 17882 }, { "epoch": 4.736228813559322, "grad_norm": 2.089736223220825, "learning_rate": 7.632018008474577e-06, "loss": 1.3318, "mean_token_accuracy": 0.7092660143971443, "num_tokens": 14382976.0, "step": 17884 }, { "epoch": 4.736758474576272, "grad_norm": 1.8602443933486938, "learning_rate": 7.631753177966103e-06, "loss": 1.2306, "mean_token_accuracy": 0.7174030691385269, "num_tokens": 14384382.0, "step": 17886 }, { "epoch": 4.737288135593221, "grad_norm": 2.0381853580474854, "learning_rate": 7.631488347457628e-06, "loss": 1.3251, "mean_token_accuracy": 0.6790157780051231, "num_tokens": 14386170.0, "step": 17888 }, { "epoch": 4.7378177966101696, "grad_norm": 1.9039419889450073, "learning_rate": 7.631223516949153e-06, "loss": 1.0275, "mean_token_accuracy": 0.7560817152261734, "num_tokens": 14387756.0, "step": 17890 }, { "epoch": 4.7383474576271185, "grad_norm": 2.3262581825256348, "learning_rate": 7.630958686440678e-06, "loss": 1.0902, "mean_token_accuracy": 0.7520337104797363, "num_tokens": 14389239.0, "step": 17892 }, { "epoch": 4.7388771186440675, "grad_norm": 2.149350881576538, "learning_rate": 7.630693855932205e-06, "loss": 1.3467, "mean_token_accuracy": 0.7115295343101025, "num_tokens": 14390723.0, "step": 17894 }, { "epoch": 4.739406779661017, "grad_norm": 1.8220288753509521, "learning_rate": 7.63042902542373e-06, "loss": 0.8291, "mean_token_accuracy": 0.798037014901638, "num_tokens": 14392425.0, "step": 17896 }, { "epoch": 4.739936440677966, "grad_norm": 2.2300963401794434, "learning_rate": 7.630164194915254e-06, "loss": 1.687, "mean_token_accuracy": 0.6615795940160751, "num_tokens": 14394213.0, "step": 17898 }, { "epoch": 4.740466101694915, "grad_norm": 2.1404712200164795, "learning_rate": 7.62989936440678e-06, "loss": 1.5421, "mean_token_accuracy": 0.6668664067983627, "num_tokens": 14395734.0, "step": 17900 }, { "epoch": 4.740995762711864, "grad_norm": 2.13741135597229, "learning_rate": 7.629634533898306e-06, "loss": 1.4386, "mean_token_accuracy": 0.6923268735408783, "num_tokens": 14397510.0, "step": 17902 }, { "epoch": 4.741525423728813, "grad_norm": 2.035391330718994, "learning_rate": 7.629369703389831e-06, "loss": 1.0839, "mean_token_accuracy": 0.7390725836157799, "num_tokens": 14399915.0, "step": 17904 }, { "epoch": 4.742055084745763, "grad_norm": 2.4003188610076904, "learning_rate": 7.629104872881357e-06, "loss": 1.724, "mean_token_accuracy": 0.6131894290447235, "num_tokens": 14401443.0, "step": 17906 }, { "epoch": 4.742584745762712, "grad_norm": 2.125916004180908, "learning_rate": 7.628840042372882e-06, "loss": 1.3219, "mean_token_accuracy": 0.7143352851271629, "num_tokens": 14403059.0, "step": 17908 }, { "epoch": 4.743114406779661, "grad_norm": 2.221130847930908, "learning_rate": 7.628575211864407e-06, "loss": 1.1587, "mean_token_accuracy": 0.7402889728546143, "num_tokens": 14404660.0, "step": 17910 }, { "epoch": 4.74364406779661, "grad_norm": 2.793241500854492, "learning_rate": 7.628310381355933e-06, "loss": 1.3258, "mean_token_accuracy": 0.6941701769828796, "num_tokens": 14406021.0, "step": 17912 }, { "epoch": 4.74417372881356, "grad_norm": 2.436461925506592, "learning_rate": 7.628045550847458e-06, "loss": 1.3723, "mean_token_accuracy": 0.6947334036231041, "num_tokens": 14407499.0, "step": 17914 }, { "epoch": 4.744703389830509, "grad_norm": 1.564910888671875, "learning_rate": 7.627780720338984e-06, "loss": 1.2277, "mean_token_accuracy": 0.6998973712325096, "num_tokens": 14409495.0, "step": 17916 }, { "epoch": 4.745233050847458, "grad_norm": 2.286315679550171, "learning_rate": 7.627515889830509e-06, "loss": 1.4651, "mean_token_accuracy": 0.688974104821682, "num_tokens": 14411020.0, "step": 17918 }, { "epoch": 4.745762711864407, "grad_norm": 1.6352204084396362, "learning_rate": 7.627251059322035e-06, "loss": 1.5555, "mean_token_accuracy": 0.6581882163882256, "num_tokens": 14412779.0, "step": 17920 }, { "epoch": 4.746292372881356, "grad_norm": 1.9948359727859497, "learning_rate": 7.6269862288135595e-06, "loss": 1.3706, "mean_token_accuracy": 0.691791832447052, "num_tokens": 14414248.0, "step": 17922 }, { "epoch": 4.746822033898305, "grad_norm": 2.024240016937256, "learning_rate": 7.626721398305086e-06, "loss": 1.422, "mean_token_accuracy": 0.6510392725467682, "num_tokens": 14416134.0, "step": 17924 }, { "epoch": 4.747351694915254, "grad_norm": 1.9736359119415283, "learning_rate": 7.626456567796611e-06, "loss": 0.977, "mean_token_accuracy": 0.7484864667057991, "num_tokens": 14417820.0, "step": 17926 }, { "epoch": 4.747881355932203, "grad_norm": 1.7565417289733887, "learning_rate": 7.626191737288137e-06, "loss": 1.1684, "mean_token_accuracy": 0.7210837304592133, "num_tokens": 14419351.0, "step": 17928 }, { "epoch": 4.748411016949152, "grad_norm": 1.860085368156433, "learning_rate": 7.625926906779662e-06, "loss": 1.2139, "mean_token_accuracy": 0.7045348137617111, "num_tokens": 14421077.0, "step": 17930 }, { "epoch": 4.748940677966102, "grad_norm": 2.1217565536499023, "learning_rate": 7.6256620762711875e-06, "loss": 0.8784, "mean_token_accuracy": 0.7761487364768982, "num_tokens": 14422667.0, "step": 17932 }, { "epoch": 4.749470338983051, "grad_norm": 1.9163099527359009, "learning_rate": 7.6253972457627124e-06, "loss": 1.5131, "mean_token_accuracy": 0.656744722276926, "num_tokens": 14424761.0, "step": 17934 }, { "epoch": 4.75, "grad_norm": 1.7930877208709717, "learning_rate": 7.625132415254238e-06, "loss": 1.2566, "mean_token_accuracy": 0.7084471061825752, "num_tokens": 14426330.0, "step": 17936 }, { "epoch": 4.750529661016949, "grad_norm": 1.784033179283142, "learning_rate": 7.624867584745763e-06, "loss": 0.8736, "mean_token_accuracy": 0.8191442787647247, "num_tokens": 14427817.0, "step": 17938 }, { "epoch": 4.751059322033898, "grad_norm": 2.895723581314087, "learning_rate": 7.624602754237289e-06, "loss": 1.276, "mean_token_accuracy": 0.7078101933002472, "num_tokens": 14429256.0, "step": 17940 }, { "epoch": 4.751588983050848, "grad_norm": 1.8576802015304565, "learning_rate": 7.624337923728814e-06, "loss": 1.1768, "mean_token_accuracy": 0.6996043920516968, "num_tokens": 14431026.0, "step": 17942 }, { "epoch": 4.752118644067797, "grad_norm": 2.2754857540130615, "learning_rate": 7.62407309322034e-06, "loss": 1.4683, "mean_token_accuracy": 0.6377832591533661, "num_tokens": 14433453.0, "step": 17944 }, { "epoch": 4.752648305084746, "grad_norm": 1.8301072120666504, "learning_rate": 7.6238082627118646e-06, "loss": 1.2632, "mean_token_accuracy": 0.70785803347826, "num_tokens": 14435196.0, "step": 17946 }, { "epoch": 4.753177966101695, "grad_norm": 2.349555492401123, "learning_rate": 7.623543432203391e-06, "loss": 1.3521, "mean_token_accuracy": 0.7097724452614784, "num_tokens": 14436923.0, "step": 17948 }, { "epoch": 4.753707627118644, "grad_norm": 1.704375147819519, "learning_rate": 7.623278601694915e-06, "loss": 0.8198, "mean_token_accuracy": 0.7904037460684776, "num_tokens": 14438350.0, "step": 17950 }, { "epoch": 4.754237288135593, "grad_norm": 1.7200132608413696, "learning_rate": 7.623013771186442e-06, "loss": 1.2549, "mean_token_accuracy": 0.7095483615994453, "num_tokens": 14440053.0, "step": 17952 }, { "epoch": 4.754766949152542, "grad_norm": 2.1136677265167236, "learning_rate": 7.622748940677967e-06, "loss": 1.6176, "mean_token_accuracy": 0.6639626733958721, "num_tokens": 14441692.0, "step": 17954 }, { "epoch": 4.755296610169491, "grad_norm": 2.3229105472564697, "learning_rate": 7.622484110169493e-06, "loss": 1.3154, "mean_token_accuracy": 0.6859867498278618, "num_tokens": 14443155.0, "step": 17956 }, { "epoch": 4.75582627118644, "grad_norm": 1.73702871799469, "learning_rate": 7.6222192796610175e-06, "loss": 0.9924, "mean_token_accuracy": 0.7452649474143982, "num_tokens": 14444748.0, "step": 17958 }, { "epoch": 4.75635593220339, "grad_norm": 2.2455766201019287, "learning_rate": 7.621954449152543e-06, "loss": 1.3636, "mean_token_accuracy": 0.6709157079458237, "num_tokens": 14446579.0, "step": 17960 }, { "epoch": 4.756885593220339, "grad_norm": 1.6597152948379517, "learning_rate": 7.621689618644068e-06, "loss": 1.1308, "mean_token_accuracy": 0.7218335643410683, "num_tokens": 14448227.0, "step": 17962 }, { "epoch": 4.757415254237288, "grad_norm": 2.191753625869751, "learning_rate": 7.621424788135594e-06, "loss": 1.1006, "mean_token_accuracy": 0.7303263545036316, "num_tokens": 14449913.0, "step": 17964 }, { "epoch": 4.757944915254237, "grad_norm": 1.8081953525543213, "learning_rate": 7.621159957627119e-06, "loss": 1.0161, "mean_token_accuracy": 0.7362566441297531, "num_tokens": 14451424.0, "step": 17966 }, { "epoch": 4.758474576271187, "grad_norm": 2.0544021129608154, "learning_rate": 7.620895127118645e-06, "loss": 1.4669, "mean_token_accuracy": 0.6797301843762398, "num_tokens": 14453042.0, "step": 17968 }, { "epoch": 4.759004237288136, "grad_norm": 2.63208270072937, "learning_rate": 7.62063029661017e-06, "loss": 1.5792, "mean_token_accuracy": 0.6685142070055008, "num_tokens": 14454678.0, "step": 17970 }, { "epoch": 4.759533898305085, "grad_norm": 2.0868687629699707, "learning_rate": 7.620365466101695e-06, "loss": 1.4634, "mean_token_accuracy": 0.6851171776652336, "num_tokens": 14456255.0, "step": 17972 }, { "epoch": 4.760063559322034, "grad_norm": 2.379591226577759, "learning_rate": 7.62010063559322e-06, "loss": 1.8152, "mean_token_accuracy": 0.6275557540357113, "num_tokens": 14457753.0, "step": 17974 }, { "epoch": 4.760593220338983, "grad_norm": 2.0371103286743164, "learning_rate": 7.619835805084746e-06, "loss": 1.3936, "mean_token_accuracy": 0.687621459364891, "num_tokens": 14459292.0, "step": 17976 }, { "epoch": 4.7611228813559325, "grad_norm": 1.6342864036560059, "learning_rate": 7.619570974576271e-06, "loss": 0.89, "mean_token_accuracy": 0.7544717639684677, "num_tokens": 14461458.0, "step": 17978 }, { "epoch": 4.7616525423728815, "grad_norm": 2.1032323837280273, "learning_rate": 7.619306144067798e-06, "loss": 1.3959, "mean_token_accuracy": 0.6743210256099701, "num_tokens": 14463188.0, "step": 17980 }, { "epoch": 4.7621822033898304, "grad_norm": 1.8509278297424316, "learning_rate": 7.619041313559323e-06, "loss": 1.215, "mean_token_accuracy": 0.7050511464476585, "num_tokens": 14464798.0, "step": 17982 }, { "epoch": 4.762711864406779, "grad_norm": 2.1373131275177, "learning_rate": 7.618776483050848e-06, "loss": 1.0579, "mean_token_accuracy": 0.7572291269898415, "num_tokens": 14466325.0, "step": 17984 }, { "epoch": 4.763241525423728, "grad_norm": 1.8784252405166626, "learning_rate": 7.618511652542373e-06, "loss": 1.2497, "mean_token_accuracy": 0.7002147063612938, "num_tokens": 14467916.0, "step": 17986 }, { "epoch": 4.763771186440678, "grad_norm": 1.7353216409683228, "learning_rate": 7.618246822033899e-06, "loss": 1.0269, "mean_token_accuracy": 0.7447110041975975, "num_tokens": 14469460.0, "step": 17988 }, { "epoch": 4.764300847457627, "grad_norm": 2.0601558685302734, "learning_rate": 7.617981991525424e-06, "loss": 1.4638, "mean_token_accuracy": 0.6680729240179062, "num_tokens": 14471194.0, "step": 17990 }, { "epoch": 4.764830508474576, "grad_norm": 1.7088794708251953, "learning_rate": 7.61771716101695e-06, "loss": 0.9666, "mean_token_accuracy": 0.7491686046123505, "num_tokens": 14472822.0, "step": 17992 }, { "epoch": 4.765360169491525, "grad_norm": 1.5960055589675903, "learning_rate": 7.617452330508475e-06, "loss": 1.0467, "mean_token_accuracy": 0.7477050721645355, "num_tokens": 14474578.0, "step": 17994 }, { "epoch": 4.765889830508475, "grad_norm": 2.0397584438323975, "learning_rate": 7.6171875000000005e-06, "loss": 0.9351, "mean_token_accuracy": 0.767380028963089, "num_tokens": 14476122.0, "step": 17996 }, { "epoch": 4.766419491525424, "grad_norm": 1.9070935249328613, "learning_rate": 7.616922669491526e-06, "loss": 1.1935, "mean_token_accuracy": 0.7014293894171715, "num_tokens": 14477742.0, "step": 17998 }, { "epoch": 4.766949152542373, "grad_norm": 1.8211811780929565, "learning_rate": 7.616657838983051e-06, "loss": 1.2631, "step": 18000 }, { "epoch": 4.766949152542373, "eval_loss": 1.3111602067947388, "eval_mean_token_accuracy": 0.7012268776049861, "eval_num_tokens": 14479784.0, "eval_runtime": 48.2713, "eval_samples_per_second": 6.381, "eval_steps_per_second": 6.381, "step": 18000 }, { "epoch": 4.767478813559322, "grad_norm": 2.056286096572876, "learning_rate": 7.616393008474578e-06, "loss": 1.0228, "mean_token_accuracy": 0.7394141592085361, "num_tokens": 14481559.0, "step": 18002 }, { "epoch": 4.768008474576272, "grad_norm": 1.6621599197387695, "learning_rate": 7.616128177966102e-06, "loss": 1.1679, "mean_token_accuracy": 0.729676865041256, "num_tokens": 14483375.0, "step": 18004 }, { "epoch": 4.768538135593221, "grad_norm": 2.18562912940979, "learning_rate": 7.6158633474576285e-06, "loss": 1.5224, "mean_token_accuracy": 0.6675764918327332, "num_tokens": 14484828.0, "step": 18006 }, { "epoch": 4.7690677966101696, "grad_norm": 1.9605846405029297, "learning_rate": 7.6155985169491535e-06, "loss": 1.2882, "mean_token_accuracy": 0.7146449089050293, "num_tokens": 14486430.0, "step": 18008 }, { "epoch": 4.7695974576271185, "grad_norm": 1.7997053861618042, "learning_rate": 7.615333686440679e-06, "loss": 0.9571, "mean_token_accuracy": 0.7573400884866714, "num_tokens": 14488273.0, "step": 18010 }, { "epoch": 4.7701271186440675, "grad_norm": 2.0144894123077393, "learning_rate": 7.615068855932204e-06, "loss": 1.4809, "mean_token_accuracy": 0.6730947494506836, "num_tokens": 14489886.0, "step": 18012 }, { "epoch": 4.770656779661017, "grad_norm": 2.166152000427246, "learning_rate": 7.61480402542373e-06, "loss": 1.3427, "mean_token_accuracy": 0.7106931991875172, "num_tokens": 14491184.0, "step": 18014 }, { "epoch": 4.771186440677966, "grad_norm": 1.914203405380249, "learning_rate": 7.614539194915255e-06, "loss": 0.9884, "mean_token_accuracy": 0.7594271451234818, "num_tokens": 14492761.0, "step": 18016 }, { "epoch": 4.771716101694915, "grad_norm": 2.2143616676330566, "learning_rate": 7.614274364406781e-06, "loss": 0.9231, "mean_token_accuracy": 0.7373817339539528, "num_tokens": 14494344.0, "step": 18018 }, { "epoch": 4.772245762711864, "grad_norm": 1.610270380973816, "learning_rate": 7.614009533898306e-06, "loss": 1.2955, "mean_token_accuracy": 0.694878600537777, "num_tokens": 14496079.0, "step": 18020 }, { "epoch": 4.772775423728813, "grad_norm": 2.006239891052246, "learning_rate": 7.613744703389831e-06, "loss": 1.0479, "mean_token_accuracy": 0.7434488981962204, "num_tokens": 14497636.0, "step": 18022 }, { "epoch": 4.773305084745763, "grad_norm": 2.1274237632751465, "learning_rate": 7.613479872881356e-06, "loss": 1.0367, "mean_token_accuracy": 0.7681155204772949, "num_tokens": 14498987.0, "step": 18024 }, { "epoch": 4.773834745762712, "grad_norm": 2.5036203861236572, "learning_rate": 7.613215042372882e-06, "loss": 0.9444, "mean_token_accuracy": 0.7669054865837097, "num_tokens": 14500526.0, "step": 18026 }, { "epoch": 4.774364406779661, "grad_norm": 2.0301015377044678, "learning_rate": 7.612950211864407e-06, "loss": 1.5856, "mean_token_accuracy": 0.6639882922172546, "num_tokens": 14502189.0, "step": 18028 }, { "epoch": 4.77489406779661, "grad_norm": 1.9816603660583496, "learning_rate": 7.612685381355933e-06, "loss": 1.7266, "mean_token_accuracy": 0.6096576116979122, "num_tokens": 14504031.0, "step": 18030 }, { "epoch": 4.77542372881356, "grad_norm": 2.0476865768432617, "learning_rate": 7.612420550847458e-06, "loss": 1.344, "mean_token_accuracy": 0.6975993812084198, "num_tokens": 14505577.0, "step": 18032 }, { "epoch": 4.775953389830509, "grad_norm": 1.8643152713775635, "learning_rate": 7.612155720338984e-06, "loss": 1.1042, "mean_token_accuracy": 0.7231585159897804, "num_tokens": 14507372.0, "step": 18034 }, { "epoch": 4.776483050847458, "grad_norm": 2.458705425262451, "learning_rate": 7.611890889830509e-06, "loss": 1.3286, "mean_token_accuracy": 0.7051686719059944, "num_tokens": 14508950.0, "step": 18036 }, { "epoch": 4.777012711864407, "grad_norm": 2.411088466644287, "learning_rate": 7.611626059322035e-06, "loss": 1.5226, "mean_token_accuracy": 0.6518940553069115, "num_tokens": 14510561.0, "step": 18038 }, { "epoch": 4.777542372881356, "grad_norm": 1.9204498529434204, "learning_rate": 7.61136122881356e-06, "loss": 0.7329, "mean_token_accuracy": 0.8081357181072235, "num_tokens": 14512266.0, "step": 18040 }, { "epoch": 4.778072033898305, "grad_norm": 2.1940319538116455, "learning_rate": 7.611096398305086e-06, "loss": 1.3972, "mean_token_accuracy": 0.6994920000433922, "num_tokens": 14513857.0, "step": 18042 }, { "epoch": 4.778601694915254, "grad_norm": 1.8291425704956055, "learning_rate": 7.610831567796611e-06, "loss": 1.3935, "mean_token_accuracy": 0.6983548700809479, "num_tokens": 14515674.0, "step": 18044 }, { "epoch": 4.779131355932203, "grad_norm": 2.1283180713653564, "learning_rate": 7.6105667372881364e-06, "loss": 1.1576, "mean_token_accuracy": 0.7369190081954002, "num_tokens": 14517273.0, "step": 18046 }, { "epoch": 4.779661016949152, "grad_norm": 1.9611624479293823, "learning_rate": 7.610301906779661e-06, "loss": 0.8055, "mean_token_accuracy": 0.8061487451195717, "num_tokens": 14518469.0, "step": 18048 }, { "epoch": 4.780190677966102, "grad_norm": 1.9308112859725952, "learning_rate": 7.610037076271187e-06, "loss": 1.485, "mean_token_accuracy": 0.6806196048855782, "num_tokens": 14519950.0, "step": 18050 }, { "epoch": 4.780720338983051, "grad_norm": 2.037790536880493, "learning_rate": 7.609772245762712e-06, "loss": 1.5028, "mean_token_accuracy": 0.6586366295814514, "num_tokens": 14521603.0, "step": 18052 }, { "epoch": 4.78125, "grad_norm": 2.340141773223877, "learning_rate": 7.609507415254238e-06, "loss": 1.6091, "mean_token_accuracy": 0.6266976669430733, "num_tokens": 14523382.0, "step": 18054 }, { "epoch": 4.781779661016949, "grad_norm": 1.8815966844558716, "learning_rate": 7.609242584745763e-06, "loss": 1.4525, "mean_token_accuracy": 0.6982754841446877, "num_tokens": 14524908.0, "step": 18056 }, { "epoch": 4.782309322033898, "grad_norm": 2.3484418392181396, "learning_rate": 7.6089777542372885e-06, "loss": 0.8714, "mean_token_accuracy": 0.7670067772269249, "num_tokens": 14526173.0, "step": 18058 }, { "epoch": 4.782838983050848, "grad_norm": 1.6083793640136719, "learning_rate": 7.6087129237288135e-06, "loss": 1.3161, "mean_token_accuracy": 0.6850389018654823, "num_tokens": 14528031.0, "step": 18060 }, { "epoch": 4.783368644067797, "grad_norm": 2.0213422775268555, "learning_rate": 7.60844809322034e-06, "loss": 1.5438, "mean_token_accuracy": 0.6454926058650017, "num_tokens": 14529761.0, "step": 18062 }, { "epoch": 4.783898305084746, "grad_norm": 1.999605655670166, "learning_rate": 7.608183262711864e-06, "loss": 1.5337, "mean_token_accuracy": 0.672205463051796, "num_tokens": 14531315.0, "step": 18064 }, { "epoch": 4.784427966101695, "grad_norm": 1.9241232872009277, "learning_rate": 7.607918432203391e-06, "loss": 1.0957, "mean_token_accuracy": 0.732657864689827, "num_tokens": 14532914.0, "step": 18066 }, { "epoch": 4.784957627118644, "grad_norm": 1.8225654363632202, "learning_rate": 7.607653601694916e-06, "loss": 1.3089, "mean_token_accuracy": 0.7162815555930138, "num_tokens": 14535011.0, "step": 18068 }, { "epoch": 4.785487288135593, "grad_norm": 1.6811496019363403, "learning_rate": 7.6073887711864415e-06, "loss": 1.2633, "mean_token_accuracy": 0.7081908285617828, "num_tokens": 14536588.0, "step": 18070 }, { "epoch": 4.786016949152542, "grad_norm": 1.3620624542236328, "learning_rate": 7.6071239406779664e-06, "loss": 1.221, "mean_token_accuracy": 0.688710480928421, "num_tokens": 14539270.0, "step": 18072 }, { "epoch": 4.786546610169491, "grad_norm": 1.931381344795227, "learning_rate": 7.606859110169492e-06, "loss": 1.2317, "mean_token_accuracy": 0.7261580899357796, "num_tokens": 14540605.0, "step": 18074 }, { "epoch": 4.78707627118644, "grad_norm": 2.2579493522644043, "learning_rate": 7.606594279661017e-06, "loss": 1.3835, "mean_token_accuracy": 0.6917965263128281, "num_tokens": 14542026.0, "step": 18076 }, { "epoch": 4.78760593220339, "grad_norm": 2.3465349674224854, "learning_rate": 7.606329449152543e-06, "loss": 1.2454, "mean_token_accuracy": 0.7345832958817482, "num_tokens": 14543307.0, "step": 18078 }, { "epoch": 4.788135593220339, "grad_norm": 1.8024386167526245, "learning_rate": 7.606064618644068e-06, "loss": 1.2733, "mean_token_accuracy": 0.7116650715470314, "num_tokens": 14544916.0, "step": 18080 }, { "epoch": 4.788665254237288, "grad_norm": 2.145432472229004, "learning_rate": 7.605799788135594e-06, "loss": 1.1708, "mean_token_accuracy": 0.7154199853539467, "num_tokens": 14546620.0, "step": 18082 }, { "epoch": 4.789194915254237, "grad_norm": 2.4085633754730225, "learning_rate": 7.605534957627119e-06, "loss": 1.5326, "mean_token_accuracy": 0.6769894137978554, "num_tokens": 14548194.0, "step": 18084 }, { "epoch": 4.789724576271187, "grad_norm": 1.968885898590088, "learning_rate": 7.605270127118644e-06, "loss": 1.2427, "mean_token_accuracy": 0.7151201292872429, "num_tokens": 14549875.0, "step": 18086 }, { "epoch": 4.790254237288136, "grad_norm": 1.8059300184249878, "learning_rate": 7.605005296610171e-06, "loss": 1.2998, "mean_token_accuracy": 0.6926003322005272, "num_tokens": 14551799.0, "step": 18088 }, { "epoch": 4.790783898305085, "grad_norm": 2.02714467048645, "learning_rate": 7.604740466101696e-06, "loss": 1.1745, "mean_token_accuracy": 0.7339491099119186, "num_tokens": 14553246.0, "step": 18090 }, { "epoch": 4.791313559322034, "grad_norm": 1.899324655532837, "learning_rate": 7.604475635593222e-06, "loss": 1.1399, "mean_token_accuracy": 0.7126662582159042, "num_tokens": 14554964.0, "step": 18092 }, { "epoch": 4.791843220338983, "grad_norm": 1.7565685510635376, "learning_rate": 7.604210805084747e-06, "loss": 1.2639, "mean_token_accuracy": 0.7162145599722862, "num_tokens": 14557164.0, "step": 18094 }, { "epoch": 4.7923728813559325, "grad_norm": 2.049791097640991, "learning_rate": 7.603945974576272e-06, "loss": 1.3661, "mean_token_accuracy": 0.6902588680386543, "num_tokens": 14558760.0, "step": 18096 }, { "epoch": 4.7929025423728815, "grad_norm": 1.8316097259521484, "learning_rate": 7.603681144067797e-06, "loss": 0.9445, "mean_token_accuracy": 0.7574376687407494, "num_tokens": 14560232.0, "step": 18098 }, { "epoch": 4.7934322033898304, "grad_norm": 1.9090393781661987, "learning_rate": 7.603416313559323e-06, "loss": 1.5121, "mean_token_accuracy": 0.6803287379443645, "num_tokens": 14561635.0, "step": 18100 }, { "epoch": 4.793961864406779, "grad_norm": 2.400268793106079, "learning_rate": 7.603151483050848e-06, "loss": 0.8288, "mean_token_accuracy": 0.7865170836448669, "num_tokens": 14563066.0, "step": 18102 }, { "epoch": 4.794491525423728, "grad_norm": 1.9555705785751343, "learning_rate": 7.602886652542374e-06, "loss": 1.1348, "mean_token_accuracy": 0.7216633185744286, "num_tokens": 14564813.0, "step": 18104 }, { "epoch": 4.795021186440678, "grad_norm": 1.9691355228424072, "learning_rate": 7.602621822033899e-06, "loss": 0.8763, "mean_token_accuracy": 0.7800363898277283, "num_tokens": 14566264.0, "step": 18106 }, { "epoch": 4.795550847457627, "grad_norm": 2.4634673595428467, "learning_rate": 7.6023569915254245e-06, "loss": 1.7675, "mean_token_accuracy": 0.6120478995144367, "num_tokens": 14568476.0, "step": 18108 }, { "epoch": 4.796080508474576, "grad_norm": 1.7969905138015747, "learning_rate": 7.602092161016949e-06, "loss": 1.7471, "mean_token_accuracy": 0.6446284912526608, "num_tokens": 14570327.0, "step": 18110 }, { "epoch": 4.796610169491525, "grad_norm": 1.9950006008148193, "learning_rate": 7.601827330508475e-06, "loss": 1.6211, "mean_token_accuracy": 0.6526079960167408, "num_tokens": 14571993.0, "step": 18112 }, { "epoch": 4.797139830508475, "grad_norm": 1.9157403707504272, "learning_rate": 7.6015625e-06, "loss": 1.531, "mean_token_accuracy": 0.648979052901268, "num_tokens": 14573645.0, "step": 18114 }, { "epoch": 4.797669491525424, "grad_norm": 1.9039498567581177, "learning_rate": 7.601297669491527e-06, "loss": 1.1636, "mean_token_accuracy": 0.7285817638039589, "num_tokens": 14575238.0, "step": 18116 }, { "epoch": 4.798199152542373, "grad_norm": 1.945951223373413, "learning_rate": 7.601032838983051e-06, "loss": 1.412, "mean_token_accuracy": 0.6716464534401894, "num_tokens": 14576979.0, "step": 18118 }, { "epoch": 4.798728813559322, "grad_norm": 2.1674790382385254, "learning_rate": 7.6007680084745774e-06, "loss": 1.3555, "mean_token_accuracy": 0.6711901277303696, "num_tokens": 14579106.0, "step": 18120 }, { "epoch": 4.799258474576272, "grad_norm": 2.1925342082977295, "learning_rate": 7.600503177966102e-06, "loss": 1.1883, "mean_token_accuracy": 0.7043908461928368, "num_tokens": 14580559.0, "step": 18122 }, { "epoch": 4.799788135593221, "grad_norm": 1.9380462169647217, "learning_rate": 7.600238347457628e-06, "loss": 0.957, "mean_token_accuracy": 0.7670851573348045, "num_tokens": 14581959.0, "step": 18124 }, { "epoch": 4.8003177966101696, "grad_norm": 1.9338918924331665, "learning_rate": 7.599973516949153e-06, "loss": 1.5239, "mean_token_accuracy": 0.6714916452765465, "num_tokens": 14583461.0, "step": 18126 }, { "epoch": 4.8008474576271185, "grad_norm": 2.261138916015625, "learning_rate": 7.599708686440679e-06, "loss": 1.9576, "mean_token_accuracy": 0.5935313925147057, "num_tokens": 14585183.0, "step": 18128 }, { "epoch": 4.8013771186440675, "grad_norm": 2.244656801223755, "learning_rate": 7.599443855932204e-06, "loss": 1.2701, "mean_token_accuracy": 0.695811852812767, "num_tokens": 14586576.0, "step": 18130 }, { "epoch": 4.801906779661017, "grad_norm": 1.9899460077285767, "learning_rate": 7.5991790254237296e-06, "loss": 1.0324, "mean_token_accuracy": 0.7412006705999374, "num_tokens": 14588329.0, "step": 18132 }, { "epoch": 4.802436440677966, "grad_norm": 2.217365026473999, "learning_rate": 7.5989141949152545e-06, "loss": 1.5363, "mean_token_accuracy": 0.6730644181370735, "num_tokens": 14589995.0, "step": 18134 }, { "epoch": 4.802966101694915, "grad_norm": 2.6219170093536377, "learning_rate": 7.59864936440678e-06, "loss": 1.319, "mean_token_accuracy": 0.7047749161720276, "num_tokens": 14591431.0, "step": 18136 }, { "epoch": 4.803495762711864, "grad_norm": 1.7217755317687988, "learning_rate": 7.598384533898305e-06, "loss": 0.9949, "mean_token_accuracy": 0.7512127161026001, "num_tokens": 14593210.0, "step": 18138 }, { "epoch": 4.804025423728813, "grad_norm": 2.398684024810791, "learning_rate": 7.598119703389831e-06, "loss": 1.2434, "mean_token_accuracy": 0.7010645791888237, "num_tokens": 14594626.0, "step": 18140 }, { "epoch": 4.804555084745763, "grad_norm": 1.7739131450653076, "learning_rate": 7.597854872881356e-06, "loss": 1.3763, "mean_token_accuracy": 0.6925265341997147, "num_tokens": 14596291.0, "step": 18142 }, { "epoch": 4.805084745762712, "grad_norm": 1.9016013145446777, "learning_rate": 7.5975900423728825e-06, "loss": 1.2107, "mean_token_accuracy": 0.7341361343860626, "num_tokens": 14597633.0, "step": 18144 }, { "epoch": 4.805614406779661, "grad_norm": 1.8769938945770264, "learning_rate": 7.597325211864407e-06, "loss": 0.951, "mean_token_accuracy": 0.7521531730890274, "num_tokens": 14599233.0, "step": 18146 }, { "epoch": 4.80614406779661, "grad_norm": 2.3476181030273438, "learning_rate": 7.597060381355933e-06, "loss": 1.8344, "mean_token_accuracy": 0.6300967559218407, "num_tokens": 14601136.0, "step": 18148 }, { "epoch": 4.80667372881356, "grad_norm": 2.749934434890747, "learning_rate": 7.596795550847458e-06, "loss": 1.5176, "mean_token_accuracy": 0.6752918288111687, "num_tokens": 14602518.0, "step": 18150 }, { "epoch": 4.807203389830509, "grad_norm": 1.7331912517547607, "learning_rate": 7.596530720338984e-06, "loss": 1.0629, "mean_token_accuracy": 0.7381481751799583, "num_tokens": 14604150.0, "step": 18152 }, { "epoch": 4.807733050847458, "grad_norm": 2.440777540206909, "learning_rate": 7.596265889830509e-06, "loss": 0.9897, "mean_token_accuracy": 0.7692070677876472, "num_tokens": 14605568.0, "step": 18154 }, { "epoch": 4.808262711864407, "grad_norm": 1.7764345407485962, "learning_rate": 7.596001059322035e-06, "loss": 1.0611, "mean_token_accuracy": 0.7273000478744507, "num_tokens": 14607276.0, "step": 18156 }, { "epoch": 4.808792372881356, "grad_norm": 2.0830087661743164, "learning_rate": 7.5957362288135596e-06, "loss": 1.1306, "mean_token_accuracy": 0.7365827336907387, "num_tokens": 14608733.0, "step": 18158 }, { "epoch": 4.809322033898305, "grad_norm": 1.693672776222229, "learning_rate": 7.595471398305085e-06, "loss": 1.4795, "mean_token_accuracy": 0.6645064502954483, "num_tokens": 14610384.0, "step": 18160 }, { "epoch": 4.809851694915254, "grad_norm": 1.8368982076644897, "learning_rate": 7.59520656779661e-06, "loss": 0.8154, "mean_token_accuracy": 0.7815229520201683, "num_tokens": 14611820.0, "step": 18162 }, { "epoch": 4.810381355932203, "grad_norm": 2.053617000579834, "learning_rate": 7.594941737288136e-06, "loss": 1.4194, "mean_token_accuracy": 0.6931599378585815, "num_tokens": 14613186.0, "step": 18164 }, { "epoch": 4.810911016949152, "grad_norm": 2.0914695262908936, "learning_rate": 7.594676906779662e-06, "loss": 1.1718, "mean_token_accuracy": 0.7307462394237518, "num_tokens": 14614805.0, "step": 18166 }, { "epoch": 4.811440677966102, "grad_norm": 1.849969744682312, "learning_rate": 7.594412076271187e-06, "loss": 0.9762, "mean_token_accuracy": 0.7598583325743675, "num_tokens": 14616414.0, "step": 18168 }, { "epoch": 4.811970338983051, "grad_norm": 2.3381807804107666, "learning_rate": 7.594147245762713e-06, "loss": 1.3912, "mean_token_accuracy": 0.6871628984808922, "num_tokens": 14617950.0, "step": 18170 }, { "epoch": 4.8125, "grad_norm": 1.9223811626434326, "learning_rate": 7.5938824152542375e-06, "loss": 1.1047, "mean_token_accuracy": 0.7256883531808853, "num_tokens": 14619780.0, "step": 18172 }, { "epoch": 4.813029661016949, "grad_norm": 1.9305427074432373, "learning_rate": 7.593617584745764e-06, "loss": 1.0821, "mean_token_accuracy": 0.7335260957479477, "num_tokens": 14621534.0, "step": 18174 }, { "epoch": 4.813559322033898, "grad_norm": 1.9227250814437866, "learning_rate": 7.593352754237289e-06, "loss": 1.3859, "mean_token_accuracy": 0.6681314408779144, "num_tokens": 14623128.0, "step": 18176 }, { "epoch": 4.814088983050848, "grad_norm": 2.149502754211426, "learning_rate": 7.593087923728815e-06, "loss": 1.5863, "mean_token_accuracy": 0.6553583666682243, "num_tokens": 14624992.0, "step": 18178 }, { "epoch": 4.814618644067797, "grad_norm": 1.7046473026275635, "learning_rate": 7.59282309322034e-06, "loss": 0.8588, "mean_token_accuracy": 0.7692881971597672, "num_tokens": 14626554.0, "step": 18180 }, { "epoch": 4.815148305084746, "grad_norm": 1.9975183010101318, "learning_rate": 7.5925582627118655e-06, "loss": 0.7564, "mean_token_accuracy": 0.812075212597847, "num_tokens": 14628088.0, "step": 18182 }, { "epoch": 4.815677966101695, "grad_norm": 1.702404260635376, "learning_rate": 7.5922934322033904e-06, "loss": 1.2374, "mean_token_accuracy": 0.712442547082901, "num_tokens": 14629780.0, "step": 18184 }, { "epoch": 4.816207627118644, "grad_norm": 1.498796820640564, "learning_rate": 7.592028601694916e-06, "loss": 0.8395, "mean_token_accuracy": 0.7750298380851746, "num_tokens": 14631526.0, "step": 18186 }, { "epoch": 4.816737288135593, "grad_norm": 1.6390008926391602, "learning_rate": 7.591763771186441e-06, "loss": 1.0706, "mean_token_accuracy": 0.7586249262094498, "num_tokens": 14633038.0, "step": 18188 }, { "epoch": 4.817266949152542, "grad_norm": 1.8684136867523193, "learning_rate": 7.591498940677967e-06, "loss": 1.6737, "mean_token_accuracy": 0.6236805319786072, "num_tokens": 14634971.0, "step": 18190 }, { "epoch": 4.817796610169491, "grad_norm": 2.1316025257110596, "learning_rate": 7.591234110169492e-06, "loss": 0.9831, "mean_token_accuracy": 0.7727326601743698, "num_tokens": 14636493.0, "step": 18192 }, { "epoch": 4.81832627118644, "grad_norm": 1.886620044708252, "learning_rate": 7.590969279661018e-06, "loss": 1.0057, "mean_token_accuracy": 0.7646086663007736, "num_tokens": 14638028.0, "step": 18194 }, { "epoch": 4.81885593220339, "grad_norm": 1.993476390838623, "learning_rate": 7.5907044491525425e-06, "loss": 1.2328, "mean_token_accuracy": 0.7296050563454628, "num_tokens": 14639560.0, "step": 18196 }, { "epoch": 4.819385593220339, "grad_norm": 1.7960071563720703, "learning_rate": 7.590439618644069e-06, "loss": 1.2228, "mean_token_accuracy": 0.7202013283967972, "num_tokens": 14641124.0, "step": 18198 }, { "epoch": 4.819915254237288, "grad_norm": 2.172757148742676, "learning_rate": 7.590174788135593e-06, "loss": 1.4413, "mean_token_accuracy": 0.6876311302185059, "num_tokens": 14642679.0, "step": 18200 }, { "epoch": 4.820444915254237, "grad_norm": 1.9354453086853027, "learning_rate": 7.58990995762712e-06, "loss": 0.9523, "mean_token_accuracy": 0.771923303604126, "num_tokens": 14643959.0, "step": 18202 }, { "epoch": 4.820974576271187, "grad_norm": 1.9253714084625244, "learning_rate": 7.589645127118645e-06, "loss": 1.1117, "mean_token_accuracy": 0.7227670252323151, "num_tokens": 14645609.0, "step": 18204 }, { "epoch": 4.821504237288136, "grad_norm": 1.917169451713562, "learning_rate": 7.5893802966101706e-06, "loss": 1.23, "mean_token_accuracy": 0.7083358205854893, "num_tokens": 14647269.0, "step": 18206 }, { "epoch": 4.822033898305085, "grad_norm": 1.6339352130889893, "learning_rate": 7.5891154661016955e-06, "loss": 1.2917, "mean_token_accuracy": 0.7060593217611313, "num_tokens": 14649227.0, "step": 18208 }, { "epoch": 4.822563559322034, "grad_norm": 1.713729977607727, "learning_rate": 7.588850635593221e-06, "loss": 0.9867, "mean_token_accuracy": 0.7864281237125397, "num_tokens": 14650596.0, "step": 18210 }, { "epoch": 4.823093220338983, "grad_norm": 2.1461548805236816, "learning_rate": 7.588585805084746e-06, "loss": 1.1324, "mean_token_accuracy": 0.7308762669563293, "num_tokens": 14651983.0, "step": 18212 }, { "epoch": 4.8236228813559325, "grad_norm": 2.1807827949523926, "learning_rate": 7.588320974576272e-06, "loss": 1.199, "mean_token_accuracy": 0.7425984665751457, "num_tokens": 14653294.0, "step": 18214 }, { "epoch": 4.8241525423728815, "grad_norm": 1.8443328142166138, "learning_rate": 7.588056144067797e-06, "loss": 0.7381, "mean_token_accuracy": 0.8165941312909126, "num_tokens": 14654857.0, "step": 18216 }, { "epoch": 4.8246822033898304, "grad_norm": 1.6095257997512817, "learning_rate": 7.587791313559323e-06, "loss": 1.3176, "mean_token_accuracy": 0.6941590011119843, "num_tokens": 14656539.0, "step": 18218 }, { "epoch": 4.825211864406779, "grad_norm": 2.2330167293548584, "learning_rate": 7.587526483050848e-06, "loss": 1.2525, "mean_token_accuracy": 0.7183952406048775, "num_tokens": 14658289.0, "step": 18220 }, { "epoch": 4.825741525423728, "grad_norm": 1.4569741487503052, "learning_rate": 7.587261652542373e-06, "loss": 0.8295, "mean_token_accuracy": 0.7813859954476357, "num_tokens": 14659959.0, "step": 18222 }, { "epoch": 4.826271186440678, "grad_norm": 2.0188517570495605, "learning_rate": 7.586996822033898e-06, "loss": 1.1827, "mean_token_accuracy": 0.7191349118947983, "num_tokens": 14661485.0, "step": 18224 }, { "epoch": 4.826800847457627, "grad_norm": 1.776989221572876, "learning_rate": 7.586731991525424e-06, "loss": 0.964, "mean_token_accuracy": 0.7597154974937439, "num_tokens": 14663042.0, "step": 18226 }, { "epoch": 4.827330508474576, "grad_norm": 1.9433047771453857, "learning_rate": 7.586467161016949e-06, "loss": 1.2288, "mean_token_accuracy": 0.7114380970597267, "num_tokens": 14664393.0, "step": 18228 }, { "epoch": 4.827860169491525, "grad_norm": 2.1889402866363525, "learning_rate": 7.586202330508476e-06, "loss": 1.3031, "mean_token_accuracy": 0.7102471739053726, "num_tokens": 14665878.0, "step": 18230 }, { "epoch": 4.828389830508475, "grad_norm": 1.546329140663147, "learning_rate": 7.585937500000001e-06, "loss": 1.2015, "mean_token_accuracy": 0.7275485098361969, "num_tokens": 14668929.0, "step": 18232 }, { "epoch": 4.828919491525424, "grad_norm": 1.9276609420776367, "learning_rate": 7.585672669491526e-06, "loss": 1.3167, "mean_token_accuracy": 0.6943082585930824, "num_tokens": 14670341.0, "step": 18234 }, { "epoch": 4.829449152542373, "grad_norm": 2.078829050064087, "learning_rate": 7.585407838983051e-06, "loss": 1.5003, "mean_token_accuracy": 0.6689930185675621, "num_tokens": 14671980.0, "step": 18236 }, { "epoch": 4.829978813559322, "grad_norm": 2.175401449203491, "learning_rate": 7.585143008474577e-06, "loss": 1.4706, "mean_token_accuracy": 0.6816320568323135, "num_tokens": 14673578.0, "step": 18238 }, { "epoch": 4.830508474576272, "grad_norm": 1.600450038909912, "learning_rate": 7.584878177966102e-06, "loss": 1.3412, "mean_token_accuracy": 0.6898021847009659, "num_tokens": 14675468.0, "step": 18240 }, { "epoch": 4.831038135593221, "grad_norm": 2.237488269805908, "learning_rate": 7.584613347457628e-06, "loss": 1.1051, "mean_token_accuracy": 0.7345109358429909, "num_tokens": 14677078.0, "step": 18242 }, { "epoch": 4.8315677966101696, "grad_norm": 2.0670053958892822, "learning_rate": 7.584348516949153e-06, "loss": 1.1164, "mean_token_accuracy": 0.7389127910137177, "num_tokens": 14678562.0, "step": 18244 }, { "epoch": 4.8320974576271185, "grad_norm": 2.3333451747894287, "learning_rate": 7.5840836864406785e-06, "loss": 1.6318, "mean_token_accuracy": 0.6528953127563, "num_tokens": 14680325.0, "step": 18246 }, { "epoch": 4.8326271186440675, "grad_norm": 1.9576094150543213, "learning_rate": 7.583818855932203e-06, "loss": 1.1769, "mean_token_accuracy": 0.7254653871059418, "num_tokens": 14681916.0, "step": 18248 }, { "epoch": 4.833156779661017, "grad_norm": 2.112858533859253, "learning_rate": 7.583554025423729e-06, "loss": 1.7511, "step": 18250 }, { "epoch": 4.833156779661017, "eval_loss": 1.3094936609268188, "eval_mean_token_accuracy": 0.7020379390035357, "eval_num_tokens": 14683692.0, "eval_runtime": 48.1673, "eval_samples_per_second": 6.394, "eval_steps_per_second": 6.394, "step": 18250 }, { "epoch": 4.833686440677966, "grad_norm": 2.324362277984619, "learning_rate": 7.583289194915256e-06, "loss": 1.1513, "mean_token_accuracy": 0.682102994993329, "num_tokens": 14685218.0, "step": 18252 }, { "epoch": 4.834216101694915, "grad_norm": 1.776600956916809, "learning_rate": 7.58302436440678e-06, "loss": 1.4323, "mean_token_accuracy": 0.6814322546124458, "num_tokens": 14686919.0, "step": 18254 }, { "epoch": 4.834745762711864, "grad_norm": 2.210568428039551, "learning_rate": 7.5827595338983065e-06, "loss": 1.5542, "mean_token_accuracy": 0.6608236208558083, "num_tokens": 14688374.0, "step": 18256 }, { "epoch": 4.835275423728813, "grad_norm": 1.748955249786377, "learning_rate": 7.5824947033898314e-06, "loss": 1.0728, "mean_token_accuracy": 0.7469783797860146, "num_tokens": 14689862.0, "step": 18258 }, { "epoch": 4.835805084745763, "grad_norm": 2.03436541557312, "learning_rate": 7.582229872881357e-06, "loss": 1.5328, "mean_token_accuracy": 0.6587233394384384, "num_tokens": 14691416.0, "step": 18260 }, { "epoch": 4.836334745762712, "grad_norm": 2.0273125171661377, "learning_rate": 7.581965042372882e-06, "loss": 1.1083, "mean_token_accuracy": 0.741014838218689, "num_tokens": 14693009.0, "step": 18262 }, { "epoch": 4.836864406779661, "grad_norm": 1.8788366317749023, "learning_rate": 7.581700211864408e-06, "loss": 1.4939, "mean_token_accuracy": 0.6950267478823662, "num_tokens": 14694535.0, "step": 18264 }, { "epoch": 4.83739406779661, "grad_norm": 1.8101015090942383, "learning_rate": 7.581435381355933e-06, "loss": 1.2955, "mean_token_accuracy": 0.7290013432502747, "num_tokens": 14696114.0, "step": 18266 }, { "epoch": 4.83792372881356, "grad_norm": 2.1994190216064453, "learning_rate": 7.581170550847459e-06, "loss": 1.2795, "mean_token_accuracy": 0.7081175670027733, "num_tokens": 14697401.0, "step": 18268 }, { "epoch": 4.838453389830509, "grad_norm": 1.7293037176132202, "learning_rate": 7.5809057203389836e-06, "loss": 1.438, "mean_token_accuracy": 0.6839058995246887, "num_tokens": 14699078.0, "step": 18270 }, { "epoch": 4.838983050847458, "grad_norm": 2.2759275436401367, "learning_rate": 7.580640889830509e-06, "loss": 1.3105, "mean_token_accuracy": 0.7072156220674515, "num_tokens": 14700616.0, "step": 18272 }, { "epoch": 4.839512711864407, "grad_norm": 1.8818771839141846, "learning_rate": 7.580376059322034e-06, "loss": 1.0611, "mean_token_accuracy": 0.7546878978610039, "num_tokens": 14702094.0, "step": 18274 }, { "epoch": 4.840042372881356, "grad_norm": 2.3964407444000244, "learning_rate": 7.58011122881356e-06, "loss": 1.5748, "mean_token_accuracy": 0.692549891769886, "num_tokens": 14703713.0, "step": 18276 }, { "epoch": 4.840572033898305, "grad_norm": 2.028031826019287, "learning_rate": 7.579846398305085e-06, "loss": 1.4626, "mean_token_accuracy": 0.6695613488554955, "num_tokens": 14705277.0, "step": 18278 }, { "epoch": 4.841101694915254, "grad_norm": 1.8071672916412354, "learning_rate": 7.579581567796611e-06, "loss": 1.1758, "mean_token_accuracy": 0.7153198644518852, "num_tokens": 14706770.0, "step": 18280 }, { "epoch": 4.841631355932203, "grad_norm": 1.881518840789795, "learning_rate": 7.579316737288136e-06, "loss": 1.0688, "mean_token_accuracy": 0.753436915576458, "num_tokens": 14708328.0, "step": 18282 }, { "epoch": 4.842161016949152, "grad_norm": 2.4670488834381104, "learning_rate": 7.579051906779662e-06, "loss": 1.2533, "mean_token_accuracy": 0.7048792913556099, "num_tokens": 14709916.0, "step": 18284 }, { "epoch": 4.842690677966102, "grad_norm": 2.1570112705230713, "learning_rate": 7.578787076271187e-06, "loss": 1.2762, "mean_token_accuracy": 0.7243528366088867, "num_tokens": 14711479.0, "step": 18286 }, { "epoch": 4.843220338983051, "grad_norm": 2.0426597595214844, "learning_rate": 7.578522245762713e-06, "loss": 1.1318, "mean_token_accuracy": 0.7353633865714073, "num_tokens": 14712880.0, "step": 18288 }, { "epoch": 4.84375, "grad_norm": 1.9227955341339111, "learning_rate": 7.578257415254238e-06, "loss": 1.1239, "mean_token_accuracy": 0.7348752617835999, "num_tokens": 14714482.0, "step": 18290 }, { "epoch": 4.844279661016949, "grad_norm": 1.9313501119613647, "learning_rate": 7.577992584745764e-06, "loss": 1.1588, "mean_token_accuracy": 0.745086058974266, "num_tokens": 14716001.0, "step": 18292 }, { "epoch": 4.844809322033898, "grad_norm": 2.242143392562866, "learning_rate": 7.577727754237289e-06, "loss": 1.2827, "mean_token_accuracy": 0.6828100979328156, "num_tokens": 14717591.0, "step": 18294 }, { "epoch": 4.845338983050848, "grad_norm": 1.9804545640945435, "learning_rate": 7.577462923728814e-06, "loss": 1.376, "mean_token_accuracy": 0.678441010415554, "num_tokens": 14719338.0, "step": 18296 }, { "epoch": 4.845868644067797, "grad_norm": 2.219712495803833, "learning_rate": 7.577198093220339e-06, "loss": 1.2287, "mean_token_accuracy": 0.7308406606316566, "num_tokens": 14720731.0, "step": 18298 }, { "epoch": 4.846398305084746, "grad_norm": 1.8167768716812134, "learning_rate": 7.576933262711865e-06, "loss": 1.2047, "mean_token_accuracy": 0.7097875624895096, "num_tokens": 14722019.0, "step": 18300 }, { "epoch": 4.846927966101695, "grad_norm": 1.925342321395874, "learning_rate": 7.57666843220339e-06, "loss": 0.871, "mean_token_accuracy": 0.7864757105708122, "num_tokens": 14723315.0, "step": 18302 }, { "epoch": 4.847457627118644, "grad_norm": 2.10213565826416, "learning_rate": 7.576403601694916e-06, "loss": 1.5501, "mean_token_accuracy": 0.6610097587108612, "num_tokens": 14725919.0, "step": 18304 }, { "epoch": 4.847987288135593, "grad_norm": 1.7675058841705322, "learning_rate": 7.576138771186441e-06, "loss": 1.098, "mean_token_accuracy": 0.7422961890697479, "num_tokens": 14727297.0, "step": 18306 }, { "epoch": 4.848516949152542, "grad_norm": 1.7940984964370728, "learning_rate": 7.5758739406779665e-06, "loss": 1.4386, "mean_token_accuracy": 0.6656352579593658, "num_tokens": 14728792.0, "step": 18308 }, { "epoch": 4.849046610169491, "grad_norm": 2.0086939334869385, "learning_rate": 7.5756091101694915e-06, "loss": 1.583, "mean_token_accuracy": 0.6398263424634933, "num_tokens": 14730319.0, "step": 18310 }, { "epoch": 4.84957627118644, "grad_norm": 1.7472093105316162, "learning_rate": 7.575344279661018e-06, "loss": 1.3296, "mean_token_accuracy": 0.6798365116119385, "num_tokens": 14732160.0, "step": 18312 }, { "epoch": 4.85010593220339, "grad_norm": 1.9626647233963013, "learning_rate": 7.575079449152542e-06, "loss": 1.4841, "mean_token_accuracy": 0.6598305702209473, "num_tokens": 14733631.0, "step": 18314 }, { "epoch": 4.850635593220339, "grad_norm": 1.916367769241333, "learning_rate": 7.574814618644069e-06, "loss": 1.4112, "mean_token_accuracy": 0.7104670628905296, "num_tokens": 14735269.0, "step": 18316 }, { "epoch": 4.851165254237288, "grad_norm": 2.0591847896575928, "learning_rate": 7.574549788135594e-06, "loss": 1.3296, "mean_token_accuracy": 0.7111651599407196, "num_tokens": 14736829.0, "step": 18318 }, { "epoch": 4.851694915254237, "grad_norm": 1.7085319757461548, "learning_rate": 7.5742849576271195e-06, "loss": 0.9645, "mean_token_accuracy": 0.7599562853574753, "num_tokens": 14738410.0, "step": 18320 }, { "epoch": 4.852224576271187, "grad_norm": 2.1538240909576416, "learning_rate": 7.574020127118644e-06, "loss": 1.6007, "mean_token_accuracy": 0.650884173810482, "num_tokens": 14740063.0, "step": 18322 }, { "epoch": 4.852754237288136, "grad_norm": 2.186044692993164, "learning_rate": 7.57375529661017e-06, "loss": 1.344, "mean_token_accuracy": 0.682045005261898, "num_tokens": 14741515.0, "step": 18324 }, { "epoch": 4.853283898305085, "grad_norm": 2.1861047744750977, "learning_rate": 7.573490466101695e-06, "loss": 0.963, "mean_token_accuracy": 0.7498294711112976, "num_tokens": 14743131.0, "step": 18326 }, { "epoch": 4.853813559322034, "grad_norm": 1.9540125131607056, "learning_rate": 7.573225635593221e-06, "loss": 1.1305, "mean_token_accuracy": 0.7296000719070435, "num_tokens": 14744791.0, "step": 18328 }, { "epoch": 4.854343220338983, "grad_norm": 2.04693341255188, "learning_rate": 7.572960805084746e-06, "loss": 1.402, "mean_token_accuracy": 0.7099490761756897, "num_tokens": 14746553.0, "step": 18330 }, { "epoch": 4.8548728813559325, "grad_norm": 1.8642034530639648, "learning_rate": 7.572695974576272e-06, "loss": 1.1215, "mean_token_accuracy": 0.7476358041167259, "num_tokens": 14748114.0, "step": 18332 }, { "epoch": 4.8554025423728815, "grad_norm": 2.341466188430786, "learning_rate": 7.572431144067797e-06, "loss": 0.925, "mean_token_accuracy": 0.74241953343153, "num_tokens": 14749401.0, "step": 18334 }, { "epoch": 4.8559322033898304, "grad_norm": 2.08990478515625, "learning_rate": 7.572166313559322e-06, "loss": 1.3604, "mean_token_accuracy": 0.6732218414545059, "num_tokens": 14751259.0, "step": 18336 }, { "epoch": 4.856461864406779, "grad_norm": 2.0624167919158936, "learning_rate": 7.571901483050849e-06, "loss": 1.5377, "mean_token_accuracy": 0.6607635095715523, "num_tokens": 14753022.0, "step": 18338 }, { "epoch": 4.856991525423728, "grad_norm": 2.439321279525757, "learning_rate": 7.571636652542374e-06, "loss": 1.4975, "mean_token_accuracy": 0.6673097983002663, "num_tokens": 14754241.0, "step": 18340 }, { "epoch": 4.857521186440678, "grad_norm": 2.1206929683685303, "learning_rate": 7.5713718220339e-06, "loss": 1.5432, "mean_token_accuracy": 0.6525043770670891, "num_tokens": 14755978.0, "step": 18342 }, { "epoch": 4.858050847457627, "grad_norm": 1.6277059316635132, "learning_rate": 7.5711069915254246e-06, "loss": 1.1494, "mean_token_accuracy": 0.7021589949727058, "num_tokens": 14758332.0, "step": 18344 }, { "epoch": 4.858580508474576, "grad_norm": 1.4897959232330322, "learning_rate": 7.57084216101695e-06, "loss": 1.0072, "mean_token_accuracy": 0.7229240834712982, "num_tokens": 14760396.0, "step": 18346 }, { "epoch": 4.859110169491525, "grad_norm": 2.0899782180786133, "learning_rate": 7.570577330508475e-06, "loss": 1.3715, "mean_token_accuracy": 0.7193533629179001, "num_tokens": 14761994.0, "step": 18348 }, { "epoch": 4.859639830508475, "grad_norm": 2.041684150695801, "learning_rate": 7.570312500000001e-06, "loss": 1.106, "mean_token_accuracy": 0.7353574335575104, "num_tokens": 14763741.0, "step": 18350 }, { "epoch": 4.860169491525424, "grad_norm": 2.2475359439849854, "learning_rate": 7.570047669491526e-06, "loss": 1.214, "mean_token_accuracy": 0.7173970937728882, "num_tokens": 14765481.0, "step": 18352 }, { "epoch": 4.860699152542373, "grad_norm": 1.5431212186813354, "learning_rate": 7.569782838983052e-06, "loss": 1.3586, "mean_token_accuracy": 0.7209880873560905, "num_tokens": 14766959.0, "step": 18354 }, { "epoch": 4.861228813559322, "grad_norm": 1.9014019966125488, "learning_rate": 7.569518008474577e-06, "loss": 1.5159, "mean_token_accuracy": 0.6540661826729774, "num_tokens": 14769037.0, "step": 18356 }, { "epoch": 4.861758474576272, "grad_norm": 2.7631072998046875, "learning_rate": 7.5692531779661025e-06, "loss": 2.0665, "mean_token_accuracy": 0.5718465521931648, "num_tokens": 14770716.0, "step": 18358 }, { "epoch": 4.862288135593221, "grad_norm": 2.0198440551757812, "learning_rate": 7.568988347457627e-06, "loss": 1.143, "mean_token_accuracy": 0.7270356193184853, "num_tokens": 14772235.0, "step": 18360 }, { "epoch": 4.8628177966101696, "grad_norm": 2.1167283058166504, "learning_rate": 7.568723516949153e-06, "loss": 1.3616, "mean_token_accuracy": 0.6959902793169022, "num_tokens": 14773722.0, "step": 18362 }, { "epoch": 4.8633474576271185, "grad_norm": 2.1266257762908936, "learning_rate": 7.568458686440678e-06, "loss": 1.6165, "mean_token_accuracy": 0.6433682627975941, "num_tokens": 14775377.0, "step": 18364 }, { "epoch": 4.8638771186440675, "grad_norm": 1.8386633396148682, "learning_rate": 7.568193855932205e-06, "loss": 0.6612, "mean_token_accuracy": 0.8189293965697289, "num_tokens": 14776746.0, "step": 18366 }, { "epoch": 4.864406779661017, "grad_norm": 2.21183180809021, "learning_rate": 7.567929025423729e-06, "loss": 1.183, "mean_token_accuracy": 0.7238944470882416, "num_tokens": 14778419.0, "step": 18368 }, { "epoch": 4.864936440677966, "grad_norm": 1.86949622631073, "learning_rate": 7.5676641949152554e-06, "loss": 1.1993, "mean_token_accuracy": 0.7417606636881828, "num_tokens": 14780182.0, "step": 18370 }, { "epoch": 4.865466101694915, "grad_norm": 2.3124985694885254, "learning_rate": 7.56739936440678e-06, "loss": 1.3384, "mean_token_accuracy": 0.689135268330574, "num_tokens": 14781579.0, "step": 18372 }, { "epoch": 4.865995762711864, "grad_norm": 1.6066023111343384, "learning_rate": 7.567134533898306e-06, "loss": 1.1982, "mean_token_accuracy": 0.7316106632351875, "num_tokens": 14783402.0, "step": 18374 }, { "epoch": 4.866525423728813, "grad_norm": 2.161966562271118, "learning_rate": 7.566869703389831e-06, "loss": 1.5105, "mean_token_accuracy": 0.655214361846447, "num_tokens": 14784982.0, "step": 18376 }, { "epoch": 4.867055084745763, "grad_norm": 2.5099382400512695, "learning_rate": 7.566604872881357e-06, "loss": 1.6036, "mean_token_accuracy": 0.6773617640137672, "num_tokens": 14786511.0, "step": 18378 }, { "epoch": 4.867584745762712, "grad_norm": 2.208834409713745, "learning_rate": 7.566340042372882e-06, "loss": 1.6176, "mean_token_accuracy": 0.6684985049068928, "num_tokens": 14788190.0, "step": 18380 }, { "epoch": 4.868114406779661, "grad_norm": 1.8724132776260376, "learning_rate": 7.5660752118644075e-06, "loss": 1.4493, "mean_token_accuracy": 0.6598822250962257, "num_tokens": 14790010.0, "step": 18382 }, { "epoch": 4.86864406779661, "grad_norm": 2.2432518005371094, "learning_rate": 7.5658103813559325e-06, "loss": 1.3957, "mean_token_accuracy": 0.6829236820340157, "num_tokens": 14791533.0, "step": 18384 }, { "epoch": 4.86917372881356, "grad_norm": 1.9680473804473877, "learning_rate": 7.565545550847458e-06, "loss": 1.157, "mean_token_accuracy": 0.7351093664765358, "num_tokens": 14792937.0, "step": 18386 }, { "epoch": 4.869703389830509, "grad_norm": 2.2534525394439697, "learning_rate": 7.565280720338983e-06, "loss": 1.4289, "mean_token_accuracy": 0.6874338760972023, "num_tokens": 14794432.0, "step": 18388 }, { "epoch": 4.870233050847458, "grad_norm": 2.0735063552856445, "learning_rate": 7.565015889830509e-06, "loss": 1.4558, "mean_token_accuracy": 0.704097107052803, "num_tokens": 14796043.0, "step": 18390 }, { "epoch": 4.870762711864407, "grad_norm": 1.6161577701568604, "learning_rate": 7.564751059322034e-06, "loss": 1.0174, "mean_token_accuracy": 0.7501068338751793, "num_tokens": 14797767.0, "step": 18392 }, { "epoch": 4.871292372881356, "grad_norm": 2.0942225456237793, "learning_rate": 7.5644862288135605e-06, "loss": 1.088, "mean_token_accuracy": 0.7460466846823692, "num_tokens": 14799510.0, "step": 18394 }, { "epoch": 4.871822033898305, "grad_norm": 2.0657105445861816, "learning_rate": 7.564221398305085e-06, "loss": 1.2721, "mean_token_accuracy": 0.7018788978457451, "num_tokens": 14801043.0, "step": 18396 }, { "epoch": 4.872351694915254, "grad_norm": 2.243774652481079, "learning_rate": 7.563956567796611e-06, "loss": 1.2409, "mean_token_accuracy": 0.7100900635123253, "num_tokens": 14802736.0, "step": 18398 }, { "epoch": 4.872881355932203, "grad_norm": 1.8944618701934814, "learning_rate": 7.563691737288136e-06, "loss": 1.5246, "mean_token_accuracy": 0.674830861389637, "num_tokens": 14804390.0, "step": 18400 }, { "epoch": 4.873411016949152, "grad_norm": 1.8478118181228638, "learning_rate": 7.563426906779662e-06, "loss": 1.2566, "mean_token_accuracy": 0.717979796230793, "num_tokens": 14805877.0, "step": 18402 }, { "epoch": 4.873940677966102, "grad_norm": 2.283679485321045, "learning_rate": 7.563162076271187e-06, "loss": 0.9338, "mean_token_accuracy": 0.7512482926249504, "num_tokens": 14807485.0, "step": 18404 }, { "epoch": 4.874470338983051, "grad_norm": 1.9243133068084717, "learning_rate": 7.562897245762713e-06, "loss": 1.1379, "mean_token_accuracy": 0.7388559654355049, "num_tokens": 14808927.0, "step": 18406 }, { "epoch": 4.875, "grad_norm": 2.0741283893585205, "learning_rate": 7.5626324152542376e-06, "loss": 1.2131, "mean_token_accuracy": 0.7072360515594482, "num_tokens": 14810448.0, "step": 18408 }, { "epoch": 4.875529661016949, "grad_norm": 2.120122194290161, "learning_rate": 7.562367584745763e-06, "loss": 1.5624, "mean_token_accuracy": 0.6517865434288979, "num_tokens": 14812203.0, "step": 18410 }, { "epoch": 4.876059322033898, "grad_norm": 2.0029757022857666, "learning_rate": 7.562102754237288e-06, "loss": 1.0662, "mean_token_accuracy": 0.7302635684609413, "num_tokens": 14813888.0, "step": 18412 }, { "epoch": 4.876588983050848, "grad_norm": 2.0708518028259277, "learning_rate": 7.561837923728814e-06, "loss": 1.2074, "mean_token_accuracy": 0.7417334243655205, "num_tokens": 14815527.0, "step": 18414 }, { "epoch": 4.877118644067797, "grad_norm": 1.8964985609054565, "learning_rate": 7.561573093220339e-06, "loss": 1.3353, "mean_token_accuracy": 0.6969070434570312, "num_tokens": 14817204.0, "step": 18416 }, { "epoch": 4.877648305084746, "grad_norm": 2.488320827484131, "learning_rate": 7.561308262711865e-06, "loss": 1.3009, "mean_token_accuracy": 0.7009717375040054, "num_tokens": 14818499.0, "step": 18418 }, { "epoch": 4.878177966101695, "grad_norm": 2.234590530395508, "learning_rate": 7.561043432203391e-06, "loss": 1.3759, "mean_token_accuracy": 0.7112400643527508, "num_tokens": 14820000.0, "step": 18420 }, { "epoch": 4.878707627118644, "grad_norm": 1.6849288940429688, "learning_rate": 7.5607786016949154e-06, "loss": 1.3448, "mean_token_accuracy": 0.694088488817215, "num_tokens": 14821812.0, "step": 18422 }, { "epoch": 4.879237288135593, "grad_norm": 1.7510946989059448, "learning_rate": 7.560513771186442e-06, "loss": 1.1269, "mean_token_accuracy": 0.7269249930977821, "num_tokens": 14823441.0, "step": 18424 }, { "epoch": 4.879766949152542, "grad_norm": 2.1720759868621826, "learning_rate": 7.560248940677967e-06, "loss": 1.1656, "mean_token_accuracy": 0.7362969890236855, "num_tokens": 14824812.0, "step": 18426 }, { "epoch": 4.880296610169491, "grad_norm": 1.9510139226913452, "learning_rate": 7.559984110169493e-06, "loss": 1.4112, "mean_token_accuracy": 0.7244899719953537, "num_tokens": 14826481.0, "step": 18428 }, { "epoch": 4.88082627118644, "grad_norm": 1.8830294609069824, "learning_rate": 7.559719279661018e-06, "loss": 1.1396, "mean_token_accuracy": 0.7198825180530548, "num_tokens": 14828188.0, "step": 18430 }, { "epoch": 4.88135593220339, "grad_norm": 2.125905990600586, "learning_rate": 7.5594544491525435e-06, "loss": 1.013, "mean_token_accuracy": 0.7560428380966187, "num_tokens": 14829763.0, "step": 18432 }, { "epoch": 4.881885593220339, "grad_norm": 2.070850133895874, "learning_rate": 7.559189618644068e-06, "loss": 1.0638, "mean_token_accuracy": 0.743607833981514, "num_tokens": 14831251.0, "step": 18434 }, { "epoch": 4.882415254237288, "grad_norm": 2.055767297744751, "learning_rate": 7.558924788135594e-06, "loss": 1.1107, "mean_token_accuracy": 0.7337550297379494, "num_tokens": 14832872.0, "step": 18436 }, { "epoch": 4.882944915254237, "grad_norm": 2.0660951137542725, "learning_rate": 7.558659957627119e-06, "loss": 1.1232, "mean_token_accuracy": 0.7430053353309631, "num_tokens": 14834485.0, "step": 18438 }, { "epoch": 4.883474576271187, "grad_norm": 2.1296136379241943, "learning_rate": 7.558395127118645e-06, "loss": 1.6817, "mean_token_accuracy": 0.6307180188596249, "num_tokens": 14836178.0, "step": 18440 }, { "epoch": 4.884004237288136, "grad_norm": 2.0278592109680176, "learning_rate": 7.55813029661017e-06, "loss": 0.9815, "mean_token_accuracy": 0.7421840056777, "num_tokens": 14837592.0, "step": 18442 }, { "epoch": 4.884533898305085, "grad_norm": 1.6055928468704224, "learning_rate": 7.557865466101696e-06, "loss": 0.7328, "mean_token_accuracy": 0.810344934463501, "num_tokens": 14838946.0, "step": 18444 }, { "epoch": 4.885063559322034, "grad_norm": 2.345346450805664, "learning_rate": 7.5576006355932205e-06, "loss": 1.1461, "mean_token_accuracy": 0.7210791110992432, "num_tokens": 14840364.0, "step": 18446 }, { "epoch": 4.885593220338983, "grad_norm": 1.6194627285003662, "learning_rate": 7.557335805084747e-06, "loss": 1.2004, "mean_token_accuracy": 0.7406049370765686, "num_tokens": 14842118.0, "step": 18448 }, { "epoch": 4.8861228813559325, "grad_norm": 1.745534896850586, "learning_rate": 7.557070974576271e-06, "loss": 1.189, "mean_token_accuracy": 0.7210961058735847, "num_tokens": 14843768.0, "step": 18450 }, { "epoch": 4.8866525423728815, "grad_norm": 2.2565298080444336, "learning_rate": 7.556806144067798e-06, "loss": 1.2648, "mean_token_accuracy": 0.7216571569442749, "num_tokens": 14845368.0, "step": 18452 }, { "epoch": 4.8871822033898304, "grad_norm": 1.9436577558517456, "learning_rate": 7.556541313559323e-06, "loss": 1.1913, "mean_token_accuracy": 0.7141117602586746, "num_tokens": 14847059.0, "step": 18454 }, { "epoch": 4.887711864406779, "grad_norm": 1.9192382097244263, "learning_rate": 7.5562764830508486e-06, "loss": 1.2585, "mean_token_accuracy": 0.7143456041812897, "num_tokens": 14848578.0, "step": 18456 }, { "epoch": 4.888241525423728, "grad_norm": 2.2323811054229736, "learning_rate": 7.5560116525423735e-06, "loss": 1.2433, "mean_token_accuracy": 0.7125601097941399, "num_tokens": 14850357.0, "step": 18458 }, { "epoch": 4.888771186440678, "grad_norm": 2.0396616458892822, "learning_rate": 7.555746822033899e-06, "loss": 1.3373, "mean_token_accuracy": 0.7102778777480125, "num_tokens": 14851897.0, "step": 18460 }, { "epoch": 4.889300847457627, "grad_norm": 1.547145962715149, "learning_rate": 7.555481991525424e-06, "loss": 1.0693, "mean_token_accuracy": 0.7535103186964989, "num_tokens": 14853535.0, "step": 18462 }, { "epoch": 4.889830508474576, "grad_norm": 2.115995168685913, "learning_rate": 7.55521716101695e-06, "loss": 1.5314, "mean_token_accuracy": 0.6691175550222397, "num_tokens": 14855010.0, "step": 18464 }, { "epoch": 4.890360169491525, "grad_norm": 2.329847812652588, "learning_rate": 7.554952330508475e-06, "loss": 1.2716, "mean_token_accuracy": 0.6981997862458229, "num_tokens": 14856513.0, "step": 18466 }, { "epoch": 4.890889830508475, "grad_norm": 2.3807520866394043, "learning_rate": 7.554687500000001e-06, "loss": 1.4253, "mean_token_accuracy": 0.6590962558984756, "num_tokens": 14858025.0, "step": 18468 }, { "epoch": 4.891419491525424, "grad_norm": 2.0245699882507324, "learning_rate": 7.554422669491526e-06, "loss": 1.5725, "mean_token_accuracy": 0.6428511813282967, "num_tokens": 14859789.0, "step": 18470 }, { "epoch": 4.891949152542373, "grad_norm": 2.3144478797912598, "learning_rate": 7.554157838983051e-06, "loss": 1.5902, "mean_token_accuracy": 0.6635803394019604, "num_tokens": 14861305.0, "step": 18472 }, { "epoch": 4.892478813559322, "grad_norm": 2.0056238174438477, "learning_rate": 7.553893008474576e-06, "loss": 1.5202, "mean_token_accuracy": 0.6620830148458481, "num_tokens": 14863097.0, "step": 18474 }, { "epoch": 4.893008474576272, "grad_norm": 1.4496852159500122, "learning_rate": 7.553628177966102e-06, "loss": 1.2435, "mean_token_accuracy": 0.721719853579998, "num_tokens": 14865020.0, "step": 18476 }, { "epoch": 4.893538135593221, "grad_norm": 2.086498737335205, "learning_rate": 7.553363347457627e-06, "loss": 0.9328, "mean_token_accuracy": 0.7628918960690498, "num_tokens": 14866456.0, "step": 18478 }, { "epoch": 4.8940677966101696, "grad_norm": 2.4671454429626465, "learning_rate": 7.553098516949154e-06, "loss": 1.4583, "mean_token_accuracy": 0.6685695797204971, "num_tokens": 14867909.0, "step": 18480 }, { "epoch": 4.8945974576271185, "grad_norm": 2.1759872436523438, "learning_rate": 7.5528336864406786e-06, "loss": 1.1178, "mean_token_accuracy": 0.7565841674804688, "num_tokens": 14869427.0, "step": 18482 }, { "epoch": 4.8951271186440675, "grad_norm": 1.9422295093536377, "learning_rate": 7.552568855932204e-06, "loss": 1.1655, "mean_token_accuracy": 0.7370652109384537, "num_tokens": 14870859.0, "step": 18484 }, { "epoch": 4.895656779661017, "grad_norm": 2.2054443359375, "learning_rate": 7.552304025423729e-06, "loss": 1.846, "mean_token_accuracy": 0.6043738797307014, "num_tokens": 14872405.0, "step": 18486 }, { "epoch": 4.896186440677966, "grad_norm": 2.0214755535125732, "learning_rate": 7.552039194915255e-06, "loss": 1.4369, "mean_token_accuracy": 0.6629058942198753, "num_tokens": 14874181.0, "step": 18488 }, { "epoch": 4.896716101694915, "grad_norm": 1.8873692750930786, "learning_rate": 7.55177436440678e-06, "loss": 1.4215, "mean_token_accuracy": 0.6873467415571213, "num_tokens": 14875848.0, "step": 18490 }, { "epoch": 4.897245762711864, "grad_norm": 2.036255359649658, "learning_rate": 7.551509533898306e-06, "loss": 1.3433, "mean_token_accuracy": 0.7010165974497795, "num_tokens": 14877475.0, "step": 18492 }, { "epoch": 4.897775423728813, "grad_norm": 1.7921345233917236, "learning_rate": 7.551244703389831e-06, "loss": 1.1515, "mean_token_accuracy": 0.7309489846229553, "num_tokens": 14879223.0, "step": 18494 }, { "epoch": 4.898305084745763, "grad_norm": 2.1887571811676025, "learning_rate": 7.5509798728813565e-06, "loss": 1.304, "mean_token_accuracy": 0.6794074401259422, "num_tokens": 14880628.0, "step": 18496 }, { "epoch": 4.898834745762712, "grad_norm": 1.9881380796432495, "learning_rate": 7.550715042372881e-06, "loss": 1.2827, "mean_token_accuracy": 0.6926332265138626, "num_tokens": 14882229.0, "step": 18498 }, { "epoch": 4.899364406779661, "grad_norm": 2.285963773727417, "learning_rate": 7.550450211864407e-06, "loss": 1.8238, "step": 18500 }, { "epoch": 4.899364406779661, "eval_loss": 1.3106707334518433, "eval_mean_token_accuracy": 0.7014531520279971, "eval_num_tokens": 14883766.0, "eval_runtime": 48.2729, "eval_samples_per_second": 6.38, "eval_steps_per_second": 6.38, "step": 18500 }, { "epoch": 4.89989406779661, "grad_norm": 1.94770085811615, "learning_rate": 7.550185381355934e-06, "loss": 1.2374, "mean_token_accuracy": 0.6542446278035641, "num_tokens": 14885354.0, "step": 18502 }, { "epoch": 4.90042372881356, "grad_norm": 1.6539411544799805, "learning_rate": 7.549920550847458e-06, "loss": 1.1417, "mean_token_accuracy": 0.7258632630109787, "num_tokens": 14887125.0, "step": 18504 }, { "epoch": 4.900953389830509, "grad_norm": 2.2573397159576416, "learning_rate": 7.5496557203389845e-06, "loss": 1.3339, "mean_token_accuracy": 0.6879462450742722, "num_tokens": 14888601.0, "step": 18506 }, { "epoch": 4.901483050847458, "grad_norm": 2.392874240875244, "learning_rate": 7.549390889830509e-06, "loss": 1.4679, "mean_token_accuracy": 0.6794611811637878, "num_tokens": 14889968.0, "step": 18508 }, { "epoch": 4.902012711864407, "grad_norm": 1.9524147510528564, "learning_rate": 7.549126059322035e-06, "loss": 0.9546, "mean_token_accuracy": 0.7724423408508301, "num_tokens": 14891319.0, "step": 18510 }, { "epoch": 4.902542372881356, "grad_norm": 1.907820224761963, "learning_rate": 7.54886122881356e-06, "loss": 1.5625, "mean_token_accuracy": 0.658125028014183, "num_tokens": 14892892.0, "step": 18512 }, { "epoch": 4.903072033898305, "grad_norm": 2.279775381088257, "learning_rate": 7.548596398305086e-06, "loss": 1.4323, "mean_token_accuracy": 0.6821070089936256, "num_tokens": 14894351.0, "step": 18514 }, { "epoch": 4.903601694915254, "grad_norm": 1.7818561792373657, "learning_rate": 7.548331567796611e-06, "loss": 1.1485, "mean_token_accuracy": 0.7420330792665482, "num_tokens": 14896224.0, "step": 18516 }, { "epoch": 4.904131355932203, "grad_norm": 2.43169903755188, "learning_rate": 7.548066737288137e-06, "loss": 1.2806, "mean_token_accuracy": 0.7044737860560417, "num_tokens": 14897770.0, "step": 18518 }, { "epoch": 4.904661016949152, "grad_norm": 1.9302395582199097, "learning_rate": 7.5478019067796615e-06, "loss": 0.9058, "mean_token_accuracy": 0.7579859346151352, "num_tokens": 14899294.0, "step": 18520 }, { "epoch": 4.905190677966102, "grad_norm": 1.828721523284912, "learning_rate": 7.547537076271187e-06, "loss": 1.1544, "mean_token_accuracy": 0.7036917433142662, "num_tokens": 14900994.0, "step": 18522 }, { "epoch": 4.905720338983051, "grad_norm": 2.17256760597229, "learning_rate": 7.547272245762712e-06, "loss": 1.2566, "mean_token_accuracy": 0.7327804043889046, "num_tokens": 14902869.0, "step": 18524 }, { "epoch": 4.90625, "grad_norm": 2.010178327560425, "learning_rate": 7.547007415254238e-06, "loss": 1.0832, "mean_token_accuracy": 0.7346340790390968, "num_tokens": 14904440.0, "step": 18526 }, { "epoch": 4.906779661016949, "grad_norm": 2.0885250568389893, "learning_rate": 7.546742584745763e-06, "loss": 1.3465, "mean_token_accuracy": 0.7041402086615562, "num_tokens": 14906012.0, "step": 18528 }, { "epoch": 4.907309322033898, "grad_norm": 2.2444522380828857, "learning_rate": 7.546477754237289e-06, "loss": 1.4075, "mean_token_accuracy": 0.6967056095600128, "num_tokens": 14907361.0, "step": 18530 }, { "epoch": 4.907838983050848, "grad_norm": 1.802854061126709, "learning_rate": 7.546212923728814e-06, "loss": 1.1061, "mean_token_accuracy": 0.7283613979816437, "num_tokens": 14909141.0, "step": 18532 }, { "epoch": 4.908368644067797, "grad_norm": 1.8842071294784546, "learning_rate": 7.54594809322034e-06, "loss": 1.2167, "mean_token_accuracy": 0.7259076684713364, "num_tokens": 14910512.0, "step": 18534 }, { "epoch": 4.908898305084746, "grad_norm": 1.4538443088531494, "learning_rate": 7.545683262711865e-06, "loss": 1.3433, "mean_token_accuracy": 0.6813088245689869, "num_tokens": 14912788.0, "step": 18536 }, { "epoch": 4.909427966101695, "grad_norm": 1.828490138053894, "learning_rate": 7.545418432203391e-06, "loss": 0.8642, "mean_token_accuracy": 0.7788451313972473, "num_tokens": 14914355.0, "step": 18538 }, { "epoch": 4.909957627118644, "grad_norm": 1.6186590194702148, "learning_rate": 7.545153601694916e-06, "loss": 1.2934, "mean_token_accuracy": 0.72312843054533, "num_tokens": 14915939.0, "step": 18540 }, { "epoch": 4.910487288135593, "grad_norm": 1.829352855682373, "learning_rate": 7.544888771186442e-06, "loss": 1.1594, "mean_token_accuracy": 0.7514906674623489, "num_tokens": 14917586.0, "step": 18542 }, { "epoch": 4.911016949152542, "grad_norm": 1.946663498878479, "learning_rate": 7.544623940677967e-06, "loss": 1.2203, "mean_token_accuracy": 0.7247818633913994, "num_tokens": 14919064.0, "step": 18544 }, { "epoch": 4.911546610169491, "grad_norm": 2.2697973251342773, "learning_rate": 7.544359110169492e-06, "loss": 1.38, "mean_token_accuracy": 0.688095897436142, "num_tokens": 14920613.0, "step": 18546 }, { "epoch": 4.91207627118644, "grad_norm": 2.2388198375701904, "learning_rate": 7.544094279661017e-06, "loss": 1.2492, "mean_token_accuracy": 0.7328140363097191, "num_tokens": 14922157.0, "step": 18548 }, { "epoch": 4.91260593220339, "grad_norm": 1.8855758905410767, "learning_rate": 7.543829449152543e-06, "loss": 1.1937, "mean_token_accuracy": 0.735530324280262, "num_tokens": 14923565.0, "step": 18550 }, { "epoch": 4.913135593220339, "grad_norm": 1.7764968872070312, "learning_rate": 7.543564618644068e-06, "loss": 1.3348, "mean_token_accuracy": 0.6888982690870762, "num_tokens": 14925264.0, "step": 18552 }, { "epoch": 4.913665254237288, "grad_norm": 1.5983986854553223, "learning_rate": 7.543299788135594e-06, "loss": 1.3859, "mean_token_accuracy": 0.6652015820145607, "num_tokens": 14927126.0, "step": 18554 }, { "epoch": 4.914194915254237, "grad_norm": 2.023913621902466, "learning_rate": 7.543034957627119e-06, "loss": 1.2182, "mean_token_accuracy": 0.7072426006197929, "num_tokens": 14928654.0, "step": 18556 }, { "epoch": 4.914724576271187, "grad_norm": 2.182450532913208, "learning_rate": 7.5427701271186445e-06, "loss": 1.3504, "mean_token_accuracy": 0.6929495483636856, "num_tokens": 14930201.0, "step": 18558 }, { "epoch": 4.915254237288136, "grad_norm": 1.9995558261871338, "learning_rate": 7.5425052966101694e-06, "loss": 1.083, "mean_token_accuracy": 0.7192459627985954, "num_tokens": 14931950.0, "step": 18560 }, { "epoch": 4.915783898305085, "grad_norm": 2.1696605682373047, "learning_rate": 7.542240466101696e-06, "loss": 1.0961, "mean_token_accuracy": 0.7446837276220322, "num_tokens": 14933741.0, "step": 18562 }, { "epoch": 4.916313559322034, "grad_norm": 1.7444514036178589, "learning_rate": 7.54197563559322e-06, "loss": 0.9602, "mean_token_accuracy": 0.7518216967582703, "num_tokens": 14935219.0, "step": 18564 }, { "epoch": 4.916843220338983, "grad_norm": 2.453442335128784, "learning_rate": 7.541710805084747e-06, "loss": 1.6558, "mean_token_accuracy": 0.644364133477211, "num_tokens": 14936935.0, "step": 18566 }, { "epoch": 4.9173728813559325, "grad_norm": 2.3183014392852783, "learning_rate": 7.541445974576272e-06, "loss": 1.6576, "mean_token_accuracy": 0.658236674964428, "num_tokens": 14938466.0, "step": 18568 }, { "epoch": 4.9179025423728815, "grad_norm": 2.0721373558044434, "learning_rate": 7.5411811440677975e-06, "loss": 1.1679, "mean_token_accuracy": 0.7317564934492111, "num_tokens": 14940089.0, "step": 18570 }, { "epoch": 4.9184322033898304, "grad_norm": 2.2935664653778076, "learning_rate": 7.540916313559322e-06, "loss": 1.6792, "mean_token_accuracy": 0.643804557621479, "num_tokens": 14941717.0, "step": 18572 }, { "epoch": 4.918961864406779, "grad_norm": 2.061671018600464, "learning_rate": 7.540651483050848e-06, "loss": 1.6289, "mean_token_accuracy": 0.6283417567610741, "num_tokens": 14943320.0, "step": 18574 }, { "epoch": 4.919491525423728, "grad_norm": 2.0402209758758545, "learning_rate": 7.540386652542373e-06, "loss": 1.5084, "mean_token_accuracy": 0.6483943536877632, "num_tokens": 14945104.0, "step": 18576 }, { "epoch": 4.920021186440678, "grad_norm": 2.208192825317383, "learning_rate": 7.540121822033899e-06, "loss": 1.3026, "mean_token_accuracy": 0.6979538537561893, "num_tokens": 14946982.0, "step": 18578 }, { "epoch": 4.920550847457627, "grad_norm": 1.9631218910217285, "learning_rate": 7.539856991525424e-06, "loss": 1.2535, "mean_token_accuracy": 0.693092867732048, "num_tokens": 14948759.0, "step": 18580 }, { "epoch": 4.921080508474576, "grad_norm": 2.0475940704345703, "learning_rate": 7.53959216101695e-06, "loss": 1.2532, "mean_token_accuracy": 0.7240493893623352, "num_tokens": 14950244.0, "step": 18582 }, { "epoch": 4.921610169491525, "grad_norm": 2.351242780685425, "learning_rate": 7.5393273305084745e-06, "loss": 0.8906, "mean_token_accuracy": 0.7747034355998039, "num_tokens": 14951810.0, "step": 18584 }, { "epoch": 4.922139830508475, "grad_norm": 1.9942853450775146, "learning_rate": 7.5390625e-06, "loss": 1.7059, "mean_token_accuracy": 0.6121921502053738, "num_tokens": 14953514.0, "step": 18586 }, { "epoch": 4.922669491525424, "grad_norm": 2.1800410747528076, "learning_rate": 7.538797669491527e-06, "loss": 1.2974, "mean_token_accuracy": 0.7021428123116493, "num_tokens": 14955092.0, "step": 18588 }, { "epoch": 4.923199152542373, "grad_norm": 2.0800342559814453, "learning_rate": 7.538532838983052e-06, "loss": 0.9997, "mean_token_accuracy": 0.7416313588619232, "num_tokens": 14957300.0, "step": 18590 }, { "epoch": 4.923728813559322, "grad_norm": 1.9087450504302979, "learning_rate": 7.538268008474578e-06, "loss": 1.0421, "mean_token_accuracy": 0.74704310297966, "num_tokens": 14959275.0, "step": 18592 }, { "epoch": 4.924258474576272, "grad_norm": 2.4878509044647217, "learning_rate": 7.5380031779661026e-06, "loss": 1.3414, "mean_token_accuracy": 0.7039451152086258, "num_tokens": 14960648.0, "step": 18594 }, { "epoch": 4.924788135593221, "grad_norm": 1.8584797382354736, "learning_rate": 7.537738347457628e-06, "loss": 1.1987, "mean_token_accuracy": 0.7234526127576828, "num_tokens": 14962186.0, "step": 18596 }, { "epoch": 4.9253177966101696, "grad_norm": 2.0067670345306396, "learning_rate": 7.537473516949153e-06, "loss": 0.9652, "mean_token_accuracy": 0.7622164860367775, "num_tokens": 14963898.0, "step": 18598 }, { "epoch": 4.9258474576271185, "grad_norm": 2.1993978023529053, "learning_rate": 7.537208686440679e-06, "loss": 1.3785, "mean_token_accuracy": 0.6760971918702126, "num_tokens": 14965653.0, "step": 18600 }, { "epoch": 4.9263771186440675, "grad_norm": 1.6544947624206543, "learning_rate": 7.536943855932204e-06, "loss": 1.1588, "mean_token_accuracy": 0.7307081371545792, "num_tokens": 14967590.0, "step": 18602 }, { "epoch": 4.926906779661017, "grad_norm": 2.1249537467956543, "learning_rate": 7.53667902542373e-06, "loss": 1.4749, "mean_token_accuracy": 0.6963693350553513, "num_tokens": 14969084.0, "step": 18604 }, { "epoch": 4.927436440677966, "grad_norm": 1.8385039567947388, "learning_rate": 7.536414194915255e-06, "loss": 1.3556, "mean_token_accuracy": 0.6901344954967499, "num_tokens": 14970638.0, "step": 18606 }, { "epoch": 4.927966101694915, "grad_norm": 2.326669692993164, "learning_rate": 7.5361493644067804e-06, "loss": 1.3797, "mean_token_accuracy": 0.6967622116208076, "num_tokens": 14972038.0, "step": 18608 }, { "epoch": 4.928495762711864, "grad_norm": 2.0691699981689453, "learning_rate": 7.535884533898305e-06, "loss": 1.4776, "mean_token_accuracy": 0.6838033720850945, "num_tokens": 14973417.0, "step": 18610 }, { "epoch": 4.929025423728813, "grad_norm": 2.0375287532806396, "learning_rate": 7.535619703389831e-06, "loss": 0.9779, "mean_token_accuracy": 0.7478010803461075, "num_tokens": 14974748.0, "step": 18612 }, { "epoch": 4.929555084745763, "grad_norm": 1.9701954126358032, "learning_rate": 7.535354872881356e-06, "loss": 1.4786, "mean_token_accuracy": 0.6543837934732437, "num_tokens": 14976334.0, "step": 18614 }, { "epoch": 4.930084745762712, "grad_norm": 2.1933324337005615, "learning_rate": 7.535090042372883e-06, "loss": 1.6636, "mean_token_accuracy": 0.6509135663509369, "num_tokens": 14977871.0, "step": 18616 }, { "epoch": 4.930614406779661, "grad_norm": 2.102963924407959, "learning_rate": 7.534825211864407e-06, "loss": 1.5539, "mean_token_accuracy": 0.6555011197924614, "num_tokens": 14979711.0, "step": 18618 }, { "epoch": 4.93114406779661, "grad_norm": 1.9622654914855957, "learning_rate": 7.534560381355933e-06, "loss": 1.4199, "mean_token_accuracy": 0.6839148849248886, "num_tokens": 14981512.0, "step": 18620 }, { "epoch": 4.93167372881356, "grad_norm": 2.452441453933716, "learning_rate": 7.534295550847458e-06, "loss": 1.286, "mean_token_accuracy": 0.6987994536757469, "num_tokens": 14983148.0, "step": 18622 }, { "epoch": 4.932203389830509, "grad_norm": 2.2180662155151367, "learning_rate": 7.534030720338984e-06, "loss": 1.418, "mean_token_accuracy": 0.6664945259690285, "num_tokens": 14984694.0, "step": 18624 }, { "epoch": 4.932733050847458, "grad_norm": 1.9053422212600708, "learning_rate": 7.533765889830509e-06, "loss": 1.5238, "mean_token_accuracy": 0.660591259598732, "num_tokens": 14986455.0, "step": 18626 }, { "epoch": 4.933262711864407, "grad_norm": 2.0905661582946777, "learning_rate": 7.533501059322035e-06, "loss": 1.1924, "mean_token_accuracy": 0.7061081975698471, "num_tokens": 14988240.0, "step": 18628 }, { "epoch": 4.933792372881356, "grad_norm": 2.025010108947754, "learning_rate": 7.53323622881356e-06, "loss": 1.4227, "mean_token_accuracy": 0.7027761116623878, "num_tokens": 14989772.0, "step": 18630 }, { "epoch": 4.934322033898305, "grad_norm": 1.6941285133361816, "learning_rate": 7.5329713983050855e-06, "loss": 0.9567, "mean_token_accuracy": 0.7792339473962784, "num_tokens": 14991144.0, "step": 18632 }, { "epoch": 4.934851694915254, "grad_norm": 2.1850481033325195, "learning_rate": 7.5327065677966105e-06, "loss": 1.2142, "mean_token_accuracy": 0.7119839563965797, "num_tokens": 14992717.0, "step": 18634 }, { "epoch": 4.935381355932203, "grad_norm": 1.9582619667053223, "learning_rate": 7.532441737288136e-06, "loss": 1.192, "mean_token_accuracy": 0.7325988411903381, "num_tokens": 14994416.0, "step": 18636 }, { "epoch": 4.935911016949152, "grad_norm": 2.3767712116241455, "learning_rate": 7.532176906779661e-06, "loss": 1.5089, "mean_token_accuracy": 0.641916312277317, "num_tokens": 14996183.0, "step": 18638 }, { "epoch": 4.936440677966102, "grad_norm": 2.1031763553619385, "learning_rate": 7.531912076271187e-06, "loss": 1.2711, "mean_token_accuracy": 0.6977511644363403, "num_tokens": 14997692.0, "step": 18640 }, { "epoch": 4.936970338983051, "grad_norm": 2.026803493499756, "learning_rate": 7.531647245762712e-06, "loss": 1.4376, "mean_token_accuracy": 0.6622745022177696, "num_tokens": 14999382.0, "step": 18642 }, { "epoch": 4.9375, "grad_norm": 1.940791368484497, "learning_rate": 7.5313824152542385e-06, "loss": 1.0182, "mean_token_accuracy": 0.7349633499979973, "num_tokens": 15000806.0, "step": 18644 }, { "epoch": 4.938029661016949, "grad_norm": 2.160351276397705, "learning_rate": 7.5311175847457626e-06, "loss": 1.3683, "mean_token_accuracy": 0.6868274509906769, "num_tokens": 15002399.0, "step": 18646 }, { "epoch": 4.938559322033898, "grad_norm": 1.587628722190857, "learning_rate": 7.530852754237289e-06, "loss": 0.9435, "mean_token_accuracy": 0.7536103874444962, "num_tokens": 15004358.0, "step": 18648 }, { "epoch": 4.939088983050848, "grad_norm": 2.0834522247314453, "learning_rate": 7.530587923728814e-06, "loss": 1.5117, "mean_token_accuracy": 0.677420124411583, "num_tokens": 15005914.0, "step": 18650 }, { "epoch": 4.939618644067797, "grad_norm": 2.0229337215423584, "learning_rate": 7.53032309322034e-06, "loss": 1.5276, "mean_token_accuracy": 0.6840458512306213, "num_tokens": 15007652.0, "step": 18652 }, { "epoch": 4.940148305084746, "grad_norm": 1.7976001501083374, "learning_rate": 7.530058262711865e-06, "loss": 1.0183, "mean_token_accuracy": 0.7543244063854218, "num_tokens": 15009335.0, "step": 18654 }, { "epoch": 4.940677966101695, "grad_norm": 2.1585376262664795, "learning_rate": 7.529793432203391e-06, "loss": 1.5687, "mean_token_accuracy": 0.6481962203979492, "num_tokens": 15011132.0, "step": 18656 }, { "epoch": 4.941207627118644, "grad_norm": 1.6254687309265137, "learning_rate": 7.5295286016949155e-06, "loss": 0.9644, "mean_token_accuracy": 0.7373496517539024, "num_tokens": 15013496.0, "step": 18658 }, { "epoch": 4.941737288135593, "grad_norm": 1.8947426080703735, "learning_rate": 7.529263771186441e-06, "loss": 1.1668, "mean_token_accuracy": 0.7200803011655807, "num_tokens": 15015158.0, "step": 18660 }, { "epoch": 4.942266949152542, "grad_norm": 1.8991765975952148, "learning_rate": 7.528998940677966e-06, "loss": 1.0877, "mean_token_accuracy": 0.7601221948862076, "num_tokens": 15016702.0, "step": 18662 }, { "epoch": 4.942796610169491, "grad_norm": 1.9270862340927124, "learning_rate": 7.528734110169492e-06, "loss": 1.3117, "mean_token_accuracy": 0.6903060078620911, "num_tokens": 15018325.0, "step": 18664 }, { "epoch": 4.94332627118644, "grad_norm": 2.2983291149139404, "learning_rate": 7.528469279661017e-06, "loss": 1.6299, "mean_token_accuracy": 0.6544005759060383, "num_tokens": 15019971.0, "step": 18666 }, { "epoch": 4.94385593220339, "grad_norm": 2.009942054748535, "learning_rate": 7.528204449152543e-06, "loss": 0.947, "mean_token_accuracy": 0.7860954627394676, "num_tokens": 15021556.0, "step": 18668 }, { "epoch": 4.944385593220339, "grad_norm": 1.8173315525054932, "learning_rate": 7.527939618644068e-06, "loss": 1.1445, "mean_token_accuracy": 0.7287765070796013, "num_tokens": 15023183.0, "step": 18670 }, { "epoch": 4.944915254237288, "grad_norm": 1.9749689102172852, "learning_rate": 7.527674788135593e-06, "loss": 1.3045, "mean_token_accuracy": 0.7055670768022537, "num_tokens": 15024819.0, "step": 18672 }, { "epoch": 4.945444915254237, "grad_norm": 1.7582776546478271, "learning_rate": 7.52740995762712e-06, "loss": 0.9714, "mean_token_accuracy": 0.75721475481987, "num_tokens": 15026133.0, "step": 18674 }, { "epoch": 4.945974576271187, "grad_norm": 2.5010414123535156, "learning_rate": 7.527145127118645e-06, "loss": 1.4123, "mean_token_accuracy": 0.6836820989847183, "num_tokens": 15027615.0, "step": 18676 }, { "epoch": 4.946504237288136, "grad_norm": 1.813704252243042, "learning_rate": 7.526880296610171e-06, "loss": 1.1173, "mean_token_accuracy": 0.7358285486698151, "num_tokens": 15029339.0, "step": 18678 }, { "epoch": 4.947033898305085, "grad_norm": 1.8621413707733154, "learning_rate": 7.526615466101696e-06, "loss": 1.0787, "mean_token_accuracy": 0.7431834936141968, "num_tokens": 15031281.0, "step": 18680 }, { "epoch": 4.947563559322034, "grad_norm": 1.545789361000061, "learning_rate": 7.5263506355932215e-06, "loss": 1.1015, "mean_token_accuracy": 0.7333827540278435, "num_tokens": 15033336.0, "step": 18682 }, { "epoch": 4.948093220338983, "grad_norm": 2.142561674118042, "learning_rate": 7.526085805084746e-06, "loss": 1.2177, "mean_token_accuracy": 0.7101798504590988, "num_tokens": 15034777.0, "step": 18684 }, { "epoch": 4.9486228813559325, "grad_norm": 2.1790084838867188, "learning_rate": 7.525820974576272e-06, "loss": 0.978, "mean_token_accuracy": 0.7408849745988846, "num_tokens": 15036411.0, "step": 18686 }, { "epoch": 4.9491525423728815, "grad_norm": 2.111593246459961, "learning_rate": 7.525556144067797e-06, "loss": 1.2457, "mean_token_accuracy": 0.7009261474013329, "num_tokens": 15038134.0, "step": 18688 }, { "epoch": 4.9496822033898304, "grad_norm": 2.241429328918457, "learning_rate": 7.525291313559323e-06, "loss": 1.699, "mean_token_accuracy": 0.6159177049994469, "num_tokens": 15039897.0, "step": 18690 }, { "epoch": 4.950211864406779, "grad_norm": 1.8740119934082031, "learning_rate": 7.525026483050848e-06, "loss": 1.0927, "mean_token_accuracy": 0.7144742384552956, "num_tokens": 15041436.0, "step": 18692 }, { "epoch": 4.950741525423728, "grad_norm": 2.163200855255127, "learning_rate": 7.5247616525423736e-06, "loss": 1.6639, "mean_token_accuracy": 0.639900915324688, "num_tokens": 15042925.0, "step": 18694 }, { "epoch": 4.951271186440678, "grad_norm": 2.2046306133270264, "learning_rate": 7.5244968220338985e-06, "loss": 1.2858, "mean_token_accuracy": 0.7216166853904724, "num_tokens": 15044477.0, "step": 18696 }, { "epoch": 4.951800847457627, "grad_norm": 2.314051389694214, "learning_rate": 7.524231991525425e-06, "loss": 1.0028, "mean_token_accuracy": 0.7559058591723442, "num_tokens": 15046052.0, "step": 18698 }, { "epoch": 4.952330508474576, "grad_norm": 2.1269195079803467, "learning_rate": 7.523967161016949e-06, "loss": 1.1692, "mean_token_accuracy": 0.7511886656284332, "num_tokens": 15047706.0, "step": 18700 }, { "epoch": 4.952860169491525, "grad_norm": 1.6336992979049683, "learning_rate": 7.523702330508476e-06, "loss": 1.0274, "mean_token_accuracy": 0.7480762377381325, "num_tokens": 15049427.0, "step": 18702 }, { "epoch": 4.953389830508475, "grad_norm": 2.014995574951172, "learning_rate": 7.523437500000001e-06, "loss": 1.2754, "mean_token_accuracy": 0.7275837287306786, "num_tokens": 15050943.0, "step": 18704 }, { "epoch": 4.953919491525424, "grad_norm": 2.198033094406128, "learning_rate": 7.5231726694915265e-06, "loss": 0.9188, "mean_token_accuracy": 0.790448933839798, "num_tokens": 15052398.0, "step": 18706 }, { "epoch": 4.954449152542373, "grad_norm": 2.0589184761047363, "learning_rate": 7.5229078389830515e-06, "loss": 1.1432, "mean_token_accuracy": 0.7102617025375366, "num_tokens": 15054017.0, "step": 18708 }, { "epoch": 4.954978813559322, "grad_norm": 1.8188645839691162, "learning_rate": 7.522643008474577e-06, "loss": 1.4078, "mean_token_accuracy": 0.6786592155694962, "num_tokens": 15055618.0, "step": 18710 }, { "epoch": 4.955508474576272, "grad_norm": 2.2333500385284424, "learning_rate": 7.522378177966102e-06, "loss": 0.9858, "mean_token_accuracy": 0.7662806585431099, "num_tokens": 15057084.0, "step": 18712 }, { "epoch": 4.956038135593221, "grad_norm": 2.2252626419067383, "learning_rate": 7.522113347457628e-06, "loss": 1.3346, "mean_token_accuracy": 0.7074171900749207, "num_tokens": 15058695.0, "step": 18714 }, { "epoch": 4.9565677966101696, "grad_norm": 1.8553497791290283, "learning_rate": 7.521848516949153e-06, "loss": 1.2204, "mean_token_accuracy": 0.7290224209427834, "num_tokens": 15060086.0, "step": 18716 }, { "epoch": 4.9570974576271185, "grad_norm": 1.7678451538085938, "learning_rate": 7.521583686440679e-06, "loss": 0.7686, "mean_token_accuracy": 0.8172709941864014, "num_tokens": 15061526.0, "step": 18718 }, { "epoch": 4.9576271186440675, "grad_norm": 1.9510494470596313, "learning_rate": 7.521318855932204e-06, "loss": 1.0272, "mean_token_accuracy": 0.7184635773301125, "num_tokens": 15063408.0, "step": 18720 }, { "epoch": 4.958156779661017, "grad_norm": 2.0900604724884033, "learning_rate": 7.521054025423729e-06, "loss": 1.2501, "mean_token_accuracy": 0.7111974209547043, "num_tokens": 15064871.0, "step": 18722 }, { "epoch": 4.958686440677966, "grad_norm": 2.4284815788269043, "learning_rate": 7.520789194915254e-06, "loss": 0.9996, "mean_token_accuracy": 0.7711238190531731, "num_tokens": 15066272.0, "step": 18724 }, { "epoch": 4.959216101694915, "grad_norm": 2.155144691467285, "learning_rate": 7.52052436440678e-06, "loss": 1.14, "mean_token_accuracy": 0.7266903519630432, "num_tokens": 15067753.0, "step": 18726 }, { "epoch": 4.959745762711864, "grad_norm": 1.9369137287139893, "learning_rate": 7.520259533898305e-06, "loss": 1.211, "mean_token_accuracy": 0.72635842487216, "num_tokens": 15069259.0, "step": 18728 }, { "epoch": 4.960275423728813, "grad_norm": 2.081071138381958, "learning_rate": 7.519994703389832e-06, "loss": 1.4207, "mean_token_accuracy": 0.6929146200418472, "num_tokens": 15070993.0, "step": 18730 }, { "epoch": 4.960805084745763, "grad_norm": 1.5020020008087158, "learning_rate": 7.5197298728813565e-06, "loss": 0.8882, "mean_token_accuracy": 0.7699570953845978, "num_tokens": 15072752.0, "step": 18732 }, { "epoch": 4.961334745762712, "grad_norm": 2.2055530548095703, "learning_rate": 7.519465042372882e-06, "loss": 1.4889, "mean_token_accuracy": 0.6464894041419029, "num_tokens": 15074287.0, "step": 18734 }, { "epoch": 4.961864406779661, "grad_norm": 2.1800928115844727, "learning_rate": 7.519200211864407e-06, "loss": 1.1338, "mean_token_accuracy": 0.7186996713280678, "num_tokens": 15075665.0, "step": 18736 }, { "epoch": 4.96239406779661, "grad_norm": 1.4370137453079224, "learning_rate": 7.518935381355933e-06, "loss": 1.2554, "mean_token_accuracy": 0.7158358097076416, "num_tokens": 15077545.0, "step": 18738 }, { "epoch": 4.96292372881356, "grad_norm": 2.457319974899292, "learning_rate": 7.518670550847458e-06, "loss": 1.2022, "mean_token_accuracy": 0.7213448882102966, "num_tokens": 15079008.0, "step": 18740 }, { "epoch": 4.963453389830509, "grad_norm": 1.7694343328475952, "learning_rate": 7.518405720338984e-06, "loss": 0.8503, "mean_token_accuracy": 0.7855793684720993, "num_tokens": 15080440.0, "step": 18742 }, { "epoch": 4.963983050847458, "grad_norm": 1.9929795265197754, "learning_rate": 7.518140889830509e-06, "loss": 0.7713, "mean_token_accuracy": 0.7973687574267387, "num_tokens": 15081779.0, "step": 18744 }, { "epoch": 4.964512711864407, "grad_norm": 2.2210257053375244, "learning_rate": 7.5178760593220344e-06, "loss": 0.9513, "mean_token_accuracy": 0.753442294895649, "num_tokens": 15083424.0, "step": 18746 }, { "epoch": 4.965042372881356, "grad_norm": 1.801093339920044, "learning_rate": 7.517611228813559e-06, "loss": 0.9077, "mean_token_accuracy": 0.8028048127889633, "num_tokens": 15085090.0, "step": 18748 }, { "epoch": 4.965572033898305, "grad_norm": 2.14863657951355, "learning_rate": 7.517346398305085e-06, "loss": 1.5666, "step": 18750 }, { "epoch": 4.965572033898305, "eval_loss": 1.311487078666687, "eval_mean_token_accuracy": 0.701897344128652, "eval_num_tokens": 15086542.0, "eval_runtime": 48.0664, "eval_samples_per_second": 6.408, "eval_steps_per_second": 6.408, "step": 18750 }, { "epoch": 4.966101694915254, "grad_norm": 1.9520108699798584, "learning_rate": 7.51708156779661e-06, "loss": 1.4537, "mean_token_accuracy": 0.6453224495053291, "num_tokens": 15088337.0, "step": 18752 }, { "epoch": 4.966631355932203, "grad_norm": 2.0467886924743652, "learning_rate": 7.516816737288136e-06, "loss": 1.4194, "mean_token_accuracy": 0.7063312940299511, "num_tokens": 15090001.0, "step": 18754 }, { "epoch": 4.967161016949152, "grad_norm": 1.9690243005752563, "learning_rate": 7.5165519067796625e-06, "loss": 1.0477, "mean_token_accuracy": 0.7594611421227455, "num_tokens": 15091741.0, "step": 18756 }, { "epoch": 4.967690677966102, "grad_norm": 1.569579005241394, "learning_rate": 7.516287076271187e-06, "loss": 1.2283, "mean_token_accuracy": 0.7077792808413506, "num_tokens": 15093712.0, "step": 18758 }, { "epoch": 4.968220338983051, "grad_norm": 2.376439332962036, "learning_rate": 7.516022245762713e-06, "loss": 1.0544, "mean_token_accuracy": 0.7720866650342941, "num_tokens": 15095241.0, "step": 18760 }, { "epoch": 4.96875, "grad_norm": 2.2334060668945312, "learning_rate": 7.515757415254238e-06, "loss": 1.5591, "mean_token_accuracy": 0.6827986538410187, "num_tokens": 15096939.0, "step": 18762 }, { "epoch": 4.969279661016949, "grad_norm": 1.7551268339157104, "learning_rate": 7.515492584745764e-06, "loss": 0.9868, "mean_token_accuracy": 0.7646901533007622, "num_tokens": 15098445.0, "step": 18764 }, { "epoch": 4.969809322033898, "grad_norm": 2.254098653793335, "learning_rate": 7.515227754237289e-06, "loss": 0.8734, "mean_token_accuracy": 0.7767032235860825, "num_tokens": 15099820.0, "step": 18766 }, { "epoch": 4.970338983050848, "grad_norm": 2.196345806121826, "learning_rate": 7.514962923728815e-06, "loss": 1.4446, "mean_token_accuracy": 0.6894195452332497, "num_tokens": 15101245.0, "step": 18768 }, { "epoch": 4.970868644067797, "grad_norm": 1.7076462507247925, "learning_rate": 7.5146980932203395e-06, "loss": 1.4518, "mean_token_accuracy": 0.6777379214763641, "num_tokens": 15102892.0, "step": 18770 }, { "epoch": 4.971398305084746, "grad_norm": 2.1509206295013428, "learning_rate": 7.514433262711865e-06, "loss": 1.4307, "mean_token_accuracy": 0.691916286945343, "num_tokens": 15104750.0, "step": 18772 }, { "epoch": 4.971927966101695, "grad_norm": 2.0315499305725098, "learning_rate": 7.51416843220339e-06, "loss": 1.247, "mean_token_accuracy": 0.6940208002924919, "num_tokens": 15106092.0, "step": 18774 }, { "epoch": 4.972457627118644, "grad_norm": 1.788769006729126, "learning_rate": 7.513903601694916e-06, "loss": 1.042, "mean_token_accuracy": 0.7369987443089485, "num_tokens": 15107713.0, "step": 18776 }, { "epoch": 4.972987288135593, "grad_norm": 1.8019989728927612, "learning_rate": 7.513638771186441e-06, "loss": 1.0009, "mean_token_accuracy": 0.7485526576638222, "num_tokens": 15109243.0, "step": 18778 }, { "epoch": 4.973516949152542, "grad_norm": 2.2735302448272705, "learning_rate": 7.513373940677967e-06, "loss": 1.5482, "mean_token_accuracy": 0.6591632962226868, "num_tokens": 15110830.0, "step": 18780 }, { "epoch": 4.974046610169491, "grad_norm": 1.961353063583374, "learning_rate": 7.513109110169492e-06, "loss": 1.1605, "mean_token_accuracy": 0.7497712969779968, "num_tokens": 15112369.0, "step": 18782 }, { "epoch": 4.97457627118644, "grad_norm": 2.076056480407715, "learning_rate": 7.512844279661018e-06, "loss": 1.3657, "mean_token_accuracy": 0.6614271998405457, "num_tokens": 15114196.0, "step": 18784 }, { "epoch": 4.97510593220339, "grad_norm": 2.1393046379089355, "learning_rate": 7.512579449152543e-06, "loss": 1.163, "mean_token_accuracy": 0.7218587249517441, "num_tokens": 15115875.0, "step": 18786 }, { "epoch": 4.975635593220339, "grad_norm": 2.141751766204834, "learning_rate": 7.512314618644069e-06, "loss": 1.2161, "mean_token_accuracy": 0.7114446014165878, "num_tokens": 15117269.0, "step": 18788 }, { "epoch": 4.976165254237288, "grad_norm": 2.431884288787842, "learning_rate": 7.512049788135594e-06, "loss": 1.6583, "mean_token_accuracy": 0.6403684914112091, "num_tokens": 15118921.0, "step": 18790 }, { "epoch": 4.976694915254237, "grad_norm": 2.038557529449463, "learning_rate": 7.51178495762712e-06, "loss": 1.169, "mean_token_accuracy": 0.7271784543991089, "num_tokens": 15120549.0, "step": 18792 }, { "epoch": 4.977224576271187, "grad_norm": 1.8442660570144653, "learning_rate": 7.511520127118645e-06, "loss": 1.3363, "mean_token_accuracy": 0.6977571249008179, "num_tokens": 15122170.0, "step": 18794 }, { "epoch": 4.977754237288136, "grad_norm": 1.8987300395965576, "learning_rate": 7.51125529661017e-06, "loss": 0.9416, "mean_token_accuracy": 0.762166291475296, "num_tokens": 15123977.0, "step": 18796 }, { "epoch": 4.978283898305085, "grad_norm": 2.11240291595459, "learning_rate": 7.510990466101695e-06, "loss": 0.9009, "mean_token_accuracy": 0.7652552053332329, "num_tokens": 15125324.0, "step": 18798 }, { "epoch": 4.978813559322034, "grad_norm": 2.125009298324585, "learning_rate": 7.510725635593221e-06, "loss": 1.7206, "mean_token_accuracy": 0.6457161754369736, "num_tokens": 15127091.0, "step": 18800 }, { "epoch": 4.979343220338983, "grad_norm": 1.8143184185028076, "learning_rate": 7.510460805084746e-06, "loss": 0.9771, "mean_token_accuracy": 0.7482302337884903, "num_tokens": 15128712.0, "step": 18802 }, { "epoch": 4.9798728813559325, "grad_norm": 1.7885571718215942, "learning_rate": 7.510195974576272e-06, "loss": 1.195, "mean_token_accuracy": 0.7468378990888596, "num_tokens": 15130466.0, "step": 18804 }, { "epoch": 4.9804025423728815, "grad_norm": 1.813782811164856, "learning_rate": 7.509931144067797e-06, "loss": 0.9639, "mean_token_accuracy": 0.7572920620441437, "num_tokens": 15131875.0, "step": 18806 }, { "epoch": 4.9809322033898304, "grad_norm": 1.9260478019714355, "learning_rate": 7.5096663135593225e-06, "loss": 1.4263, "mean_token_accuracy": 0.6806085109710693, "num_tokens": 15133482.0, "step": 18808 }, { "epoch": 4.981461864406779, "grad_norm": 1.5225821733474731, "learning_rate": 7.509401483050847e-06, "loss": 1.1344, "mean_token_accuracy": 0.7414067834615707, "num_tokens": 15134940.0, "step": 18810 }, { "epoch": 4.981991525423728, "grad_norm": 1.5918445587158203, "learning_rate": 7.509136652542374e-06, "loss": 0.9113, "mean_token_accuracy": 0.7748785763978958, "num_tokens": 15136686.0, "step": 18812 }, { "epoch": 4.982521186440678, "grad_norm": 1.9337258338928223, "learning_rate": 7.508871822033898e-06, "loss": 1.3101, "mean_token_accuracy": 0.7179949283599854, "num_tokens": 15138191.0, "step": 18814 }, { "epoch": 4.983050847457627, "grad_norm": 2.0691983699798584, "learning_rate": 7.508606991525425e-06, "loss": 1.4214, "mean_token_accuracy": 0.6489852368831635, "num_tokens": 15139824.0, "step": 18816 }, { "epoch": 4.983580508474576, "grad_norm": 2.1213982105255127, "learning_rate": 7.50834216101695e-06, "loss": 1.2516, "mean_token_accuracy": 0.7055561169981956, "num_tokens": 15141371.0, "step": 18818 }, { "epoch": 4.984110169491525, "grad_norm": 2.03922438621521, "learning_rate": 7.5080773305084755e-06, "loss": 1.3819, "mean_token_accuracy": 0.6795726381242275, "num_tokens": 15142848.0, "step": 18820 }, { "epoch": 4.984639830508475, "grad_norm": 1.8409481048583984, "learning_rate": 7.5078125e-06, "loss": 1.4013, "mean_token_accuracy": 0.677546925842762, "num_tokens": 15144611.0, "step": 18822 }, { "epoch": 4.985169491525424, "grad_norm": 1.749252438545227, "learning_rate": 7.507547669491526e-06, "loss": 1.081, "mean_token_accuracy": 0.7409424483776093, "num_tokens": 15145928.0, "step": 18824 }, { "epoch": 4.985699152542373, "grad_norm": 2.0999398231506348, "learning_rate": 7.507282838983051e-06, "loss": 1.2854, "mean_token_accuracy": 0.7205724939703941, "num_tokens": 15147398.0, "step": 18826 }, { "epoch": 4.986228813559322, "grad_norm": 1.5731861591339111, "learning_rate": 7.507018008474577e-06, "loss": 0.9769, "mean_token_accuracy": 0.7623325809836388, "num_tokens": 15148960.0, "step": 18828 }, { "epoch": 4.986758474576272, "grad_norm": 1.866382122039795, "learning_rate": 7.506753177966102e-06, "loss": 1.2439, "mean_token_accuracy": 0.6919244006276131, "num_tokens": 15150606.0, "step": 18830 }, { "epoch": 4.987288135593221, "grad_norm": 2.6299827098846436, "learning_rate": 7.5064883474576276e-06, "loss": 1.6858, "mean_token_accuracy": 0.6532190218567848, "num_tokens": 15152047.0, "step": 18832 }, { "epoch": 4.9878177966101696, "grad_norm": 2.076244354248047, "learning_rate": 7.5062235169491525e-06, "loss": 1.0257, "mean_token_accuracy": 0.7512087598443031, "num_tokens": 15153670.0, "step": 18834 }, { "epoch": 4.9883474576271185, "grad_norm": 2.0655767917633057, "learning_rate": 7.505958686440678e-06, "loss": 1.3154, "mean_token_accuracy": 0.7147593684494495, "num_tokens": 15155322.0, "step": 18836 }, { "epoch": 4.9888771186440675, "grad_norm": 2.2010557651519775, "learning_rate": 7.505693855932203e-06, "loss": 1.1978, "mean_token_accuracy": 0.7179779186844826, "num_tokens": 15156801.0, "step": 18838 }, { "epoch": 4.989406779661017, "grad_norm": 1.7049802541732788, "learning_rate": 7.50542902542373e-06, "loss": 1.0168, "mean_token_accuracy": 0.7944992110133171, "num_tokens": 15158359.0, "step": 18840 }, { "epoch": 4.989936440677966, "grad_norm": 1.9297748804092407, "learning_rate": 7.505164194915256e-06, "loss": 1.669, "mean_token_accuracy": 0.6140768527984619, "num_tokens": 15160196.0, "step": 18842 }, { "epoch": 4.990466101694915, "grad_norm": 1.9850058555603027, "learning_rate": 7.5048993644067805e-06, "loss": 1.2849, "mean_token_accuracy": 0.7014720663428307, "num_tokens": 15162010.0, "step": 18844 }, { "epoch": 4.990995762711864, "grad_norm": 2.466473340988159, "learning_rate": 7.504634533898306e-06, "loss": 1.4421, "mean_token_accuracy": 0.6846802979707718, "num_tokens": 15163423.0, "step": 18846 }, { "epoch": 4.991525423728813, "grad_norm": 2.385928153991699, "learning_rate": 7.504369703389831e-06, "loss": 1.0071, "mean_token_accuracy": 0.7463840991258621, "num_tokens": 15164912.0, "step": 18848 }, { "epoch": 4.992055084745763, "grad_norm": 2.22148060798645, "learning_rate": 7.504104872881357e-06, "loss": 1.743, "mean_token_accuracy": 0.5966576635837555, "num_tokens": 15166515.0, "step": 18850 }, { "epoch": 4.992584745762712, "grad_norm": 2.009369134902954, "learning_rate": 7.503840042372882e-06, "loss": 1.2213, "mean_token_accuracy": 0.7113881483674049, "num_tokens": 15168354.0, "step": 18852 }, { "epoch": 4.993114406779661, "grad_norm": 1.8056176900863647, "learning_rate": 7.503575211864408e-06, "loss": 1.198, "mean_token_accuracy": 0.7069846838712692, "num_tokens": 15169980.0, "step": 18854 }, { "epoch": 4.99364406779661, "grad_norm": 1.7300305366516113, "learning_rate": 7.503310381355933e-06, "loss": 1.3512, "mean_token_accuracy": 0.6856916099786758, "num_tokens": 15171827.0, "step": 18856 }, { "epoch": 4.99417372881356, "grad_norm": 1.9184826612472534, "learning_rate": 7.503045550847458e-06, "loss": 1.2756, "mean_token_accuracy": 0.6977286785840988, "num_tokens": 15173596.0, "step": 18858 }, { "epoch": 4.994703389830509, "grad_norm": 1.8908798694610596, "learning_rate": 7.502780720338983e-06, "loss": 1.3886, "mean_token_accuracy": 0.6751898303627968, "num_tokens": 15175162.0, "step": 18860 }, { "epoch": 4.995233050847458, "grad_norm": 2.7321701049804688, "learning_rate": 7.502515889830509e-06, "loss": 1.0599, "mean_token_accuracy": 0.7488363832235336, "num_tokens": 15176439.0, "step": 18862 }, { "epoch": 4.995762711864407, "grad_norm": 1.735994577407837, "learning_rate": 7.502251059322034e-06, "loss": 1.1539, "mean_token_accuracy": 0.7063094303011894, "num_tokens": 15178262.0, "step": 18864 }, { "epoch": 4.996292372881356, "grad_norm": 1.9860798120498657, "learning_rate": 7.501986228813561e-06, "loss": 1.1936, "mean_token_accuracy": 0.7432993352413177, "num_tokens": 15179973.0, "step": 18866 }, { "epoch": 4.996822033898305, "grad_norm": 2.3034048080444336, "learning_rate": 7.501721398305085e-06, "loss": 1.4536, "mean_token_accuracy": 0.6720183044672012, "num_tokens": 15181675.0, "step": 18868 }, { "epoch": 4.997351694915254, "grad_norm": 1.8272366523742676, "learning_rate": 7.501456567796611e-06, "loss": 1.3272, "mean_token_accuracy": 0.7093470469117165, "num_tokens": 15183281.0, "step": 18870 }, { "epoch": 4.997881355932203, "grad_norm": 1.9163074493408203, "learning_rate": 7.501191737288136e-06, "loss": 0.8826, "mean_token_accuracy": 0.7745647579431534, "num_tokens": 15184640.0, "step": 18872 }, { "epoch": 4.998411016949152, "grad_norm": 1.865679383277893, "learning_rate": 7.500926906779662e-06, "loss": 1.0605, "mean_token_accuracy": 0.7446741685271263, "num_tokens": 15186576.0, "step": 18874 }, { "epoch": 4.998940677966102, "grad_norm": 2.214076519012451, "learning_rate": 7.500662076271187e-06, "loss": 1.4822, "mean_token_accuracy": 0.6691631972789764, "num_tokens": 15188074.0, "step": 18876 }, { "epoch": 4.999470338983051, "grad_norm": 1.8551980257034302, "learning_rate": 7.500397245762713e-06, "loss": 1.0376, "mean_token_accuracy": 0.7509828582406044, "num_tokens": 15189687.0, "step": 18878 }, { "epoch": 5.0, "grad_norm": 2.252692461013794, "learning_rate": 7.500132415254238e-06, "loss": 1.4614, "mean_token_accuracy": 0.686891220510006, "num_tokens": 15191340.0, "step": 18880 }, { "epoch": 5.000529661016949, "grad_norm": 1.8679735660552979, "learning_rate": 7.4998675847457635e-06, "loss": 0.9798, "mean_token_accuracy": 0.7465677186846733, "num_tokens": 15192845.0, "step": 18882 }, { "epoch": 5.001059322033898, "grad_norm": 2.083850622177124, "learning_rate": 7.4996027542372884e-06, "loss": 1.2117, "mean_token_accuracy": 0.6812767833471298, "num_tokens": 15194671.0, "step": 18884 }, { "epoch": 5.001588983050848, "grad_norm": 1.6857781410217285, "learning_rate": 7.499337923728814e-06, "loss": 1.1073, "mean_token_accuracy": 0.7148718908429146, "num_tokens": 15196219.0, "step": 18886 }, { "epoch": 5.002118644067797, "grad_norm": 2.3334290981292725, "learning_rate": 7.499073093220339e-06, "loss": 1.5859, "mean_token_accuracy": 0.6769496724009514, "num_tokens": 15197839.0, "step": 18888 }, { "epoch": 5.002648305084746, "grad_norm": 1.5872852802276611, "learning_rate": 7.498808262711865e-06, "loss": 1.1338, "mean_token_accuracy": 0.7351405769586563, "num_tokens": 15199500.0, "step": 18890 }, { "epoch": 5.003177966101695, "grad_norm": 1.5700244903564453, "learning_rate": 7.49854343220339e-06, "loss": 1.1023, "mean_token_accuracy": 0.7445184737443924, "num_tokens": 15201205.0, "step": 18892 }, { "epoch": 5.0037076271186445, "grad_norm": 2.2257792949676514, "learning_rate": 7.4982786016949165e-06, "loss": 1.2703, "mean_token_accuracy": 0.7103397399187088, "num_tokens": 15202816.0, "step": 18894 }, { "epoch": 5.004237288135593, "grad_norm": 1.9999200105667114, "learning_rate": 7.4980137711864405e-06, "loss": 1.2284, "mean_token_accuracy": 0.7211213111877441, "num_tokens": 15204538.0, "step": 18896 }, { "epoch": 5.004766949152542, "grad_norm": 1.9967856407165527, "learning_rate": 7.497748940677967e-06, "loss": 1.4102, "mean_token_accuracy": 0.6811489909887314, "num_tokens": 15206176.0, "step": 18898 }, { "epoch": 5.005296610169491, "grad_norm": 2.35428524017334, "learning_rate": 7.497484110169492e-06, "loss": 1.1085, "mean_token_accuracy": 0.7178188040852547, "num_tokens": 15207756.0, "step": 18900 }, { "epoch": 5.00582627118644, "grad_norm": 2.067330837249756, "learning_rate": 7.497219279661018e-06, "loss": 1.0399, "mean_token_accuracy": 0.7498259395360947, "num_tokens": 15209252.0, "step": 18902 }, { "epoch": 5.00635593220339, "grad_norm": 2.063710927963257, "learning_rate": 7.496954449152543e-06, "loss": 1.367, "mean_token_accuracy": 0.7163507342338562, "num_tokens": 15211023.0, "step": 18904 }, { "epoch": 5.006885593220339, "grad_norm": 1.7168278694152832, "learning_rate": 7.496689618644069e-06, "loss": 0.8865, "mean_token_accuracy": 0.7783715128898621, "num_tokens": 15212670.0, "step": 18906 }, { "epoch": 5.007415254237288, "grad_norm": 2.2849013805389404, "learning_rate": 7.4964247881355935e-06, "loss": 1.3561, "mean_token_accuracy": 0.6923702955245972, "num_tokens": 15214395.0, "step": 18908 }, { "epoch": 5.007944915254237, "grad_norm": 2.2502286434173584, "learning_rate": 7.496159957627119e-06, "loss": 1.0273, "mean_token_accuracy": 0.7466889843344688, "num_tokens": 15215874.0, "step": 18910 }, { "epoch": 5.008474576271187, "grad_norm": 1.4067524671554565, "learning_rate": 7.495895127118644e-06, "loss": 0.9681, "mean_token_accuracy": 0.7621616944670677, "num_tokens": 15217679.0, "step": 18912 }, { "epoch": 5.009004237288136, "grad_norm": 1.9777300357818604, "learning_rate": 7.49563029661017e-06, "loss": 1.02, "mean_token_accuracy": 0.7409248650074005, "num_tokens": 15220133.0, "step": 18914 }, { "epoch": 5.009533898305085, "grad_norm": 1.7658113241195679, "learning_rate": 7.495365466101695e-06, "loss": 0.997, "mean_token_accuracy": 0.7752309367060661, "num_tokens": 15221687.0, "step": 18916 }, { "epoch": 5.010063559322034, "grad_norm": 1.7986276149749756, "learning_rate": 7.495100635593221e-06, "loss": 1.2246, "mean_token_accuracy": 0.7150793820619583, "num_tokens": 15223387.0, "step": 18918 }, { "epoch": 5.010593220338983, "grad_norm": 2.3819282054901123, "learning_rate": 7.494835805084746e-06, "loss": 1.2563, "mean_token_accuracy": 0.7236559838056564, "num_tokens": 15224930.0, "step": 18920 }, { "epoch": 5.0111228813559325, "grad_norm": 1.8866690397262573, "learning_rate": 7.494570974576271e-06, "loss": 1.0584, "mean_token_accuracy": 0.7618273124098778, "num_tokens": 15226515.0, "step": 18922 }, { "epoch": 5.0116525423728815, "grad_norm": 1.6753687858581543, "learning_rate": 7.494306144067798e-06, "loss": 0.9384, "mean_token_accuracy": 0.7553658038377762, "num_tokens": 15228007.0, "step": 18924 }, { "epoch": 5.0121822033898304, "grad_norm": 1.7656729221343994, "learning_rate": 7.494041313559323e-06, "loss": 0.9706, "mean_token_accuracy": 0.7611035406589508, "num_tokens": 15229673.0, "step": 18926 }, { "epoch": 5.012711864406779, "grad_norm": 1.8896100521087646, "learning_rate": 7.493776483050849e-06, "loss": 1.0495, "mean_token_accuracy": 0.7585197985172272, "num_tokens": 15231229.0, "step": 18928 }, { "epoch": 5.013241525423729, "grad_norm": 2.0829176902770996, "learning_rate": 7.493511652542374e-06, "loss": 1.4081, "mean_token_accuracy": 0.6855699121952057, "num_tokens": 15232621.0, "step": 18930 }, { "epoch": 5.013771186440678, "grad_norm": 2.20174503326416, "learning_rate": 7.4932468220338994e-06, "loss": 1.5489, "mean_token_accuracy": 0.6469855234026909, "num_tokens": 15234040.0, "step": 18932 }, { "epoch": 5.014300847457627, "grad_norm": 2.169257640838623, "learning_rate": 7.492981991525424e-06, "loss": 1.6435, "mean_token_accuracy": 0.6695581041276455, "num_tokens": 15236371.0, "step": 18934 }, { "epoch": 5.014830508474576, "grad_norm": 2.2137043476104736, "learning_rate": 7.49271716101695e-06, "loss": 0.9096, "mean_token_accuracy": 0.7453394830226898, "num_tokens": 15237717.0, "step": 18936 }, { "epoch": 5.015360169491525, "grad_norm": 1.948246955871582, "learning_rate": 7.492452330508475e-06, "loss": 0.9019, "mean_token_accuracy": 0.7812594845890999, "num_tokens": 15239487.0, "step": 18938 }, { "epoch": 5.015889830508475, "grad_norm": 1.9857028722763062, "learning_rate": 7.492187500000001e-06, "loss": 1.3864, "mean_token_accuracy": 0.6853371560573578, "num_tokens": 15240915.0, "step": 18940 }, { "epoch": 5.016419491525424, "grad_norm": 1.883506178855896, "learning_rate": 7.491922669491526e-06, "loss": 0.7558, "mean_token_accuracy": 0.8036978170275688, "num_tokens": 15242695.0, "step": 18942 }, { "epoch": 5.016949152542373, "grad_norm": 2.0998642444610596, "learning_rate": 7.4916578389830516e-06, "loss": 1.0248, "mean_token_accuracy": 0.7733720988035202, "num_tokens": 15244180.0, "step": 18944 }, { "epoch": 5.017478813559322, "grad_norm": 2.1842589378356934, "learning_rate": 7.4913930084745765e-06, "loss": 0.8848, "mean_token_accuracy": 0.8021498546004295, "num_tokens": 15245576.0, "step": 18946 }, { "epoch": 5.018008474576271, "grad_norm": 2.0387916564941406, "learning_rate": 7.491128177966103e-06, "loss": 0.8595, "mean_token_accuracy": 0.8013206645846367, "num_tokens": 15247087.0, "step": 18948 }, { "epoch": 5.018538135593221, "grad_norm": 1.7032904624938965, "learning_rate": 7.490863347457627e-06, "loss": 1.1392, "mean_token_accuracy": 0.7472624108195305, "num_tokens": 15248508.0, "step": 18950 }, { "epoch": 5.0190677966101696, "grad_norm": 2.1739485263824463, "learning_rate": 7.490598516949154e-06, "loss": 1.4289, "mean_token_accuracy": 0.6715710088610649, "num_tokens": 15250104.0, "step": 18952 }, { "epoch": 5.0195974576271185, "grad_norm": 2.0572867393493652, "learning_rate": 7.490333686440679e-06, "loss": 1.4569, "mean_token_accuracy": 0.6633047759532928, "num_tokens": 15251776.0, "step": 18954 }, { "epoch": 5.0201271186440675, "grad_norm": 1.7296855449676514, "learning_rate": 7.4900688559322045e-06, "loss": 1.3018, "mean_token_accuracy": 0.7137466222047806, "num_tokens": 15253592.0, "step": 18956 }, { "epoch": 5.020656779661017, "grad_norm": 2.3511784076690674, "learning_rate": 7.4898040254237294e-06, "loss": 1.1062, "mean_token_accuracy": 0.7377027794718742, "num_tokens": 15254925.0, "step": 18958 }, { "epoch": 5.021186440677966, "grad_norm": 1.9245734214782715, "learning_rate": 7.489539194915255e-06, "loss": 1.1345, "mean_token_accuracy": 0.7141507640480995, "num_tokens": 15256836.0, "step": 18960 }, { "epoch": 5.021716101694915, "grad_norm": 2.342406988143921, "learning_rate": 7.48927436440678e-06, "loss": 1.1255, "mean_token_accuracy": 0.7397210597991943, "num_tokens": 15258459.0, "step": 18962 }, { "epoch": 5.022245762711864, "grad_norm": 2.2547104358673096, "learning_rate": 7.489009533898306e-06, "loss": 1.2709, "mean_token_accuracy": 0.6968988105654716, "num_tokens": 15260108.0, "step": 18964 }, { "epoch": 5.022775423728813, "grad_norm": 1.944953203201294, "learning_rate": 7.488744703389831e-06, "loss": 1.2375, "mean_token_accuracy": 0.6813494563102722, "num_tokens": 15262171.0, "step": 18966 }, { "epoch": 5.023305084745763, "grad_norm": 1.8077735900878906, "learning_rate": 7.488479872881357e-06, "loss": 0.8541, "mean_token_accuracy": 0.7863758429884911, "num_tokens": 15263824.0, "step": 18968 }, { "epoch": 5.023834745762712, "grad_norm": 2.315061330795288, "learning_rate": 7.4882150423728816e-06, "loss": 1.4243, "mean_token_accuracy": 0.7041074633598328, "num_tokens": 15265383.0, "step": 18970 }, { "epoch": 5.024364406779661, "grad_norm": 2.000389337539673, "learning_rate": 7.487950211864407e-06, "loss": 1.2004, "mean_token_accuracy": 0.725064292550087, "num_tokens": 15267097.0, "step": 18972 }, { "epoch": 5.02489406779661, "grad_norm": 2.0425169467926025, "learning_rate": 7.487685381355932e-06, "loss": 1.2139, "mean_token_accuracy": 0.7188255116343498, "num_tokens": 15268776.0, "step": 18974 }, { "epoch": 5.02542372881356, "grad_norm": 2.118957042694092, "learning_rate": 7.487420550847458e-06, "loss": 1.3462, "mean_token_accuracy": 0.7086291387677193, "num_tokens": 15270423.0, "step": 18976 }, { "epoch": 5.025953389830509, "grad_norm": 1.6310957670211792, "learning_rate": 7.487155720338983e-06, "loss": 1.2522, "mean_token_accuracy": 0.7423930019140244, "num_tokens": 15272181.0, "step": 18978 }, { "epoch": 5.026483050847458, "grad_norm": 2.0539305210113525, "learning_rate": 7.48689088983051e-06, "loss": 1.281, "mean_token_accuracy": 0.7034332156181335, "num_tokens": 15273900.0, "step": 18980 }, { "epoch": 5.027012711864407, "grad_norm": 2.2266571521759033, "learning_rate": 7.4866260593220345e-06, "loss": 1.2789, "mean_token_accuracy": 0.7048804983496666, "num_tokens": 15275386.0, "step": 18982 }, { "epoch": 5.0275423728813555, "grad_norm": 2.6603822708129883, "learning_rate": 7.48636122881356e-06, "loss": 1.2724, "mean_token_accuracy": 0.7039337977766991, "num_tokens": 15276799.0, "step": 18984 }, { "epoch": 5.028072033898305, "grad_norm": 1.7649763822555542, "learning_rate": 7.486096398305085e-06, "loss": 1.1052, "mean_token_accuracy": 0.7329946085810661, "num_tokens": 15278518.0, "step": 18986 }, { "epoch": 5.028601694915254, "grad_norm": 1.834441900253296, "learning_rate": 7.485831567796611e-06, "loss": 1.4622, "mean_token_accuracy": 0.6836521252989769, "num_tokens": 15280273.0, "step": 18988 }, { "epoch": 5.029131355932203, "grad_norm": 1.8841006755828857, "learning_rate": 7.485566737288136e-06, "loss": 1.4839, "mean_token_accuracy": 0.6679976060986519, "num_tokens": 15282071.0, "step": 18990 }, { "epoch": 5.029661016949152, "grad_norm": 2.2211554050445557, "learning_rate": 7.485301906779662e-06, "loss": 1.4995, "mean_token_accuracy": 0.6584079712629318, "num_tokens": 15283948.0, "step": 18992 }, { "epoch": 5.030190677966102, "grad_norm": 2.1705985069274902, "learning_rate": 7.485037076271187e-06, "loss": 1.7835, "mean_token_accuracy": 0.62353590503335, "num_tokens": 15285618.0, "step": 18994 }, { "epoch": 5.030720338983051, "grad_norm": 2.37410569190979, "learning_rate": 7.484772245762712e-06, "loss": 1.2222, "mean_token_accuracy": 0.7154550850391388, "num_tokens": 15287177.0, "step": 18996 }, { "epoch": 5.03125, "grad_norm": 1.8342547416687012, "learning_rate": 7.484507415254237e-06, "loss": 0.9977, "mean_token_accuracy": 0.765982910990715, "num_tokens": 15288593.0, "step": 18998 }, { "epoch": 5.031779661016949, "grad_norm": 2.1332643032073975, "learning_rate": 7.484242584745763e-06, "loss": 1.5056, "step": 19000 }, { "epoch": 5.031779661016949, "eval_loss": 1.3158868551254272, "eval_mean_token_accuracy": 0.7012854894840872, "eval_num_tokens": 15290157.0, "eval_runtime": 48.1121, "eval_samples_per_second": 6.402, "eval_steps_per_second": 6.402, "step": 19000 }, { "epoch": 5.032309322033898, "grad_norm": 1.7705745697021484, "learning_rate": 7.483977754237288e-06, "loss": 1.3199, "mean_token_accuracy": 0.6947316899895668, "num_tokens": 15291796.0, "step": 19002 }, { "epoch": 5.032838983050848, "grad_norm": 2.2618942260742188, "learning_rate": 7.483712923728814e-06, "loss": 1.226, "mean_token_accuracy": 0.7371056824922562, "num_tokens": 15293140.0, "step": 19004 }, { "epoch": 5.033368644067797, "grad_norm": 1.8957635164260864, "learning_rate": 7.483448093220339e-06, "loss": 1.7465, "mean_token_accuracy": 0.6252212822437286, "num_tokens": 15294832.0, "step": 19006 }, { "epoch": 5.033898305084746, "grad_norm": 1.8882403373718262, "learning_rate": 7.483183262711865e-06, "loss": 1.2291, "mean_token_accuracy": 0.7205872312188148, "num_tokens": 15296625.0, "step": 19008 }, { "epoch": 5.034427966101695, "grad_norm": 2.3973042964935303, "learning_rate": 7.482918432203391e-06, "loss": 1.2236, "mean_token_accuracy": 0.7121957466006279, "num_tokens": 15298157.0, "step": 19010 }, { "epoch": 5.0349576271186445, "grad_norm": 2.2942466735839844, "learning_rate": 7.482653601694916e-06, "loss": 1.3281, "mean_token_accuracy": 0.7095955088734627, "num_tokens": 15299665.0, "step": 19012 }, { "epoch": 5.035487288135593, "grad_norm": 2.508671522140503, "learning_rate": 7.482388771186442e-06, "loss": 0.9653, "mean_token_accuracy": 0.775764100253582, "num_tokens": 15301880.0, "step": 19014 }, { "epoch": 5.036016949152542, "grad_norm": 2.4965507984161377, "learning_rate": 7.482123940677967e-06, "loss": 1.4836, "mean_token_accuracy": 0.6861923933029175, "num_tokens": 15303426.0, "step": 19016 }, { "epoch": 5.036546610169491, "grad_norm": 1.9538614749908447, "learning_rate": 7.4818591101694926e-06, "loss": 1.2337, "mean_token_accuracy": 0.7124964818358421, "num_tokens": 15304841.0, "step": 19018 }, { "epoch": 5.03707627118644, "grad_norm": 1.8822853565216064, "learning_rate": 7.4815942796610175e-06, "loss": 1.2742, "mean_token_accuracy": 0.709081269800663, "num_tokens": 15306450.0, "step": 19020 }, { "epoch": 5.03760593220339, "grad_norm": 2.4856879711151123, "learning_rate": 7.481329449152543e-06, "loss": 1.2023, "mean_token_accuracy": 0.7304684668779373, "num_tokens": 15307771.0, "step": 19022 }, { "epoch": 5.038135593220339, "grad_norm": 2.0546493530273438, "learning_rate": 7.481064618644068e-06, "loss": 1.1431, "mean_token_accuracy": 0.7344129979610443, "num_tokens": 15309603.0, "step": 19024 }, { "epoch": 5.038665254237288, "grad_norm": 2.073934316635132, "learning_rate": 7.480799788135594e-06, "loss": 0.9653, "mean_token_accuracy": 0.7672927901148796, "num_tokens": 15311258.0, "step": 19026 }, { "epoch": 5.039194915254237, "grad_norm": 2.221806764602661, "learning_rate": 7.480534957627119e-06, "loss": 1.3625, "mean_token_accuracy": 0.7125323638319969, "num_tokens": 15312932.0, "step": 19028 }, { "epoch": 5.039724576271187, "grad_norm": 2.045001268386841, "learning_rate": 7.480270127118645e-06, "loss": 1.2746, "mean_token_accuracy": 0.6970388889312744, "num_tokens": 15314595.0, "step": 19030 }, { "epoch": 5.040254237288136, "grad_norm": 2.388543128967285, "learning_rate": 7.48000529661017e-06, "loss": 1.384, "mean_token_accuracy": 0.6812694445252419, "num_tokens": 15316076.0, "step": 19032 }, { "epoch": 5.040783898305085, "grad_norm": 2.246983528137207, "learning_rate": 7.479740466101696e-06, "loss": 1.749, "mean_token_accuracy": 0.6642170585691929, "num_tokens": 15317631.0, "step": 19034 }, { "epoch": 5.041313559322034, "grad_norm": 1.8480571508407593, "learning_rate": 7.479475635593221e-06, "loss": 1.0626, "mean_token_accuracy": 0.7388193607330322, "num_tokens": 15319229.0, "step": 19036 }, { "epoch": 5.041843220338983, "grad_norm": 2.8008527755737305, "learning_rate": 7.479210805084747e-06, "loss": 1.4805, "mean_token_accuracy": 0.6880454681813717, "num_tokens": 15320458.0, "step": 19038 }, { "epoch": 5.0423728813559325, "grad_norm": 2.1263630390167236, "learning_rate": 7.478945974576272e-06, "loss": 1.3496, "mean_token_accuracy": 0.6977141201496124, "num_tokens": 15321939.0, "step": 19040 }, { "epoch": 5.0429025423728815, "grad_norm": 2.2978482246398926, "learning_rate": 7.478681144067798e-06, "loss": 1.0774, "mean_token_accuracy": 0.748648889362812, "num_tokens": 15323388.0, "step": 19042 }, { "epoch": 5.0434322033898304, "grad_norm": 2.128023862838745, "learning_rate": 7.4784163135593226e-06, "loss": 1.2769, "mean_token_accuracy": 0.7179771959781647, "num_tokens": 15324809.0, "step": 19044 }, { "epoch": 5.043961864406779, "grad_norm": 2.237797498703003, "learning_rate": 7.478151483050848e-06, "loss": 1.5098, "mean_token_accuracy": 0.6783309280872345, "num_tokens": 15326447.0, "step": 19046 }, { "epoch": 5.044491525423729, "grad_norm": 2.4825923442840576, "learning_rate": 7.477886652542373e-06, "loss": 1.574, "mean_token_accuracy": 0.6533186361193657, "num_tokens": 15327821.0, "step": 19048 }, { "epoch": 5.045021186440678, "grad_norm": 2.3156557083129883, "learning_rate": 7.477621822033899e-06, "loss": 1.5313, "mean_token_accuracy": 0.6604345813393593, "num_tokens": 15329543.0, "step": 19050 }, { "epoch": 5.045550847457627, "grad_norm": 2.5209641456604004, "learning_rate": 7.477356991525424e-06, "loss": 1.3793, "mean_token_accuracy": 0.6759216338396072, "num_tokens": 15331098.0, "step": 19052 }, { "epoch": 5.046080508474576, "grad_norm": 2.2381491661071777, "learning_rate": 7.47709216101695e-06, "loss": 1.5282, "mean_token_accuracy": 0.6563068106770515, "num_tokens": 15332688.0, "step": 19054 }, { "epoch": 5.046610169491525, "grad_norm": 2.0556247234344482, "learning_rate": 7.476827330508475e-06, "loss": 0.8935, "mean_token_accuracy": 0.7802921757102013, "num_tokens": 15334202.0, "step": 19056 }, { "epoch": 5.047139830508475, "grad_norm": 2.27469539642334, "learning_rate": 7.4765625000000005e-06, "loss": 1.3197, "mean_token_accuracy": 0.7038799300789833, "num_tokens": 15335711.0, "step": 19058 }, { "epoch": 5.047669491525424, "grad_norm": 2.152106523513794, "learning_rate": 7.476297669491525e-06, "loss": 1.2107, "mean_token_accuracy": 0.7117888703942299, "num_tokens": 15337299.0, "step": 19060 }, { "epoch": 5.048199152542373, "grad_norm": 2.443779706954956, "learning_rate": 7.476032838983052e-06, "loss": 1.1316, "mean_token_accuracy": 0.7157393097877502, "num_tokens": 15338904.0, "step": 19062 }, { "epoch": 5.048728813559322, "grad_norm": 2.4931130409240723, "learning_rate": 7.475768008474576e-06, "loss": 1.2614, "mean_token_accuracy": 0.7172752991318703, "num_tokens": 15340226.0, "step": 19064 }, { "epoch": 5.049258474576271, "grad_norm": 2.5252766609191895, "learning_rate": 7.475503177966103e-06, "loss": 1.4432, "mean_token_accuracy": 0.6760422512888908, "num_tokens": 15341823.0, "step": 19066 }, { "epoch": 5.049788135593221, "grad_norm": 2.4880316257476807, "learning_rate": 7.475238347457628e-06, "loss": 1.6918, "mean_token_accuracy": 0.6439758464694023, "num_tokens": 15343311.0, "step": 19068 }, { "epoch": 5.0503177966101696, "grad_norm": 1.8624132871627808, "learning_rate": 7.4749735169491534e-06, "loss": 1.0397, "mean_token_accuracy": 0.7553931772708893, "num_tokens": 15345168.0, "step": 19070 }, { "epoch": 5.0508474576271185, "grad_norm": 1.9169594049453735, "learning_rate": 7.474708686440678e-06, "loss": 1.1128, "mean_token_accuracy": 0.7369439974427223, "num_tokens": 15346746.0, "step": 19072 }, { "epoch": 5.0513771186440675, "grad_norm": 1.9399075508117676, "learning_rate": 7.474443855932204e-06, "loss": 0.9338, "mean_token_accuracy": 0.7576913088560104, "num_tokens": 15348372.0, "step": 19074 }, { "epoch": 5.051906779661017, "grad_norm": 1.8242428302764893, "learning_rate": 7.474179025423729e-06, "loss": 1.0848, "mean_token_accuracy": 0.7527153789997101, "num_tokens": 15349999.0, "step": 19076 }, { "epoch": 5.052436440677966, "grad_norm": 2.154545545578003, "learning_rate": 7.473914194915255e-06, "loss": 1.1892, "mean_token_accuracy": 0.7298869863152504, "num_tokens": 15351728.0, "step": 19078 }, { "epoch": 5.052966101694915, "grad_norm": 2.4663748741149902, "learning_rate": 7.47364936440678e-06, "loss": 1.4492, "mean_token_accuracy": 0.6660315096378326, "num_tokens": 15353472.0, "step": 19080 }, { "epoch": 5.053495762711864, "grad_norm": 2.1854653358459473, "learning_rate": 7.4733845338983055e-06, "loss": 0.869, "mean_token_accuracy": 0.7853631302714348, "num_tokens": 15355245.0, "step": 19082 }, { "epoch": 5.054025423728813, "grad_norm": 2.26704478263855, "learning_rate": 7.4731197033898305e-06, "loss": 0.7892, "mean_token_accuracy": 0.7980312332510948, "num_tokens": 15356670.0, "step": 19084 }, { "epoch": 5.054555084745763, "grad_norm": 1.8455876111984253, "learning_rate": 7.472854872881356e-06, "loss": 1.2608, "mean_token_accuracy": 0.7355344407260418, "num_tokens": 15358354.0, "step": 19086 }, { "epoch": 5.055084745762712, "grad_norm": 2.3013064861297607, "learning_rate": 7.472590042372881e-06, "loss": 1.4914, "mean_token_accuracy": 0.6818147376179695, "num_tokens": 15359958.0, "step": 19088 }, { "epoch": 5.055614406779661, "grad_norm": 2.3278284072875977, "learning_rate": 7.472325211864408e-06, "loss": 1.2976, "mean_token_accuracy": 0.7115022018551826, "num_tokens": 15361425.0, "step": 19090 }, { "epoch": 5.05614406779661, "grad_norm": 2.14030122756958, "learning_rate": 7.472060381355934e-06, "loss": 0.9302, "mean_token_accuracy": 0.7968765869736671, "num_tokens": 15362770.0, "step": 19092 }, { "epoch": 5.05667372881356, "grad_norm": 4.154926300048828, "learning_rate": 7.4717955508474585e-06, "loss": 0.9939, "mean_token_accuracy": 0.7426692098379135, "num_tokens": 15364267.0, "step": 19094 }, { "epoch": 5.057203389830509, "grad_norm": 3.2484536170959473, "learning_rate": 7.471530720338984e-06, "loss": 1.1743, "mean_token_accuracy": 0.7019148766994476, "num_tokens": 15365553.0, "step": 19096 }, { "epoch": 5.057733050847458, "grad_norm": 2.070878267288208, "learning_rate": 7.471265889830509e-06, "loss": 1.2772, "mean_token_accuracy": 0.7088722661137581, "num_tokens": 15367197.0, "step": 19098 }, { "epoch": 5.058262711864407, "grad_norm": 2.759768486022949, "learning_rate": 7.471001059322035e-06, "loss": 0.9614, "mean_token_accuracy": 0.7674932032823563, "num_tokens": 15368680.0, "step": 19100 }, { "epoch": 5.0587923728813555, "grad_norm": 1.8946200609207153, "learning_rate": 7.47073622881356e-06, "loss": 1.176, "mean_token_accuracy": 0.7321812435984612, "num_tokens": 15370508.0, "step": 19102 }, { "epoch": 5.059322033898305, "grad_norm": 1.7563647031784058, "learning_rate": 7.470471398305086e-06, "loss": 0.7273, "mean_token_accuracy": 0.8123210072517395, "num_tokens": 15371973.0, "step": 19104 }, { "epoch": 5.059851694915254, "grad_norm": 2.153573989868164, "learning_rate": 7.470206567796611e-06, "loss": 1.3438, "mean_token_accuracy": 0.6860145926475525, "num_tokens": 15373549.0, "step": 19106 }, { "epoch": 5.060381355932203, "grad_norm": 2.002004623413086, "learning_rate": 7.469941737288136e-06, "loss": 1.0452, "mean_token_accuracy": 0.7444118410348892, "num_tokens": 15375252.0, "step": 19108 }, { "epoch": 5.060911016949152, "grad_norm": 1.797493577003479, "learning_rate": 7.469676906779661e-06, "loss": 1.3325, "mean_token_accuracy": 0.6979402005672455, "num_tokens": 15376933.0, "step": 19110 }, { "epoch": 5.061440677966102, "grad_norm": 2.272167921066284, "learning_rate": 7.469412076271187e-06, "loss": 1.0456, "mean_token_accuracy": 0.7458030432462692, "num_tokens": 15378331.0, "step": 19112 }, { "epoch": 5.061970338983051, "grad_norm": 2.2985646724700928, "learning_rate": 7.469147245762712e-06, "loss": 1.5171, "mean_token_accuracy": 0.6537579745054245, "num_tokens": 15379996.0, "step": 19114 }, { "epoch": 5.0625, "grad_norm": 2.5899999141693115, "learning_rate": 7.468882415254239e-06, "loss": 1.2756, "mean_token_accuracy": 0.7215125858783722, "num_tokens": 15381321.0, "step": 19116 }, { "epoch": 5.063029661016949, "grad_norm": 2.3216583728790283, "learning_rate": 7.468617584745763e-06, "loss": 1.2623, "mean_token_accuracy": 0.7034601122140884, "num_tokens": 15382660.0, "step": 19118 }, { "epoch": 5.063559322033898, "grad_norm": 2.4797329902648926, "learning_rate": 7.468352754237289e-06, "loss": 0.9927, "mean_token_accuracy": 0.7401976585388184, "num_tokens": 15384128.0, "step": 19120 }, { "epoch": 5.064088983050848, "grad_norm": 1.9315296411514282, "learning_rate": 7.468087923728814e-06, "loss": 1.0826, "mean_token_accuracy": 0.7337975278496742, "num_tokens": 15385932.0, "step": 19122 }, { "epoch": 5.064618644067797, "grad_norm": 1.6846799850463867, "learning_rate": 7.46782309322034e-06, "loss": 0.6204, "mean_token_accuracy": 0.8413264229893684, "num_tokens": 15387568.0, "step": 19124 }, { "epoch": 5.065148305084746, "grad_norm": 2.003664493560791, "learning_rate": 7.467558262711865e-06, "loss": 1.2228, "mean_token_accuracy": 0.7297121584415436, "num_tokens": 15389267.0, "step": 19126 }, { "epoch": 5.065677966101695, "grad_norm": 2.5112080574035645, "learning_rate": 7.467293432203391e-06, "loss": 1.4381, "mean_token_accuracy": 0.6958851739764214, "num_tokens": 15390903.0, "step": 19128 }, { "epoch": 5.0662076271186445, "grad_norm": 2.1081182956695557, "learning_rate": 7.467028601694916e-06, "loss": 1.0949, "mean_token_accuracy": 0.7241570129990578, "num_tokens": 15392537.0, "step": 19130 }, { "epoch": 5.066737288135593, "grad_norm": 2.2617859840393066, "learning_rate": 7.4667637711864415e-06, "loss": 1.4776, "mean_token_accuracy": 0.6764518991112709, "num_tokens": 15394093.0, "step": 19132 }, { "epoch": 5.067266949152542, "grad_norm": 2.7054646015167236, "learning_rate": 7.466498940677966e-06, "loss": 1.396, "mean_token_accuracy": 0.6711509898304939, "num_tokens": 15395823.0, "step": 19134 }, { "epoch": 5.067796610169491, "grad_norm": 2.2594432830810547, "learning_rate": 7.466234110169492e-06, "loss": 1.5712, "mean_token_accuracy": 0.6399176195263863, "num_tokens": 15397571.0, "step": 19136 }, { "epoch": 5.06832627118644, "grad_norm": 2.079389810562134, "learning_rate": 7.465969279661017e-06, "loss": 1.1526, "mean_token_accuracy": 0.7441123351454735, "num_tokens": 15399282.0, "step": 19138 }, { "epoch": 5.06885593220339, "grad_norm": 2.4701600074768066, "learning_rate": 7.465704449152543e-06, "loss": 1.1887, "mean_token_accuracy": 0.7203608155250549, "num_tokens": 15400967.0, "step": 19140 }, { "epoch": 5.069385593220339, "grad_norm": 1.9766178131103516, "learning_rate": 7.465439618644068e-06, "loss": 0.9885, "mean_token_accuracy": 0.7832854092121124, "num_tokens": 15402478.0, "step": 19142 }, { "epoch": 5.069915254237288, "grad_norm": 2.8986659049987793, "learning_rate": 7.4651747881355944e-06, "loss": 1.2639, "mean_token_accuracy": 0.7193495556712151, "num_tokens": 15403815.0, "step": 19144 }, { "epoch": 5.070444915254237, "grad_norm": 2.2505671977996826, "learning_rate": 7.4649099576271185e-06, "loss": 1.3334, "mean_token_accuracy": 0.6758092641830444, "num_tokens": 15406094.0, "step": 19146 }, { "epoch": 5.070974576271187, "grad_norm": 2.1931533813476562, "learning_rate": 7.464645127118645e-06, "loss": 1.3162, "mean_token_accuracy": 0.6927709057927132, "num_tokens": 15407572.0, "step": 19148 }, { "epoch": 5.071504237288136, "grad_norm": 3.010565996170044, "learning_rate": 7.46438029661017e-06, "loss": 1.1034, "mean_token_accuracy": 0.7390336319804192, "num_tokens": 15408856.0, "step": 19150 }, { "epoch": 5.072033898305085, "grad_norm": 2.341217517852783, "learning_rate": 7.464115466101696e-06, "loss": 1.5261, "mean_token_accuracy": 0.6788923591375351, "num_tokens": 15410220.0, "step": 19152 }, { "epoch": 5.072563559322034, "grad_norm": 1.8736158609390259, "learning_rate": 7.463850635593221e-06, "loss": 1.065, "mean_token_accuracy": 0.7388318032026291, "num_tokens": 15411804.0, "step": 19154 }, { "epoch": 5.073093220338983, "grad_norm": 2.76145339012146, "learning_rate": 7.4635858050847466e-06, "loss": 1.2417, "mean_token_accuracy": 0.6980887278914452, "num_tokens": 15413159.0, "step": 19156 }, { "epoch": 5.0736228813559325, "grad_norm": 2.1542863845825195, "learning_rate": 7.4633209745762715e-06, "loss": 1.1134, "mean_token_accuracy": 0.7303661853075027, "num_tokens": 15415018.0, "step": 19158 }, { "epoch": 5.0741525423728815, "grad_norm": 2.286078929901123, "learning_rate": 7.463056144067797e-06, "loss": 1.1773, "mean_token_accuracy": 0.724915437400341, "num_tokens": 15416622.0, "step": 19160 }, { "epoch": 5.0746822033898304, "grad_norm": 2.308793783187866, "learning_rate": 7.462791313559322e-06, "loss": 1.1079, "mean_token_accuracy": 0.734176442027092, "num_tokens": 15418309.0, "step": 19162 }, { "epoch": 5.075211864406779, "grad_norm": 2.030073404312134, "learning_rate": 7.462526483050848e-06, "loss": 1.3406, "mean_token_accuracy": 0.6919030733406544, "num_tokens": 15420035.0, "step": 19164 }, { "epoch": 5.075741525423729, "grad_norm": 2.3860690593719482, "learning_rate": 7.462261652542373e-06, "loss": 1.6153, "mean_token_accuracy": 0.6573633179068565, "num_tokens": 15421521.0, "step": 19166 }, { "epoch": 5.076271186440678, "grad_norm": 2.010154962539673, "learning_rate": 7.461996822033899e-06, "loss": 1.525, "mean_token_accuracy": 0.6847239658236504, "num_tokens": 15423396.0, "step": 19168 }, { "epoch": 5.076800847457627, "grad_norm": 2.1899256706237793, "learning_rate": 7.461731991525424e-06, "loss": 1.2122, "mean_token_accuracy": 0.7208480462431908, "num_tokens": 15425015.0, "step": 19170 }, { "epoch": 5.077330508474576, "grad_norm": 1.893692970275879, "learning_rate": 7.461467161016949e-06, "loss": 1.3494, "mean_token_accuracy": 0.6415493860840797, "num_tokens": 15427225.0, "step": 19172 }, { "epoch": 5.077860169491525, "grad_norm": 2.0943686962127686, "learning_rate": 7.461202330508474e-06, "loss": 1.3217, "mean_token_accuracy": 0.6984466537833214, "num_tokens": 15429023.0, "step": 19174 }, { "epoch": 5.078389830508475, "grad_norm": 2.0285959243774414, "learning_rate": 7.460937500000001e-06, "loss": 1.7349, "mean_token_accuracy": 0.6110147051513195, "num_tokens": 15430960.0, "step": 19176 }, { "epoch": 5.078919491525424, "grad_norm": 2.5451743602752686, "learning_rate": 7.460672669491527e-06, "loss": 1.718, "mean_token_accuracy": 0.6274320334196091, "num_tokens": 15432510.0, "step": 19178 }, { "epoch": 5.079449152542373, "grad_norm": 2.112276792526245, "learning_rate": 7.460407838983052e-06, "loss": 1.5374, "mean_token_accuracy": 0.6673358976840973, "num_tokens": 15434130.0, "step": 19180 }, { "epoch": 5.079978813559322, "grad_norm": 2.545314311981201, "learning_rate": 7.460143008474577e-06, "loss": 1.3086, "mean_token_accuracy": 0.6855155006051064, "num_tokens": 15435685.0, "step": 19182 }, { "epoch": 5.080508474576271, "grad_norm": 2.113931179046631, "learning_rate": 7.459878177966102e-06, "loss": 1.2685, "mean_token_accuracy": 0.7047203332185745, "num_tokens": 15437446.0, "step": 19184 }, { "epoch": 5.081038135593221, "grad_norm": 1.989043951034546, "learning_rate": 7.459613347457628e-06, "loss": 1.1639, "mean_token_accuracy": 0.7387732043862343, "num_tokens": 15439231.0, "step": 19186 }, { "epoch": 5.0815677966101696, "grad_norm": 1.6048812866210938, "learning_rate": 7.459348516949153e-06, "loss": 1.1809, "mean_token_accuracy": 0.7255360782146454, "num_tokens": 15440723.0, "step": 19188 }, { "epoch": 5.0820974576271185, "grad_norm": 2.492929697036743, "learning_rate": 7.459083686440679e-06, "loss": 1.404, "mean_token_accuracy": 0.6963615268468857, "num_tokens": 15442396.0, "step": 19190 }, { "epoch": 5.0826271186440675, "grad_norm": 2.29872727394104, "learning_rate": 7.458818855932204e-06, "loss": 1.2987, "mean_token_accuracy": 0.6944401860237122, "num_tokens": 15444226.0, "step": 19192 }, { "epoch": 5.083156779661017, "grad_norm": 1.8135849237442017, "learning_rate": 7.4585540254237295e-06, "loss": 1.4509, "mean_token_accuracy": 0.6724174171686172, "num_tokens": 15446092.0, "step": 19194 }, { "epoch": 5.083686440677966, "grad_norm": 2.0102779865264893, "learning_rate": 7.4582891949152545e-06, "loss": 1.1903, "mean_token_accuracy": 0.7186012044548988, "num_tokens": 15447682.0, "step": 19196 }, { "epoch": 5.084216101694915, "grad_norm": 2.187854528427124, "learning_rate": 7.458024364406781e-06, "loss": 1.4398, "mean_token_accuracy": 0.7076523154973984, "num_tokens": 15449562.0, "step": 19198 }, { "epoch": 5.084745762711864, "grad_norm": 2.380128860473633, "learning_rate": 7.457759533898305e-06, "loss": 1.1186, "mean_token_accuracy": 0.7222721949219704, "num_tokens": 15451120.0, "step": 19200 }, { "epoch": 5.085275423728813, "grad_norm": 2.1375772953033447, "learning_rate": 7.457494703389832e-06, "loss": 1.417, "mean_token_accuracy": 0.6726131811738014, "num_tokens": 15452784.0, "step": 19202 }, { "epoch": 5.085805084745763, "grad_norm": 2.185058355331421, "learning_rate": 7.457229872881357e-06, "loss": 1.1473, "mean_token_accuracy": 0.728621356189251, "num_tokens": 15454602.0, "step": 19204 }, { "epoch": 5.086334745762712, "grad_norm": 1.7744731903076172, "learning_rate": 7.4569650423728825e-06, "loss": 1.0367, "mean_token_accuracy": 0.7480346485972404, "num_tokens": 15456458.0, "step": 19206 }, { "epoch": 5.086864406779661, "grad_norm": 2.423949956893921, "learning_rate": 7.4567002118644074e-06, "loss": 1.3147, "mean_token_accuracy": 0.70981764793396, "num_tokens": 15457942.0, "step": 19208 }, { "epoch": 5.08739406779661, "grad_norm": 2.660212278366089, "learning_rate": 7.456435381355933e-06, "loss": 1.0392, "mean_token_accuracy": 0.7500201389193535, "num_tokens": 15459392.0, "step": 19210 }, { "epoch": 5.08792372881356, "grad_norm": 2.466017246246338, "learning_rate": 7.456170550847458e-06, "loss": 1.5371, "mean_token_accuracy": 0.6789954900741577, "num_tokens": 15460791.0, "step": 19212 }, { "epoch": 5.088453389830509, "grad_norm": 2.063239812850952, "learning_rate": 7.455905720338984e-06, "loss": 1.7042, "mean_token_accuracy": 0.6229822486639023, "num_tokens": 15462488.0, "step": 19214 }, { "epoch": 5.088983050847458, "grad_norm": 2.3981857299804688, "learning_rate": 7.455640889830509e-06, "loss": 1.336, "mean_token_accuracy": 0.7013601884245872, "num_tokens": 15463916.0, "step": 19216 }, { "epoch": 5.089512711864407, "grad_norm": 2.33651065826416, "learning_rate": 7.455376059322035e-06, "loss": 1.4536, "mean_token_accuracy": 0.6740562319755554, "num_tokens": 15465572.0, "step": 19218 }, { "epoch": 5.0900423728813555, "grad_norm": 2.663358211517334, "learning_rate": 7.4551112288135595e-06, "loss": 1.4434, "mean_token_accuracy": 0.6779379099607468, "num_tokens": 15466871.0, "step": 19220 }, { "epoch": 5.090572033898305, "grad_norm": 2.47334623336792, "learning_rate": 7.454846398305085e-06, "loss": 1.3438, "mean_token_accuracy": 0.7052008211612701, "num_tokens": 15468353.0, "step": 19222 }, { "epoch": 5.091101694915254, "grad_norm": 2.2955639362335205, "learning_rate": 7.45458156779661e-06, "loss": 1.2284, "mean_token_accuracy": 0.7180659770965576, "num_tokens": 15469774.0, "step": 19224 }, { "epoch": 5.091631355932203, "grad_norm": 2.4430103302001953, "learning_rate": 7.454316737288136e-06, "loss": 1.4495, "mean_token_accuracy": 0.6955877244472504, "num_tokens": 15471332.0, "step": 19226 }, { "epoch": 5.092161016949152, "grad_norm": 1.8142422437667847, "learning_rate": 7.454051906779661e-06, "loss": 1.3523, "mean_token_accuracy": 0.6945073008537292, "num_tokens": 15472938.0, "step": 19228 }, { "epoch": 5.092690677966102, "grad_norm": 2.150761365890503, "learning_rate": 7.4537870762711876e-06, "loss": 1.0217, "mean_token_accuracy": 0.7420453503727913, "num_tokens": 15474456.0, "step": 19230 }, { "epoch": 5.093220338983051, "grad_norm": 2.7828357219696045, "learning_rate": 7.4535222457627125e-06, "loss": 1.3208, "mean_token_accuracy": 0.6756733655929565, "num_tokens": 15475711.0, "step": 19232 }, { "epoch": 5.09375, "grad_norm": 2.077496290206909, "learning_rate": 7.453257415254238e-06, "loss": 1.3094, "mean_token_accuracy": 0.6940436214208603, "num_tokens": 15477169.0, "step": 19234 }, { "epoch": 5.094279661016949, "grad_norm": 1.9526177644729614, "learning_rate": 7.452992584745763e-06, "loss": 0.9739, "mean_token_accuracy": 0.7695216685533524, "num_tokens": 15479080.0, "step": 19236 }, { "epoch": 5.094809322033898, "grad_norm": 1.9829859733581543, "learning_rate": 7.452727754237289e-06, "loss": 1.436, "mean_token_accuracy": 0.6876299008727074, "num_tokens": 15480677.0, "step": 19238 }, { "epoch": 5.095338983050848, "grad_norm": 2.3626034259796143, "learning_rate": 7.452462923728814e-06, "loss": 1.2831, "mean_token_accuracy": 0.7255401238799095, "num_tokens": 15482041.0, "step": 19240 }, { "epoch": 5.095868644067797, "grad_norm": 2.203768491744995, "learning_rate": 7.45219809322034e-06, "loss": 0.8773, "mean_token_accuracy": 0.7594438567757607, "num_tokens": 15483460.0, "step": 19242 }, { "epoch": 5.096398305084746, "grad_norm": 1.7594659328460693, "learning_rate": 7.451933262711865e-06, "loss": 0.82, "mean_token_accuracy": 0.8089640736579895, "num_tokens": 15485271.0, "step": 19244 }, { "epoch": 5.096927966101695, "grad_norm": 2.207620620727539, "learning_rate": 7.45166843220339e-06, "loss": 1.2925, "mean_token_accuracy": 0.6875336170196533, "num_tokens": 15486874.0, "step": 19246 }, { "epoch": 5.0974576271186445, "grad_norm": 1.717675805091858, "learning_rate": 7.451403601694915e-06, "loss": 1.1304, "mean_token_accuracy": 0.7289014607667923, "num_tokens": 15488744.0, "step": 19248 }, { "epoch": 5.097987288135593, "grad_norm": 1.8167860507965088, "learning_rate": 7.451138771186441e-06, "loss": 1.0182, "step": 19250 }, { "epoch": 5.097987288135593, "eval_loss": 1.3171937465667725, "eval_mean_token_accuracy": 0.7013932305884052, "eval_num_tokens": 15490371.0, "eval_runtime": 48.2687, "eval_samples_per_second": 6.381, "eval_steps_per_second": 6.381, "step": 19250 }, { "epoch": 5.098516949152542, "grad_norm": 2.5165374279022217, "learning_rate": 7.450873940677966e-06, "loss": 1.0185, "mean_token_accuracy": 0.7630044929683208, "num_tokens": 15491443.0, "step": 19252 }, { "epoch": 5.099046610169491, "grad_norm": 1.679577350616455, "learning_rate": 7.450609110169492e-06, "loss": 0.8818, "mean_token_accuracy": 0.7816399931907654, "num_tokens": 15492936.0, "step": 19254 }, { "epoch": 5.09957627118644, "grad_norm": 1.844535231590271, "learning_rate": 7.450344279661017e-06, "loss": 1.0377, "mean_token_accuracy": 0.7632946968078613, "num_tokens": 15494596.0, "step": 19256 }, { "epoch": 5.10010593220339, "grad_norm": 2.4237916469573975, "learning_rate": 7.450079449152543e-06, "loss": 1.4114, "mean_token_accuracy": 0.6700067520141602, "num_tokens": 15496099.0, "step": 19258 }, { "epoch": 5.100635593220339, "grad_norm": 2.3039371967315674, "learning_rate": 7.4498146186440674e-06, "loss": 1.4621, "mean_token_accuracy": 0.6542386040091515, "num_tokens": 15497956.0, "step": 19260 }, { "epoch": 5.101165254237288, "grad_norm": 2.290362596511841, "learning_rate": 7.449549788135594e-06, "loss": 1.5437, "mean_token_accuracy": 0.655879557132721, "num_tokens": 15499574.0, "step": 19262 }, { "epoch": 5.101694915254237, "grad_norm": 2.241704225540161, "learning_rate": 7.44928495762712e-06, "loss": 1.5696, "mean_token_accuracy": 0.6636482775211334, "num_tokens": 15501429.0, "step": 19264 }, { "epoch": 5.102224576271187, "grad_norm": 2.3837931156158447, "learning_rate": 7.449020127118645e-06, "loss": 1.2408, "mean_token_accuracy": 0.710249625146389, "num_tokens": 15502750.0, "step": 19266 }, { "epoch": 5.102754237288136, "grad_norm": 2.042027235031128, "learning_rate": 7.4487552966101705e-06, "loss": 1.1823, "mean_token_accuracy": 0.7055706530809402, "num_tokens": 15504313.0, "step": 19268 }, { "epoch": 5.103283898305085, "grad_norm": 1.9577651023864746, "learning_rate": 7.4484904661016955e-06, "loss": 0.9184, "mean_token_accuracy": 0.7548040598630905, "num_tokens": 15505828.0, "step": 19270 }, { "epoch": 5.103813559322034, "grad_norm": 2.048593759536743, "learning_rate": 7.448225635593221e-06, "loss": 1.1307, "mean_token_accuracy": 0.745431087911129, "num_tokens": 15507292.0, "step": 19272 }, { "epoch": 5.104343220338983, "grad_norm": 2.0668256282806396, "learning_rate": 7.447960805084746e-06, "loss": 0.8111, "mean_token_accuracy": 0.8061989694833755, "num_tokens": 15508867.0, "step": 19274 }, { "epoch": 5.1048728813559325, "grad_norm": 2.195573329925537, "learning_rate": 7.447695974576272e-06, "loss": 1.4317, "mean_token_accuracy": 0.693635419011116, "num_tokens": 15510601.0, "step": 19276 }, { "epoch": 5.1054025423728815, "grad_norm": 1.7452868223190308, "learning_rate": 7.447431144067797e-06, "loss": 1.0651, "mean_token_accuracy": 0.7370206192135811, "num_tokens": 15512515.0, "step": 19278 }, { "epoch": 5.1059322033898304, "grad_norm": 1.7777200937271118, "learning_rate": 7.447166313559323e-06, "loss": 1.0809, "mean_token_accuracy": 0.7221915423870087, "num_tokens": 15514330.0, "step": 19280 }, { "epoch": 5.106461864406779, "grad_norm": 1.8658350706100464, "learning_rate": 7.446901483050848e-06, "loss": 0.9405, "mean_token_accuracy": 0.7818639352917671, "num_tokens": 15516055.0, "step": 19282 }, { "epoch": 5.106991525423729, "grad_norm": 1.5933568477630615, "learning_rate": 7.446636652542374e-06, "loss": 0.8457, "mean_token_accuracy": 0.7918675914406776, "num_tokens": 15517591.0, "step": 19284 }, { "epoch": 5.107521186440678, "grad_norm": 2.341463088989258, "learning_rate": 7.446371822033899e-06, "loss": 1.1837, "mean_token_accuracy": 0.7195990011096001, "num_tokens": 15519125.0, "step": 19286 }, { "epoch": 5.108050847457627, "grad_norm": 1.870379090309143, "learning_rate": 7.446106991525425e-06, "loss": 1.1793, "mean_token_accuracy": 0.7447261586785316, "num_tokens": 15520659.0, "step": 19288 }, { "epoch": 5.108580508474576, "grad_norm": 1.5738123655319214, "learning_rate": 7.44584216101695e-06, "loss": 1.03, "mean_token_accuracy": 0.7451595216989517, "num_tokens": 15522343.0, "step": 19290 }, { "epoch": 5.109110169491525, "grad_norm": 1.9964441061019897, "learning_rate": 7.445577330508476e-06, "loss": 1.4543, "mean_token_accuracy": 0.6993679851293564, "num_tokens": 15523890.0, "step": 19292 }, { "epoch": 5.109639830508475, "grad_norm": 1.9462591409683228, "learning_rate": 7.4453125000000006e-06, "loss": 0.8589, "mean_token_accuracy": 0.7929378002882004, "num_tokens": 15525464.0, "step": 19294 }, { "epoch": 5.110169491525424, "grad_norm": 2.017341136932373, "learning_rate": 7.445047669491526e-06, "loss": 1.2438, "mean_token_accuracy": 0.7208888605237007, "num_tokens": 15527043.0, "step": 19296 }, { "epoch": 5.110699152542373, "grad_norm": 2.9524736404418945, "learning_rate": 7.444782838983051e-06, "loss": 1.1939, "mean_token_accuracy": 0.7308109924197197, "num_tokens": 15528200.0, "step": 19298 }, { "epoch": 5.111228813559322, "grad_norm": 1.9569672346115112, "learning_rate": 7.444518008474577e-06, "loss": 1.0592, "mean_token_accuracy": 0.7335067540407181, "num_tokens": 15529696.0, "step": 19300 }, { "epoch": 5.111758474576271, "grad_norm": 2.2118024826049805, "learning_rate": 7.444253177966102e-06, "loss": 1.4517, "mean_token_accuracy": 0.669451005756855, "num_tokens": 15531510.0, "step": 19302 }, { "epoch": 5.112288135593221, "grad_norm": 2.4109745025634766, "learning_rate": 7.443988347457628e-06, "loss": 1.4756, "mean_token_accuracy": 0.6575168967247009, "num_tokens": 15532822.0, "step": 19304 }, { "epoch": 5.1128177966101696, "grad_norm": 2.3852174282073975, "learning_rate": 7.443723516949153e-06, "loss": 1.3847, "mean_token_accuracy": 0.6999393478035927, "num_tokens": 15534453.0, "step": 19306 }, { "epoch": 5.1133474576271185, "grad_norm": 2.3309290409088135, "learning_rate": 7.4434586864406784e-06, "loss": 1.4515, "mean_token_accuracy": 0.6992138400673866, "num_tokens": 15535862.0, "step": 19308 }, { "epoch": 5.1138771186440675, "grad_norm": 1.7127677202224731, "learning_rate": 7.443193855932203e-06, "loss": 0.8744, "mean_token_accuracy": 0.7646948993206024, "num_tokens": 15537310.0, "step": 19310 }, { "epoch": 5.114406779661017, "grad_norm": 2.536914348602295, "learning_rate": 7.44292902542373e-06, "loss": 1.0446, "mean_token_accuracy": 0.7521776556968689, "num_tokens": 15539004.0, "step": 19312 }, { "epoch": 5.114936440677966, "grad_norm": 2.4342782497406006, "learning_rate": 7.442664194915254e-06, "loss": 1.1415, "mean_token_accuracy": 0.726951539516449, "num_tokens": 15540236.0, "step": 19314 }, { "epoch": 5.115466101694915, "grad_norm": 1.9088047742843628, "learning_rate": 7.442399364406781e-06, "loss": 1.3529, "mean_token_accuracy": 0.7152466587722301, "num_tokens": 15541791.0, "step": 19316 }, { "epoch": 5.115995762711864, "grad_norm": 2.3688580989837646, "learning_rate": 7.442134533898306e-06, "loss": 1.6078, "mean_token_accuracy": 0.6405391991138458, "num_tokens": 15543272.0, "step": 19318 }, { "epoch": 5.116525423728813, "grad_norm": 1.9579468965530396, "learning_rate": 7.441869703389831e-06, "loss": 0.9395, "mean_token_accuracy": 0.7751232534646988, "num_tokens": 15544843.0, "step": 19320 }, { "epoch": 5.117055084745763, "grad_norm": 2.2534186840057373, "learning_rate": 7.441604872881356e-06, "loss": 1.2918, "mean_token_accuracy": 0.708678774535656, "num_tokens": 15546271.0, "step": 19322 }, { "epoch": 5.117584745762712, "grad_norm": 1.9079105854034424, "learning_rate": 7.441340042372882e-06, "loss": 0.8035, "mean_token_accuracy": 0.7813508734107018, "num_tokens": 15547857.0, "step": 19324 }, { "epoch": 5.118114406779661, "grad_norm": 2.2517149448394775, "learning_rate": 7.441075211864407e-06, "loss": 1.2511, "mean_token_accuracy": 0.710857093334198, "num_tokens": 15549480.0, "step": 19326 }, { "epoch": 5.11864406779661, "grad_norm": 1.6790021657943726, "learning_rate": 7.440810381355933e-06, "loss": 1.016, "mean_token_accuracy": 0.7589975744485855, "num_tokens": 15551299.0, "step": 19328 }, { "epoch": 5.11917372881356, "grad_norm": 1.9236851930618286, "learning_rate": 7.440545550847458e-06, "loss": 0.7251, "mean_token_accuracy": 0.8020525053143501, "num_tokens": 15553001.0, "step": 19330 }, { "epoch": 5.119703389830509, "grad_norm": 2.490985155105591, "learning_rate": 7.4402807203389835e-06, "loss": 1.318, "mean_token_accuracy": 0.6770806536078453, "num_tokens": 15555634.0, "step": 19332 }, { "epoch": 5.120233050847458, "grad_norm": 2.1223108768463135, "learning_rate": 7.4400158898305085e-06, "loss": 1.3626, "mean_token_accuracy": 0.6810849756002426, "num_tokens": 15557286.0, "step": 19334 }, { "epoch": 5.120762711864407, "grad_norm": 2.4820287227630615, "learning_rate": 7.439751059322034e-06, "loss": 1.2083, "mean_token_accuracy": 0.7291719689965248, "num_tokens": 15558875.0, "step": 19336 }, { "epoch": 5.1212923728813555, "grad_norm": 2.0547468662261963, "learning_rate": 7.439486228813559e-06, "loss": 1.0431, "mean_token_accuracy": 0.7411919087171555, "num_tokens": 15560805.0, "step": 19338 }, { "epoch": 5.121822033898305, "grad_norm": 2.363112688064575, "learning_rate": 7.439221398305086e-06, "loss": 1.0066, "mean_token_accuracy": 0.7775467932224274, "num_tokens": 15562266.0, "step": 19340 }, { "epoch": 5.122351694915254, "grad_norm": 2.310925245285034, "learning_rate": 7.43895656779661e-06, "loss": 1.4155, "mean_token_accuracy": 0.683952584862709, "num_tokens": 15563877.0, "step": 19342 }, { "epoch": 5.122881355932203, "grad_norm": 1.9619877338409424, "learning_rate": 7.4386917372881365e-06, "loss": 1.4813, "mean_token_accuracy": 0.666075561195612, "num_tokens": 15565499.0, "step": 19344 }, { "epoch": 5.123411016949152, "grad_norm": 2.3162343502044678, "learning_rate": 7.438426906779662e-06, "loss": 1.1704, "mean_token_accuracy": 0.7341156601905823, "num_tokens": 15566913.0, "step": 19346 }, { "epoch": 5.123940677966102, "grad_norm": 1.7622112035751343, "learning_rate": 7.438162076271187e-06, "loss": 1.5303, "mean_token_accuracy": 0.6333367228507996, "num_tokens": 15568870.0, "step": 19348 }, { "epoch": 5.124470338983051, "grad_norm": 1.5984468460083008, "learning_rate": 7.437897245762713e-06, "loss": 0.839, "mean_token_accuracy": 0.7897045165300369, "num_tokens": 15570453.0, "step": 19350 }, { "epoch": 5.125, "grad_norm": 2.5741543769836426, "learning_rate": 7.437632415254238e-06, "loss": 1.4354, "mean_token_accuracy": 0.6811530143022537, "num_tokens": 15572147.0, "step": 19352 }, { "epoch": 5.125529661016949, "grad_norm": 2.0778980255126953, "learning_rate": 7.437367584745764e-06, "loss": 1.4229, "mean_token_accuracy": 0.6657641604542732, "num_tokens": 15573977.0, "step": 19354 }, { "epoch": 5.126059322033898, "grad_norm": 2.373755693435669, "learning_rate": 7.437102754237289e-06, "loss": 1.0079, "mean_token_accuracy": 0.7661375850439072, "num_tokens": 15575383.0, "step": 19356 }, { "epoch": 5.126588983050848, "grad_norm": 2.05071759223938, "learning_rate": 7.436837923728814e-06, "loss": 1.0398, "mean_token_accuracy": 0.741429977118969, "num_tokens": 15576873.0, "step": 19358 }, { "epoch": 5.127118644067797, "grad_norm": 2.002434253692627, "learning_rate": 7.436573093220339e-06, "loss": 0.9663, "mean_token_accuracy": 0.7714908719062805, "num_tokens": 15578409.0, "step": 19360 }, { "epoch": 5.127648305084746, "grad_norm": 2.1881561279296875, "learning_rate": 7.436308262711865e-06, "loss": 1.4485, "mean_token_accuracy": 0.6718283668160439, "num_tokens": 15580032.0, "step": 19362 }, { "epoch": 5.128177966101695, "grad_norm": 1.9112794399261475, "learning_rate": 7.43604343220339e-06, "loss": 1.2272, "mean_token_accuracy": 0.7280398309230804, "num_tokens": 15581720.0, "step": 19364 }, { "epoch": 5.1287076271186445, "grad_norm": 2.3182499408721924, "learning_rate": 7.435778601694917e-06, "loss": 1.6858, "mean_token_accuracy": 0.658955916762352, "num_tokens": 15583246.0, "step": 19366 }, { "epoch": 5.129237288135593, "grad_norm": 2.8827128410339355, "learning_rate": 7.435513771186441e-06, "loss": 1.4838, "mean_token_accuracy": 0.6699602156877518, "num_tokens": 15584805.0, "step": 19368 }, { "epoch": 5.129766949152542, "grad_norm": 2.4927821159362793, "learning_rate": 7.435248940677967e-06, "loss": 1.3859, "mean_token_accuracy": 0.6760029196739197, "num_tokens": 15586435.0, "step": 19370 }, { "epoch": 5.130296610169491, "grad_norm": 2.639650344848633, "learning_rate": 7.434984110169492e-06, "loss": 1.0535, "mean_token_accuracy": 0.7356977537274361, "num_tokens": 15587915.0, "step": 19372 }, { "epoch": 5.13082627118644, "grad_norm": 1.8182824850082397, "learning_rate": 7.434719279661018e-06, "loss": 1.0287, "mean_token_accuracy": 0.7462248876690865, "num_tokens": 15589495.0, "step": 19374 }, { "epoch": 5.13135593220339, "grad_norm": 1.8787152767181396, "learning_rate": 7.434454449152543e-06, "loss": 0.992, "mean_token_accuracy": 0.762736476957798, "num_tokens": 15591845.0, "step": 19376 }, { "epoch": 5.131885593220339, "grad_norm": 2.3527934551239014, "learning_rate": 7.434189618644069e-06, "loss": 1.4214, "mean_token_accuracy": 0.6877157092094421, "num_tokens": 15593187.0, "step": 19378 }, { "epoch": 5.132415254237288, "grad_norm": 2.3085403442382812, "learning_rate": 7.433924788135594e-06, "loss": 1.4819, "mean_token_accuracy": 0.6787672787904739, "num_tokens": 15594978.0, "step": 19380 }, { "epoch": 5.132944915254237, "grad_norm": 2.3390374183654785, "learning_rate": 7.4336599576271195e-06, "loss": 1.2568, "mean_token_accuracy": 0.7108694538474083, "num_tokens": 15596438.0, "step": 19382 }, { "epoch": 5.133474576271187, "grad_norm": 1.8084328174591064, "learning_rate": 7.433395127118644e-06, "loss": 1.1907, "mean_token_accuracy": 0.7345837280154228, "num_tokens": 15598318.0, "step": 19384 }, { "epoch": 5.134004237288136, "grad_norm": 1.6731806993484497, "learning_rate": 7.43313029661017e-06, "loss": 0.9822, "mean_token_accuracy": 0.7555506154894829, "num_tokens": 15599948.0, "step": 19386 }, { "epoch": 5.134533898305085, "grad_norm": 2.2966225147247314, "learning_rate": 7.432865466101695e-06, "loss": 1.2398, "mean_token_accuracy": 0.69822908192873, "num_tokens": 15601464.0, "step": 19388 }, { "epoch": 5.135063559322034, "grad_norm": 2.336657762527466, "learning_rate": 7.432600635593221e-06, "loss": 1.5431, "mean_token_accuracy": 0.6497717574238777, "num_tokens": 15602896.0, "step": 19390 }, { "epoch": 5.135593220338983, "grad_norm": 2.2474498748779297, "learning_rate": 7.432335805084746e-06, "loss": 1.384, "mean_token_accuracy": 0.6718246564269066, "num_tokens": 15604442.0, "step": 19392 }, { "epoch": 5.1361228813559325, "grad_norm": 2.125396966934204, "learning_rate": 7.4320709745762724e-06, "loss": 1.1186, "mean_token_accuracy": 0.73487189412117, "num_tokens": 15606070.0, "step": 19394 }, { "epoch": 5.1366525423728815, "grad_norm": 1.7160029411315918, "learning_rate": 7.4318061440677965e-06, "loss": 1.2258, "mean_token_accuracy": 0.7093859687447548, "num_tokens": 15607678.0, "step": 19396 }, { "epoch": 5.1371822033898304, "grad_norm": 2.454622507095337, "learning_rate": 7.431541313559323e-06, "loss": 1.2013, "mean_token_accuracy": 0.7191502302885056, "num_tokens": 15609149.0, "step": 19398 }, { "epoch": 5.137711864406779, "grad_norm": 2.1227471828460693, "learning_rate": 7.431276483050848e-06, "loss": 1.3948, "mean_token_accuracy": 0.6728325039148331, "num_tokens": 15610754.0, "step": 19400 }, { "epoch": 5.138241525423728, "grad_norm": 2.1691598892211914, "learning_rate": 7.431011652542374e-06, "loss": 1.1228, "mean_token_accuracy": 0.7466575279831886, "num_tokens": 15612081.0, "step": 19402 }, { "epoch": 5.138771186440678, "grad_norm": 2.1656274795532227, "learning_rate": 7.430746822033899e-06, "loss": 1.2321, "mean_token_accuracy": 0.7220124676823616, "num_tokens": 15613577.0, "step": 19404 }, { "epoch": 5.139300847457627, "grad_norm": 1.5503143072128296, "learning_rate": 7.4304819915254245e-06, "loss": 0.9188, "mean_token_accuracy": 0.7577926814556122, "num_tokens": 15615224.0, "step": 19406 }, { "epoch": 5.139830508474576, "grad_norm": 2.4429805278778076, "learning_rate": 7.4302171610169495e-06, "loss": 1.4733, "mean_token_accuracy": 0.6602184772491455, "num_tokens": 15616854.0, "step": 19408 }, { "epoch": 5.140360169491525, "grad_norm": 1.873661994934082, "learning_rate": 7.429952330508475e-06, "loss": 1.1244, "mean_token_accuracy": 0.7415828704833984, "num_tokens": 15618361.0, "step": 19410 }, { "epoch": 5.140889830508475, "grad_norm": 2.419179677963257, "learning_rate": 7.4296875e-06, "loss": 1.8042, "mean_token_accuracy": 0.6108113899827003, "num_tokens": 15620017.0, "step": 19412 }, { "epoch": 5.141419491525424, "grad_norm": 1.9275108575820923, "learning_rate": 7.429422669491526e-06, "loss": 1.1725, "mean_token_accuracy": 0.7261403053998947, "num_tokens": 15621746.0, "step": 19414 }, { "epoch": 5.141949152542373, "grad_norm": 2.223595380783081, "learning_rate": 7.429157838983051e-06, "loss": 1.5492, "mean_token_accuracy": 0.6517355293035507, "num_tokens": 15623321.0, "step": 19416 }, { "epoch": 5.142478813559322, "grad_norm": 1.8821288347244263, "learning_rate": 7.428893008474577e-06, "loss": 1.3365, "mean_token_accuracy": 0.6863673403859138, "num_tokens": 15624789.0, "step": 19418 }, { "epoch": 5.143008474576272, "grad_norm": 2.1836283206939697, "learning_rate": 7.428628177966102e-06, "loss": 1.1746, "mean_token_accuracy": 0.7295651510357857, "num_tokens": 15626196.0, "step": 19420 }, { "epoch": 5.143538135593221, "grad_norm": 2.108398199081421, "learning_rate": 7.428363347457627e-06, "loss": 1.1134, "mean_token_accuracy": 0.7299793437123299, "num_tokens": 15627677.0, "step": 19422 }, { "epoch": 5.1440677966101696, "grad_norm": 2.007380723953247, "learning_rate": 7.428098516949152e-06, "loss": 0.9322, "mean_token_accuracy": 0.7730025053024292, "num_tokens": 15629398.0, "step": 19424 }, { "epoch": 5.1445974576271185, "grad_norm": 2.055588960647583, "learning_rate": 7.427833686440679e-06, "loss": 0.9805, "mean_token_accuracy": 0.7774174213409424, "num_tokens": 15630632.0, "step": 19426 }, { "epoch": 5.1451271186440675, "grad_norm": 2.429935932159424, "learning_rate": 7.427568855932204e-06, "loss": 1.4672, "mean_token_accuracy": 0.6743785440921783, "num_tokens": 15632483.0, "step": 19428 }, { "epoch": 5.145656779661017, "grad_norm": 1.9580436944961548, "learning_rate": 7.42730402542373e-06, "loss": 0.6631, "mean_token_accuracy": 0.8237038180232048, "num_tokens": 15633870.0, "step": 19430 }, { "epoch": 5.146186440677966, "grad_norm": 2.004157781600952, "learning_rate": 7.427039194915255e-06, "loss": 1.2901, "mean_token_accuracy": 0.7012304961681366, "num_tokens": 15635401.0, "step": 19432 }, { "epoch": 5.146716101694915, "grad_norm": 2.0956897735595703, "learning_rate": 7.42677436440678e-06, "loss": 1.288, "mean_token_accuracy": 0.7055264636874199, "num_tokens": 15636906.0, "step": 19434 }, { "epoch": 5.147245762711864, "grad_norm": 1.8948771953582764, "learning_rate": 7.426509533898306e-06, "loss": 1.3645, "mean_token_accuracy": 0.6907661184668541, "num_tokens": 15638813.0, "step": 19436 }, { "epoch": 5.147775423728813, "grad_norm": 2.3595659732818604, "learning_rate": 7.426244703389831e-06, "loss": 1.0448, "mean_token_accuracy": 0.7699805274605751, "num_tokens": 15640298.0, "step": 19438 }, { "epoch": 5.148305084745763, "grad_norm": 2.3915672302246094, "learning_rate": 7.425979872881357e-06, "loss": 1.3723, "mean_token_accuracy": 0.6835152357816696, "num_tokens": 15641746.0, "step": 19440 }, { "epoch": 5.148834745762712, "grad_norm": 2.1541881561279297, "learning_rate": 7.425715042372882e-06, "loss": 1.2123, "mean_token_accuracy": 0.7180824279785156, "num_tokens": 15643300.0, "step": 19442 }, { "epoch": 5.149364406779661, "grad_norm": 1.8809947967529297, "learning_rate": 7.4254502118644075e-06, "loss": 1.0454, "mean_token_accuracy": 0.756393700838089, "num_tokens": 15644808.0, "step": 19444 }, { "epoch": 5.14989406779661, "grad_norm": 2.6371092796325684, "learning_rate": 7.4251853813559324e-06, "loss": 1.1013, "mean_token_accuracy": 0.7487109154462814, "num_tokens": 15646320.0, "step": 19446 }, { "epoch": 5.15042372881356, "grad_norm": 2.3677804470062256, "learning_rate": 7.424920550847459e-06, "loss": 1.1932, "mean_token_accuracy": 0.7400830313563347, "num_tokens": 15647875.0, "step": 19448 }, { "epoch": 5.150953389830509, "grad_norm": 1.8880635499954224, "learning_rate": 7.424655720338983e-06, "loss": 1.3783, "mean_token_accuracy": 0.6908783167600632, "num_tokens": 15649667.0, "step": 19450 }, { "epoch": 5.151483050847458, "grad_norm": 2.1634790897369385, "learning_rate": 7.42439088983051e-06, "loss": 1.3252, "mean_token_accuracy": 0.7093203477561474, "num_tokens": 15651768.0, "step": 19452 }, { "epoch": 5.152012711864407, "grad_norm": 2.533461570739746, "learning_rate": 7.424126059322035e-06, "loss": 1.0305, "mean_token_accuracy": 0.7425490543246269, "num_tokens": 15653513.0, "step": 19454 }, { "epoch": 5.1525423728813555, "grad_norm": 2.0446929931640625, "learning_rate": 7.4238612288135605e-06, "loss": 1.2361, "mean_token_accuracy": 0.7167302370071411, "num_tokens": 15655181.0, "step": 19456 }, { "epoch": 5.153072033898305, "grad_norm": 2.192490816116333, "learning_rate": 7.423596398305085e-06, "loss": 1.0971, "mean_token_accuracy": 0.754953533411026, "num_tokens": 15656743.0, "step": 19458 }, { "epoch": 5.153601694915254, "grad_norm": 2.606858730316162, "learning_rate": 7.423331567796611e-06, "loss": 1.2949, "mean_token_accuracy": 0.6950728595256805, "num_tokens": 15658228.0, "step": 19460 }, { "epoch": 5.154131355932203, "grad_norm": 3.022442579269409, "learning_rate": 7.423066737288136e-06, "loss": 1.0933, "mean_token_accuracy": 0.7325502783060074, "num_tokens": 15659566.0, "step": 19462 }, { "epoch": 5.154661016949152, "grad_norm": 2.6181390285491943, "learning_rate": 7.422801906779662e-06, "loss": 1.1845, "mean_token_accuracy": 0.7392118126153946, "num_tokens": 15660878.0, "step": 19464 }, { "epoch": 5.155190677966102, "grad_norm": 2.0934722423553467, "learning_rate": 7.422537076271187e-06, "loss": 1.1441, "mean_token_accuracy": 0.7245953977108002, "num_tokens": 15662571.0, "step": 19466 }, { "epoch": 5.155720338983051, "grad_norm": 2.350130796432495, "learning_rate": 7.422272245762713e-06, "loss": 1.1459, "mean_token_accuracy": 0.7437007576227188, "num_tokens": 15663863.0, "step": 19468 }, { "epoch": 5.15625, "grad_norm": 2.4563546180725098, "learning_rate": 7.4220074152542375e-06, "loss": 1.1724, "mean_token_accuracy": 0.7298645675182343, "num_tokens": 15665370.0, "step": 19470 }, { "epoch": 5.156779661016949, "grad_norm": 2.1656086444854736, "learning_rate": 7.421742584745763e-06, "loss": 1.21, "mean_token_accuracy": 0.727256566286087, "num_tokens": 15667014.0, "step": 19472 }, { "epoch": 5.157309322033898, "grad_norm": 2.1310431957244873, "learning_rate": 7.421477754237288e-06, "loss": 1.3415, "mean_token_accuracy": 0.7197132706642151, "num_tokens": 15668947.0, "step": 19474 }, { "epoch": 5.157838983050848, "grad_norm": 2.22377610206604, "learning_rate": 7.421212923728814e-06, "loss": 0.9499, "mean_token_accuracy": 0.7655048668384552, "num_tokens": 15670239.0, "step": 19476 }, { "epoch": 5.158368644067797, "grad_norm": 1.9668409824371338, "learning_rate": 7.420948093220339e-06, "loss": 0.7523, "mean_token_accuracy": 0.7887989729642868, "num_tokens": 15671917.0, "step": 19478 }, { "epoch": 5.158898305084746, "grad_norm": 2.38627028465271, "learning_rate": 7.4206832627118656e-06, "loss": 1.1689, "mean_token_accuracy": 0.747902899980545, "num_tokens": 15673361.0, "step": 19480 }, { "epoch": 5.159427966101695, "grad_norm": 1.9220750331878662, "learning_rate": 7.4204184322033905e-06, "loss": 1.1131, "mean_token_accuracy": 0.7293530032038689, "num_tokens": 15674926.0, "step": 19482 }, { "epoch": 5.1599576271186445, "grad_norm": 2.041165828704834, "learning_rate": 7.420153601694916e-06, "loss": 1.0627, "mean_token_accuracy": 0.7382261231541634, "num_tokens": 15676660.0, "step": 19484 }, { "epoch": 5.160487288135593, "grad_norm": 1.7679612636566162, "learning_rate": 7.419888771186441e-06, "loss": 1.2027, "mean_token_accuracy": 0.7360201999545097, "num_tokens": 15679266.0, "step": 19486 }, { "epoch": 5.161016949152542, "grad_norm": 2.5366647243499756, "learning_rate": 7.419623940677967e-06, "loss": 1.0678, "mean_token_accuracy": 0.7429307326674461, "num_tokens": 15680691.0, "step": 19488 }, { "epoch": 5.161546610169491, "grad_norm": 1.7522008419036865, "learning_rate": 7.419359110169492e-06, "loss": 1.0879, "mean_token_accuracy": 0.7517253831028938, "num_tokens": 15682262.0, "step": 19490 }, { "epoch": 5.16207627118644, "grad_norm": 2.2860207557678223, "learning_rate": 7.419094279661018e-06, "loss": 0.93, "mean_token_accuracy": 0.7667215168476105, "num_tokens": 15683800.0, "step": 19492 }, { "epoch": 5.16260593220339, "grad_norm": 2.360515832901001, "learning_rate": 7.418829449152543e-06, "loss": 1.4074, "mean_token_accuracy": 0.6794226802885532, "num_tokens": 15685492.0, "step": 19494 }, { "epoch": 5.163135593220339, "grad_norm": 1.8315705060958862, "learning_rate": 7.418564618644068e-06, "loss": 1.1031, "mean_token_accuracy": 0.7293485924601555, "num_tokens": 15686961.0, "step": 19496 }, { "epoch": 5.163665254237288, "grad_norm": 2.160360813140869, "learning_rate": 7.418299788135593e-06, "loss": 1.2703, "mean_token_accuracy": 0.7283950299024582, "num_tokens": 15688607.0, "step": 19498 }, { "epoch": 5.164194915254237, "grad_norm": 2.002285957336426, "learning_rate": 7.418034957627119e-06, "loss": 0.8931, "step": 19500 }, { "epoch": 5.164194915254237, "eval_loss": 1.3170043230056763, "eval_mean_token_accuracy": 0.7012610751893613, "eval_num_tokens": 15690073.0, "eval_runtime": 48.2652, "eval_samples_per_second": 6.381, "eval_steps_per_second": 6.381, "step": 19500 }, { "epoch": 5.164724576271187, "grad_norm": 2.2034497261047363, "learning_rate": 7.417770127118644e-06, "loss": 1.0692, "mean_token_accuracy": 0.754346527159214, "num_tokens": 15691589.0, "step": 19502 }, { "epoch": 5.165254237288136, "grad_norm": 2.1536617279052734, "learning_rate": 7.41750529661017e-06, "loss": 1.1664, "mean_token_accuracy": 0.710959866642952, "num_tokens": 15693272.0, "step": 19504 }, { "epoch": 5.165783898305085, "grad_norm": 2.107719659805298, "learning_rate": 7.417240466101695e-06, "loss": 1.4828, "mean_token_accuracy": 0.6731916517019272, "num_tokens": 15695131.0, "step": 19506 }, { "epoch": 5.166313559322034, "grad_norm": 2.1333439350128174, "learning_rate": 7.416975635593221e-06, "loss": 1.1068, "mean_token_accuracy": 0.7132673040032387, "num_tokens": 15696870.0, "step": 19508 }, { "epoch": 5.166843220338983, "grad_norm": 2.349168539047241, "learning_rate": 7.416710805084745e-06, "loss": 1.2916, "mean_token_accuracy": 0.7090569883584976, "num_tokens": 15698259.0, "step": 19510 }, { "epoch": 5.1673728813559325, "grad_norm": 2.1136600971221924, "learning_rate": 7.416445974576272e-06, "loss": 1.2535, "mean_token_accuracy": 0.6847910732030869, "num_tokens": 15700006.0, "step": 19512 }, { "epoch": 5.1679025423728815, "grad_norm": 2.0943570137023926, "learning_rate": 7.416181144067798e-06, "loss": 1.149, "mean_token_accuracy": 0.7344260886311531, "num_tokens": 15701493.0, "step": 19514 }, { "epoch": 5.1684322033898304, "grad_norm": 2.290692090988159, "learning_rate": 7.415916313559323e-06, "loss": 1.4898, "mean_token_accuracy": 0.6373923942446709, "num_tokens": 15703080.0, "step": 19516 }, { "epoch": 5.168961864406779, "grad_norm": 2.1926329135894775, "learning_rate": 7.4156514830508485e-06, "loss": 1.1342, "mean_token_accuracy": 0.717809334397316, "num_tokens": 15704489.0, "step": 19518 }, { "epoch": 5.169491525423728, "grad_norm": 2.4619569778442383, "learning_rate": 7.4153866525423735e-06, "loss": 1.7867, "mean_token_accuracy": 0.6259969547390938, "num_tokens": 15706196.0, "step": 19520 }, { "epoch": 5.170021186440678, "grad_norm": 1.8950412273406982, "learning_rate": 7.415121822033899e-06, "loss": 1.0744, "mean_token_accuracy": 0.7496439069509506, "num_tokens": 15707818.0, "step": 19522 }, { "epoch": 5.170550847457627, "grad_norm": 1.974188208580017, "learning_rate": 7.414856991525424e-06, "loss": 1.051, "mean_token_accuracy": 0.7420445755124092, "num_tokens": 15709371.0, "step": 19524 }, { "epoch": 5.171080508474576, "grad_norm": 2.092637777328491, "learning_rate": 7.41459216101695e-06, "loss": 0.8361, "mean_token_accuracy": 0.8059931471943855, "num_tokens": 15710510.0, "step": 19526 }, { "epoch": 5.171610169491525, "grad_norm": 2.4145143032073975, "learning_rate": 7.414327330508475e-06, "loss": 1.7132, "mean_token_accuracy": 0.6216200068593025, "num_tokens": 15712328.0, "step": 19528 }, { "epoch": 5.172139830508475, "grad_norm": 2.3148961067199707, "learning_rate": 7.414062500000001e-06, "loss": 1.3161, "mean_token_accuracy": 0.7063480690121651, "num_tokens": 15713854.0, "step": 19530 }, { "epoch": 5.172669491525424, "grad_norm": 2.0141613483428955, "learning_rate": 7.4137976694915256e-06, "loss": 1.4257, "mean_token_accuracy": 0.6687450855970383, "num_tokens": 15715569.0, "step": 19532 }, { "epoch": 5.173199152542373, "grad_norm": 2.201324224472046, "learning_rate": 7.413532838983052e-06, "loss": 1.4436, "mean_token_accuracy": 0.697288740426302, "num_tokens": 15717407.0, "step": 19534 }, { "epoch": 5.173728813559322, "grad_norm": 2.3295655250549316, "learning_rate": 7.413268008474577e-06, "loss": 0.8466, "mean_token_accuracy": 0.7959883809089661, "num_tokens": 15718954.0, "step": 19536 }, { "epoch": 5.174258474576272, "grad_norm": 2.2386066913604736, "learning_rate": 7.413003177966103e-06, "loss": 1.2939, "mean_token_accuracy": 0.6960821971297264, "num_tokens": 15720412.0, "step": 19538 }, { "epoch": 5.174788135593221, "grad_norm": 2.2891249656677246, "learning_rate": 7.412738347457628e-06, "loss": 1.3569, "mean_token_accuracy": 0.6853652372956276, "num_tokens": 15721966.0, "step": 19540 }, { "epoch": 5.1753177966101696, "grad_norm": 2.337083339691162, "learning_rate": 7.412473516949154e-06, "loss": 1.4661, "mean_token_accuracy": 0.6436187326908112, "num_tokens": 15723714.0, "step": 19542 }, { "epoch": 5.1758474576271185, "grad_norm": 2.622302532196045, "learning_rate": 7.4122086864406785e-06, "loss": 1.2278, "mean_token_accuracy": 0.7194901183247566, "num_tokens": 15725265.0, "step": 19544 }, { "epoch": 5.1763771186440675, "grad_norm": 2.3557536602020264, "learning_rate": 7.411943855932204e-06, "loss": 1.4331, "mean_token_accuracy": 0.6684470176696777, "num_tokens": 15727036.0, "step": 19546 }, { "epoch": 5.176906779661017, "grad_norm": 2.1395981311798096, "learning_rate": 7.411679025423729e-06, "loss": 1.4934, "mean_token_accuracy": 0.6757333651185036, "num_tokens": 15728619.0, "step": 19548 }, { "epoch": 5.177436440677966, "grad_norm": 1.8204264640808105, "learning_rate": 7.411414194915255e-06, "loss": 0.9077, "mean_token_accuracy": 0.7685102373361588, "num_tokens": 15730171.0, "step": 19550 }, { "epoch": 5.177966101694915, "grad_norm": 2.5389606952667236, "learning_rate": 7.41114936440678e-06, "loss": 0.77, "mean_token_accuracy": 0.8142807707190514, "num_tokens": 15731610.0, "step": 19552 }, { "epoch": 5.178495762711864, "grad_norm": 1.8645583391189575, "learning_rate": 7.410884533898306e-06, "loss": 1.0484, "mean_token_accuracy": 0.7457317113876343, "num_tokens": 15733235.0, "step": 19554 }, { "epoch": 5.179025423728813, "grad_norm": 2.151381254196167, "learning_rate": 7.410619703389831e-06, "loss": 1.0151, "mean_token_accuracy": 0.734274223446846, "num_tokens": 15734860.0, "step": 19556 }, { "epoch": 5.179555084745763, "grad_norm": 1.766550064086914, "learning_rate": 7.4103548728813564e-06, "loss": 1.1363, "mean_token_accuracy": 0.7365705221891403, "num_tokens": 15736402.0, "step": 19558 }, { "epoch": 5.180084745762712, "grad_norm": 2.5487303733825684, "learning_rate": 7.410090042372881e-06, "loss": 1.1498, "mean_token_accuracy": 0.7481616511940956, "num_tokens": 15737972.0, "step": 19560 }, { "epoch": 5.180614406779661, "grad_norm": 2.187269687652588, "learning_rate": 7.409825211864408e-06, "loss": 1.045, "mean_token_accuracy": 0.763300359249115, "num_tokens": 15739466.0, "step": 19562 }, { "epoch": 5.18114406779661, "grad_norm": 2.2229743003845215, "learning_rate": 7.409560381355932e-06, "loss": 1.6701, "mean_token_accuracy": 0.6475709900259972, "num_tokens": 15741285.0, "step": 19564 }, { "epoch": 5.18167372881356, "grad_norm": 2.513085126876831, "learning_rate": 7.409295550847459e-06, "loss": 1.2916, "mean_token_accuracy": 0.6977984085679054, "num_tokens": 15742715.0, "step": 19566 }, { "epoch": 5.182203389830509, "grad_norm": 2.0394999980926514, "learning_rate": 7.409030720338984e-06, "loss": 1.1017, "mean_token_accuracy": 0.7436160147190094, "num_tokens": 15744300.0, "step": 19568 }, { "epoch": 5.182733050847458, "grad_norm": 2.411561965942383, "learning_rate": 7.408765889830509e-06, "loss": 1.4602, "mean_token_accuracy": 0.7031724452972412, "num_tokens": 15745616.0, "step": 19570 }, { "epoch": 5.183262711864407, "grad_norm": 1.9201327562332153, "learning_rate": 7.408501059322034e-06, "loss": 1.4857, "mean_token_accuracy": 0.687132328748703, "num_tokens": 15747233.0, "step": 19572 }, { "epoch": 5.1837923728813555, "grad_norm": 2.1960036754608154, "learning_rate": 7.40823622881356e-06, "loss": 1.2953, "mean_token_accuracy": 0.7032303288578987, "num_tokens": 15748823.0, "step": 19574 }, { "epoch": 5.184322033898305, "grad_norm": 1.9551410675048828, "learning_rate": 7.407971398305085e-06, "loss": 1.0838, "mean_token_accuracy": 0.7200295031070709, "num_tokens": 15750454.0, "step": 19576 }, { "epoch": 5.184851694915254, "grad_norm": 2.559112071990967, "learning_rate": 7.407706567796611e-06, "loss": 1.6495, "mean_token_accuracy": 0.6763118654489517, "num_tokens": 15751943.0, "step": 19578 }, { "epoch": 5.185381355932203, "grad_norm": 2.1754136085510254, "learning_rate": 7.407441737288136e-06, "loss": 1.086, "mean_token_accuracy": 0.7314184457063675, "num_tokens": 15753600.0, "step": 19580 }, { "epoch": 5.185911016949152, "grad_norm": 2.4531266689300537, "learning_rate": 7.4071769067796615e-06, "loss": 1.1241, "mean_token_accuracy": 0.7271706536412239, "num_tokens": 15755327.0, "step": 19582 }, { "epoch": 5.186440677966102, "grad_norm": 2.338980197906494, "learning_rate": 7.4069120762711864e-06, "loss": 1.1569, "mean_token_accuracy": 0.7226964086294174, "num_tokens": 15757291.0, "step": 19584 }, { "epoch": 5.186970338983051, "grad_norm": 2.6426467895507812, "learning_rate": 7.406647245762712e-06, "loss": 1.3916, "mean_token_accuracy": 0.6995359361171722, "num_tokens": 15758602.0, "step": 19586 }, { "epoch": 5.1875, "grad_norm": 2.3249897956848145, "learning_rate": 7.406382415254237e-06, "loss": 1.4285, "mean_token_accuracy": 0.6808751448988914, "num_tokens": 15760220.0, "step": 19588 }, { "epoch": 5.188029661016949, "grad_norm": 2.089188814163208, "learning_rate": 7.406117584745764e-06, "loss": 1.0726, "mean_token_accuracy": 0.7327579036355019, "num_tokens": 15761954.0, "step": 19590 }, { "epoch": 5.188559322033898, "grad_norm": 1.80752694606781, "learning_rate": 7.405852754237288e-06, "loss": 1.141, "mean_token_accuracy": 0.748028963804245, "num_tokens": 15763743.0, "step": 19592 }, { "epoch": 5.189088983050848, "grad_norm": 2.1988484859466553, "learning_rate": 7.4055879237288145e-06, "loss": 0.9316, "mean_token_accuracy": 0.7476894780993462, "num_tokens": 15765072.0, "step": 19594 }, { "epoch": 5.189618644067797, "grad_norm": 2.4659717082977295, "learning_rate": 7.405323093220339e-06, "loss": 1.5065, "mean_token_accuracy": 0.675336018204689, "num_tokens": 15766463.0, "step": 19596 }, { "epoch": 5.190148305084746, "grad_norm": 1.949960708618164, "learning_rate": 7.405058262711865e-06, "loss": 0.9406, "mean_token_accuracy": 0.7794084027409554, "num_tokens": 15768112.0, "step": 19598 }, { "epoch": 5.190677966101695, "grad_norm": 2.370417833328247, "learning_rate": 7.404793432203391e-06, "loss": 1.3957, "mean_token_accuracy": 0.6668592803180218, "num_tokens": 15769681.0, "step": 19600 }, { "epoch": 5.1912076271186445, "grad_norm": 2.177288770675659, "learning_rate": 7.404528601694916e-06, "loss": 1.0381, "mean_token_accuracy": 0.7547059506177902, "num_tokens": 15771462.0, "step": 19602 }, { "epoch": 5.191737288135593, "grad_norm": 2.446991205215454, "learning_rate": 7.404263771186442e-06, "loss": 1.5453, "mean_token_accuracy": 0.6593869253993034, "num_tokens": 15773076.0, "step": 19604 }, { "epoch": 5.192266949152542, "grad_norm": 1.8709688186645508, "learning_rate": 7.403998940677967e-06, "loss": 1.2368, "mean_token_accuracy": 0.721218504011631, "num_tokens": 15774693.0, "step": 19606 }, { "epoch": 5.192796610169491, "grad_norm": 2.240335702896118, "learning_rate": 7.403734110169492e-06, "loss": 1.2345, "mean_token_accuracy": 0.7158390432596207, "num_tokens": 15776382.0, "step": 19608 }, { "epoch": 5.19332627118644, "grad_norm": 2.1370465755462646, "learning_rate": 7.403469279661017e-06, "loss": 1.5216, "mean_token_accuracy": 0.6452389284968376, "num_tokens": 15778226.0, "step": 19610 }, { "epoch": 5.19385593220339, "grad_norm": 2.8219921588897705, "learning_rate": 7.403204449152543e-06, "loss": 1.4418, "mean_token_accuracy": 0.6581469401717186, "num_tokens": 15779689.0, "step": 19612 }, { "epoch": 5.194385593220339, "grad_norm": 2.303551435470581, "learning_rate": 7.402939618644068e-06, "loss": 1.3876, "mean_token_accuracy": 0.7054912149906158, "num_tokens": 15781169.0, "step": 19614 }, { "epoch": 5.194915254237288, "grad_norm": 1.615953803062439, "learning_rate": 7.402674788135595e-06, "loss": 0.8078, "mean_token_accuracy": 0.7945099100470543, "num_tokens": 15782807.0, "step": 19616 }, { "epoch": 5.195444915254237, "grad_norm": 1.9489115476608276, "learning_rate": 7.402409957627119e-06, "loss": 1.2644, "mean_token_accuracy": 0.6916230022907257, "num_tokens": 15784618.0, "step": 19618 }, { "epoch": 5.195974576271187, "grad_norm": 2.30936336517334, "learning_rate": 7.402145127118645e-06, "loss": 1.4376, "mean_token_accuracy": 0.6997737064957619, "num_tokens": 15786237.0, "step": 19620 }, { "epoch": 5.196504237288136, "grad_norm": 2.387360095977783, "learning_rate": 7.40188029661017e-06, "loss": 1.1581, "mean_token_accuracy": 0.7301943749189377, "num_tokens": 15787567.0, "step": 19622 }, { "epoch": 5.197033898305085, "grad_norm": 2.1931490898132324, "learning_rate": 7.401615466101696e-06, "loss": 1.2934, "mean_token_accuracy": 0.7222149893641472, "num_tokens": 15789116.0, "step": 19624 }, { "epoch": 5.197563559322034, "grad_norm": 2.46639347076416, "learning_rate": 7.401350635593221e-06, "loss": 1.4315, "mean_token_accuracy": 0.6871562525629997, "num_tokens": 15790721.0, "step": 19626 }, { "epoch": 5.198093220338983, "grad_norm": 2.250974178314209, "learning_rate": 7.401085805084747e-06, "loss": 0.9767, "mean_token_accuracy": 0.7852949276566505, "num_tokens": 15792759.0, "step": 19628 }, { "epoch": 5.1986228813559325, "grad_norm": 2.1317334175109863, "learning_rate": 7.400820974576272e-06, "loss": 1.5204, "mean_token_accuracy": 0.6494451612234116, "num_tokens": 15794472.0, "step": 19630 }, { "epoch": 5.1991525423728815, "grad_norm": 2.5042190551757812, "learning_rate": 7.4005561440677974e-06, "loss": 1.2368, "mean_token_accuracy": 0.7135120704770088, "num_tokens": 15795945.0, "step": 19632 }, { "epoch": 5.1996822033898304, "grad_norm": 2.191934823989868, "learning_rate": 7.400291313559322e-06, "loss": 1.6424, "mean_token_accuracy": 0.6514272838830948, "num_tokens": 15797757.0, "step": 19634 }, { "epoch": 5.200211864406779, "grad_norm": 2.7093114852905273, "learning_rate": 7.400026483050848e-06, "loss": 1.1759, "mean_token_accuracy": 0.7166353464126587, "num_tokens": 15798978.0, "step": 19636 }, { "epoch": 5.200741525423728, "grad_norm": 2.049614667892456, "learning_rate": 7.399761652542373e-06, "loss": 1.1861, "mean_token_accuracy": 0.7318770587444305, "num_tokens": 15800667.0, "step": 19638 }, { "epoch": 5.201271186440678, "grad_norm": 2.7461047172546387, "learning_rate": 7.399496822033899e-06, "loss": 1.3093, "mean_token_accuracy": 0.6887490451335907, "num_tokens": 15802333.0, "step": 19640 }, { "epoch": 5.201800847457627, "grad_norm": 2.2275984287261963, "learning_rate": 7.399231991525424e-06, "loss": 1.2508, "mean_token_accuracy": 0.7086461707949638, "num_tokens": 15803915.0, "step": 19642 }, { "epoch": 5.202330508474576, "grad_norm": 2.2788615226745605, "learning_rate": 7.39896716101695e-06, "loss": 1.1361, "mean_token_accuracy": 0.7143316864967346, "num_tokens": 15805528.0, "step": 19644 }, { "epoch": 5.202860169491525, "grad_norm": 2.373711585998535, "learning_rate": 7.3987023305084745e-06, "loss": 1.585, "mean_token_accuracy": 0.6627497002482414, "num_tokens": 15807194.0, "step": 19646 }, { "epoch": 5.203389830508475, "grad_norm": 2.1795382499694824, "learning_rate": 7.398437500000001e-06, "loss": 1.167, "mean_token_accuracy": 0.7326920479536057, "num_tokens": 15809052.0, "step": 19648 }, { "epoch": 5.203919491525424, "grad_norm": 1.8421783447265625, "learning_rate": 7.398172669491526e-06, "loss": 0.9448, "mean_token_accuracy": 0.765765830874443, "num_tokens": 15810658.0, "step": 19650 }, { "epoch": 5.204449152542373, "grad_norm": 1.993768572807312, "learning_rate": 7.397907838983052e-06, "loss": 1.1916, "mean_token_accuracy": 0.7217135950922966, "num_tokens": 15812313.0, "step": 19652 }, { "epoch": 5.204978813559322, "grad_norm": 1.9627199172973633, "learning_rate": 7.397643008474577e-06, "loss": 1.5205, "mean_token_accuracy": 0.6509833782911301, "num_tokens": 15814267.0, "step": 19654 }, { "epoch": 5.205508474576272, "grad_norm": 2.186997175216675, "learning_rate": 7.3973781779661025e-06, "loss": 1.463, "mean_token_accuracy": 0.689518392086029, "num_tokens": 15815867.0, "step": 19656 }, { "epoch": 5.206038135593221, "grad_norm": 2.259648084640503, "learning_rate": 7.3971133474576274e-06, "loss": 1.4301, "mean_token_accuracy": 0.6747474297881126, "num_tokens": 15817514.0, "step": 19658 }, { "epoch": 5.2065677966101696, "grad_norm": 2.181194543838501, "learning_rate": 7.396848516949153e-06, "loss": 1.2302, "mean_token_accuracy": 0.7104089707136154, "num_tokens": 15818923.0, "step": 19660 }, { "epoch": 5.2070974576271185, "grad_norm": 2.1678531169891357, "learning_rate": 7.396583686440678e-06, "loss": 1.3895, "mean_token_accuracy": 0.6947056874632835, "num_tokens": 15820669.0, "step": 19662 }, { "epoch": 5.2076271186440675, "grad_norm": 1.8998969793319702, "learning_rate": 7.396318855932204e-06, "loss": 1.0288, "mean_token_accuracy": 0.7450947389006615, "num_tokens": 15822212.0, "step": 19664 }, { "epoch": 5.208156779661017, "grad_norm": 2.3634660243988037, "learning_rate": 7.396054025423729e-06, "loss": 1.2609, "mean_token_accuracy": 0.6990948170423508, "num_tokens": 15823666.0, "step": 19666 }, { "epoch": 5.208686440677966, "grad_norm": 2.092926263809204, "learning_rate": 7.395789194915255e-06, "loss": 1.4445, "mean_token_accuracy": 0.6672559157013893, "num_tokens": 15825241.0, "step": 19668 }, { "epoch": 5.209216101694915, "grad_norm": 2.2704453468322754, "learning_rate": 7.3955243644067796e-06, "loss": 1.2292, "mean_token_accuracy": 0.7149349451065063, "num_tokens": 15826830.0, "step": 19670 }, { "epoch": 5.209745762711864, "grad_norm": 2.230682849884033, "learning_rate": 7.395259533898305e-06, "loss": 1.3049, "mean_token_accuracy": 0.7212698236107826, "num_tokens": 15828397.0, "step": 19672 }, { "epoch": 5.210275423728813, "grad_norm": 1.8878906965255737, "learning_rate": 7.39499470338983e-06, "loss": 0.9844, "mean_token_accuracy": 0.7596374973654747, "num_tokens": 15829948.0, "step": 19674 }, { "epoch": 5.210805084745763, "grad_norm": 2.3571128845214844, "learning_rate": 7.394729872881357e-06, "loss": 1.1712, "mean_token_accuracy": 0.7170205116271973, "num_tokens": 15831871.0, "step": 19676 }, { "epoch": 5.211334745762712, "grad_norm": 2.370675563812256, "learning_rate": 7.394465042372882e-06, "loss": 1.5283, "mean_token_accuracy": 0.6646493226289749, "num_tokens": 15833521.0, "step": 19678 }, { "epoch": 5.211864406779661, "grad_norm": 2.463007926940918, "learning_rate": 7.394200211864408e-06, "loss": 1.0673, "mean_token_accuracy": 0.759408488869667, "num_tokens": 15835020.0, "step": 19680 }, { "epoch": 5.21239406779661, "grad_norm": 2.362605333328247, "learning_rate": 7.393935381355933e-06, "loss": 1.3914, "mean_token_accuracy": 0.696278028190136, "num_tokens": 15836486.0, "step": 19682 }, { "epoch": 5.21292372881356, "grad_norm": 1.97358238697052, "learning_rate": 7.393670550847458e-06, "loss": 1.293, "mean_token_accuracy": 0.6889037415385246, "num_tokens": 15838234.0, "step": 19684 }, { "epoch": 5.213453389830509, "grad_norm": 2.1097140312194824, "learning_rate": 7.393405720338984e-06, "loss": 1.6031, "mean_token_accuracy": 0.6369205042719841, "num_tokens": 15840585.0, "step": 19686 }, { "epoch": 5.213983050847458, "grad_norm": 2.6144120693206787, "learning_rate": 7.393140889830509e-06, "loss": 1.4577, "mean_token_accuracy": 0.6818841248750687, "num_tokens": 15842157.0, "step": 19688 }, { "epoch": 5.214512711864407, "grad_norm": 2.2105140686035156, "learning_rate": 7.392876059322035e-06, "loss": 1.1002, "mean_token_accuracy": 0.7473533228039742, "num_tokens": 15843736.0, "step": 19690 }, { "epoch": 5.2150423728813555, "grad_norm": 2.268866777420044, "learning_rate": 7.39261122881356e-06, "loss": 1.2407, "mean_token_accuracy": 0.7331851199269295, "num_tokens": 15845267.0, "step": 19692 }, { "epoch": 5.215572033898305, "grad_norm": 2.0103721618652344, "learning_rate": 7.3923463983050855e-06, "loss": 0.8846, "mean_token_accuracy": 0.7698512226343155, "num_tokens": 15846894.0, "step": 19694 }, { "epoch": 5.216101694915254, "grad_norm": 2.529217481613159, "learning_rate": 7.39208156779661e-06, "loss": 1.1836, "mean_token_accuracy": 0.7175217121839523, "num_tokens": 15848328.0, "step": 19696 }, { "epoch": 5.216631355932203, "grad_norm": 2.4474070072174072, "learning_rate": 7.391816737288137e-06, "loss": 1.5587, "mean_token_accuracy": 0.6677383035421371, "num_tokens": 15849890.0, "step": 19698 }, { "epoch": 5.217161016949152, "grad_norm": 2.2477774620056152, "learning_rate": 7.391551906779661e-06, "loss": 1.2503, "mean_token_accuracy": 0.6905240267515182, "num_tokens": 15851354.0, "step": 19700 }, { "epoch": 5.217690677966102, "grad_norm": 2.1218535900115967, "learning_rate": 7.391287076271188e-06, "loss": 0.9465, "mean_token_accuracy": 0.7700616270303726, "num_tokens": 15853273.0, "step": 19702 }, { "epoch": 5.218220338983051, "grad_norm": 1.7208937406539917, "learning_rate": 7.391022245762713e-06, "loss": 1.0576, "mean_token_accuracy": 0.7670306339859962, "num_tokens": 15854658.0, "step": 19704 }, { "epoch": 5.21875, "grad_norm": 2.204637050628662, "learning_rate": 7.3907574152542385e-06, "loss": 1.0565, "mean_token_accuracy": 0.7420855909585953, "num_tokens": 15856204.0, "step": 19706 }, { "epoch": 5.219279661016949, "grad_norm": 2.083723306655884, "learning_rate": 7.390492584745763e-06, "loss": 0.846, "mean_token_accuracy": 0.7959700524806976, "num_tokens": 15857744.0, "step": 19708 }, { "epoch": 5.219809322033898, "grad_norm": 1.9388245344161987, "learning_rate": 7.390227754237289e-06, "loss": 1.277, "mean_token_accuracy": 0.6995124816894531, "num_tokens": 15859230.0, "step": 19710 }, { "epoch": 5.220338983050848, "grad_norm": 2.9486095905303955, "learning_rate": 7.389962923728814e-06, "loss": 1.4899, "mean_token_accuracy": 0.6790899559855461, "num_tokens": 15860483.0, "step": 19712 }, { "epoch": 5.220868644067797, "grad_norm": 2.241630792617798, "learning_rate": 7.38969809322034e-06, "loss": 1.4923, "mean_token_accuracy": 0.6636330559849739, "num_tokens": 15862343.0, "step": 19714 }, { "epoch": 5.221398305084746, "grad_norm": 2.153885841369629, "learning_rate": 7.389433262711865e-06, "loss": 0.9268, "mean_token_accuracy": 0.7674720138311386, "num_tokens": 15864179.0, "step": 19716 }, { "epoch": 5.221927966101695, "grad_norm": 2.156790256500244, "learning_rate": 7.3891684322033906e-06, "loss": 1.3524, "mean_token_accuracy": 0.6807851865887642, "num_tokens": 15865812.0, "step": 19718 }, { "epoch": 5.2224576271186445, "grad_norm": 2.5224063396453857, "learning_rate": 7.3889036016949155e-06, "loss": 1.4461, "mean_token_accuracy": 0.6613766960799694, "num_tokens": 15867384.0, "step": 19720 }, { "epoch": 5.222987288135593, "grad_norm": 2.6208555698394775, "learning_rate": 7.388638771186441e-06, "loss": 0.9585, "mean_token_accuracy": 0.7771675437688828, "num_tokens": 15868710.0, "step": 19722 }, { "epoch": 5.223516949152542, "grad_norm": 2.302013635635376, "learning_rate": 7.388373940677966e-06, "loss": 1.0424, "mean_token_accuracy": 0.7357065230607986, "num_tokens": 15870428.0, "step": 19724 }, { "epoch": 5.224046610169491, "grad_norm": 2.1866114139556885, "learning_rate": 7.388109110169492e-06, "loss": 1.6541, "mean_token_accuracy": 0.6486168093979359, "num_tokens": 15872087.0, "step": 19726 }, { "epoch": 5.22457627118644, "grad_norm": 2.1366889476776123, "learning_rate": 7.387844279661017e-06, "loss": 1.2333, "mean_token_accuracy": 0.7320810481905937, "num_tokens": 15873849.0, "step": 19728 }, { "epoch": 5.22510593220339, "grad_norm": 2.2235400676727295, "learning_rate": 7.3875794491525435e-06, "loss": 1.1717, "mean_token_accuracy": 0.724396362900734, "num_tokens": 15875484.0, "step": 19730 }, { "epoch": 5.225635593220339, "grad_norm": 2.2099015712738037, "learning_rate": 7.3873146186440685e-06, "loss": 1.0118, "mean_token_accuracy": 0.7441714629530907, "num_tokens": 15876868.0, "step": 19732 }, { "epoch": 5.226165254237288, "grad_norm": 1.828255534172058, "learning_rate": 7.387049788135594e-06, "loss": 1.0853, "mean_token_accuracy": 0.7501916363835335, "num_tokens": 15878424.0, "step": 19734 }, { "epoch": 5.226694915254237, "grad_norm": 2.353794813156128, "learning_rate": 7.386784957627119e-06, "loss": 1.1487, "mean_token_accuracy": 0.7328694835305214, "num_tokens": 15879953.0, "step": 19736 }, { "epoch": 5.227224576271187, "grad_norm": 2.061154842376709, "learning_rate": 7.386520127118645e-06, "loss": 1.3501, "mean_token_accuracy": 0.6906467601656914, "num_tokens": 15881700.0, "step": 19738 }, { "epoch": 5.227754237288136, "grad_norm": 2.089121103286743, "learning_rate": 7.38625529661017e-06, "loss": 1.1873, "mean_token_accuracy": 0.714055173099041, "num_tokens": 15883453.0, "step": 19740 }, { "epoch": 5.228283898305085, "grad_norm": 1.958498239517212, "learning_rate": 7.385990466101696e-06, "loss": 1.4316, "mean_token_accuracy": 0.6869893595576286, "num_tokens": 15885446.0, "step": 19742 }, { "epoch": 5.228813559322034, "grad_norm": 2.2674312591552734, "learning_rate": 7.385725635593221e-06, "loss": 1.2234, "mean_token_accuracy": 0.706541433930397, "num_tokens": 15886956.0, "step": 19744 }, { "epoch": 5.229343220338983, "grad_norm": 2.430899143218994, "learning_rate": 7.385460805084746e-06, "loss": 1.317, "mean_token_accuracy": 0.7464360892772675, "num_tokens": 15888350.0, "step": 19746 }, { "epoch": 5.2298728813559325, "grad_norm": 1.9415639638900757, "learning_rate": 7.385195974576271e-06, "loss": 1.1076, "mean_token_accuracy": 0.7265623509883881, "num_tokens": 15889845.0, "step": 19748 }, { "epoch": 5.2304025423728815, "grad_norm": 2.059831142425537, "learning_rate": 7.384931144067797e-06, "loss": 0.7105, "step": 19750 }, { "epoch": 5.2304025423728815, "eval_loss": 1.3178379535675049, "eval_mean_token_accuracy": 0.7004106363112276, "eval_num_tokens": 15891396.0, "eval_runtime": 48.3096, "eval_samples_per_second": 6.376, "eval_steps_per_second": 6.376, "step": 19750 }, { "epoch": 5.2309322033898304, "grad_norm": 1.9963611364364624, "learning_rate": 7.384666313559322e-06, "loss": 1.3428, "mean_token_accuracy": 0.7499226219952106, "num_tokens": 15893345.0, "step": 19752 }, { "epoch": 5.231461864406779, "grad_norm": 2.34499192237854, "learning_rate": 7.384401483050848e-06, "loss": 1.6336, "mean_token_accuracy": 0.6413281112909317, "num_tokens": 15894937.0, "step": 19754 }, { "epoch": 5.231991525423728, "grad_norm": 2.168853282928467, "learning_rate": 7.384136652542373e-06, "loss": 1.038, "mean_token_accuracy": 0.7376021817326546, "num_tokens": 15896554.0, "step": 19756 }, { "epoch": 5.232521186440678, "grad_norm": 2.1622369289398193, "learning_rate": 7.383871822033899e-06, "loss": 1.2474, "mean_token_accuracy": 0.7114110589027405, "num_tokens": 15898292.0, "step": 19758 }, { "epoch": 5.233050847457627, "grad_norm": 1.7436609268188477, "learning_rate": 7.383606991525423e-06, "loss": 0.8157, "mean_token_accuracy": 0.7751230746507645, "num_tokens": 15900010.0, "step": 19760 }, { "epoch": 5.233580508474576, "grad_norm": 2.115985870361328, "learning_rate": 7.38334216101695e-06, "loss": 1.2956, "mean_token_accuracy": 0.7033757418394089, "num_tokens": 15901913.0, "step": 19762 }, { "epoch": 5.234110169491525, "grad_norm": 2.5961906909942627, "learning_rate": 7.383077330508475e-06, "loss": 1.3863, "mean_token_accuracy": 0.671937383711338, "num_tokens": 15903516.0, "step": 19764 }, { "epoch": 5.234639830508475, "grad_norm": 2.027658700942993, "learning_rate": 7.382812500000001e-06, "loss": 0.946, "mean_token_accuracy": 0.7599589377641678, "num_tokens": 15904955.0, "step": 19766 }, { "epoch": 5.235169491525424, "grad_norm": 2.1333861351013184, "learning_rate": 7.3825476694915265e-06, "loss": 1.3122, "mean_token_accuracy": 0.6867861077189445, "num_tokens": 15906708.0, "step": 19768 }, { "epoch": 5.235699152542373, "grad_norm": 2.348825454711914, "learning_rate": 7.3822828389830514e-06, "loss": 1.2171, "mean_token_accuracy": 0.7119208201766014, "num_tokens": 15908107.0, "step": 19770 }, { "epoch": 5.236228813559322, "grad_norm": 2.350431203842163, "learning_rate": 7.382018008474577e-06, "loss": 1.0469, "mean_token_accuracy": 0.7499758824706078, "num_tokens": 15909523.0, "step": 19772 }, { "epoch": 5.236758474576272, "grad_norm": 1.9700244665145874, "learning_rate": 7.381753177966102e-06, "loss": 1.1195, "mean_token_accuracy": 0.7367445603013039, "num_tokens": 15911077.0, "step": 19774 }, { "epoch": 5.237288135593221, "grad_norm": 2.4241416454315186, "learning_rate": 7.381488347457628e-06, "loss": 1.189, "mean_token_accuracy": 0.7226420044898987, "num_tokens": 15912533.0, "step": 19776 }, { "epoch": 5.2378177966101696, "grad_norm": 2.0563805103302, "learning_rate": 7.381223516949153e-06, "loss": 1.2656, "mean_token_accuracy": 0.7045674920082092, "num_tokens": 15914270.0, "step": 19778 }, { "epoch": 5.2383474576271185, "grad_norm": 1.980042815208435, "learning_rate": 7.380958686440679e-06, "loss": 1.0299, "mean_token_accuracy": 0.7447780147194862, "num_tokens": 15916013.0, "step": 19780 }, { "epoch": 5.2388771186440675, "grad_norm": 2.1087374687194824, "learning_rate": 7.3806938559322036e-06, "loss": 1.403, "mean_token_accuracy": 0.6851078644394875, "num_tokens": 15917497.0, "step": 19782 }, { "epoch": 5.239406779661017, "grad_norm": 1.781757116317749, "learning_rate": 7.38042902542373e-06, "loss": 0.7679, "mean_token_accuracy": 0.7983139082789421, "num_tokens": 15919094.0, "step": 19784 }, { "epoch": 5.239936440677966, "grad_norm": 2.0907142162323, "learning_rate": 7.380164194915255e-06, "loss": 1.3165, "mean_token_accuracy": 0.7170031890273094, "num_tokens": 15920652.0, "step": 19786 }, { "epoch": 5.240466101694915, "grad_norm": 2.182417154312134, "learning_rate": 7.379899364406781e-06, "loss": 1.566, "mean_token_accuracy": 0.6699328012764454, "num_tokens": 15922234.0, "step": 19788 }, { "epoch": 5.240995762711864, "grad_norm": 2.424560070037842, "learning_rate": 7.379634533898306e-06, "loss": 1.3301, "mean_token_accuracy": 0.686852753162384, "num_tokens": 15923828.0, "step": 19790 }, { "epoch": 5.241525423728813, "grad_norm": 2.7249443531036377, "learning_rate": 7.379369703389832e-06, "loss": 1.5477, "mean_token_accuracy": 0.6867872700095177, "num_tokens": 15925270.0, "step": 19792 }, { "epoch": 5.242055084745763, "grad_norm": 2.0193471908569336, "learning_rate": 7.3791048728813565e-06, "loss": 1.1967, "mean_token_accuracy": 0.7233323976397514, "num_tokens": 15926821.0, "step": 19794 }, { "epoch": 5.242584745762712, "grad_norm": 2.221754789352417, "learning_rate": 7.378840042372882e-06, "loss": 1.1245, "mean_token_accuracy": 0.7177069261670113, "num_tokens": 15928371.0, "step": 19796 }, { "epoch": 5.243114406779661, "grad_norm": 1.8370320796966553, "learning_rate": 7.378575211864407e-06, "loss": 0.9059, "mean_token_accuracy": 0.7830476835370064, "num_tokens": 15929959.0, "step": 19798 }, { "epoch": 5.24364406779661, "grad_norm": 2.463848352432251, "learning_rate": 7.378310381355933e-06, "loss": 1.327, "mean_token_accuracy": 0.7096908017992973, "num_tokens": 15931380.0, "step": 19800 }, { "epoch": 5.24417372881356, "grad_norm": 1.9695427417755127, "learning_rate": 7.378045550847458e-06, "loss": 1.1098, "mean_token_accuracy": 0.7429805025458336, "num_tokens": 15932695.0, "step": 19802 }, { "epoch": 5.244703389830509, "grad_norm": 1.4081945419311523, "learning_rate": 7.377780720338984e-06, "loss": 0.6935, "mean_token_accuracy": 0.8245587423443794, "num_tokens": 15934256.0, "step": 19804 }, { "epoch": 5.245233050847458, "grad_norm": 2.060046911239624, "learning_rate": 7.377515889830509e-06, "loss": 1.3848, "mean_token_accuracy": 0.6551498770713806, "num_tokens": 15935864.0, "step": 19806 }, { "epoch": 5.245762711864407, "grad_norm": 1.8634675741195679, "learning_rate": 7.377251059322034e-06, "loss": 1.3116, "mean_token_accuracy": 0.7484279423952103, "num_tokens": 15937319.0, "step": 19808 }, { "epoch": 5.2462923728813555, "grad_norm": 2.281383991241455, "learning_rate": 7.376986228813559e-06, "loss": 1.3295, "mean_token_accuracy": 0.6903669387102127, "num_tokens": 15939296.0, "step": 19810 }, { "epoch": 5.246822033898305, "grad_norm": 2.393616199493408, "learning_rate": 7.376721398305086e-06, "loss": 1.4532, "mean_token_accuracy": 0.6856546178460121, "num_tokens": 15940658.0, "step": 19812 }, { "epoch": 5.247351694915254, "grad_norm": 2.2819321155548096, "learning_rate": 7.37645656779661e-06, "loss": 1.7045, "mean_token_accuracy": 0.6556204259395599, "num_tokens": 15942393.0, "step": 19814 }, { "epoch": 5.247881355932203, "grad_norm": 2.469510316848755, "learning_rate": 7.376191737288137e-06, "loss": 1.0918, "mean_token_accuracy": 0.7404877096414566, "num_tokens": 15943863.0, "step": 19816 }, { "epoch": 5.248411016949152, "grad_norm": 1.8364057540893555, "learning_rate": 7.375926906779662e-06, "loss": 1.2614, "mean_token_accuracy": 0.7072346583008766, "num_tokens": 15945325.0, "step": 19818 }, { "epoch": 5.248940677966102, "grad_norm": 2.280811309814453, "learning_rate": 7.375662076271187e-06, "loss": 1.2133, "mean_token_accuracy": 0.7088525667786598, "num_tokens": 15946986.0, "step": 19820 }, { "epoch": 5.249470338983051, "grad_norm": 2.6972668170928955, "learning_rate": 7.375397245762712e-06, "loss": 1.6611, "mean_token_accuracy": 0.6112833917140961, "num_tokens": 15948591.0, "step": 19822 }, { "epoch": 5.25, "grad_norm": 1.8793376684188843, "learning_rate": 7.375132415254238e-06, "loss": 0.7741, "mean_token_accuracy": 0.80364128947258, "num_tokens": 15950099.0, "step": 19824 }, { "epoch": 5.250529661016949, "grad_norm": 2.4497616291046143, "learning_rate": 7.374867584745763e-06, "loss": 1.1766, "mean_token_accuracy": 0.7279830724000931, "num_tokens": 15951469.0, "step": 19826 }, { "epoch": 5.251059322033898, "grad_norm": 1.9309908151626587, "learning_rate": 7.374602754237289e-06, "loss": 0.8806, "mean_token_accuracy": 0.7700725719332695, "num_tokens": 15953027.0, "step": 19828 }, { "epoch": 5.251588983050848, "grad_norm": 2.072324275970459, "learning_rate": 7.374337923728814e-06, "loss": 1.2224, "mean_token_accuracy": 0.7087139189243317, "num_tokens": 15954654.0, "step": 19830 }, { "epoch": 5.252118644067797, "grad_norm": 2.1073946952819824, "learning_rate": 7.3740730932203395e-06, "loss": 0.7337, "mean_token_accuracy": 0.8193048313260078, "num_tokens": 15955951.0, "step": 19832 }, { "epoch": 5.252648305084746, "grad_norm": 2.2167131900787354, "learning_rate": 7.373808262711864e-06, "loss": 1.4203, "mean_token_accuracy": 0.6861294135451317, "num_tokens": 15957615.0, "step": 19834 }, { "epoch": 5.253177966101695, "grad_norm": 2.2173092365264893, "learning_rate": 7.37354343220339e-06, "loss": 1.1723, "mean_token_accuracy": 0.716874286532402, "num_tokens": 15958999.0, "step": 19836 }, { "epoch": 5.2537076271186445, "grad_norm": 2.3884918689727783, "learning_rate": 7.373278601694915e-06, "loss": 1.195, "mean_token_accuracy": 0.7320253551006317, "num_tokens": 15960558.0, "step": 19838 }, { "epoch": 5.254237288135593, "grad_norm": 2.4489026069641113, "learning_rate": 7.373013771186442e-06, "loss": 0.9931, "mean_token_accuracy": 0.7342450097203255, "num_tokens": 15962092.0, "step": 19840 }, { "epoch": 5.254766949152542, "grad_norm": 2.0485024452209473, "learning_rate": 7.372748940677966e-06, "loss": 1.3244, "mean_token_accuracy": 0.6883322671055794, "num_tokens": 15963994.0, "step": 19842 }, { "epoch": 5.255296610169491, "grad_norm": 1.997597575187683, "learning_rate": 7.3724841101694924e-06, "loss": 1.5081, "mean_token_accuracy": 0.6828080266714096, "num_tokens": 15965706.0, "step": 19844 }, { "epoch": 5.25582627118644, "grad_norm": 1.8641117811203003, "learning_rate": 7.372219279661017e-06, "loss": 1.444, "mean_token_accuracy": 0.6871329918503761, "num_tokens": 15967530.0, "step": 19846 }, { "epoch": 5.25635593220339, "grad_norm": 2.0968446731567383, "learning_rate": 7.371954449152543e-06, "loss": 1.38, "mean_token_accuracy": 0.7017281949520111, "num_tokens": 15969110.0, "step": 19848 }, { "epoch": 5.256885593220339, "grad_norm": 1.9291257858276367, "learning_rate": 7.371689618644068e-06, "loss": 1.0852, "mean_token_accuracy": 0.7325529381632805, "num_tokens": 15970913.0, "step": 19850 }, { "epoch": 5.257415254237288, "grad_norm": 2.005150318145752, "learning_rate": 7.371424788135594e-06, "loss": 1.0963, "mean_token_accuracy": 0.7423973456025124, "num_tokens": 15972664.0, "step": 19852 }, { "epoch": 5.257944915254237, "grad_norm": 2.335195779800415, "learning_rate": 7.37115995762712e-06, "loss": 1.338, "mean_token_accuracy": 0.7058524191379547, "num_tokens": 15974386.0, "step": 19854 }, { "epoch": 5.258474576271187, "grad_norm": 2.120809316635132, "learning_rate": 7.3708951271186446e-06, "loss": 1.2193, "mean_token_accuracy": 0.7139295190572739, "num_tokens": 15976098.0, "step": 19856 }, { "epoch": 5.259004237288136, "grad_norm": 2.218414068222046, "learning_rate": 7.37063029661017e-06, "loss": 1.1545, "mean_token_accuracy": 0.7257242649793625, "num_tokens": 15977481.0, "step": 19858 }, { "epoch": 5.259533898305085, "grad_norm": 2.5735411643981934, "learning_rate": 7.370365466101695e-06, "loss": 1.5875, "mean_token_accuracy": 0.6535889357328415, "num_tokens": 15978865.0, "step": 19860 }, { "epoch": 5.260063559322034, "grad_norm": 1.9511138200759888, "learning_rate": 7.370100635593221e-06, "loss": 1.0807, "mean_token_accuracy": 0.7613792642951012, "num_tokens": 15980502.0, "step": 19862 }, { "epoch": 5.260593220338983, "grad_norm": 2.089810848236084, "learning_rate": 7.369835805084746e-06, "loss": 1.0704, "mean_token_accuracy": 0.7271545007824898, "num_tokens": 15982179.0, "step": 19864 }, { "epoch": 5.2611228813559325, "grad_norm": 2.256063222885132, "learning_rate": 7.369570974576273e-06, "loss": 1.4349, "mean_token_accuracy": 0.7252709344029427, "num_tokens": 15983691.0, "step": 19866 }, { "epoch": 5.2616525423728815, "grad_norm": 1.8808529376983643, "learning_rate": 7.369306144067797e-06, "loss": 1.2461, "mean_token_accuracy": 0.6996837928891182, "num_tokens": 15985402.0, "step": 19868 }, { "epoch": 5.2621822033898304, "grad_norm": 2.1147301197052, "learning_rate": 7.369041313559323e-06, "loss": 1.113, "mean_token_accuracy": 0.7505576387047768, "num_tokens": 15986980.0, "step": 19870 }, { "epoch": 5.262711864406779, "grad_norm": 2.73415470123291, "learning_rate": 7.368776483050848e-06, "loss": 1.26, "mean_token_accuracy": 0.70919518917799, "num_tokens": 15988524.0, "step": 19872 }, { "epoch": 5.263241525423728, "grad_norm": 1.9569547176361084, "learning_rate": 7.368511652542374e-06, "loss": 1.2022, "mean_token_accuracy": 0.6991542056202888, "num_tokens": 15990208.0, "step": 19874 }, { "epoch": 5.263771186440678, "grad_norm": 2.043830633163452, "learning_rate": 7.368246822033899e-06, "loss": 1.0844, "mean_token_accuracy": 0.7446816563606262, "num_tokens": 15991810.0, "step": 19876 }, { "epoch": 5.264300847457627, "grad_norm": 2.351148843765259, "learning_rate": 7.367981991525425e-06, "loss": 1.0656, "mean_token_accuracy": 0.755001038312912, "num_tokens": 15993278.0, "step": 19878 }, { "epoch": 5.264830508474576, "grad_norm": 1.9054163694381714, "learning_rate": 7.36771716101695e-06, "loss": 0.8564, "mean_token_accuracy": 0.7853798791766167, "num_tokens": 15994827.0, "step": 19880 }, { "epoch": 5.265360169491525, "grad_norm": 2.488163471221924, "learning_rate": 7.367452330508475e-06, "loss": 1.6688, "mean_token_accuracy": 0.6340645924210548, "num_tokens": 15996379.0, "step": 19882 }, { "epoch": 5.265889830508475, "grad_norm": 2.146207094192505, "learning_rate": 7.3671875e-06, "loss": 1.2752, "mean_token_accuracy": 0.7098641470074654, "num_tokens": 15998044.0, "step": 19884 }, { "epoch": 5.266419491525424, "grad_norm": 2.739701271057129, "learning_rate": 7.366922669491526e-06, "loss": 1.2327, "mean_token_accuracy": 0.7560867853462696, "num_tokens": 15999507.0, "step": 19886 }, { "epoch": 5.266949152542373, "grad_norm": 2.748927354812622, "learning_rate": 7.366657838983051e-06, "loss": 1.3456, "mean_token_accuracy": 0.7104908004403114, "num_tokens": 16001118.0, "step": 19888 }, { "epoch": 5.267478813559322, "grad_norm": 2.3094658851623535, "learning_rate": 7.366393008474577e-06, "loss": 1.3278, "mean_token_accuracy": 0.7099316716194153, "num_tokens": 16002795.0, "step": 19890 }, { "epoch": 5.268008474576272, "grad_norm": 2.4743876457214355, "learning_rate": 7.366128177966102e-06, "loss": 1.4059, "mean_token_accuracy": 0.6847660206258297, "num_tokens": 16004509.0, "step": 19892 }, { "epoch": 5.268538135593221, "grad_norm": 2.1250975131988525, "learning_rate": 7.365863347457628e-06, "loss": 1.2265, "mean_token_accuracy": 0.7218897342681885, "num_tokens": 16005777.0, "step": 19894 }, { "epoch": 5.2690677966101696, "grad_norm": 2.412362575531006, "learning_rate": 7.3655985169491525e-06, "loss": 1.2035, "mean_token_accuracy": 0.7160244584083557, "num_tokens": 16007469.0, "step": 19896 }, { "epoch": 5.2695974576271185, "grad_norm": 2.3843629360198975, "learning_rate": 7.365333686440679e-06, "loss": 1.3997, "mean_token_accuracy": 0.6922706179320812, "num_tokens": 16008938.0, "step": 19898 }, { "epoch": 5.2701271186440675, "grad_norm": 1.9857358932495117, "learning_rate": 7.365068855932204e-06, "loss": 1.1948, "mean_token_accuracy": 0.6983971744775772, "num_tokens": 16010597.0, "step": 19900 }, { "epoch": 5.270656779661017, "grad_norm": 1.8767545223236084, "learning_rate": 7.36480402542373e-06, "loss": 0.9067, "mean_token_accuracy": 0.7472834438085556, "num_tokens": 16012344.0, "step": 19902 }, { "epoch": 5.271186440677966, "grad_norm": 1.839335560798645, "learning_rate": 7.364539194915255e-06, "loss": 1.4284, "mean_token_accuracy": 0.6863556951284409, "num_tokens": 16013999.0, "step": 19904 }, { "epoch": 5.271716101694915, "grad_norm": 1.8791295289993286, "learning_rate": 7.3642743644067805e-06, "loss": 0.816, "mean_token_accuracy": 0.7836445197463036, "num_tokens": 16015701.0, "step": 19906 }, { "epoch": 5.272245762711864, "grad_norm": 1.9934710264205933, "learning_rate": 7.3640095338983054e-06, "loss": 1.124, "mean_token_accuracy": 0.7213821336627007, "num_tokens": 16017402.0, "step": 19908 }, { "epoch": 5.272775423728813, "grad_norm": 1.9577912092208862, "learning_rate": 7.363744703389831e-06, "loss": 1.0412, "mean_token_accuracy": 0.747919887304306, "num_tokens": 16019109.0, "step": 19910 }, { "epoch": 5.273305084745763, "grad_norm": 1.9906158447265625, "learning_rate": 7.363479872881356e-06, "loss": 0.8505, "mean_token_accuracy": 0.8049613833427429, "num_tokens": 16020543.0, "step": 19912 }, { "epoch": 5.273834745762712, "grad_norm": 3.0964181423187256, "learning_rate": 7.363215042372882e-06, "loss": 1.0359, "mean_token_accuracy": 0.7456923127174377, "num_tokens": 16021986.0, "step": 19914 }, { "epoch": 5.274364406779661, "grad_norm": 2.461613416671753, "learning_rate": 7.362950211864407e-06, "loss": 1.4246, "mean_token_accuracy": 0.6724108532071114, "num_tokens": 16023378.0, "step": 19916 }, { "epoch": 5.27489406779661, "grad_norm": 2.117724895477295, "learning_rate": 7.362685381355933e-06, "loss": 1.0506, "mean_token_accuracy": 0.7631954923272133, "num_tokens": 16024751.0, "step": 19918 }, { "epoch": 5.27542372881356, "grad_norm": 1.9736531972885132, "learning_rate": 7.3624205508474575e-06, "loss": 1.208, "mean_token_accuracy": 0.7276200503110886, "num_tokens": 16026331.0, "step": 19920 }, { "epoch": 5.275953389830509, "grad_norm": 2.2459139823913574, "learning_rate": 7.362155720338983e-06, "loss": 0.8867, "mean_token_accuracy": 0.8039781972765923, "num_tokens": 16027640.0, "step": 19922 }, { "epoch": 5.276483050847458, "grad_norm": 1.6511021852493286, "learning_rate": 7.361890889830508e-06, "loss": 1.2552, "mean_token_accuracy": 0.7309471294283867, "num_tokens": 16029086.0, "step": 19924 }, { "epoch": 5.277012711864407, "grad_norm": 2.6303865909576416, "learning_rate": 7.361626059322035e-06, "loss": 1.3238, "mean_token_accuracy": 0.6973485499620438, "num_tokens": 16030674.0, "step": 19926 }, { "epoch": 5.2775423728813555, "grad_norm": 1.815771222114563, "learning_rate": 7.36136122881356e-06, "loss": 0.9333, "mean_token_accuracy": 0.7804180309176445, "num_tokens": 16032418.0, "step": 19928 }, { "epoch": 5.278072033898305, "grad_norm": 2.027541399002075, "learning_rate": 7.361096398305086e-06, "loss": 1.1186, "mean_token_accuracy": 0.6974921748042107, "num_tokens": 16034171.0, "step": 19930 }, { "epoch": 5.278601694915254, "grad_norm": 2.1134262084960938, "learning_rate": 7.3608315677966105e-06, "loss": 0.965, "mean_token_accuracy": 0.7709761932492256, "num_tokens": 16035633.0, "step": 19932 }, { "epoch": 5.279131355932203, "grad_norm": 2.3950469493865967, "learning_rate": 7.360566737288136e-06, "loss": 1.4861, "mean_token_accuracy": 0.6704376935958862, "num_tokens": 16037281.0, "step": 19934 }, { "epoch": 5.279661016949152, "grad_norm": 1.8663511276245117, "learning_rate": 7.360301906779662e-06, "loss": 1.2033, "mean_token_accuracy": 0.7280586063861847, "num_tokens": 16038820.0, "step": 19936 }, { "epoch": 5.280190677966102, "grad_norm": 2.2266595363616943, "learning_rate": 7.360037076271187e-06, "loss": 0.873, "mean_token_accuracy": 0.7977796494960785, "num_tokens": 16040212.0, "step": 19938 }, { "epoch": 5.280720338983051, "grad_norm": 1.898698329925537, "learning_rate": 7.359772245762713e-06, "loss": 1.0125, "mean_token_accuracy": 0.7485016360878944, "num_tokens": 16042167.0, "step": 19940 }, { "epoch": 5.28125, "grad_norm": 2.0997869968414307, "learning_rate": 7.359507415254238e-06, "loss": 1.0006, "mean_token_accuracy": 0.7535978779196739, "num_tokens": 16043514.0, "step": 19942 }, { "epoch": 5.281779661016949, "grad_norm": 1.8654671907424927, "learning_rate": 7.3592425847457635e-06, "loss": 1.3457, "mean_token_accuracy": 0.6903621256351471, "num_tokens": 16045337.0, "step": 19944 }, { "epoch": 5.282309322033898, "grad_norm": 2.4981026649475098, "learning_rate": 7.358977754237288e-06, "loss": 1.1705, "mean_token_accuracy": 0.7178790643811226, "num_tokens": 16047005.0, "step": 19946 }, { "epoch": 5.282838983050848, "grad_norm": 1.8535277843475342, "learning_rate": 7.358712923728815e-06, "loss": 0.9907, "mean_token_accuracy": 0.7530353516340256, "num_tokens": 16048680.0, "step": 19948 }, { "epoch": 5.283368644067797, "grad_norm": 1.9320358037948608, "learning_rate": 7.358448093220339e-06, "loss": 1.1571, "mean_token_accuracy": 0.7269687801599503, "num_tokens": 16050421.0, "step": 19950 }, { "epoch": 5.283898305084746, "grad_norm": 2.298659563064575, "learning_rate": 7.358183262711866e-06, "loss": 1.3967, "mean_token_accuracy": 0.6943622529506683, "num_tokens": 16051923.0, "step": 19952 }, { "epoch": 5.284427966101695, "grad_norm": 2.229435443878174, "learning_rate": 7.357918432203391e-06, "loss": 1.2762, "mean_token_accuracy": 0.7090995982289314, "num_tokens": 16053490.0, "step": 19954 }, { "epoch": 5.2849576271186445, "grad_norm": 1.9565907716751099, "learning_rate": 7.3576536016949164e-06, "loss": 0.9012, "mean_token_accuracy": 0.7928441911935806, "num_tokens": 16054880.0, "step": 19956 }, { "epoch": 5.285487288135593, "grad_norm": 2.4083261489868164, "learning_rate": 7.357388771186441e-06, "loss": 1.1082, "mean_token_accuracy": 0.7365803495049477, "num_tokens": 16056746.0, "step": 19958 }, { "epoch": 5.286016949152542, "grad_norm": 2.3102457523345947, "learning_rate": 7.357123940677967e-06, "loss": 1.2521, "mean_token_accuracy": 0.7083065658807755, "num_tokens": 16058592.0, "step": 19960 }, { "epoch": 5.286546610169491, "grad_norm": 2.598665714263916, "learning_rate": 7.356859110169492e-06, "loss": 1.7173, "mean_token_accuracy": 0.621159166097641, "num_tokens": 16060149.0, "step": 19962 }, { "epoch": 5.28707627118644, "grad_norm": 2.0229101181030273, "learning_rate": 7.356594279661018e-06, "loss": 0.8279, "mean_token_accuracy": 0.773656040430069, "num_tokens": 16061704.0, "step": 19964 }, { "epoch": 5.28760593220339, "grad_norm": 2.3926339149475098, "learning_rate": 7.356329449152543e-06, "loss": 1.5184, "mean_token_accuracy": 0.6523252353072166, "num_tokens": 16063254.0, "step": 19966 }, { "epoch": 5.288135593220339, "grad_norm": 2.434752941131592, "learning_rate": 7.3560646186440686e-06, "loss": 1.762, "mean_token_accuracy": 0.6442612558603287, "num_tokens": 16064783.0, "step": 19968 }, { "epoch": 5.288665254237288, "grad_norm": 1.5424884557724, "learning_rate": 7.3557997881355935e-06, "loss": 1.186, "mean_token_accuracy": 0.6985049992799759, "num_tokens": 16067347.0, "step": 19970 }, { "epoch": 5.289194915254237, "grad_norm": 2.314635992050171, "learning_rate": 7.355534957627119e-06, "loss": 1.4581, "mean_token_accuracy": 0.6802644208073616, "num_tokens": 16068920.0, "step": 19972 }, { "epoch": 5.289724576271187, "grad_norm": 2.6046881675720215, "learning_rate": 7.355270127118644e-06, "loss": 1.4256, "mean_token_accuracy": 0.6885782480239868, "num_tokens": 16070358.0, "step": 19974 }, { "epoch": 5.290254237288136, "grad_norm": 2.3730781078338623, "learning_rate": 7.35500529661017e-06, "loss": 0.9728, "mean_token_accuracy": 0.7734759747982025, "num_tokens": 16071755.0, "step": 19976 }, { "epoch": 5.290783898305085, "grad_norm": 2.063840866088867, "learning_rate": 7.354740466101695e-06, "loss": 1.3981, "mean_token_accuracy": 0.6705505549907684, "num_tokens": 16073261.0, "step": 19978 }, { "epoch": 5.291313559322034, "grad_norm": 2.08799409866333, "learning_rate": 7.3544756355932215e-06, "loss": 0.8974, "mean_token_accuracy": 0.7692779079079628, "num_tokens": 16074937.0, "step": 19980 }, { "epoch": 5.291843220338983, "grad_norm": 2.1046175956726074, "learning_rate": 7.3542108050847464e-06, "loss": 1.4112, "mean_token_accuracy": 0.6799125149846077, "num_tokens": 16076583.0, "step": 19982 }, { "epoch": 5.2923728813559325, "grad_norm": 2.355555295944214, "learning_rate": 7.353945974576272e-06, "loss": 1.3761, "mean_token_accuracy": 0.6987292058765888, "num_tokens": 16078194.0, "step": 19984 }, { "epoch": 5.2929025423728815, "grad_norm": 1.9613860845565796, "learning_rate": 7.353681144067797e-06, "loss": 0.9086, "mean_token_accuracy": 0.7562067434191704, "num_tokens": 16079779.0, "step": 19986 }, { "epoch": 5.2934322033898304, "grad_norm": 2.4319310188293457, "learning_rate": 7.353416313559323e-06, "loss": 1.4777, "mean_token_accuracy": 0.6682273894548416, "num_tokens": 16081255.0, "step": 19988 }, { "epoch": 5.293961864406779, "grad_norm": 1.7423310279846191, "learning_rate": 7.353151483050848e-06, "loss": 1.278, "mean_token_accuracy": 0.7005666419863701, "num_tokens": 16082935.0, "step": 19990 }, { "epoch": 5.294491525423728, "grad_norm": 1.9651532173156738, "learning_rate": 7.352886652542374e-06, "loss": 1.2738, "mean_token_accuracy": 0.7175884619355202, "num_tokens": 16084740.0, "step": 19992 }, { "epoch": 5.295021186440678, "grad_norm": 2.437441825866699, "learning_rate": 7.3526218220338986e-06, "loss": 1.0668, "mean_token_accuracy": 0.7331491187214851, "num_tokens": 16086268.0, "step": 19994 }, { "epoch": 5.295550847457627, "grad_norm": 1.8298375606536865, "learning_rate": 7.352356991525424e-06, "loss": 1.0929, "mean_token_accuracy": 0.7301054671406746, "num_tokens": 16087927.0, "step": 19996 }, { "epoch": 5.296080508474576, "grad_norm": 1.9584522247314453, "learning_rate": 7.352092161016949e-06, "loss": 1.366, "mean_token_accuracy": 0.693336583673954, "num_tokens": 16089651.0, "step": 19998 }, { "epoch": 5.296610169491525, "grad_norm": 1.9625942707061768, "learning_rate": 7.351827330508475e-06, "loss": 1.0927, "step": 20000 }, { "epoch": 5.296610169491525, "eval_loss": 1.3192505836486816, "eval_mean_token_accuracy": 0.7010013078907867, "eval_num_tokens": 16091373.0, "eval_runtime": 48.2377, "eval_samples_per_second": 6.385, "eval_steps_per_second": 6.385, "step": 20000 }, { "epoch": 5.297139830508475, "grad_norm": 2.075916051864624, "learning_rate": 7.3515625e-06, "loss": 1.0215, "mean_token_accuracy": 0.7497573420405388, "num_tokens": 16093021.0, "step": 20002 }, { "epoch": 5.297669491525424, "grad_norm": 1.795533299446106, "learning_rate": 7.351297669491526e-06, "loss": 1.1273, "mean_token_accuracy": 0.7387767098844051, "num_tokens": 16094711.0, "step": 20004 }, { "epoch": 5.298199152542373, "grad_norm": 1.5643647909164429, "learning_rate": 7.351032838983051e-06, "loss": 1.3465, "mean_token_accuracy": 0.7094603925943375, "num_tokens": 16096545.0, "step": 20006 }, { "epoch": 5.298728813559322, "grad_norm": 2.3984735012054443, "learning_rate": 7.350768008474577e-06, "loss": 1.1424, "mean_token_accuracy": 0.7253849878907204, "num_tokens": 16098095.0, "step": 20008 }, { "epoch": 5.299258474576272, "grad_norm": 2.248533010482788, "learning_rate": 7.350503177966101e-06, "loss": 1.3081, "mean_token_accuracy": 0.6942978799343109, "num_tokens": 16099558.0, "step": 20010 }, { "epoch": 5.299788135593221, "grad_norm": 1.8523683547973633, "learning_rate": 7.350238347457628e-06, "loss": 1.0743, "mean_token_accuracy": 0.7342484444379807, "num_tokens": 16101172.0, "step": 20012 }, { "epoch": 5.3003177966101696, "grad_norm": 2.4411728382110596, "learning_rate": 7.349973516949153e-06, "loss": 1.1361, "mean_token_accuracy": 0.7512761726975441, "num_tokens": 16102945.0, "step": 20014 }, { "epoch": 5.3008474576271185, "grad_norm": 2.4507782459259033, "learning_rate": 7.349708686440679e-06, "loss": 1.3514, "mean_token_accuracy": 0.6859547942876816, "num_tokens": 16104648.0, "step": 20016 }, { "epoch": 5.3013771186440675, "grad_norm": 2.2194950580596924, "learning_rate": 7.349443855932204e-06, "loss": 0.8541, "mean_token_accuracy": 0.7741219326853752, "num_tokens": 16106381.0, "step": 20018 }, { "epoch": 5.301906779661017, "grad_norm": 2.3988606929779053, "learning_rate": 7.349179025423729e-06, "loss": 1.4321, "mean_token_accuracy": 0.6967284306883812, "num_tokens": 16107991.0, "step": 20020 }, { "epoch": 5.302436440677966, "grad_norm": 2.165658950805664, "learning_rate": 7.348914194915255e-06, "loss": 1.0755, "mean_token_accuracy": 0.727384127676487, "num_tokens": 16109588.0, "step": 20022 }, { "epoch": 5.302966101694915, "grad_norm": 1.8089964389801025, "learning_rate": 7.34864936440678e-06, "loss": 1.4341, "mean_token_accuracy": 0.6510502845048904, "num_tokens": 16111212.0, "step": 20024 }, { "epoch": 5.303495762711864, "grad_norm": 2.042056083679199, "learning_rate": 7.348384533898306e-06, "loss": 1.3854, "mean_token_accuracy": 0.6737713813781738, "num_tokens": 16112943.0, "step": 20026 }, { "epoch": 5.304025423728813, "grad_norm": 2.2178549766540527, "learning_rate": 7.348119703389831e-06, "loss": 1.1589, "mean_token_accuracy": 0.7340965047478676, "num_tokens": 16114549.0, "step": 20028 }, { "epoch": 5.304555084745763, "grad_norm": 2.223019599914551, "learning_rate": 7.347854872881357e-06, "loss": 1.4842, "mean_token_accuracy": 0.699055552482605, "num_tokens": 16116130.0, "step": 20030 }, { "epoch": 5.305084745762712, "grad_norm": 2.1837141513824463, "learning_rate": 7.3475900423728815e-06, "loss": 1.3668, "mean_token_accuracy": 0.6764830127358437, "num_tokens": 16117673.0, "step": 20032 }, { "epoch": 5.305614406779661, "grad_norm": 2.6349618434906006, "learning_rate": 7.347325211864408e-06, "loss": 1.3359, "mean_token_accuracy": 0.6805906370282173, "num_tokens": 16119254.0, "step": 20034 }, { "epoch": 5.30614406779661, "grad_norm": 2.1554691791534424, "learning_rate": 7.347060381355933e-06, "loss": 0.7468, "mean_token_accuracy": 0.7868416234850883, "num_tokens": 16120835.0, "step": 20036 }, { "epoch": 5.30667372881356, "grad_norm": 2.294034242630005, "learning_rate": 7.346795550847459e-06, "loss": 1.3006, "mean_token_accuracy": 0.6980474144220352, "num_tokens": 16122756.0, "step": 20038 }, { "epoch": 5.307203389830509, "grad_norm": 2.429102897644043, "learning_rate": 7.346530720338984e-06, "loss": 1.1103, "mean_token_accuracy": 0.7467395886778831, "num_tokens": 16124355.0, "step": 20040 }, { "epoch": 5.307733050847458, "grad_norm": 2.0144505500793457, "learning_rate": 7.3462658898305096e-06, "loss": 0.9362, "mean_token_accuracy": 0.7630531117320061, "num_tokens": 16125908.0, "step": 20042 }, { "epoch": 5.308262711864407, "grad_norm": 2.3144164085388184, "learning_rate": 7.3460010593220345e-06, "loss": 1.1708, "mean_token_accuracy": 0.7300219163298607, "num_tokens": 16127255.0, "step": 20044 }, { "epoch": 5.3087923728813555, "grad_norm": 2.452277898788452, "learning_rate": 7.34573622881356e-06, "loss": 0.9088, "mean_token_accuracy": 0.7779732197523117, "num_tokens": 16128546.0, "step": 20046 }, { "epoch": 5.309322033898305, "grad_norm": 1.7653120756149292, "learning_rate": 7.345471398305085e-06, "loss": 0.9583, "mean_token_accuracy": 0.7640651389956474, "num_tokens": 16130333.0, "step": 20048 }, { "epoch": 5.309851694915254, "grad_norm": 1.5056809186935425, "learning_rate": 7.345206567796611e-06, "loss": 0.9293, "mean_token_accuracy": 0.7564639821648598, "num_tokens": 16132060.0, "step": 20050 }, { "epoch": 5.310381355932203, "grad_norm": 2.1918981075286865, "learning_rate": 7.344941737288136e-06, "loss": 1.1034, "mean_token_accuracy": 0.7506833747029305, "num_tokens": 16133387.0, "step": 20052 }, { "epoch": 5.310911016949152, "grad_norm": 2.2292609214782715, "learning_rate": 7.344676906779662e-06, "loss": 1.6874, "mean_token_accuracy": 0.6426575630903244, "num_tokens": 16135024.0, "step": 20054 }, { "epoch": 5.311440677966102, "grad_norm": 2.1688332557678223, "learning_rate": 7.344412076271187e-06, "loss": 1.3168, "mean_token_accuracy": 0.7079777419567108, "num_tokens": 16136691.0, "step": 20056 }, { "epoch": 5.311970338983051, "grad_norm": 2.2867090702056885, "learning_rate": 7.344147245762712e-06, "loss": 1.6332, "mean_token_accuracy": 0.6381418630480766, "num_tokens": 16138447.0, "step": 20058 }, { "epoch": 5.3125, "grad_norm": 2.5092644691467285, "learning_rate": 7.343882415254237e-06, "loss": 1.7058, "mean_token_accuracy": 0.6169495731592178, "num_tokens": 16140160.0, "step": 20060 }, { "epoch": 5.313029661016949, "grad_norm": 2.348554849624634, "learning_rate": 7.343617584745764e-06, "loss": 1.4422, "mean_token_accuracy": 0.7032334953546524, "num_tokens": 16141597.0, "step": 20062 }, { "epoch": 5.313559322033898, "grad_norm": 2.1537444591522217, "learning_rate": 7.343352754237288e-06, "loss": 1.6728, "mean_token_accuracy": 0.6192451231181622, "num_tokens": 16143238.0, "step": 20064 }, { "epoch": 5.314088983050848, "grad_norm": 1.881313443183899, "learning_rate": 7.343087923728815e-06, "loss": 1.0107, "mean_token_accuracy": 0.7407799661159515, "num_tokens": 16144832.0, "step": 20066 }, { "epoch": 5.314618644067797, "grad_norm": 2.3568668365478516, "learning_rate": 7.3428230932203396e-06, "loss": 1.2054, "mean_token_accuracy": 0.7290723323822021, "num_tokens": 16146074.0, "step": 20068 }, { "epoch": 5.315148305084746, "grad_norm": 2.3013157844543457, "learning_rate": 7.342558262711865e-06, "loss": 1.0946, "mean_token_accuracy": 0.7271734848618507, "num_tokens": 16147811.0, "step": 20070 }, { "epoch": 5.315677966101695, "grad_norm": 2.5105979442596436, "learning_rate": 7.34229343220339e-06, "loss": 1.2822, "mean_token_accuracy": 0.7035906985402107, "num_tokens": 16149488.0, "step": 20072 }, { "epoch": 5.3162076271186445, "grad_norm": 2.542085886001587, "learning_rate": 7.342028601694916e-06, "loss": 1.4782, "mean_token_accuracy": 0.6709574162960052, "num_tokens": 16151069.0, "step": 20074 }, { "epoch": 5.316737288135593, "grad_norm": 2.555666446685791, "learning_rate": 7.341763771186441e-06, "loss": 1.3978, "mean_token_accuracy": 0.7074404433369637, "num_tokens": 16152586.0, "step": 20076 }, { "epoch": 5.317266949152542, "grad_norm": 1.7050753831863403, "learning_rate": 7.341498940677967e-06, "loss": 1.3095, "mean_token_accuracy": 0.6933433711528778, "num_tokens": 16154606.0, "step": 20078 }, { "epoch": 5.317796610169491, "grad_norm": 2.311330795288086, "learning_rate": 7.341234110169492e-06, "loss": 1.1426, "mean_token_accuracy": 0.7352366372942924, "num_tokens": 16156168.0, "step": 20080 }, { "epoch": 5.31832627118644, "grad_norm": 1.6975125074386597, "learning_rate": 7.3409692796610175e-06, "loss": 0.9101, "mean_token_accuracy": 0.7772964462637901, "num_tokens": 16157715.0, "step": 20082 }, { "epoch": 5.31885593220339, "grad_norm": 2.9102110862731934, "learning_rate": 7.340704449152542e-06, "loss": 0.8436, "mean_token_accuracy": 0.7905043810606003, "num_tokens": 16159339.0, "step": 20084 }, { "epoch": 5.319385593220339, "grad_norm": 2.1820483207702637, "learning_rate": 7.340439618644068e-06, "loss": 1.1993, "mean_token_accuracy": 0.7347153052687645, "num_tokens": 16161016.0, "step": 20086 }, { "epoch": 5.319915254237288, "grad_norm": 2.2411086559295654, "learning_rate": 7.340174788135593e-06, "loss": 1.4776, "mean_token_accuracy": 0.6745846904814243, "num_tokens": 16162668.0, "step": 20088 }, { "epoch": 5.320444915254237, "grad_norm": 2.020136833190918, "learning_rate": 7.33990995762712e-06, "loss": 1.0011, "mean_token_accuracy": 0.7348015531897545, "num_tokens": 16164321.0, "step": 20090 }, { "epoch": 5.320974576271187, "grad_norm": 2.32568621635437, "learning_rate": 7.339645127118644e-06, "loss": 1.0541, "mean_token_accuracy": 0.74540626257658, "num_tokens": 16165959.0, "step": 20092 }, { "epoch": 5.321504237288136, "grad_norm": 1.953705906867981, "learning_rate": 7.3393802966101704e-06, "loss": 1.2683, "mean_token_accuracy": 0.6915549784898758, "num_tokens": 16167650.0, "step": 20094 }, { "epoch": 5.322033898305085, "grad_norm": 2.2218592166900635, "learning_rate": 7.339115466101695e-06, "loss": 1.1813, "mean_token_accuracy": 0.733308732509613, "num_tokens": 16169035.0, "step": 20096 }, { "epoch": 5.322563559322034, "grad_norm": 2.61336088180542, "learning_rate": 7.338850635593221e-06, "loss": 1.4871, "mean_token_accuracy": 0.6663430705666542, "num_tokens": 16170541.0, "step": 20098 }, { "epoch": 5.323093220338983, "grad_norm": 2.0791423320770264, "learning_rate": 7.338585805084746e-06, "loss": 0.9874, "mean_token_accuracy": 0.7535531520843506, "num_tokens": 16172232.0, "step": 20100 }, { "epoch": 5.3236228813559325, "grad_norm": 2.345405340194702, "learning_rate": 7.338320974576272e-06, "loss": 1.1199, "mean_token_accuracy": 0.7385035902261734, "num_tokens": 16173929.0, "step": 20102 }, { "epoch": 5.3241525423728815, "grad_norm": 1.8986320495605469, "learning_rate": 7.338056144067798e-06, "loss": 1.1065, "mean_token_accuracy": 0.7347486540675163, "num_tokens": 16175518.0, "step": 20104 }, { "epoch": 5.3246822033898304, "grad_norm": 1.9158446788787842, "learning_rate": 7.3377913135593225e-06, "loss": 0.8866, "mean_token_accuracy": 0.7831480279564857, "num_tokens": 16176987.0, "step": 20106 }, { "epoch": 5.325211864406779, "grad_norm": 2.3024046421051025, "learning_rate": 7.337526483050848e-06, "loss": 1.4159, "mean_token_accuracy": 0.6774830669164658, "num_tokens": 16178647.0, "step": 20108 }, { "epoch": 5.325741525423728, "grad_norm": 2.154832601547241, "learning_rate": 7.337261652542373e-06, "loss": 1.3836, "mean_token_accuracy": 0.711165577173233, "num_tokens": 16180239.0, "step": 20110 }, { "epoch": 5.326271186440678, "grad_norm": 1.7998868227005005, "learning_rate": 7.336996822033899e-06, "loss": 0.7238, "mean_token_accuracy": 0.809543713927269, "num_tokens": 16181748.0, "step": 20112 }, { "epoch": 5.326800847457627, "grad_norm": 1.8472046852111816, "learning_rate": 7.336731991525424e-06, "loss": 1.0011, "mean_token_accuracy": 0.7444690689444542, "num_tokens": 16183697.0, "step": 20114 }, { "epoch": 5.327330508474576, "grad_norm": 2.217529058456421, "learning_rate": 7.336467161016951e-06, "loss": 0.8511, "mean_token_accuracy": 0.7743188664317131, "num_tokens": 16185149.0, "step": 20116 }, { "epoch": 5.327860169491525, "grad_norm": 2.0985450744628906, "learning_rate": 7.336202330508475e-06, "loss": 1.0261, "mean_token_accuracy": 0.7569302618503571, "num_tokens": 16186864.0, "step": 20118 }, { "epoch": 5.328389830508475, "grad_norm": 2.046948194503784, "learning_rate": 7.335937500000001e-06, "loss": 0.93, "mean_token_accuracy": 0.7699044570326805, "num_tokens": 16188217.0, "step": 20120 }, { "epoch": 5.328919491525424, "grad_norm": 2.147505760192871, "learning_rate": 7.335672669491526e-06, "loss": 1.1396, "mean_token_accuracy": 0.6981754824519157, "num_tokens": 16190605.0, "step": 20122 }, { "epoch": 5.329449152542373, "grad_norm": 2.8096814155578613, "learning_rate": 7.335407838983052e-06, "loss": 1.1755, "mean_token_accuracy": 0.7213925719261169, "num_tokens": 16191951.0, "step": 20124 }, { "epoch": 5.329978813559322, "grad_norm": 2.483177661895752, "learning_rate": 7.335143008474577e-06, "loss": 1.2746, "mean_token_accuracy": 0.6786698624491692, "num_tokens": 16193361.0, "step": 20126 }, { "epoch": 5.330508474576272, "grad_norm": 2.1367998123168945, "learning_rate": 7.334878177966103e-06, "loss": 1.0389, "mean_token_accuracy": 0.7626582309603691, "num_tokens": 16195043.0, "step": 20128 }, { "epoch": 5.331038135593221, "grad_norm": 2.1850666999816895, "learning_rate": 7.334613347457628e-06, "loss": 1.6091, "mean_token_accuracy": 0.6586113385856152, "num_tokens": 16196644.0, "step": 20130 }, { "epoch": 5.3315677966101696, "grad_norm": 2.163466453552246, "learning_rate": 7.334348516949153e-06, "loss": 1.5564, "mean_token_accuracy": 0.667020320892334, "num_tokens": 16198248.0, "step": 20132 }, { "epoch": 5.3320974576271185, "grad_norm": 1.8981916904449463, "learning_rate": 7.334083686440678e-06, "loss": 1.1317, "mean_token_accuracy": 0.7160368263721466, "num_tokens": 16199946.0, "step": 20134 }, { "epoch": 5.3326271186440675, "grad_norm": 2.3314621448516846, "learning_rate": 7.333818855932204e-06, "loss": 0.8722, "mean_token_accuracy": 0.7856775671243668, "num_tokens": 16201365.0, "step": 20136 }, { "epoch": 5.333156779661017, "grad_norm": 1.8752362728118896, "learning_rate": 7.333554025423729e-06, "loss": 1.1547, "mean_token_accuracy": 0.7396951913833618, "num_tokens": 16202954.0, "step": 20138 }, { "epoch": 5.333686440677966, "grad_norm": 2.2993004322052, "learning_rate": 7.333289194915255e-06, "loss": 1.3658, "mean_token_accuracy": 0.6830096989870071, "num_tokens": 16204487.0, "step": 20140 }, { "epoch": 5.334216101694915, "grad_norm": 1.8956801891326904, "learning_rate": 7.33302436440678e-06, "loss": 0.9422, "mean_token_accuracy": 0.7677347287535667, "num_tokens": 16206201.0, "step": 20142 }, { "epoch": 5.334745762711864, "grad_norm": 1.8779921531677246, "learning_rate": 7.332759533898306e-06, "loss": 1.1212, "mean_token_accuracy": 0.7394026815891266, "num_tokens": 16208193.0, "step": 20144 }, { "epoch": 5.335275423728813, "grad_norm": 2.0919692516326904, "learning_rate": 7.3324947033898304e-06, "loss": 1.1603, "mean_token_accuracy": 0.7273555397987366, "num_tokens": 16209851.0, "step": 20146 }, { "epoch": 5.335805084745763, "grad_norm": 1.868586540222168, "learning_rate": 7.332229872881357e-06, "loss": 1.238, "mean_token_accuracy": 0.6987104192376137, "num_tokens": 16211441.0, "step": 20148 }, { "epoch": 5.336334745762712, "grad_norm": 1.9635136127471924, "learning_rate": 7.331965042372882e-06, "loss": 0.9171, "mean_token_accuracy": 0.763537235558033, "num_tokens": 16212814.0, "step": 20150 }, { "epoch": 5.336864406779661, "grad_norm": 2.1541149616241455, "learning_rate": 7.331700211864408e-06, "loss": 1.0371, "mean_token_accuracy": 0.748231053352356, "num_tokens": 16214459.0, "step": 20152 }, { "epoch": 5.33739406779661, "grad_norm": 2.524080753326416, "learning_rate": 7.331435381355933e-06, "loss": 1.4732, "mean_token_accuracy": 0.6691538244485855, "num_tokens": 16215833.0, "step": 20154 }, { "epoch": 5.33792372881356, "grad_norm": 2.3539764881134033, "learning_rate": 7.3311705508474585e-06, "loss": 1.4341, "mean_token_accuracy": 0.676802970468998, "num_tokens": 16217768.0, "step": 20156 }, { "epoch": 5.338453389830509, "grad_norm": 2.423649787902832, "learning_rate": 7.330905720338983e-06, "loss": 1.2198, "mean_token_accuracy": 0.7088637351989746, "num_tokens": 16219237.0, "step": 20158 }, { "epoch": 5.338983050847458, "grad_norm": 2.0031676292419434, "learning_rate": 7.330640889830509e-06, "loss": 1.0809, "mean_token_accuracy": 0.7229005470871925, "num_tokens": 16221024.0, "step": 20160 }, { "epoch": 5.339512711864407, "grad_norm": 2.132629632949829, "learning_rate": 7.330376059322034e-06, "loss": 1.4013, "mean_token_accuracy": 0.7112872526049614, "num_tokens": 16222636.0, "step": 20162 }, { "epoch": 5.3400423728813555, "grad_norm": 1.956208348274231, "learning_rate": 7.33011122881356e-06, "loss": 0.9523, "mean_token_accuracy": 0.7668873369693756, "num_tokens": 16224293.0, "step": 20164 }, { "epoch": 5.340572033898305, "grad_norm": 2.1321592330932617, "learning_rate": 7.329846398305085e-06, "loss": 1.7131, "mean_token_accuracy": 0.6311028674244881, "num_tokens": 16225988.0, "step": 20166 }, { "epoch": 5.341101694915254, "grad_norm": 2.159867763519287, "learning_rate": 7.329581567796611e-06, "loss": 1.2309, "mean_token_accuracy": 0.695551685988903, "num_tokens": 16227581.0, "step": 20168 }, { "epoch": 5.341631355932203, "grad_norm": 2.4350531101226807, "learning_rate": 7.3293167372881355e-06, "loss": 1.7865, "mean_token_accuracy": 0.6057430654764175, "num_tokens": 16229206.0, "step": 20170 }, { "epoch": 5.342161016949152, "grad_norm": 1.8451184034347534, "learning_rate": 7.329051906779661e-06, "loss": 0.9959, "mean_token_accuracy": 0.7596136331558228, "num_tokens": 16230679.0, "step": 20172 }, { "epoch": 5.342690677966102, "grad_norm": 2.3742458820343018, "learning_rate": 7.328787076271186e-06, "loss": 1.5555, "mean_token_accuracy": 0.6703148372471333, "num_tokens": 16232270.0, "step": 20174 }, { "epoch": 5.343220338983051, "grad_norm": 2.575451612472534, "learning_rate": 7.328522245762713e-06, "loss": 1.625, "mean_token_accuracy": 0.6587567739188671, "num_tokens": 16234095.0, "step": 20176 }, { "epoch": 5.34375, "grad_norm": 2.1991653442382812, "learning_rate": 7.328257415254238e-06, "loss": 1.4168, "mean_token_accuracy": 0.6906446814537048, "num_tokens": 16235570.0, "step": 20178 }, { "epoch": 5.344279661016949, "grad_norm": 2.1866226196289062, "learning_rate": 7.3279925847457636e-06, "loss": 1.3397, "mean_token_accuracy": 0.6951371282339096, "num_tokens": 16236845.0, "step": 20180 }, { "epoch": 5.344809322033898, "grad_norm": 2.5998778343200684, "learning_rate": 7.3277277542372885e-06, "loss": 1.407, "mean_token_accuracy": 0.6917030178010464, "num_tokens": 16238280.0, "step": 20182 }, { "epoch": 5.345338983050848, "grad_norm": 1.872769832611084, "learning_rate": 7.327462923728814e-06, "loss": 0.9666, "mean_token_accuracy": 0.7703470662236214, "num_tokens": 16239749.0, "step": 20184 }, { "epoch": 5.345868644067797, "grad_norm": 2.8370954990386963, "learning_rate": 7.327198093220339e-06, "loss": 1.2088, "mean_token_accuracy": 0.7194001376628876, "num_tokens": 16241225.0, "step": 20186 }, { "epoch": 5.346398305084746, "grad_norm": 2.697047710418701, "learning_rate": 7.326933262711865e-06, "loss": 1.4966, "mean_token_accuracy": 0.6789742335677147, "num_tokens": 16242782.0, "step": 20188 }, { "epoch": 5.346927966101695, "grad_norm": 2.0903234481811523, "learning_rate": 7.326668432203391e-06, "loss": 0.9679, "mean_token_accuracy": 0.7654585316777229, "num_tokens": 16244185.0, "step": 20190 }, { "epoch": 5.3474576271186445, "grad_norm": 1.9657174348831177, "learning_rate": 7.326403601694916e-06, "loss": 1.3068, "mean_token_accuracy": 0.7040540054440498, "num_tokens": 16245903.0, "step": 20192 }, { "epoch": 5.347987288135593, "grad_norm": 2.021125316619873, "learning_rate": 7.3261387711864415e-06, "loss": 1.1772, "mean_token_accuracy": 0.7304525226354599, "num_tokens": 16247264.0, "step": 20194 }, { "epoch": 5.348516949152542, "grad_norm": 1.8254939317703247, "learning_rate": 7.325873940677966e-06, "loss": 1.0059, "mean_token_accuracy": 0.7486254796385765, "num_tokens": 16249051.0, "step": 20196 }, { "epoch": 5.349046610169491, "grad_norm": 1.7056952714920044, "learning_rate": 7.325609110169493e-06, "loss": 0.9672, "mean_token_accuracy": 0.7424894869327545, "num_tokens": 16250875.0, "step": 20198 }, { "epoch": 5.34957627118644, "grad_norm": 2.4185574054718018, "learning_rate": 7.325344279661017e-06, "loss": 1.6381, "mean_token_accuracy": 0.6319563463330269, "num_tokens": 16252453.0, "step": 20200 }, { "epoch": 5.35010593220339, "grad_norm": 2.3212106227874756, "learning_rate": 7.325079449152544e-06, "loss": 0.8958, "mean_token_accuracy": 0.7693294808268547, "num_tokens": 16253881.0, "step": 20202 }, { "epoch": 5.350635593220339, "grad_norm": 2.150040626525879, "learning_rate": 7.324814618644069e-06, "loss": 1.0138, "mean_token_accuracy": 0.7683482766151428, "num_tokens": 16255309.0, "step": 20204 }, { "epoch": 5.351165254237288, "grad_norm": 1.6064443588256836, "learning_rate": 7.324549788135594e-06, "loss": 1.0636, "mean_token_accuracy": 0.7346600517630577, "num_tokens": 16256959.0, "step": 20206 }, { "epoch": 5.351694915254237, "grad_norm": 2.1290602684020996, "learning_rate": 7.324284957627119e-06, "loss": 1.5673, "mean_token_accuracy": 0.6456954628229141, "num_tokens": 16258704.0, "step": 20208 }, { "epoch": 5.352224576271187, "grad_norm": 2.051177740097046, "learning_rate": 7.324020127118645e-06, "loss": 1.4016, "mean_token_accuracy": 0.6935143247246742, "num_tokens": 16260139.0, "step": 20210 }, { "epoch": 5.352754237288136, "grad_norm": 2.4107913970947266, "learning_rate": 7.32375529661017e-06, "loss": 0.946, "mean_token_accuracy": 0.774183489382267, "num_tokens": 16261570.0, "step": 20212 }, { "epoch": 5.353283898305085, "grad_norm": 2.085162878036499, "learning_rate": 7.323490466101696e-06, "loss": 0.6944, "mean_token_accuracy": 0.8251702040433884, "num_tokens": 16263163.0, "step": 20214 }, { "epoch": 5.353813559322034, "grad_norm": 2.3242762088775635, "learning_rate": 7.323225635593221e-06, "loss": 1.3725, "mean_token_accuracy": 0.6819996535778046, "num_tokens": 16264667.0, "step": 20216 }, { "epoch": 5.354343220338983, "grad_norm": 2.4093523025512695, "learning_rate": 7.3229608050847465e-06, "loss": 1.1801, "mean_token_accuracy": 0.7485297620296478, "num_tokens": 16266329.0, "step": 20218 }, { "epoch": 5.3548728813559325, "grad_norm": 1.6415855884552002, "learning_rate": 7.3226959745762715e-06, "loss": 0.9835, "mean_token_accuracy": 0.7660426422953606, "num_tokens": 16268277.0, "step": 20220 }, { "epoch": 5.3554025423728815, "grad_norm": 2.274632692337036, "learning_rate": 7.322431144067797e-06, "loss": 0.9149, "mean_token_accuracy": 0.7423791289329529, "num_tokens": 16269785.0, "step": 20222 }, { "epoch": 5.3559322033898304, "grad_norm": 2.106412649154663, "learning_rate": 7.322166313559322e-06, "loss": 0.9402, "mean_token_accuracy": 0.7795155271887779, "num_tokens": 16271453.0, "step": 20224 }, { "epoch": 5.356461864406779, "grad_norm": 1.6730122566223145, "learning_rate": 7.321901483050848e-06, "loss": 0.8062, "mean_token_accuracy": 0.8005995526909828, "num_tokens": 16273173.0, "step": 20226 }, { "epoch": 5.356991525423728, "grad_norm": 3.0262374877929688, "learning_rate": 7.321636652542373e-06, "loss": 1.4435, "mean_token_accuracy": 0.6642847582697868, "num_tokens": 16274634.0, "step": 20228 }, { "epoch": 5.357521186440678, "grad_norm": 2.2412073612213135, "learning_rate": 7.3213718220338995e-06, "loss": 1.2135, "mean_token_accuracy": 0.7198351323604584, "num_tokens": 16276084.0, "step": 20230 }, { "epoch": 5.358050847457627, "grad_norm": 1.8392255306243896, "learning_rate": 7.321106991525424e-06, "loss": 0.9884, "mean_token_accuracy": 0.7492872029542923, "num_tokens": 16277542.0, "step": 20232 }, { "epoch": 5.358580508474576, "grad_norm": 2.285656690597534, "learning_rate": 7.32084216101695e-06, "loss": 1.0449, "mean_token_accuracy": 0.7451512217521667, "num_tokens": 16278989.0, "step": 20234 }, { "epoch": 5.359110169491525, "grad_norm": 2.2912771701812744, "learning_rate": 7.320577330508475e-06, "loss": 1.1841, "mean_token_accuracy": 0.7176805585622787, "num_tokens": 16280503.0, "step": 20236 }, { "epoch": 5.359639830508475, "grad_norm": 2.1257708072662354, "learning_rate": 7.320312500000001e-06, "loss": 1.2283, "mean_token_accuracy": 0.7626183554530144, "num_tokens": 16282197.0, "step": 20238 }, { "epoch": 5.360169491525424, "grad_norm": 1.4963182210922241, "learning_rate": 7.320047669491526e-06, "loss": 0.9044, "mean_token_accuracy": 0.7734221369028091, "num_tokens": 16284057.0, "step": 20240 }, { "epoch": 5.360699152542373, "grad_norm": 2.3909292221069336, "learning_rate": 7.319782838983052e-06, "loss": 1.2679, "mean_token_accuracy": 0.7013892307877541, "num_tokens": 16285403.0, "step": 20242 }, { "epoch": 5.361228813559322, "grad_norm": 2.3654086589813232, "learning_rate": 7.3195180084745765e-06, "loss": 0.9379, "mean_token_accuracy": 0.7679660841822624, "num_tokens": 16287001.0, "step": 20244 }, { "epoch": 5.361758474576272, "grad_norm": 1.8191120624542236, "learning_rate": 7.319253177966102e-06, "loss": 1.0841, "mean_token_accuracy": 0.7339030280709267, "num_tokens": 16288768.0, "step": 20246 }, { "epoch": 5.362288135593221, "grad_norm": 2.6895527839660645, "learning_rate": 7.318988347457627e-06, "loss": 1.1698, "mean_token_accuracy": 0.7188672125339508, "num_tokens": 16290236.0, "step": 20248 }, { "epoch": 5.3628177966101696, "grad_norm": 1.8148181438446045, "learning_rate": 7.318723516949153e-06, "loss": 0.9445, "step": 20250 }, { "epoch": 5.3628177966101696, "eval_loss": 1.3200453519821167, "eval_mean_token_accuracy": 0.7000935583726152, "eval_num_tokens": 16291634.0, "eval_runtime": 48.2983, "eval_samples_per_second": 6.377, "eval_steps_per_second": 6.377, "step": 20250 }, { "epoch": 5.3633474576271185, "grad_norm": 2.3759231567382812, "learning_rate": 7.318458686440678e-06, "loss": 1.2866, "mean_token_accuracy": 0.7428971864283085, "num_tokens": 16293289.0, "step": 20252 }, { "epoch": 5.3638771186440675, "grad_norm": 2.334146738052368, "learning_rate": 7.318193855932204e-06, "loss": 1.2635, "mean_token_accuracy": 0.707380436360836, "num_tokens": 16295136.0, "step": 20254 }, { "epoch": 5.364406779661017, "grad_norm": 2.4715466499328613, "learning_rate": 7.317929025423729e-06, "loss": 1.2355, "mean_token_accuracy": 0.7097470238804817, "num_tokens": 16296765.0, "step": 20256 }, { "epoch": 5.364936440677966, "grad_norm": 2.395942211151123, "learning_rate": 7.317664194915255e-06, "loss": 1.2731, "mean_token_accuracy": 0.7109387814998627, "num_tokens": 16298318.0, "step": 20258 }, { "epoch": 5.365466101694915, "grad_norm": 2.2681641578674316, "learning_rate": 7.317399364406779e-06, "loss": 1.0847, "mean_token_accuracy": 0.7394709512591362, "num_tokens": 16299941.0, "step": 20260 }, { "epoch": 5.365995762711864, "grad_norm": 2.342125177383423, "learning_rate": 7.317134533898306e-06, "loss": 1.1817, "mean_token_accuracy": 0.7285058796405792, "num_tokens": 16301544.0, "step": 20262 }, { "epoch": 5.366525423728813, "grad_norm": 2.0517077445983887, "learning_rate": 7.316869703389831e-06, "loss": 1.2418, "mean_token_accuracy": 0.710011251270771, "num_tokens": 16303221.0, "step": 20264 }, { "epoch": 5.367055084745763, "grad_norm": 2.446476936340332, "learning_rate": 7.316604872881357e-06, "loss": 1.0786, "mean_token_accuracy": 0.7569676227867603, "num_tokens": 16304663.0, "step": 20266 }, { "epoch": 5.367584745762712, "grad_norm": 1.9036946296691895, "learning_rate": 7.316340042372882e-06, "loss": 0.6172, "mean_token_accuracy": 0.8314977884292603, "num_tokens": 16306135.0, "step": 20268 }, { "epoch": 5.368114406779661, "grad_norm": 2.1152594089508057, "learning_rate": 7.316075211864407e-06, "loss": 1.1972, "mean_token_accuracy": 0.7181534171104431, "num_tokens": 16307797.0, "step": 20270 }, { "epoch": 5.36864406779661, "grad_norm": 1.8185774087905884, "learning_rate": 7.315810381355933e-06, "loss": 0.9959, "mean_token_accuracy": 0.7485488206148148, "num_tokens": 16309404.0, "step": 20272 }, { "epoch": 5.36917372881356, "grad_norm": 1.614372968673706, "learning_rate": 7.315545550847458e-06, "loss": 0.8819, "mean_token_accuracy": 0.7463415861129761, "num_tokens": 16311290.0, "step": 20274 }, { "epoch": 5.369703389830509, "grad_norm": 2.681356191635132, "learning_rate": 7.315280720338984e-06, "loss": 1.4571, "mean_token_accuracy": 0.6883005127310753, "num_tokens": 16312718.0, "step": 20276 }, { "epoch": 5.370233050847458, "grad_norm": 2.172929048538208, "learning_rate": 7.315015889830509e-06, "loss": 1.3229, "mean_token_accuracy": 0.7148624509572983, "num_tokens": 16314251.0, "step": 20278 }, { "epoch": 5.370762711864407, "grad_norm": 1.8067060708999634, "learning_rate": 7.314751059322035e-06, "loss": 0.8975, "mean_token_accuracy": 0.7731766551733017, "num_tokens": 16315761.0, "step": 20280 }, { "epoch": 5.3712923728813555, "grad_norm": 1.7720756530761719, "learning_rate": 7.3144862288135595e-06, "loss": 1.4192, "mean_token_accuracy": 0.6712600067257881, "num_tokens": 16317407.0, "step": 20282 }, { "epoch": 5.371822033898305, "grad_norm": 2.706197738647461, "learning_rate": 7.314221398305086e-06, "loss": 1.2366, "mean_token_accuracy": 0.7024243175983429, "num_tokens": 16318962.0, "step": 20284 }, { "epoch": 5.372351694915254, "grad_norm": 2.483898401260376, "learning_rate": 7.313956567796611e-06, "loss": 1.0039, "mean_token_accuracy": 0.7479526773095131, "num_tokens": 16320857.0, "step": 20286 }, { "epoch": 5.372881355932203, "grad_norm": 2.1237809658050537, "learning_rate": 7.313691737288137e-06, "loss": 1.154, "mean_token_accuracy": 0.7074446752667427, "num_tokens": 16322367.0, "step": 20288 }, { "epoch": 5.373411016949152, "grad_norm": 1.984139323234558, "learning_rate": 7.313426906779662e-06, "loss": 0.8632, "mean_token_accuracy": 0.7848859280347824, "num_tokens": 16324080.0, "step": 20290 }, { "epoch": 5.373940677966102, "grad_norm": 1.8377118110656738, "learning_rate": 7.3131620762711875e-06, "loss": 1.1361, "mean_token_accuracy": 0.7263264693319798, "num_tokens": 16325816.0, "step": 20292 }, { "epoch": 5.374470338983051, "grad_norm": 2.402700901031494, "learning_rate": 7.3128972457627125e-06, "loss": 1.5097, "mean_token_accuracy": 0.6713954582810402, "num_tokens": 16327368.0, "step": 20294 }, { "epoch": 5.375, "grad_norm": 2.318526268005371, "learning_rate": 7.312632415254238e-06, "loss": 1.3708, "mean_token_accuracy": 0.6760357022285461, "num_tokens": 16329349.0, "step": 20296 }, { "epoch": 5.375529661016949, "grad_norm": 2.3127264976501465, "learning_rate": 7.312367584745763e-06, "loss": 1.505, "mean_token_accuracy": 0.6805401295423508, "num_tokens": 16330835.0, "step": 20298 }, { "epoch": 5.376059322033898, "grad_norm": 2.3227953910827637, "learning_rate": 7.312102754237289e-06, "loss": 1.3325, "mean_token_accuracy": 0.6864764615893364, "num_tokens": 16332549.0, "step": 20300 }, { "epoch": 5.376588983050848, "grad_norm": 1.6343165636062622, "learning_rate": 7.311837923728814e-06, "loss": 0.9662, "mean_token_accuracy": 0.7507064566016197, "num_tokens": 16334343.0, "step": 20302 }, { "epoch": 5.377118644067797, "grad_norm": 2.155636787414551, "learning_rate": 7.31157309322034e-06, "loss": 0.7332, "mean_token_accuracy": 0.802703931927681, "num_tokens": 16335760.0, "step": 20304 }, { "epoch": 5.377648305084746, "grad_norm": 1.9659608602523804, "learning_rate": 7.311308262711865e-06, "loss": 1.1481, "mean_token_accuracy": 0.7311660125851631, "num_tokens": 16337415.0, "step": 20306 }, { "epoch": 5.378177966101695, "grad_norm": 2.110079050064087, "learning_rate": 7.31104343220339e-06, "loss": 1.4398, "mean_token_accuracy": 0.6879005283117294, "num_tokens": 16339150.0, "step": 20308 }, { "epoch": 5.3787076271186445, "grad_norm": 2.315059185028076, "learning_rate": 7.310778601694915e-06, "loss": 1.2182, "mean_token_accuracy": 0.7373615279793739, "num_tokens": 16340862.0, "step": 20310 }, { "epoch": 5.379237288135593, "grad_norm": 2.1852266788482666, "learning_rate": 7.310513771186442e-06, "loss": 1.2526, "mean_token_accuracy": 0.7327246591448784, "num_tokens": 16342490.0, "step": 20312 }, { "epoch": 5.379766949152542, "grad_norm": 2.3351330757141113, "learning_rate": 7.310248940677966e-06, "loss": 1.3288, "mean_token_accuracy": 0.7067956626415253, "num_tokens": 16343962.0, "step": 20314 }, { "epoch": 5.380296610169491, "grad_norm": 2.222334384918213, "learning_rate": 7.309984110169493e-06, "loss": 1.3776, "mean_token_accuracy": 0.697897881269455, "num_tokens": 16345652.0, "step": 20316 }, { "epoch": 5.38082627118644, "grad_norm": 2.515578269958496, "learning_rate": 7.3097192796610176e-06, "loss": 1.3499, "mean_token_accuracy": 0.6857071071863174, "num_tokens": 16347386.0, "step": 20318 }, { "epoch": 5.38135593220339, "grad_norm": 2.3256311416625977, "learning_rate": 7.309454449152543e-06, "loss": 1.191, "mean_token_accuracy": 0.7473171725869179, "num_tokens": 16349032.0, "step": 20320 }, { "epoch": 5.381885593220339, "grad_norm": 2.2267301082611084, "learning_rate": 7.309189618644068e-06, "loss": 1.4856, "mean_token_accuracy": 0.6528586633503437, "num_tokens": 16350861.0, "step": 20322 }, { "epoch": 5.382415254237288, "grad_norm": 2.577669858932495, "learning_rate": 7.308924788135594e-06, "loss": 1.7081, "mean_token_accuracy": 0.6256306022405624, "num_tokens": 16352665.0, "step": 20324 }, { "epoch": 5.382944915254237, "grad_norm": 2.2183213233947754, "learning_rate": 7.308659957627119e-06, "loss": 1.2792, "mean_token_accuracy": 0.680941067636013, "num_tokens": 16354350.0, "step": 20326 }, { "epoch": 5.383474576271187, "grad_norm": 2.399961471557617, "learning_rate": 7.308395127118645e-06, "loss": 1.4884, "mean_token_accuracy": 0.688807662576437, "num_tokens": 16355847.0, "step": 20328 }, { "epoch": 5.384004237288136, "grad_norm": 2.0404281616210938, "learning_rate": 7.30813029661017e-06, "loss": 1.1191, "mean_token_accuracy": 0.7488311603665352, "num_tokens": 16357324.0, "step": 20330 }, { "epoch": 5.384533898305085, "grad_norm": 1.994710922241211, "learning_rate": 7.3078654661016954e-06, "loss": 1.6962, "mean_token_accuracy": 0.6563901528716087, "num_tokens": 16358976.0, "step": 20332 }, { "epoch": 5.385063559322034, "grad_norm": 2.1966047286987305, "learning_rate": 7.30760063559322e-06, "loss": 1.0505, "mean_token_accuracy": 0.7482728511095047, "num_tokens": 16360503.0, "step": 20334 }, { "epoch": 5.385593220338983, "grad_norm": 2.6854119300842285, "learning_rate": 7.307335805084746e-06, "loss": 1.3446, "mean_token_accuracy": 0.7117393463850021, "num_tokens": 16361993.0, "step": 20336 }, { "epoch": 5.3861228813559325, "grad_norm": 2.3877110481262207, "learning_rate": 7.307070974576271e-06, "loss": 1.3157, "mean_token_accuracy": 0.6987144127488136, "num_tokens": 16363315.0, "step": 20338 }, { "epoch": 5.3866525423728815, "grad_norm": 2.3838253021240234, "learning_rate": 7.306806144067798e-06, "loss": 1.3681, "mean_token_accuracy": 0.6929144933819771, "num_tokens": 16364893.0, "step": 20340 }, { "epoch": 5.3871822033898304, "grad_norm": 2.4458513259887695, "learning_rate": 7.306541313559322e-06, "loss": 1.2822, "mean_token_accuracy": 0.7224450558423996, "num_tokens": 16366311.0, "step": 20342 }, { "epoch": 5.387711864406779, "grad_norm": 2.611701726913452, "learning_rate": 7.306276483050848e-06, "loss": 1.486, "mean_token_accuracy": 0.6788466274738312, "num_tokens": 16367837.0, "step": 20344 }, { "epoch": 5.388241525423728, "grad_norm": 2.215562105178833, "learning_rate": 7.306011652542373e-06, "loss": 1.5147, "mean_token_accuracy": 0.6492459215223789, "num_tokens": 16369484.0, "step": 20346 }, { "epoch": 5.388771186440678, "grad_norm": 1.792700171470642, "learning_rate": 7.305746822033899e-06, "loss": 1.0513, "mean_token_accuracy": 0.7623699605464935, "num_tokens": 16370743.0, "step": 20348 }, { "epoch": 5.389300847457627, "grad_norm": 2.475522756576538, "learning_rate": 7.305481991525424e-06, "loss": 1.2337, "mean_token_accuracy": 0.7272417843341827, "num_tokens": 16372413.0, "step": 20350 }, { "epoch": 5.389830508474576, "grad_norm": 2.172941207885742, "learning_rate": 7.30521716101695e-06, "loss": 1.2552, "mean_token_accuracy": 0.7062276303768158, "num_tokens": 16374160.0, "step": 20352 }, { "epoch": 5.390360169491525, "grad_norm": 1.9903976917266846, "learning_rate": 7.304952330508475e-06, "loss": 1.3244, "mean_token_accuracy": 0.6751456782221794, "num_tokens": 16375912.0, "step": 20354 }, { "epoch": 5.390889830508475, "grad_norm": 2.078695297241211, "learning_rate": 7.3046875000000005e-06, "loss": 1.3211, "mean_token_accuracy": 0.667637899518013, "num_tokens": 16377568.0, "step": 20356 }, { "epoch": 5.391419491525424, "grad_norm": 2.504012107849121, "learning_rate": 7.304422669491526e-06, "loss": 1.3945, "mean_token_accuracy": 0.6845457851886749, "num_tokens": 16379100.0, "step": 20358 }, { "epoch": 5.391949152542373, "grad_norm": 2.218082904815674, "learning_rate": 7.304157838983051e-06, "loss": 0.9984, "mean_token_accuracy": 0.7615512236952782, "num_tokens": 16380728.0, "step": 20360 }, { "epoch": 5.392478813559322, "grad_norm": 1.9492042064666748, "learning_rate": 7.303893008474577e-06, "loss": 0.8037, "mean_token_accuracy": 0.7840084433555603, "num_tokens": 16382408.0, "step": 20362 }, { "epoch": 5.393008474576272, "grad_norm": 2.2370004653930664, "learning_rate": 7.303628177966102e-06, "loss": 1.2069, "mean_token_accuracy": 0.7098169773817062, "num_tokens": 16384167.0, "step": 20364 }, { "epoch": 5.393538135593221, "grad_norm": 2.2598941326141357, "learning_rate": 7.3033633474576286e-06, "loss": 1.0632, "mean_token_accuracy": 0.7321010679006577, "num_tokens": 16386016.0, "step": 20366 }, { "epoch": 5.3940677966101696, "grad_norm": 1.9336744546890259, "learning_rate": 7.303098516949153e-06, "loss": 1.3143, "mean_token_accuracy": 0.7179071754217148, "num_tokens": 16387687.0, "step": 20368 }, { "epoch": 5.3945974576271185, "grad_norm": 1.8378721475601196, "learning_rate": 7.302833686440679e-06, "loss": 1.1317, "mean_token_accuracy": 0.7287060096859932, "num_tokens": 16390243.0, "step": 20370 }, { "epoch": 5.3951271186440675, "grad_norm": 2.4306366443634033, "learning_rate": 7.302568855932204e-06, "loss": 1.4058, "mean_token_accuracy": 0.6528586447238922, "num_tokens": 16392007.0, "step": 20372 }, { "epoch": 5.395656779661017, "grad_norm": 2.365070104598999, "learning_rate": 7.30230402542373e-06, "loss": 1.2453, "mean_token_accuracy": 0.711399756371975, "num_tokens": 16393470.0, "step": 20374 }, { "epoch": 5.396186440677966, "grad_norm": 2.143552541732788, "learning_rate": 7.302039194915255e-06, "loss": 1.363, "mean_token_accuracy": 0.7081584557890892, "num_tokens": 16394923.0, "step": 20376 }, { "epoch": 5.396716101694915, "grad_norm": 2.1552205085754395, "learning_rate": 7.301774364406781e-06, "loss": 1.0661, "mean_token_accuracy": 0.7581203579902649, "num_tokens": 16396617.0, "step": 20378 }, { "epoch": 5.397245762711864, "grad_norm": 1.91434907913208, "learning_rate": 7.301509533898306e-06, "loss": 1.4161, "mean_token_accuracy": 0.6829769536852837, "num_tokens": 16398482.0, "step": 20380 }, { "epoch": 5.397775423728813, "grad_norm": 2.050745725631714, "learning_rate": 7.301244703389831e-06, "loss": 0.8936, "mean_token_accuracy": 0.7693339735269547, "num_tokens": 16399963.0, "step": 20382 }, { "epoch": 5.398305084745763, "grad_norm": 2.280500888824463, "learning_rate": 7.300979872881356e-06, "loss": 1.2744, "mean_token_accuracy": 0.712689720094204, "num_tokens": 16401827.0, "step": 20384 }, { "epoch": 5.398834745762712, "grad_norm": 2.1904337406158447, "learning_rate": 7.300715042372882e-06, "loss": 1.2932, "mean_token_accuracy": 0.7089649215340614, "num_tokens": 16403433.0, "step": 20386 }, { "epoch": 5.399364406779661, "grad_norm": 2.4560863971710205, "learning_rate": 7.300450211864407e-06, "loss": 1.0712, "mean_token_accuracy": 0.7496760338544846, "num_tokens": 16404859.0, "step": 20388 }, { "epoch": 5.39989406779661, "grad_norm": 2.098916530609131, "learning_rate": 7.300185381355933e-06, "loss": 1.0933, "mean_token_accuracy": 0.7485375702381134, "num_tokens": 16406538.0, "step": 20390 }, { "epoch": 5.40042372881356, "grad_norm": 1.5978996753692627, "learning_rate": 7.299920550847458e-06, "loss": 1.0712, "mean_token_accuracy": 0.7527976632118225, "num_tokens": 16408369.0, "step": 20392 }, { "epoch": 5.400953389830509, "grad_norm": 2.121504068374634, "learning_rate": 7.299655720338984e-06, "loss": 1.5223, "mean_token_accuracy": 0.6672214791178703, "num_tokens": 16410024.0, "step": 20394 }, { "epoch": 5.401483050847458, "grad_norm": 2.5405571460723877, "learning_rate": 7.2993908898305084e-06, "loss": 1.6042, "mean_token_accuracy": 0.6507923230528831, "num_tokens": 16411616.0, "step": 20396 }, { "epoch": 5.402012711864407, "grad_norm": 2.603200674057007, "learning_rate": 7.299126059322035e-06, "loss": 1.3386, "mean_token_accuracy": 0.7083009108901024, "num_tokens": 16413021.0, "step": 20398 }, { "epoch": 5.4025423728813555, "grad_norm": 2.5953826904296875, "learning_rate": 7.29886122881356e-06, "loss": 1.3082, "mean_token_accuracy": 0.7050704285502434, "num_tokens": 16414629.0, "step": 20400 }, { "epoch": 5.403072033898305, "grad_norm": 2.8171164989471436, "learning_rate": 7.298596398305086e-06, "loss": 1.7297, "mean_token_accuracy": 0.646635890007019, "num_tokens": 16416189.0, "step": 20402 }, { "epoch": 5.403601694915254, "grad_norm": 2.206479549407959, "learning_rate": 7.298331567796611e-06, "loss": 1.2592, "mean_token_accuracy": 0.7017704322934151, "num_tokens": 16417726.0, "step": 20404 }, { "epoch": 5.404131355932203, "grad_norm": 2.1887803077697754, "learning_rate": 7.2980667372881365e-06, "loss": 1.0144, "mean_token_accuracy": 0.7539114952087402, "num_tokens": 16419392.0, "step": 20406 }, { "epoch": 5.404661016949152, "grad_norm": 2.5921685695648193, "learning_rate": 7.297801906779661e-06, "loss": 1.0861, "mean_token_accuracy": 0.7550502866506577, "num_tokens": 16420799.0, "step": 20408 }, { "epoch": 5.405190677966102, "grad_norm": 2.1858150959014893, "learning_rate": 7.297537076271187e-06, "loss": 0.8937, "mean_token_accuracy": 0.7821315079927444, "num_tokens": 16422220.0, "step": 20410 }, { "epoch": 5.405720338983051, "grad_norm": 2.4433743953704834, "learning_rate": 7.297272245762712e-06, "loss": 1.3896, "mean_token_accuracy": 0.6890036761760712, "num_tokens": 16423880.0, "step": 20412 }, { "epoch": 5.40625, "grad_norm": 2.535219430923462, "learning_rate": 7.297007415254238e-06, "loss": 1.222, "mean_token_accuracy": 0.7186720371246338, "num_tokens": 16425285.0, "step": 20414 }, { "epoch": 5.406779661016949, "grad_norm": 2.5212740898132324, "learning_rate": 7.296742584745763e-06, "loss": 1.139, "mean_token_accuracy": 0.7367008626461029, "num_tokens": 16426655.0, "step": 20416 }, { "epoch": 5.407309322033898, "grad_norm": 2.6431944370269775, "learning_rate": 7.2964777542372886e-06, "loss": 0.9425, "mean_token_accuracy": 0.7679663151502609, "num_tokens": 16428092.0, "step": 20418 }, { "epoch": 5.407838983050848, "grad_norm": 2.548933982849121, "learning_rate": 7.2962129237288135e-06, "loss": 1.231, "mean_token_accuracy": 0.7196787670254707, "num_tokens": 16429758.0, "step": 20420 }, { "epoch": 5.408368644067797, "grad_norm": 2.7087888717651367, "learning_rate": 7.295948093220339e-06, "loss": 1.2698, "mean_token_accuracy": 0.7195518836379051, "num_tokens": 16431201.0, "step": 20422 }, { "epoch": 5.408898305084746, "grad_norm": 2.1876373291015625, "learning_rate": 7.295683262711864e-06, "loss": 1.3163, "mean_token_accuracy": 0.7065581977367401, "num_tokens": 16432896.0, "step": 20424 }, { "epoch": 5.409427966101695, "grad_norm": 1.9482648372650146, "learning_rate": 7.295418432203391e-06, "loss": 1.1152, "mean_token_accuracy": 0.7419458776712418, "num_tokens": 16434380.0, "step": 20426 }, { "epoch": 5.4099576271186445, "grad_norm": 2.4129693508148193, "learning_rate": 7.295153601694916e-06, "loss": 0.88, "mean_token_accuracy": 0.7988357990980148, "num_tokens": 16435821.0, "step": 20428 }, { "epoch": 5.410487288135593, "grad_norm": 2.2418062686920166, "learning_rate": 7.2948887711864415e-06, "loss": 1.2107, "mean_token_accuracy": 0.7084772288799286, "num_tokens": 16437331.0, "step": 20430 }, { "epoch": 5.411016949152542, "grad_norm": 1.9915140867233276, "learning_rate": 7.2946239406779665e-06, "loss": 1.3232, "mean_token_accuracy": 0.7229564264416695, "num_tokens": 16438858.0, "step": 20432 }, { "epoch": 5.411546610169491, "grad_norm": 1.864814043045044, "learning_rate": 7.294359110169492e-06, "loss": 0.9757, "mean_token_accuracy": 0.7541156485676765, "num_tokens": 16440596.0, "step": 20434 }, { "epoch": 5.41207627118644, "grad_norm": 2.1907472610473633, "learning_rate": 7.294094279661017e-06, "loss": 1.0422, "mean_token_accuracy": 0.7425516545772552, "num_tokens": 16442145.0, "step": 20436 }, { "epoch": 5.41260593220339, "grad_norm": 2.7629294395446777, "learning_rate": 7.293829449152543e-06, "loss": 1.3793, "mean_token_accuracy": 0.7027052640914917, "num_tokens": 16443681.0, "step": 20438 }, { "epoch": 5.413135593220339, "grad_norm": 2.28365421295166, "learning_rate": 7.293564618644068e-06, "loss": 1.2296, "mean_token_accuracy": 0.7041827253997326, "num_tokens": 16445420.0, "step": 20440 }, { "epoch": 5.413665254237288, "grad_norm": 2.46105694770813, "learning_rate": 7.293299788135594e-06, "loss": 1.4558, "mean_token_accuracy": 0.6876535192131996, "num_tokens": 16446846.0, "step": 20442 }, { "epoch": 5.414194915254237, "grad_norm": 2.454599618911743, "learning_rate": 7.2930349576271194e-06, "loss": 1.4203, "mean_token_accuracy": 0.6612526476383209, "num_tokens": 16449057.0, "step": 20444 }, { "epoch": 5.414724576271187, "grad_norm": 2.018721342086792, "learning_rate": 7.292770127118644e-06, "loss": 0.7372, "mean_token_accuracy": 0.8195840790867805, "num_tokens": 16450420.0, "step": 20446 }, { "epoch": 5.415254237288136, "grad_norm": 2.2331249713897705, "learning_rate": 7.292505296610171e-06, "loss": 1.1742, "mean_token_accuracy": 0.714270330965519, "num_tokens": 16452127.0, "step": 20448 }, { "epoch": 5.415783898305085, "grad_norm": 2.1824512481689453, "learning_rate": 7.292240466101695e-06, "loss": 1.1216, "mean_token_accuracy": 0.7342356294393539, "num_tokens": 16453926.0, "step": 20450 }, { "epoch": 5.416313559322034, "grad_norm": 2.2841744422912598, "learning_rate": 7.291975635593222e-06, "loss": 1.2745, "mean_token_accuracy": 0.7125886902213097, "num_tokens": 16455711.0, "step": 20452 }, { "epoch": 5.416843220338983, "grad_norm": 2.2641196250915527, "learning_rate": 7.291710805084747e-06, "loss": 1.5834, "mean_token_accuracy": 0.6643470153212547, "num_tokens": 16457313.0, "step": 20454 }, { "epoch": 5.4173728813559325, "grad_norm": 1.8942034244537354, "learning_rate": 7.291445974576272e-06, "loss": 0.7534, "mean_token_accuracy": 0.8155529871582985, "num_tokens": 16458701.0, "step": 20456 }, { "epoch": 5.4179025423728815, "grad_norm": 2.3739876747131348, "learning_rate": 7.291181144067797e-06, "loss": 1.2577, "mean_token_accuracy": 0.7051874101161957, "num_tokens": 16461031.0, "step": 20458 }, { "epoch": 5.4184322033898304, "grad_norm": 2.1445672512054443, "learning_rate": 7.290916313559323e-06, "loss": 1.6352, "mean_token_accuracy": 0.6378199867904186, "num_tokens": 16462771.0, "step": 20460 }, { "epoch": 5.418961864406779, "grad_norm": 1.7696077823638916, "learning_rate": 7.290651483050848e-06, "loss": 0.8902, "mean_token_accuracy": 0.7830328643321991, "num_tokens": 16464548.0, "step": 20462 }, { "epoch": 5.419491525423728, "grad_norm": 2.200793981552124, "learning_rate": 7.290386652542374e-06, "loss": 0.8414, "mean_token_accuracy": 0.7892171666026115, "num_tokens": 16466188.0, "step": 20464 }, { "epoch": 5.420021186440678, "grad_norm": 2.1388401985168457, "learning_rate": 7.290121822033899e-06, "loss": 0.9263, "mean_token_accuracy": 0.7791878581047058, "num_tokens": 16467752.0, "step": 20466 }, { "epoch": 5.420550847457627, "grad_norm": 2.3529460430145264, "learning_rate": 7.2898569915254245e-06, "loss": 1.469, "mean_token_accuracy": 0.6847915500402451, "num_tokens": 16468997.0, "step": 20468 }, { "epoch": 5.421080508474576, "grad_norm": 1.9170629978179932, "learning_rate": 7.2895921610169494e-06, "loss": 0.9175, "mean_token_accuracy": 0.7781631126999855, "num_tokens": 16470627.0, "step": 20470 }, { "epoch": 5.421610169491525, "grad_norm": 2.33107852935791, "learning_rate": 7.289327330508475e-06, "loss": 0.9827, "mean_token_accuracy": 0.7500293403863907, "num_tokens": 16472413.0, "step": 20472 }, { "epoch": 5.422139830508475, "grad_norm": 2.6963372230529785, "learning_rate": 7.2890625e-06, "loss": 1.2613, "mean_token_accuracy": 0.6959065422415733, "num_tokens": 16473788.0, "step": 20474 }, { "epoch": 5.422669491525424, "grad_norm": 2.0411531925201416, "learning_rate": 7.288797669491526e-06, "loss": 1.329, "mean_token_accuracy": 0.7040688395500183, "num_tokens": 16475345.0, "step": 20476 }, { "epoch": 5.423199152542373, "grad_norm": 2.2730553150177, "learning_rate": 7.288532838983051e-06, "loss": 1.2252, "mean_token_accuracy": 0.7238830402493477, "num_tokens": 16476963.0, "step": 20478 }, { "epoch": 5.423728813559322, "grad_norm": 2.134420871734619, "learning_rate": 7.2882680084745775e-06, "loss": 0.8515, "mean_token_accuracy": 0.7701215296983719, "num_tokens": 16478406.0, "step": 20480 }, { "epoch": 5.424258474576272, "grad_norm": 2.1681816577911377, "learning_rate": 7.288003177966102e-06, "loss": 1.147, "mean_token_accuracy": 0.7225664407014847, "num_tokens": 16480036.0, "step": 20482 }, { "epoch": 5.424788135593221, "grad_norm": 2.11525297164917, "learning_rate": 7.287738347457628e-06, "loss": 1.593, "mean_token_accuracy": 0.6603561118245125, "num_tokens": 16481723.0, "step": 20484 }, { "epoch": 5.4253177966101696, "grad_norm": 1.997835397720337, "learning_rate": 7.287473516949153e-06, "loss": 1.1252, "mean_token_accuracy": 0.7403716631233692, "num_tokens": 16483241.0, "step": 20486 }, { "epoch": 5.4258474576271185, "grad_norm": 1.915389895439148, "learning_rate": 7.287208686440679e-06, "loss": 0.9811, "mean_token_accuracy": 0.7499656826257706, "num_tokens": 16484650.0, "step": 20488 }, { "epoch": 5.4263771186440675, "grad_norm": 2.02799391746521, "learning_rate": 7.286943855932204e-06, "loss": 1.6616, "mean_token_accuracy": 0.6425575204193592, "num_tokens": 16486447.0, "step": 20490 }, { "epoch": 5.426906779661017, "grad_norm": 2.333785057067871, "learning_rate": 7.28667902542373e-06, "loss": 1.4156, "mean_token_accuracy": 0.6883651688694954, "num_tokens": 16488044.0, "step": 20492 }, { "epoch": 5.427436440677966, "grad_norm": 2.0352020263671875, "learning_rate": 7.2864141949152545e-06, "loss": 0.8362, "mean_token_accuracy": 0.7698131576180458, "num_tokens": 16489711.0, "step": 20494 }, { "epoch": 5.427966101694915, "grad_norm": 2.104898452758789, "learning_rate": 7.28614936440678e-06, "loss": 1.3556, "mean_token_accuracy": 0.6849567294120789, "num_tokens": 16491428.0, "step": 20496 }, { "epoch": 5.428495762711864, "grad_norm": 2.368288278579712, "learning_rate": 7.285884533898305e-06, "loss": 1.5228, "mean_token_accuracy": 0.6728431843221188, "num_tokens": 16493403.0, "step": 20498 }, { "epoch": 5.429025423728813, "grad_norm": 2.5436017513275146, "learning_rate": 7.285619703389831e-06, "loss": 1.2035, "step": 20500 }, { "epoch": 5.429025423728813, "eval_loss": 1.3183705806732178, "eval_mean_token_accuracy": 0.6999604318823133, "eval_num_tokens": 16494951.0, "eval_runtime": 48.2634, "eval_samples_per_second": 6.382, "eval_steps_per_second": 6.382, "step": 20500 }, { "epoch": 5.429555084745763, "grad_norm": 2.5186259746551514, "learning_rate": 7.285354872881356e-06, "loss": 1.1838, "mean_token_accuracy": 0.7187970206141472, "num_tokens": 16496280.0, "step": 20502 }, { "epoch": 5.430084745762712, "grad_norm": 2.1558618545532227, "learning_rate": 7.285090042372882e-06, "loss": 1.497, "mean_token_accuracy": 0.6511394530534744, "num_tokens": 16497781.0, "step": 20504 }, { "epoch": 5.430614406779661, "grad_norm": 2.5995211601257324, "learning_rate": 7.284825211864407e-06, "loss": 1.6168, "mean_token_accuracy": 0.6486008167266846, "num_tokens": 16499228.0, "step": 20506 }, { "epoch": 5.43114406779661, "grad_norm": 2.2155303955078125, "learning_rate": 7.284560381355933e-06, "loss": 1.3813, "mean_token_accuracy": 0.6970654502511024, "num_tokens": 16500894.0, "step": 20508 }, { "epoch": 5.43167372881356, "grad_norm": 2.261772871017456, "learning_rate": 7.284295550847457e-06, "loss": 1.5817, "mean_token_accuracy": 0.6210664063692093, "num_tokens": 16502388.0, "step": 20510 }, { "epoch": 5.432203389830509, "grad_norm": 2.1014697551727295, "learning_rate": 7.284030720338984e-06, "loss": 1.3093, "mean_token_accuracy": 0.6932150945067406, "num_tokens": 16504157.0, "step": 20512 }, { "epoch": 5.432733050847458, "grad_norm": 2.1855275630950928, "learning_rate": 7.283765889830509e-06, "loss": 1.0803, "mean_token_accuracy": 0.7456738278269768, "num_tokens": 16505715.0, "step": 20514 }, { "epoch": 5.433262711864407, "grad_norm": 2.3437294960021973, "learning_rate": 7.283501059322035e-06, "loss": 1.3239, "mean_token_accuracy": 0.6977940872311592, "num_tokens": 16507058.0, "step": 20516 }, { "epoch": 5.4337923728813555, "grad_norm": 2.221346378326416, "learning_rate": 7.28323622881356e-06, "loss": 1.3488, "mean_token_accuracy": 0.6883370280265808, "num_tokens": 16508754.0, "step": 20518 }, { "epoch": 5.434322033898305, "grad_norm": 2.415334463119507, "learning_rate": 7.282971398305085e-06, "loss": 0.9451, "mean_token_accuracy": 0.754246212542057, "num_tokens": 16510133.0, "step": 20520 }, { "epoch": 5.434851694915254, "grad_norm": 2.2857210636138916, "learning_rate": 7.28270656779661e-06, "loss": 1.5069, "mean_token_accuracy": 0.6328911334276199, "num_tokens": 16511911.0, "step": 20522 }, { "epoch": 5.435381355932203, "grad_norm": 1.8606209754943848, "learning_rate": 7.282441737288136e-06, "loss": 1.4925, "mean_token_accuracy": 0.6531450524926186, "num_tokens": 16513664.0, "step": 20524 }, { "epoch": 5.435911016949152, "grad_norm": 1.8693093061447144, "learning_rate": 7.282176906779662e-06, "loss": 0.7912, "mean_token_accuracy": 0.7949018254876137, "num_tokens": 16515392.0, "step": 20526 }, { "epoch": 5.436440677966102, "grad_norm": 2.306105136871338, "learning_rate": 7.281912076271187e-06, "loss": 1.7127, "mean_token_accuracy": 0.6217541918158531, "num_tokens": 16517113.0, "step": 20528 }, { "epoch": 5.436970338983051, "grad_norm": 1.8658885955810547, "learning_rate": 7.2816472457627126e-06, "loss": 0.9547, "mean_token_accuracy": 0.7634756490588188, "num_tokens": 16518835.0, "step": 20530 }, { "epoch": 5.4375, "grad_norm": 2.9632155895233154, "learning_rate": 7.2813824152542375e-06, "loss": 1.2327, "mean_token_accuracy": 0.7155073434114456, "num_tokens": 16520151.0, "step": 20532 }, { "epoch": 5.438029661016949, "grad_norm": 2.312838554382324, "learning_rate": 7.281117584745764e-06, "loss": 1.1465, "mean_token_accuracy": 0.7352554351091385, "num_tokens": 16521454.0, "step": 20534 }, { "epoch": 5.438559322033898, "grad_norm": 2.0937955379486084, "learning_rate": 7.280852754237289e-06, "loss": 1.3882, "mean_token_accuracy": 0.6807350963354111, "num_tokens": 16522955.0, "step": 20536 }, { "epoch": 5.439088983050848, "grad_norm": 2.2353315353393555, "learning_rate": 7.280587923728815e-06, "loss": 1.1375, "mean_token_accuracy": 0.7170980870723724, "num_tokens": 16524584.0, "step": 20538 }, { "epoch": 5.439618644067797, "grad_norm": 2.440096616744995, "learning_rate": 7.28032309322034e-06, "loss": 1.482, "mean_token_accuracy": 0.6675298884510994, "num_tokens": 16526471.0, "step": 20540 }, { "epoch": 5.440148305084746, "grad_norm": 2.349841833114624, "learning_rate": 7.2800582627118655e-06, "loss": 1.5561, "mean_token_accuracy": 0.6647959649562836, "num_tokens": 16528344.0, "step": 20542 }, { "epoch": 5.440677966101695, "grad_norm": 2.1073615550994873, "learning_rate": 7.2797934322033905e-06, "loss": 0.9881, "mean_token_accuracy": 0.7489150390028954, "num_tokens": 16529857.0, "step": 20544 }, { "epoch": 5.4412076271186445, "grad_norm": 2.5102479457855225, "learning_rate": 7.279528601694916e-06, "loss": 1.2437, "mean_token_accuracy": 0.7160807698965073, "num_tokens": 16531303.0, "step": 20546 }, { "epoch": 5.441737288135593, "grad_norm": 2.1205480098724365, "learning_rate": 7.279263771186441e-06, "loss": 0.85, "mean_token_accuracy": 0.7785231173038483, "num_tokens": 16532738.0, "step": 20548 }, { "epoch": 5.442266949152542, "grad_norm": 2.309002161026001, "learning_rate": 7.278998940677967e-06, "loss": 1.1428, "mean_token_accuracy": 0.7254326120018959, "num_tokens": 16534631.0, "step": 20550 }, { "epoch": 5.442796610169491, "grad_norm": 2.0786640644073486, "learning_rate": 7.278734110169492e-06, "loss": 1.1327, "mean_token_accuracy": 0.74297746270895, "num_tokens": 16536086.0, "step": 20552 }, { "epoch": 5.44332627118644, "grad_norm": 2.167436361312866, "learning_rate": 7.278469279661018e-06, "loss": 1.2349, "mean_token_accuracy": 0.7336914464831352, "num_tokens": 16537735.0, "step": 20554 }, { "epoch": 5.44385593220339, "grad_norm": 2.3265836238861084, "learning_rate": 7.2782044491525426e-06, "loss": 1.1833, "mean_token_accuracy": 0.7183124870061874, "num_tokens": 16539214.0, "step": 20556 }, { "epoch": 5.444385593220339, "grad_norm": 2.4273791313171387, "learning_rate": 7.277939618644068e-06, "loss": 1.6987, "mean_token_accuracy": 0.6460024788975716, "num_tokens": 16540687.0, "step": 20558 }, { "epoch": 5.444915254237288, "grad_norm": 2.4470577239990234, "learning_rate": 7.277674788135593e-06, "loss": 1.0959, "mean_token_accuracy": 0.760899618268013, "num_tokens": 16542390.0, "step": 20560 }, { "epoch": 5.445444915254237, "grad_norm": 2.260984420776367, "learning_rate": 7.27740995762712e-06, "loss": 0.8603, "mean_token_accuracy": 0.7863893285393715, "num_tokens": 16543888.0, "step": 20562 }, { "epoch": 5.445974576271187, "grad_norm": 2.6393966674804688, "learning_rate": 7.277145127118644e-06, "loss": 1.7765, "mean_token_accuracy": 0.6564352735877037, "num_tokens": 16545457.0, "step": 20564 }, { "epoch": 5.446504237288136, "grad_norm": 1.9816612005233765, "learning_rate": 7.276880296610171e-06, "loss": 1.069, "mean_token_accuracy": 0.759238138794899, "num_tokens": 16546928.0, "step": 20566 }, { "epoch": 5.447033898305085, "grad_norm": 2.0590920448303223, "learning_rate": 7.2766154661016955e-06, "loss": 1.229, "mean_token_accuracy": 0.7153305411338806, "num_tokens": 16548579.0, "step": 20568 }, { "epoch": 5.447563559322034, "grad_norm": 2.1319682598114014, "learning_rate": 7.276350635593221e-06, "loss": 1.175, "mean_token_accuracy": 0.7125164940953255, "num_tokens": 16551166.0, "step": 20570 }, { "epoch": 5.448093220338983, "grad_norm": 2.538898468017578, "learning_rate": 7.276085805084746e-06, "loss": 1.1251, "mean_token_accuracy": 0.7338294833898544, "num_tokens": 16552690.0, "step": 20572 }, { "epoch": 5.4486228813559325, "grad_norm": 2.6834182739257812, "learning_rate": 7.275820974576272e-06, "loss": 1.7715, "mean_token_accuracy": 0.6264381520450115, "num_tokens": 16554341.0, "step": 20574 }, { "epoch": 5.4491525423728815, "grad_norm": 2.2182466983795166, "learning_rate": 7.275556144067797e-06, "loss": 1.0191, "mean_token_accuracy": 0.747993640601635, "num_tokens": 16556160.0, "step": 20576 }, { "epoch": 5.4496822033898304, "grad_norm": 2.096047878265381, "learning_rate": 7.275291313559323e-06, "loss": 1.0766, "mean_token_accuracy": 0.7371773645281792, "num_tokens": 16558060.0, "step": 20578 }, { "epoch": 5.450211864406779, "grad_norm": 1.8384296894073486, "learning_rate": 7.275026483050848e-06, "loss": 1.1104, "mean_token_accuracy": 0.7147852070629597, "num_tokens": 16559880.0, "step": 20580 }, { "epoch": 5.450741525423728, "grad_norm": 1.9069774150848389, "learning_rate": 7.2747616525423734e-06, "loss": 0.8368, "mean_token_accuracy": 0.7803240343928337, "num_tokens": 16561302.0, "step": 20582 }, { "epoch": 5.451271186440678, "grad_norm": 2.668036460876465, "learning_rate": 7.274496822033898e-06, "loss": 1.2905, "mean_token_accuracy": 0.7349220812320709, "num_tokens": 16562518.0, "step": 20584 }, { "epoch": 5.451800847457627, "grad_norm": 2.2482447624206543, "learning_rate": 7.274231991525424e-06, "loss": 1.1247, "mean_token_accuracy": 0.739480160176754, "num_tokens": 16564177.0, "step": 20586 }, { "epoch": 5.452330508474576, "grad_norm": 2.241053581237793, "learning_rate": 7.273967161016949e-06, "loss": 1.398, "mean_token_accuracy": 0.68697115406394, "num_tokens": 16565685.0, "step": 20588 }, { "epoch": 5.452860169491525, "grad_norm": 2.333341121673584, "learning_rate": 7.273702330508476e-06, "loss": 1.7463, "mean_token_accuracy": 0.6123416945338249, "num_tokens": 16567327.0, "step": 20590 }, { "epoch": 5.453389830508475, "grad_norm": 2.4119319915771484, "learning_rate": 7.2734375e-06, "loss": 1.5302, "mean_token_accuracy": 0.6582394018769264, "num_tokens": 16568832.0, "step": 20592 }, { "epoch": 5.453919491525424, "grad_norm": 2.256983518600464, "learning_rate": 7.273172669491526e-06, "loss": 1.4789, "mean_token_accuracy": 0.6561588421463966, "num_tokens": 16570506.0, "step": 20594 }, { "epoch": 5.454449152542373, "grad_norm": 1.634479284286499, "learning_rate": 7.272907838983051e-06, "loss": 0.7243, "mean_token_accuracy": 0.8042491003870964, "num_tokens": 16572130.0, "step": 20596 }, { "epoch": 5.454978813559322, "grad_norm": 2.1448683738708496, "learning_rate": 7.272643008474577e-06, "loss": 1.2153, "mean_token_accuracy": 0.7046518176794052, "num_tokens": 16573901.0, "step": 20598 }, { "epoch": 5.455508474576272, "grad_norm": 2.506999969482422, "learning_rate": 7.272378177966102e-06, "loss": 1.4706, "mean_token_accuracy": 0.7099886387586594, "num_tokens": 16575438.0, "step": 20600 }, { "epoch": 5.456038135593221, "grad_norm": 2.0378036499023438, "learning_rate": 7.272113347457628e-06, "loss": 0.9749, "mean_token_accuracy": 0.7633658275008202, "num_tokens": 16577110.0, "step": 20602 }, { "epoch": 5.4565677966101696, "grad_norm": 2.4326295852661133, "learning_rate": 7.271848516949153e-06, "loss": 1.4148, "mean_token_accuracy": 0.714009128510952, "num_tokens": 16578704.0, "step": 20604 }, { "epoch": 5.4570974576271185, "grad_norm": 2.4174225330352783, "learning_rate": 7.2715836864406785e-06, "loss": 1.0457, "mean_token_accuracy": 0.759360134601593, "num_tokens": 16580290.0, "step": 20606 }, { "epoch": 5.4576271186440675, "grad_norm": 2.3777403831481934, "learning_rate": 7.2713188559322034e-06, "loss": 1.1831, "mean_token_accuracy": 0.7272465452551842, "num_tokens": 16581852.0, "step": 20608 }, { "epoch": 5.458156779661017, "grad_norm": 2.4056472778320312, "learning_rate": 7.271054025423729e-06, "loss": 1.3192, "mean_token_accuracy": 0.7160631939768791, "num_tokens": 16583269.0, "step": 20610 }, { "epoch": 5.458686440677966, "grad_norm": 2.1096529960632324, "learning_rate": 7.270789194915255e-06, "loss": 1.2843, "mean_token_accuracy": 0.7025194689631462, "num_tokens": 16584885.0, "step": 20612 }, { "epoch": 5.459216101694915, "grad_norm": 2.461343288421631, "learning_rate": 7.27052436440678e-06, "loss": 1.5745, "mean_token_accuracy": 0.6565456911921501, "num_tokens": 16586529.0, "step": 20614 }, { "epoch": 5.459745762711864, "grad_norm": 2.345078468322754, "learning_rate": 7.2702595338983065e-06, "loss": 0.9895, "mean_token_accuracy": 0.7772925198078156, "num_tokens": 16588068.0, "step": 20616 }, { "epoch": 5.460275423728813, "grad_norm": 2.8293375968933105, "learning_rate": 7.269994703389831e-06, "loss": 1.3522, "mean_token_accuracy": 0.7035553008317947, "num_tokens": 16589697.0, "step": 20618 }, { "epoch": 5.460805084745763, "grad_norm": 1.8318251371383667, "learning_rate": 7.269729872881357e-06, "loss": 0.895, "mean_token_accuracy": 0.7847145050764084, "num_tokens": 16591324.0, "step": 20620 }, { "epoch": 5.461334745762712, "grad_norm": 1.7390244007110596, "learning_rate": 7.269465042372882e-06, "loss": 1.0665, "mean_token_accuracy": 0.7501460835337639, "num_tokens": 16592997.0, "step": 20622 }, { "epoch": 5.461864406779661, "grad_norm": 1.9219403266906738, "learning_rate": 7.269200211864408e-06, "loss": 1.2239, "mean_token_accuracy": 0.7104324847459793, "num_tokens": 16594595.0, "step": 20624 }, { "epoch": 5.46239406779661, "grad_norm": 2.2544612884521484, "learning_rate": 7.268935381355933e-06, "loss": 1.4648, "mean_token_accuracy": 0.6704114750027657, "num_tokens": 16596078.0, "step": 20626 }, { "epoch": 5.46292372881356, "grad_norm": 2.040116310119629, "learning_rate": 7.268670550847459e-06, "loss": 1.1581, "mean_token_accuracy": 0.7402101159095764, "num_tokens": 16597750.0, "step": 20628 }, { "epoch": 5.463453389830509, "grad_norm": 2.3439345359802246, "learning_rate": 7.268405720338984e-06, "loss": 1.1633, "mean_token_accuracy": 0.7147140353918076, "num_tokens": 16599305.0, "step": 20630 }, { "epoch": 5.463983050847458, "grad_norm": 1.9290266036987305, "learning_rate": 7.268140889830509e-06, "loss": 0.6691, "mean_token_accuracy": 0.8207328170537949, "num_tokens": 16600769.0, "step": 20632 }, { "epoch": 5.464512711864407, "grad_norm": 2.4031245708465576, "learning_rate": 7.267876059322034e-06, "loss": 1.0311, "mean_token_accuracy": 0.7419156357645988, "num_tokens": 16602132.0, "step": 20634 }, { "epoch": 5.4650423728813555, "grad_norm": 1.8670166730880737, "learning_rate": 7.26761122881356e-06, "loss": 1.3123, "mean_token_accuracy": 0.7273875921964645, "num_tokens": 16603981.0, "step": 20636 }, { "epoch": 5.465572033898305, "grad_norm": 1.9781949520111084, "learning_rate": 7.267346398305085e-06, "loss": 1.1385, "mean_token_accuracy": 0.7532273456454277, "num_tokens": 16605627.0, "step": 20638 }, { "epoch": 5.466101694915254, "grad_norm": 1.962679386138916, "learning_rate": 7.267081567796611e-06, "loss": 0.985, "mean_token_accuracy": 0.7714373841881752, "num_tokens": 16607388.0, "step": 20640 }, { "epoch": 5.466631355932203, "grad_norm": 2.4739394187927246, "learning_rate": 7.266816737288136e-06, "loss": 1.4498, "mean_token_accuracy": 0.7015482373535633, "num_tokens": 16608975.0, "step": 20642 }, { "epoch": 5.467161016949152, "grad_norm": 2.060274124145508, "learning_rate": 7.266551906779662e-06, "loss": 1.4074, "mean_token_accuracy": 0.6598645225167274, "num_tokens": 16610750.0, "step": 20644 }, { "epoch": 5.467690677966102, "grad_norm": 2.285639524459839, "learning_rate": 7.266287076271186e-06, "loss": 1.5967, "mean_token_accuracy": 0.6379996798932552, "num_tokens": 16612378.0, "step": 20646 }, { "epoch": 5.468220338983051, "grad_norm": 2.2790305614471436, "learning_rate": 7.266022245762713e-06, "loss": 1.2706, "mean_token_accuracy": 0.7106687650084496, "num_tokens": 16613953.0, "step": 20648 }, { "epoch": 5.46875, "grad_norm": 2.1821534633636475, "learning_rate": 7.265757415254238e-06, "loss": 1.367, "mean_token_accuracy": 0.6846985220909119, "num_tokens": 16615643.0, "step": 20650 }, { "epoch": 5.469279661016949, "grad_norm": 2.2888340950012207, "learning_rate": 7.265492584745764e-06, "loss": 1.318, "mean_token_accuracy": 0.7121654227375984, "num_tokens": 16617234.0, "step": 20652 }, { "epoch": 5.469809322033898, "grad_norm": 2.7870233058929443, "learning_rate": 7.265227754237289e-06, "loss": 1.5136, "mean_token_accuracy": 0.6529113948345184, "num_tokens": 16618783.0, "step": 20654 }, { "epoch": 5.470338983050848, "grad_norm": 2.906586170196533, "learning_rate": 7.2649629237288144e-06, "loss": 1.3529, "mean_token_accuracy": 0.6908981874585152, "num_tokens": 16620305.0, "step": 20656 }, { "epoch": 5.470868644067797, "grad_norm": 2.106198787689209, "learning_rate": 7.264698093220339e-06, "loss": 1.447, "mean_token_accuracy": 0.6670949906110764, "num_tokens": 16622021.0, "step": 20658 }, { "epoch": 5.471398305084746, "grad_norm": 1.7505606412887573, "learning_rate": 7.264433262711865e-06, "loss": 1.0216, "mean_token_accuracy": 0.7555007487535477, "num_tokens": 16623863.0, "step": 20660 }, { "epoch": 5.471927966101695, "grad_norm": 2.575606346130371, "learning_rate": 7.26416843220339e-06, "loss": 1.1832, "mean_token_accuracy": 0.7386470139026642, "num_tokens": 16625333.0, "step": 20662 }, { "epoch": 5.4724576271186445, "grad_norm": 2.141263961791992, "learning_rate": 7.263903601694916e-06, "loss": 1.1447, "mean_token_accuracy": 0.7242446616292, "num_tokens": 16627094.0, "step": 20664 }, { "epoch": 5.472987288135593, "grad_norm": 2.189728021621704, "learning_rate": 7.263638771186441e-06, "loss": 0.9471, "mean_token_accuracy": 0.766917310655117, "num_tokens": 16628283.0, "step": 20666 }, { "epoch": 5.473516949152542, "grad_norm": 2.2348411083221436, "learning_rate": 7.2633739406779666e-06, "loss": 1.5587, "mean_token_accuracy": 0.6558495834469795, "num_tokens": 16629942.0, "step": 20668 }, { "epoch": 5.474046610169491, "grad_norm": 1.884181261062622, "learning_rate": 7.2631091101694915e-06, "loss": 1.1449, "mean_token_accuracy": 0.7364080473780632, "num_tokens": 16631386.0, "step": 20670 }, { "epoch": 5.47457627118644, "grad_norm": 2.281075954437256, "learning_rate": 7.262844279661017e-06, "loss": 1.1945, "mean_token_accuracy": 0.7130686864256859, "num_tokens": 16632958.0, "step": 20672 }, { "epoch": 5.47510593220339, "grad_norm": 2.4513778686523438, "learning_rate": 7.262579449152542e-06, "loss": 1.2248, "mean_token_accuracy": 0.7500422447919846, "num_tokens": 16634449.0, "step": 20674 }, { "epoch": 5.475635593220339, "grad_norm": 2.3439109325408936, "learning_rate": 7.262314618644069e-06, "loss": 1.0125, "mean_token_accuracy": 0.7485788911581039, "num_tokens": 16636010.0, "step": 20676 }, { "epoch": 5.476165254237288, "grad_norm": 2.2478065490722656, "learning_rate": 7.262049788135594e-06, "loss": 1.3106, "mean_token_accuracy": 0.6872932314872742, "num_tokens": 16637723.0, "step": 20678 }, { "epoch": 5.476694915254237, "grad_norm": 2.7915942668914795, "learning_rate": 7.2617849576271195e-06, "loss": 1.5085, "mean_token_accuracy": 0.6799611449241638, "num_tokens": 16639211.0, "step": 20680 }, { "epoch": 5.477224576271187, "grad_norm": 1.8099381923675537, "learning_rate": 7.2615201271186444e-06, "loss": 1.1501, "mean_token_accuracy": 0.7260334119200706, "num_tokens": 16641107.0, "step": 20682 }, { "epoch": 5.477754237288136, "grad_norm": 2.43194580078125, "learning_rate": 7.26125529661017e-06, "loss": 1.3227, "mean_token_accuracy": 0.6976564303040504, "num_tokens": 16642797.0, "step": 20684 }, { "epoch": 5.478283898305085, "grad_norm": 2.295125961303711, "learning_rate": 7.260990466101695e-06, "loss": 1.523, "mean_token_accuracy": 0.6757384315133095, "num_tokens": 16644192.0, "step": 20686 }, { "epoch": 5.478813559322034, "grad_norm": 1.8112818002700806, "learning_rate": 7.260725635593221e-06, "loss": 1.1866, "mean_token_accuracy": 0.7216749042272568, "num_tokens": 16646285.0, "step": 20688 }, { "epoch": 5.479343220338983, "grad_norm": 2.8023409843444824, "learning_rate": 7.260460805084746e-06, "loss": 1.3169, "mean_token_accuracy": 0.7003970965743065, "num_tokens": 16647770.0, "step": 20690 }, { "epoch": 5.4798728813559325, "grad_norm": 2.2753968238830566, "learning_rate": 7.260195974576272e-06, "loss": 0.9403, "mean_token_accuracy": 0.7540487051010132, "num_tokens": 16649947.0, "step": 20692 }, { "epoch": 5.4804025423728815, "grad_norm": 2.1074914932250977, "learning_rate": 7.259931144067797e-06, "loss": 1.6373, "mean_token_accuracy": 0.6376981511712074, "num_tokens": 16651612.0, "step": 20694 }, { "epoch": 5.4809322033898304, "grad_norm": 2.132943630218506, "learning_rate": 7.259666313559322e-06, "loss": 1.0545, "mean_token_accuracy": 0.7344926819205284, "num_tokens": 16653142.0, "step": 20696 }, { "epoch": 5.481461864406779, "grad_norm": 2.250981330871582, "learning_rate": 7.259401483050849e-06, "loss": 1.4185, "mean_token_accuracy": 0.6747802719473839, "num_tokens": 16654789.0, "step": 20698 }, { "epoch": 5.481991525423728, "grad_norm": 2.3223650455474854, "learning_rate": 7.259136652542373e-06, "loss": 1.0315, "mean_token_accuracy": 0.751108705997467, "num_tokens": 16656309.0, "step": 20700 }, { "epoch": 5.482521186440678, "grad_norm": 1.9204074144363403, "learning_rate": 7.2588718220339e-06, "loss": 0.657, "mean_token_accuracy": 0.8237502053380013, "num_tokens": 16657874.0, "step": 20702 }, { "epoch": 5.483050847457627, "grad_norm": 2.1209630966186523, "learning_rate": 7.258606991525425e-06, "loss": 1.2397, "mean_token_accuracy": 0.7062187530100346, "num_tokens": 16659585.0, "step": 20704 }, { "epoch": 5.483580508474576, "grad_norm": 2.2836971282958984, "learning_rate": 7.25834216101695e-06, "loss": 1.0761, "mean_token_accuracy": 0.7011897787451744, "num_tokens": 16661820.0, "step": 20706 }, { "epoch": 5.484110169491525, "grad_norm": 2.732746124267578, "learning_rate": 7.258077330508475e-06, "loss": 1.3119, "mean_token_accuracy": 0.7139852344989777, "num_tokens": 16663007.0, "step": 20708 }, { "epoch": 5.484639830508475, "grad_norm": 2.9393537044525146, "learning_rate": 7.257812500000001e-06, "loss": 1.4441, "mean_token_accuracy": 0.7155895233154297, "num_tokens": 16664312.0, "step": 20710 }, { "epoch": 5.485169491525424, "grad_norm": 2.543553590774536, "learning_rate": 7.257547669491526e-06, "loss": 1.2415, "mean_token_accuracy": 0.7183052971959114, "num_tokens": 16665790.0, "step": 20712 }, { "epoch": 5.485699152542373, "grad_norm": 1.7186797857284546, "learning_rate": 7.257282838983052e-06, "loss": 0.8906, "mean_token_accuracy": 0.7620891407132149, "num_tokens": 16667354.0, "step": 20714 }, { "epoch": 5.486228813559322, "grad_norm": 2.620783567428589, "learning_rate": 7.257018008474577e-06, "loss": 1.4159, "mean_token_accuracy": 0.6610546708106995, "num_tokens": 16668850.0, "step": 20716 }, { "epoch": 5.486758474576272, "grad_norm": 2.1136419773101807, "learning_rate": 7.2567531779661025e-06, "loss": 1.742, "mean_token_accuracy": 0.6209631413221359, "num_tokens": 16670436.0, "step": 20718 }, { "epoch": 5.487288135593221, "grad_norm": 2.1375513076782227, "learning_rate": 7.256488347457627e-06, "loss": 1.6862, "mean_token_accuracy": 0.6082155480980873, "num_tokens": 16672309.0, "step": 20720 }, { "epoch": 5.4878177966101696, "grad_norm": 2.0881099700927734, "learning_rate": 7.256223516949153e-06, "loss": 1.0803, "mean_token_accuracy": 0.7287362068891525, "num_tokens": 16674142.0, "step": 20722 }, { "epoch": 5.4883474576271185, "grad_norm": 2.2532765865325928, "learning_rate": 7.255958686440678e-06, "loss": 1.206, "mean_token_accuracy": 0.7061334922909737, "num_tokens": 16675673.0, "step": 20724 }, { "epoch": 5.4888771186440675, "grad_norm": 1.9537384510040283, "learning_rate": 7.255693855932204e-06, "loss": 1.2669, "mean_token_accuracy": 0.7172282189130783, "num_tokens": 16677399.0, "step": 20726 }, { "epoch": 5.489406779661017, "grad_norm": 1.6086461544036865, "learning_rate": 7.255429025423729e-06, "loss": 0.6042, "mean_token_accuracy": 0.8216031789779663, "num_tokens": 16679122.0, "step": 20728 }, { "epoch": 5.489936440677966, "grad_norm": 2.308600425720215, "learning_rate": 7.2551641949152555e-06, "loss": 1.4044, "mean_token_accuracy": 0.6736759953200817, "num_tokens": 16680800.0, "step": 20730 }, { "epoch": 5.490466101694915, "grad_norm": 2.1904900074005127, "learning_rate": 7.25489936440678e-06, "loss": 1.2746, "mean_token_accuracy": 0.7077111601829529, "num_tokens": 16682441.0, "step": 20732 }, { "epoch": 5.490995762711864, "grad_norm": 2.0020127296447754, "learning_rate": 7.254634533898306e-06, "loss": 1.2167, "mean_token_accuracy": 0.7133559957146645, "num_tokens": 16683809.0, "step": 20734 }, { "epoch": 5.491525423728813, "grad_norm": 2.1777119636535645, "learning_rate": 7.254369703389831e-06, "loss": 1.3981, "mean_token_accuracy": 0.6780505701899529, "num_tokens": 16685834.0, "step": 20736 }, { "epoch": 5.492055084745763, "grad_norm": 1.805289626121521, "learning_rate": 7.254104872881357e-06, "loss": 0.9576, "mean_token_accuracy": 0.7628805935382843, "num_tokens": 16687232.0, "step": 20738 }, { "epoch": 5.492584745762712, "grad_norm": 2.115424633026123, "learning_rate": 7.253840042372882e-06, "loss": 1.1649, "mean_token_accuracy": 0.7307906001806259, "num_tokens": 16688902.0, "step": 20740 }, { "epoch": 5.493114406779661, "grad_norm": 2.570840358734131, "learning_rate": 7.2535752118644076e-06, "loss": 1.4665, "mean_token_accuracy": 0.6671725809574127, "num_tokens": 16690517.0, "step": 20742 }, { "epoch": 5.49364406779661, "grad_norm": 2.0890512466430664, "learning_rate": 7.2533103813559325e-06, "loss": 1.4082, "mean_token_accuracy": 0.7108738794922829, "num_tokens": 16691992.0, "step": 20744 }, { "epoch": 5.49417372881356, "grad_norm": 2.0521085262298584, "learning_rate": 7.253045550847458e-06, "loss": 1.6995, "mean_token_accuracy": 0.6547083668410778, "num_tokens": 16693869.0, "step": 20746 }, { "epoch": 5.494703389830509, "grad_norm": 2.470889091491699, "learning_rate": 7.252780720338983e-06, "loss": 1.3866, "mean_token_accuracy": 0.7018113434314728, "num_tokens": 16695500.0, "step": 20748 }, { "epoch": 5.495233050847458, "grad_norm": 2.1977739334106445, "learning_rate": 7.252515889830509e-06, "loss": 1.3301, "step": 20750 }, { "epoch": 5.495233050847458, "eval_loss": 1.3175956010818481, "eval_mean_token_accuracy": 0.6999997264379031, "eval_num_tokens": 16697093.0, "eval_runtime": 48.2756, "eval_samples_per_second": 6.38, "eval_steps_per_second": 6.38, "step": 20750 }, { "epoch": 5.495762711864407, "grad_norm": 2.0288326740264893, "learning_rate": 7.252251059322034e-06, "loss": 0.9717, "mean_token_accuracy": 0.7287415377795696, "num_tokens": 16698767.0, "step": 20752 }, { "epoch": 5.4962923728813555, "grad_norm": 1.9446032047271729, "learning_rate": 7.25198622881356e-06, "loss": 1.0849, "mean_token_accuracy": 0.7480712607502937, "num_tokens": 16700758.0, "step": 20754 }, { "epoch": 5.496822033898305, "grad_norm": 2.02179217338562, "learning_rate": 7.251721398305085e-06, "loss": 1.1132, "mean_token_accuracy": 0.7438345775008202, "num_tokens": 16702522.0, "step": 20756 }, { "epoch": 5.497351694915254, "grad_norm": 2.273297071456909, "learning_rate": 7.251456567796611e-06, "loss": 1.0154, "mean_token_accuracy": 0.7364387214183807, "num_tokens": 16704510.0, "step": 20758 }, { "epoch": 5.497881355932203, "grad_norm": 2.299410343170166, "learning_rate": 7.251191737288136e-06, "loss": 0.6641, "mean_token_accuracy": 0.8291557356715202, "num_tokens": 16705992.0, "step": 20760 }, { "epoch": 5.498411016949152, "grad_norm": 1.8544549942016602, "learning_rate": 7.250926906779662e-06, "loss": 0.9248, "mean_token_accuracy": 0.7866217344999313, "num_tokens": 16707913.0, "step": 20762 }, { "epoch": 5.498940677966102, "grad_norm": 2.052290439605713, "learning_rate": 7.250662076271187e-06, "loss": 1.3553, "mean_token_accuracy": 0.6883139759302139, "num_tokens": 16709765.0, "step": 20764 }, { "epoch": 5.499470338983051, "grad_norm": 2.09899640083313, "learning_rate": 7.250397245762713e-06, "loss": 1.2037, "mean_token_accuracy": 0.7399456650018692, "num_tokens": 16711281.0, "step": 20766 }, { "epoch": 5.5, "grad_norm": 1.941867709159851, "learning_rate": 7.250132415254238e-06, "loss": 1.1697, "mean_token_accuracy": 0.7395598217844963, "num_tokens": 16712748.0, "step": 20768 }, { "epoch": 5.500529661016949, "grad_norm": 2.4136438369750977, "learning_rate": 7.249867584745763e-06, "loss": 1.5377, "mean_token_accuracy": 0.6734738796949387, "num_tokens": 16714366.0, "step": 20770 }, { "epoch": 5.501059322033898, "grad_norm": 1.632087230682373, "learning_rate": 7.249602754237288e-06, "loss": 0.9637, "mean_token_accuracy": 0.7679935321211815, "num_tokens": 16715936.0, "step": 20772 }, { "epoch": 5.501588983050848, "grad_norm": 1.872898817062378, "learning_rate": 7.249337923728814e-06, "loss": 1.4102, "mean_token_accuracy": 0.6842779442667961, "num_tokens": 16717651.0, "step": 20774 }, { "epoch": 5.502118644067797, "grad_norm": 2.3025119304656982, "learning_rate": 7.249073093220339e-06, "loss": 1.1729, "mean_token_accuracy": 0.6958335302770138, "num_tokens": 16719273.0, "step": 20776 }, { "epoch": 5.502648305084746, "grad_norm": 2.007080316543579, "learning_rate": 7.248808262711865e-06, "loss": 1.044, "mean_token_accuracy": 0.7320975065231323, "num_tokens": 16720945.0, "step": 20778 }, { "epoch": 5.503177966101695, "grad_norm": 2.400331735610962, "learning_rate": 7.2485434322033905e-06, "loss": 1.2006, "mean_token_accuracy": 0.7056451737880707, "num_tokens": 16722485.0, "step": 20780 }, { "epoch": 5.503707627118644, "grad_norm": 2.4136879444122314, "learning_rate": 7.2482786016949155e-06, "loss": 1.0878, "mean_token_accuracy": 0.7413025051355362, "num_tokens": 16724215.0, "step": 20782 }, { "epoch": 5.504237288135593, "grad_norm": 1.9773640632629395, "learning_rate": 7.248013771186442e-06, "loss": 0.8714, "mean_token_accuracy": 0.7788233757019043, "num_tokens": 16725745.0, "step": 20784 }, { "epoch": 5.504766949152542, "grad_norm": 2.047598361968994, "learning_rate": 7.247748940677967e-06, "loss": 1.3881, "mean_token_accuracy": 0.6939719095826149, "num_tokens": 16727543.0, "step": 20786 }, { "epoch": 5.505296610169491, "grad_norm": 2.6189348697662354, "learning_rate": 7.247484110169493e-06, "loss": 1.1021, "mean_token_accuracy": 0.7286197543144226, "num_tokens": 16729035.0, "step": 20788 }, { "epoch": 5.50582627118644, "grad_norm": 2.1992416381835938, "learning_rate": 7.247219279661018e-06, "loss": 1.0821, "mean_token_accuracy": 0.7384100034832954, "num_tokens": 16730484.0, "step": 20790 }, { "epoch": 5.50635593220339, "grad_norm": 2.149517059326172, "learning_rate": 7.2469544491525435e-06, "loss": 1.3809, "mean_token_accuracy": 0.6715726181864738, "num_tokens": 16732087.0, "step": 20792 }, { "epoch": 5.506885593220339, "grad_norm": 2.823646306991577, "learning_rate": 7.2466896186440684e-06, "loss": 1.5153, "mean_token_accuracy": 0.67580346763134, "num_tokens": 16733302.0, "step": 20794 }, { "epoch": 5.507415254237288, "grad_norm": 2.2778289318084717, "learning_rate": 7.246424788135594e-06, "loss": 1.1903, "mean_token_accuracy": 0.7283270582556725, "num_tokens": 16734883.0, "step": 20796 }, { "epoch": 5.507944915254237, "grad_norm": 2.517132043838501, "learning_rate": 7.246159957627119e-06, "loss": 1.6521, "mean_token_accuracy": 0.6520582139492035, "num_tokens": 16736577.0, "step": 20798 }, { "epoch": 5.508474576271187, "grad_norm": 2.41684889793396, "learning_rate": 7.245895127118645e-06, "loss": 0.9917, "mean_token_accuracy": 0.7411763444542885, "num_tokens": 16738325.0, "step": 20800 }, { "epoch": 5.509004237288136, "grad_norm": 2.001208543777466, "learning_rate": 7.24563029661017e-06, "loss": 1.4197, "mean_token_accuracy": 0.7015884295105934, "num_tokens": 16739939.0, "step": 20802 }, { "epoch": 5.509533898305085, "grad_norm": 2.017862319946289, "learning_rate": 7.245365466101696e-06, "loss": 1.4417, "mean_token_accuracy": 0.6646317839622498, "num_tokens": 16741356.0, "step": 20804 }, { "epoch": 5.510063559322034, "grad_norm": 1.8229753971099854, "learning_rate": 7.2451006355932205e-06, "loss": 1.021, "mean_token_accuracy": 0.740434505045414, "num_tokens": 16742833.0, "step": 20806 }, { "epoch": 5.510593220338983, "grad_norm": 1.8613803386688232, "learning_rate": 7.244835805084746e-06, "loss": 0.9088, "mean_token_accuracy": 0.7906756401062012, "num_tokens": 16744475.0, "step": 20808 }, { "epoch": 5.5111228813559325, "grad_norm": 2.463595151901245, "learning_rate": 7.244570974576271e-06, "loss": 1.3101, "mean_token_accuracy": 0.691007237881422, "num_tokens": 16745906.0, "step": 20810 }, { "epoch": 5.5116525423728815, "grad_norm": 1.7811062335968018, "learning_rate": 7.244306144067798e-06, "loss": 1.283, "mean_token_accuracy": 0.7101449742913246, "num_tokens": 16747707.0, "step": 20812 }, { "epoch": 5.5121822033898304, "grad_norm": 2.020265579223633, "learning_rate": 7.244041313559323e-06, "loss": 1.4541, "mean_token_accuracy": 0.668570838868618, "num_tokens": 16749198.0, "step": 20814 }, { "epoch": 5.512711864406779, "grad_norm": 1.7735579013824463, "learning_rate": 7.243776483050849e-06, "loss": 0.8778, "mean_token_accuracy": 0.7819741368293762, "num_tokens": 16750586.0, "step": 20816 }, { "epoch": 5.513241525423728, "grad_norm": 2.0460383892059326, "learning_rate": 7.2435116525423735e-06, "loss": 1.2616, "mean_token_accuracy": 0.7175598740577698, "num_tokens": 16752357.0, "step": 20818 }, { "epoch": 5.513771186440678, "grad_norm": 2.0065245628356934, "learning_rate": 7.243246822033899e-06, "loss": 1.0053, "mean_token_accuracy": 0.7665422931313515, "num_tokens": 16753570.0, "step": 20820 }, { "epoch": 5.514300847457627, "grad_norm": 2.1465723514556885, "learning_rate": 7.242981991525424e-06, "loss": 1.0337, "mean_token_accuracy": 0.75044946372509, "num_tokens": 16755200.0, "step": 20822 }, { "epoch": 5.514830508474576, "grad_norm": 2.2438013553619385, "learning_rate": 7.24271716101695e-06, "loss": 1.0567, "mean_token_accuracy": 0.7485030069947243, "num_tokens": 16756693.0, "step": 20824 }, { "epoch": 5.515360169491525, "grad_norm": 2.267982006072998, "learning_rate": 7.242452330508475e-06, "loss": 1.3147, "mean_token_accuracy": 0.6855993419885635, "num_tokens": 16758302.0, "step": 20826 }, { "epoch": 5.515889830508475, "grad_norm": 2.1584866046905518, "learning_rate": 7.242187500000001e-06, "loss": 1.3287, "mean_token_accuracy": 0.6985074505209923, "num_tokens": 16759896.0, "step": 20828 }, { "epoch": 5.516419491525424, "grad_norm": 2.370776653289795, "learning_rate": 7.241922669491526e-06, "loss": 1.3421, "mean_token_accuracy": 0.6904264912009239, "num_tokens": 16761623.0, "step": 20830 }, { "epoch": 5.516949152542373, "grad_norm": 1.5420371294021606, "learning_rate": 7.241657838983051e-06, "loss": 1.0842, "mean_token_accuracy": 0.7556407079100609, "num_tokens": 16763558.0, "step": 20832 }, { "epoch": 5.517478813559322, "grad_norm": 2.6218676567077637, "learning_rate": 7.241393008474576e-06, "loss": 1.6012, "mean_token_accuracy": 0.6869529038667679, "num_tokens": 16765047.0, "step": 20834 }, { "epoch": 5.518008474576272, "grad_norm": 2.2028489112854004, "learning_rate": 7.241128177966102e-06, "loss": 1.0802, "mean_token_accuracy": 0.7652016431093216, "num_tokens": 16766652.0, "step": 20836 }, { "epoch": 5.518538135593221, "grad_norm": 2.4191529750823975, "learning_rate": 7.240863347457627e-06, "loss": 1.7063, "mean_token_accuracy": 0.6368048191070557, "num_tokens": 16768197.0, "step": 20838 }, { "epoch": 5.5190677966101696, "grad_norm": 2.1698405742645264, "learning_rate": 7.240598516949154e-06, "loss": 1.13, "mean_token_accuracy": 0.7421534806489944, "num_tokens": 16769743.0, "step": 20840 }, { "epoch": 5.5195974576271185, "grad_norm": 2.0128893852233887, "learning_rate": 7.240333686440678e-06, "loss": 1.2251, "mean_token_accuracy": 0.7162475138902664, "num_tokens": 16771746.0, "step": 20842 }, { "epoch": 5.5201271186440675, "grad_norm": 2.6601951122283936, "learning_rate": 7.240068855932204e-06, "loss": 1.6633, "mean_token_accuracy": 0.6712678223848343, "num_tokens": 16773258.0, "step": 20844 }, { "epoch": 5.520656779661017, "grad_norm": 2.3965723514556885, "learning_rate": 7.239804025423729e-06, "loss": 1.5097, "mean_token_accuracy": 0.6576958000659943, "num_tokens": 16774879.0, "step": 20846 }, { "epoch": 5.521186440677966, "grad_norm": 1.9408785104751587, "learning_rate": 7.239539194915255e-06, "loss": 1.2936, "mean_token_accuracy": 0.6865323930978775, "num_tokens": 16776407.0, "step": 20848 }, { "epoch": 5.521716101694915, "grad_norm": 2.378145933151245, "learning_rate": 7.23927436440678e-06, "loss": 1.5133, "mean_token_accuracy": 0.6704293191432953, "num_tokens": 16778152.0, "step": 20850 }, { "epoch": 5.522245762711864, "grad_norm": 2.3739523887634277, "learning_rate": 7.239009533898306e-06, "loss": 1.1871, "mean_token_accuracy": 0.7178256511688232, "num_tokens": 16779468.0, "step": 20852 }, { "epoch": 5.522775423728813, "grad_norm": 2.18515682220459, "learning_rate": 7.238744703389831e-06, "loss": 1.4023, "mean_token_accuracy": 0.6750001460313797, "num_tokens": 16781147.0, "step": 20854 }, { "epoch": 5.523305084745763, "grad_norm": 3.5776753425598145, "learning_rate": 7.2384798728813565e-06, "loss": 1.0678, "mean_token_accuracy": 0.7330899834632874, "num_tokens": 16782576.0, "step": 20856 }, { "epoch": 5.523834745762712, "grad_norm": 2.4228875637054443, "learning_rate": 7.238215042372881e-06, "loss": 1.1084, "mean_token_accuracy": 0.7342707365751266, "num_tokens": 16784316.0, "step": 20858 }, { "epoch": 5.524364406779661, "grad_norm": 2.741760730743408, "learning_rate": 7.237950211864407e-06, "loss": 1.2795, "mean_token_accuracy": 0.7186920568346977, "num_tokens": 16785859.0, "step": 20860 }, { "epoch": 5.52489406779661, "grad_norm": 1.963929295539856, "learning_rate": 7.237685381355933e-06, "loss": 1.3307, "mean_token_accuracy": 0.6915397495031357, "num_tokens": 16787642.0, "step": 20862 }, { "epoch": 5.52542372881356, "grad_norm": 2.3258509635925293, "learning_rate": 7.237420550847458e-06, "loss": 1.3648, "mean_token_accuracy": 0.7033788338303566, "num_tokens": 16789113.0, "step": 20864 }, { "epoch": 5.525953389830509, "grad_norm": 2.6241629123687744, "learning_rate": 7.2371557203389845e-06, "loss": 1.05, "mean_token_accuracy": 0.7473729848861694, "num_tokens": 16790479.0, "step": 20866 }, { "epoch": 5.526483050847458, "grad_norm": 2.587186813354492, "learning_rate": 7.2368908898305094e-06, "loss": 1.4299, "mean_token_accuracy": 0.7039291337132454, "num_tokens": 16792327.0, "step": 20868 }, { "epoch": 5.527012711864407, "grad_norm": 2.1417057514190674, "learning_rate": 7.236626059322035e-06, "loss": 1.6537, "mean_token_accuracy": 0.6369957774877548, "num_tokens": 16794100.0, "step": 20870 }, { "epoch": 5.527542372881356, "grad_norm": 2.1114389896392822, "learning_rate": 7.23636122881356e-06, "loss": 1.1192, "mean_token_accuracy": 0.7100357264280319, "num_tokens": 16795619.0, "step": 20872 }, { "epoch": 5.528072033898305, "grad_norm": 2.0722150802612305, "learning_rate": 7.236096398305086e-06, "loss": 1.2487, "mean_token_accuracy": 0.7124202698469162, "num_tokens": 16797162.0, "step": 20874 }, { "epoch": 5.528601694915254, "grad_norm": 2.2220709323883057, "learning_rate": 7.235831567796611e-06, "loss": 1.1317, "mean_token_accuracy": 0.7096172124147415, "num_tokens": 16798873.0, "step": 20876 }, { "epoch": 5.529131355932203, "grad_norm": 2.6996002197265625, "learning_rate": 7.235566737288137e-06, "loss": 1.5821, "mean_token_accuracy": 0.6641651540994644, "num_tokens": 16800344.0, "step": 20878 }, { "epoch": 5.529661016949152, "grad_norm": 2.15112042427063, "learning_rate": 7.2353019067796616e-06, "loss": 1.5119, "mean_token_accuracy": 0.6621092781424522, "num_tokens": 16802043.0, "step": 20880 }, { "epoch": 5.530190677966102, "grad_norm": 2.659740447998047, "learning_rate": 7.235037076271187e-06, "loss": 1.0981, "mean_token_accuracy": 0.7251229807734489, "num_tokens": 16803722.0, "step": 20882 }, { "epoch": 5.530720338983051, "grad_norm": 2.578752040863037, "learning_rate": 7.234772245762712e-06, "loss": 1.5788, "mean_token_accuracy": 0.6410973817110062, "num_tokens": 16805173.0, "step": 20884 }, { "epoch": 5.53125, "grad_norm": 2.2603704929351807, "learning_rate": 7.234507415254238e-06, "loss": 1.3073, "mean_token_accuracy": 0.6844790652394295, "num_tokens": 16806978.0, "step": 20886 }, { "epoch": 5.531779661016949, "grad_norm": 2.3829660415649414, "learning_rate": 7.234242584745763e-06, "loss": 1.2334, "mean_token_accuracy": 0.7079079747200012, "num_tokens": 16808680.0, "step": 20888 }, { "epoch": 5.532309322033898, "grad_norm": 2.4725708961486816, "learning_rate": 7.233977754237289e-06, "loss": 1.4057, "mean_token_accuracy": 0.6935820542275906, "num_tokens": 16810345.0, "step": 20890 }, { "epoch": 5.532838983050848, "grad_norm": 2.1653759479522705, "learning_rate": 7.233712923728814e-06, "loss": 0.906, "mean_token_accuracy": 0.7772476077079773, "num_tokens": 16811931.0, "step": 20892 }, { "epoch": 5.533368644067797, "grad_norm": 2.3251397609710693, "learning_rate": 7.23344809322034e-06, "loss": 0.945, "mean_token_accuracy": 0.7684296071529388, "num_tokens": 16814501.0, "step": 20894 }, { "epoch": 5.533898305084746, "grad_norm": 2.155698537826538, "learning_rate": 7.233183262711864e-06, "loss": 1.1438, "mean_token_accuracy": 0.7338792979717255, "num_tokens": 16815767.0, "step": 20896 }, { "epoch": 5.534427966101695, "grad_norm": 2.5008816719055176, "learning_rate": 7.232918432203391e-06, "loss": 0.9752, "mean_token_accuracy": 0.7675952017307281, "num_tokens": 16817213.0, "step": 20898 }, { "epoch": 5.534957627118644, "grad_norm": 2.2515010833740234, "learning_rate": 7.232653601694916e-06, "loss": 1.0566, "mean_token_accuracy": 0.7310512438416481, "num_tokens": 16818650.0, "step": 20900 }, { "epoch": 5.535487288135593, "grad_norm": 2.2935569286346436, "learning_rate": 7.232388771186442e-06, "loss": 0.9583, "mean_token_accuracy": 0.7701913267374039, "num_tokens": 16820254.0, "step": 20902 }, { "epoch": 5.536016949152542, "grad_norm": 2.023516893386841, "learning_rate": 7.232123940677967e-06, "loss": 0.9086, "mean_token_accuracy": 0.7861398458480835, "num_tokens": 16821768.0, "step": 20904 }, { "epoch": 5.536546610169491, "grad_norm": 1.963831901550293, "learning_rate": 7.231859110169492e-06, "loss": 1.5602, "mean_token_accuracy": 0.6707828119397163, "num_tokens": 16823332.0, "step": 20906 }, { "epoch": 5.53707627118644, "grad_norm": 1.954052448272705, "learning_rate": 7.231594279661017e-06, "loss": 1.0347, "mean_token_accuracy": 0.767696239054203, "num_tokens": 16824665.0, "step": 20908 }, { "epoch": 5.53760593220339, "grad_norm": 2.3857927322387695, "learning_rate": 7.231329449152543e-06, "loss": 1.2965, "mean_token_accuracy": 0.7002133727073669, "num_tokens": 16826302.0, "step": 20910 }, { "epoch": 5.538135593220339, "grad_norm": 2.155677080154419, "learning_rate": 7.231064618644068e-06, "loss": 1.3834, "mean_token_accuracy": 0.6756806522607803, "num_tokens": 16827859.0, "step": 20912 }, { "epoch": 5.538665254237288, "grad_norm": 2.850654125213623, "learning_rate": 7.230799788135594e-06, "loss": 1.2181, "mean_token_accuracy": 0.7344706058502197, "num_tokens": 16829399.0, "step": 20914 }, { "epoch": 5.539194915254237, "grad_norm": 2.336298704147339, "learning_rate": 7.230534957627119e-06, "loss": 1.2766, "mean_token_accuracy": 0.6867895573377609, "num_tokens": 16831066.0, "step": 20916 }, { "epoch": 5.539724576271187, "grad_norm": 2.3116629123687744, "learning_rate": 7.2302701271186445e-06, "loss": 1.2411, "mean_token_accuracy": 0.7096700370311737, "num_tokens": 16832723.0, "step": 20918 }, { "epoch": 5.540254237288136, "grad_norm": 2.0318892002105713, "learning_rate": 7.2300052966101695e-06, "loss": 0.7996, "mean_token_accuracy": 0.7807902470231056, "num_tokens": 16834315.0, "step": 20920 }, { "epoch": 5.540783898305085, "grad_norm": 2.1274797916412354, "learning_rate": 7.229740466101696e-06, "loss": 1.5571, "mean_token_accuracy": 0.644388236105442, "num_tokens": 16835965.0, "step": 20922 }, { "epoch": 5.541313559322034, "grad_norm": 2.6638095378875732, "learning_rate": 7.22947563559322e-06, "loss": 1.1019, "mean_token_accuracy": 0.7578825652599335, "num_tokens": 16837382.0, "step": 20924 }, { "epoch": 5.541843220338983, "grad_norm": 2.0934104919433594, "learning_rate": 7.229210805084747e-06, "loss": 0.9632, "mean_token_accuracy": 0.7622932493686676, "num_tokens": 16839027.0, "step": 20926 }, { "epoch": 5.5423728813559325, "grad_norm": 2.8924059867858887, "learning_rate": 7.228945974576272e-06, "loss": 1.3961, "mean_token_accuracy": 0.6891991719603539, "num_tokens": 16840509.0, "step": 20928 }, { "epoch": 5.5429025423728815, "grad_norm": 2.448249578475952, "learning_rate": 7.2286811440677975e-06, "loss": 1.0444, "mean_token_accuracy": 0.7472534626722336, "num_tokens": 16841956.0, "step": 20930 }, { "epoch": 5.5434322033898304, "grad_norm": 2.078444719314575, "learning_rate": 7.2284163135593224e-06, "loss": 0.9358, "mean_token_accuracy": 0.7650540396571159, "num_tokens": 16843574.0, "step": 20932 }, { "epoch": 5.543961864406779, "grad_norm": 1.914650797843933, "learning_rate": 7.228151483050848e-06, "loss": 1.2569, "mean_token_accuracy": 0.731216162443161, "num_tokens": 16845098.0, "step": 20934 }, { "epoch": 5.544491525423728, "grad_norm": 2.402777671813965, "learning_rate": 7.227886652542373e-06, "loss": 1.0416, "mean_token_accuracy": 0.7461813241243362, "num_tokens": 16846471.0, "step": 20936 }, { "epoch": 5.545021186440678, "grad_norm": 2.519252061843872, "learning_rate": 7.227621822033899e-06, "loss": 1.2871, "mean_token_accuracy": 0.7181105092167854, "num_tokens": 16848002.0, "step": 20938 }, { "epoch": 5.545550847457627, "grad_norm": 2.2362852096557617, "learning_rate": 7.227356991525424e-06, "loss": 0.8536, "mean_token_accuracy": 0.7753078490495682, "num_tokens": 16849413.0, "step": 20940 }, { "epoch": 5.546080508474576, "grad_norm": 2.4315571784973145, "learning_rate": 7.22709216101695e-06, "loss": 1.3197, "mean_token_accuracy": 0.6945287883281708, "num_tokens": 16850945.0, "step": 20942 }, { "epoch": 5.546610169491525, "grad_norm": 2.0969274044036865, "learning_rate": 7.2268273305084745e-06, "loss": 1.2071, "mean_token_accuracy": 0.7361594438552856, "num_tokens": 16852535.0, "step": 20944 }, { "epoch": 5.547139830508475, "grad_norm": 1.9598908424377441, "learning_rate": 7.2265625e-06, "loss": 1.072, "mean_token_accuracy": 0.7186509221792221, "num_tokens": 16854161.0, "step": 20946 }, { "epoch": 5.547669491525424, "grad_norm": 2.6695032119750977, "learning_rate": 7.226297669491527e-06, "loss": 0.9784, "mean_token_accuracy": 0.7622647657990456, "num_tokens": 16855738.0, "step": 20948 }, { "epoch": 5.548199152542373, "grad_norm": 2.6933679580688477, "learning_rate": 7.226032838983051e-06, "loss": 1.0115, "mean_token_accuracy": 0.7702633142471313, "num_tokens": 16857294.0, "step": 20950 }, { "epoch": 5.548728813559322, "grad_norm": 2.2655959129333496, "learning_rate": 7.225768008474578e-06, "loss": 1.4397, "mean_token_accuracy": 0.6838752701878548, "num_tokens": 16858693.0, "step": 20952 }, { "epoch": 5.549258474576272, "grad_norm": 1.7766880989074707, "learning_rate": 7.225503177966103e-06, "loss": 1.0263, "mean_token_accuracy": 0.751365102827549, "num_tokens": 16860495.0, "step": 20954 }, { "epoch": 5.549788135593221, "grad_norm": 1.8210715055465698, "learning_rate": 7.225238347457628e-06, "loss": 0.9578, "mean_token_accuracy": 0.7723656222224236, "num_tokens": 16861936.0, "step": 20956 }, { "epoch": 5.5503177966101696, "grad_norm": 2.3759608268737793, "learning_rate": 7.224973516949153e-06, "loss": 1.542, "mean_token_accuracy": 0.6576103866100311, "num_tokens": 16863802.0, "step": 20958 }, { "epoch": 5.5508474576271185, "grad_norm": 2.042372941970825, "learning_rate": 7.224708686440679e-06, "loss": 1.1704, "mean_token_accuracy": 0.7194869369268417, "num_tokens": 16865245.0, "step": 20960 }, { "epoch": 5.5513771186440675, "grad_norm": 2.1254026889801025, "learning_rate": 7.224443855932204e-06, "loss": 1.3987, "mean_token_accuracy": 0.6886944249272346, "num_tokens": 16866748.0, "step": 20962 }, { "epoch": 5.551906779661017, "grad_norm": 2.236417055130005, "learning_rate": 7.22417902542373e-06, "loss": 1.0579, "mean_token_accuracy": 0.7323341518640518, "num_tokens": 16868550.0, "step": 20964 }, { "epoch": 5.552436440677966, "grad_norm": 1.9893901348114014, "learning_rate": 7.223914194915255e-06, "loss": 1.0594, "mean_token_accuracy": 0.7503474205732346, "num_tokens": 16869980.0, "step": 20966 }, { "epoch": 5.552966101694915, "grad_norm": 2.0060436725616455, "learning_rate": 7.2236493644067805e-06, "loss": 0.9585, "mean_token_accuracy": 0.7798751518130302, "num_tokens": 16871478.0, "step": 20968 }, { "epoch": 5.553495762711864, "grad_norm": 2.5835118293762207, "learning_rate": 7.223384533898305e-06, "loss": 1.477, "mean_token_accuracy": 0.6649125404655933, "num_tokens": 16872964.0, "step": 20970 }, { "epoch": 5.554025423728813, "grad_norm": 2.185173749923706, "learning_rate": 7.223119703389831e-06, "loss": 1.3253, "mean_token_accuracy": 0.7111420035362244, "num_tokens": 16874736.0, "step": 20972 }, { "epoch": 5.554555084745763, "grad_norm": 2.91804838180542, "learning_rate": 7.222854872881356e-06, "loss": 1.5605, "mean_token_accuracy": 0.6678318157792091, "num_tokens": 16876032.0, "step": 20974 }, { "epoch": 5.555084745762712, "grad_norm": 2.1701266765594482, "learning_rate": 7.222590042372883e-06, "loss": 1.455, "mean_token_accuracy": 0.6674831360578537, "num_tokens": 16877675.0, "step": 20976 }, { "epoch": 5.555614406779661, "grad_norm": 1.965697169303894, "learning_rate": 7.222325211864407e-06, "loss": 1.0814, "mean_token_accuracy": 0.7427324131131172, "num_tokens": 16879319.0, "step": 20978 }, { "epoch": 5.55614406779661, "grad_norm": 1.9980733394622803, "learning_rate": 7.2220603813559334e-06, "loss": 0.9079, "mean_token_accuracy": 0.7615073025226593, "num_tokens": 16881075.0, "step": 20980 }, { "epoch": 5.55667372881356, "grad_norm": 2.5870144367218018, "learning_rate": 7.221795550847458e-06, "loss": 1.6524, "mean_token_accuracy": 0.6846399530768394, "num_tokens": 16882725.0, "step": 20982 }, { "epoch": 5.557203389830509, "grad_norm": 2.1765787601470947, "learning_rate": 7.221530720338984e-06, "loss": 1.6777, "mean_token_accuracy": 0.6525142043828964, "num_tokens": 16884274.0, "step": 20984 }, { "epoch": 5.557733050847458, "grad_norm": 2.5220937728881836, "learning_rate": 7.221265889830509e-06, "loss": 1.5645, "mean_token_accuracy": 0.6511016711592674, "num_tokens": 16885642.0, "step": 20986 }, { "epoch": 5.558262711864407, "grad_norm": 2.3449137210845947, "learning_rate": 7.221001059322035e-06, "loss": 1.0235, "mean_token_accuracy": 0.7265091687440872, "num_tokens": 16887040.0, "step": 20988 }, { "epoch": 5.558792372881356, "grad_norm": 2.214247703552246, "learning_rate": 7.22073622881356e-06, "loss": 1.8318, "mean_token_accuracy": 0.6125221215188503, "num_tokens": 16888804.0, "step": 20990 }, { "epoch": 5.559322033898305, "grad_norm": 2.2263383865356445, "learning_rate": 7.2204713983050855e-06, "loss": 1.1125, "mean_token_accuracy": 0.7291310131549835, "num_tokens": 16890430.0, "step": 20992 }, { "epoch": 5.559851694915254, "grad_norm": 2.050830841064453, "learning_rate": 7.2202065677966105e-06, "loss": 0.7159, "mean_token_accuracy": 0.8030832037329674, "num_tokens": 16891755.0, "step": 20994 }, { "epoch": 5.560381355932203, "grad_norm": 2.162041425704956, "learning_rate": 7.219941737288136e-06, "loss": 1.2279, "mean_token_accuracy": 0.7086667343974113, "num_tokens": 16893476.0, "step": 20996 }, { "epoch": 5.560911016949152, "grad_norm": 2.2519915103912354, "learning_rate": 7.219676906779661e-06, "loss": 1.1283, "mean_token_accuracy": 0.7234947681427002, "num_tokens": 16895006.0, "step": 20998 }, { "epoch": 5.561440677966102, "grad_norm": 1.7347749471664429, "learning_rate": 7.219412076271187e-06, "loss": 0.9118, "step": 21000 }, { "epoch": 5.561440677966102, "eval_loss": 1.3172645568847656, "eval_mean_token_accuracy": 0.700793400600359, "eval_num_tokens": 16896599.0, "eval_runtime": 48.2553, "eval_samples_per_second": 6.383, "eval_steps_per_second": 6.383, "step": 21000 }, { "epoch": 5.561970338983051, "grad_norm": 2.5141470432281494, "learning_rate": 7.219147245762712e-06, "loss": 1.3087, "mean_token_accuracy": 0.7323395609855652, "num_tokens": 16898225.0, "step": 21002 }, { "epoch": 5.5625, "grad_norm": 1.8625234365463257, "learning_rate": 7.218882415254238e-06, "loss": 0.8856, "mean_token_accuracy": 0.7790225222706795, "num_tokens": 16899888.0, "step": 21004 }, { "epoch": 5.563029661016949, "grad_norm": 2.4311940670013428, "learning_rate": 7.218617584745763e-06, "loss": 1.5359, "mean_token_accuracy": 0.6589605063199997, "num_tokens": 16901504.0, "step": 21006 }, { "epoch": 5.563559322033898, "grad_norm": 2.0134804248809814, "learning_rate": 7.218352754237289e-06, "loss": 1.0858, "mean_token_accuracy": 0.7579639032483101, "num_tokens": 16903060.0, "step": 21008 }, { "epoch": 5.564088983050848, "grad_norm": 2.4437906742095947, "learning_rate": 7.218087923728814e-06, "loss": 1.1711, "mean_token_accuracy": 0.7425892874598503, "num_tokens": 16904683.0, "step": 21010 }, { "epoch": 5.564618644067797, "grad_norm": 1.8041349649429321, "learning_rate": 7.21782309322034e-06, "loss": 0.8675, "mean_token_accuracy": 0.7717098444700241, "num_tokens": 16906499.0, "step": 21012 }, { "epoch": 5.565148305084746, "grad_norm": 2.057478427886963, "learning_rate": 7.217558262711865e-06, "loss": 1.0903, "mean_token_accuracy": 0.7350077629089355, "num_tokens": 16908395.0, "step": 21014 }, { "epoch": 5.565677966101695, "grad_norm": 2.4208502769470215, "learning_rate": 7.217293432203391e-06, "loss": 1.292, "mean_token_accuracy": 0.7027581036090851, "num_tokens": 16910091.0, "step": 21016 }, { "epoch": 5.566207627118644, "grad_norm": 2.1469485759735107, "learning_rate": 7.2170286016949156e-06, "loss": 1.1166, "mean_token_accuracy": 0.7195527255535126, "num_tokens": 16911435.0, "step": 21018 }, { "epoch": 5.566737288135593, "grad_norm": 2.1457934379577637, "learning_rate": 7.216763771186441e-06, "loss": 1.4156, "mean_token_accuracy": 0.6605450659990311, "num_tokens": 16912865.0, "step": 21020 }, { "epoch": 5.567266949152542, "grad_norm": 2.6000449657440186, "learning_rate": 7.216498940677966e-06, "loss": 1.0918, "mean_token_accuracy": 0.7455973103642464, "num_tokens": 16914527.0, "step": 21022 }, { "epoch": 5.567796610169491, "grad_norm": 2.1572108268737793, "learning_rate": 7.216234110169492e-06, "loss": 1.3747, "mean_token_accuracy": 0.6707174703478813, "num_tokens": 16916260.0, "step": 21024 }, { "epoch": 5.56832627118644, "grad_norm": 2.526632070541382, "learning_rate": 7.215969279661017e-06, "loss": 1.2091, "mean_token_accuracy": 0.7085998132824898, "num_tokens": 16917721.0, "step": 21026 }, { "epoch": 5.56885593220339, "grad_norm": 2.1875574588775635, "learning_rate": 7.215704449152543e-06, "loss": 1.2324, "mean_token_accuracy": 0.7041555568575859, "num_tokens": 16919370.0, "step": 21028 }, { "epoch": 5.569385593220339, "grad_norm": 2.7645976543426514, "learning_rate": 7.215439618644068e-06, "loss": 1.0514, "mean_token_accuracy": 0.7416963577270508, "num_tokens": 16920935.0, "step": 21030 }, { "epoch": 5.569915254237288, "grad_norm": 1.6294636726379395, "learning_rate": 7.2151747881355935e-06, "loss": 0.9209, "mean_token_accuracy": 0.763018935918808, "num_tokens": 16922360.0, "step": 21032 }, { "epoch": 5.570444915254237, "grad_norm": 2.5608644485473633, "learning_rate": 7.21490995762712e-06, "loss": 1.5886, "mean_token_accuracy": 0.645286962389946, "num_tokens": 16923965.0, "step": 21034 }, { "epoch": 5.570974576271187, "grad_norm": 2.015197992324829, "learning_rate": 7.214645127118645e-06, "loss": 0.9806, "mean_token_accuracy": 0.7521624565124512, "num_tokens": 16925452.0, "step": 21036 }, { "epoch": 5.571504237288136, "grad_norm": 1.92123544216156, "learning_rate": 7.214380296610171e-06, "loss": 1.0429, "mean_token_accuracy": 0.754909560084343, "num_tokens": 16927389.0, "step": 21038 }, { "epoch": 5.572033898305085, "grad_norm": 1.6046459674835205, "learning_rate": 7.214115466101696e-06, "loss": 1.0517, "mean_token_accuracy": 0.7572695910930634, "num_tokens": 16929243.0, "step": 21040 }, { "epoch": 5.572563559322034, "grad_norm": 2.1886422634124756, "learning_rate": 7.2138506355932215e-06, "loss": 1.6172, "mean_token_accuracy": 0.640655592083931, "num_tokens": 16930911.0, "step": 21042 }, { "epoch": 5.573093220338983, "grad_norm": 2.255249500274658, "learning_rate": 7.213585805084746e-06, "loss": 1.0315, "mean_token_accuracy": 0.7492527365684509, "num_tokens": 16932540.0, "step": 21044 }, { "epoch": 5.5736228813559325, "grad_norm": 2.243372678756714, "learning_rate": 7.213320974576272e-06, "loss": 1.1563, "mean_token_accuracy": 0.7350902408361435, "num_tokens": 16933954.0, "step": 21046 }, { "epoch": 5.5741525423728815, "grad_norm": 2.1539270877838135, "learning_rate": 7.213056144067797e-06, "loss": 1.1492, "mean_token_accuracy": 0.728570893406868, "num_tokens": 16936127.0, "step": 21048 }, { "epoch": 5.5746822033898304, "grad_norm": 2.854027032852173, "learning_rate": 7.212791313559323e-06, "loss": 1.2464, "mean_token_accuracy": 0.7211744338274002, "num_tokens": 16937379.0, "step": 21050 }, { "epoch": 5.575211864406779, "grad_norm": 2.2760860919952393, "learning_rate": 7.212526483050848e-06, "loss": 1.1033, "mean_token_accuracy": 0.727075956761837, "num_tokens": 16938869.0, "step": 21052 }, { "epoch": 5.575741525423728, "grad_norm": 2.028646945953369, "learning_rate": 7.212261652542374e-06, "loss": 1.5602, "mean_token_accuracy": 0.6619957610964775, "num_tokens": 16940681.0, "step": 21054 }, { "epoch": 5.576271186440678, "grad_norm": 2.3497700691223145, "learning_rate": 7.2119968220338985e-06, "loss": 1.2749, "mean_token_accuracy": 0.7281959168612957, "num_tokens": 16942043.0, "step": 21056 }, { "epoch": 5.576800847457627, "grad_norm": 2.3103854656219482, "learning_rate": 7.211731991525424e-06, "loss": 1.1205, "mean_token_accuracy": 0.7282119393348694, "num_tokens": 16943608.0, "step": 21058 }, { "epoch": 5.577330508474576, "grad_norm": 2.633681297302246, "learning_rate": 7.211467161016949e-06, "loss": 1.2084, "mean_token_accuracy": 0.7374311685562134, "num_tokens": 16945246.0, "step": 21060 }, { "epoch": 5.577860169491525, "grad_norm": 2.398007392883301, "learning_rate": 7.211202330508476e-06, "loss": 1.2586, "mean_token_accuracy": 0.7219731211662292, "num_tokens": 16946685.0, "step": 21062 }, { "epoch": 5.578389830508475, "grad_norm": 2.418597459793091, "learning_rate": 7.210937500000001e-06, "loss": 1.3652, "mean_token_accuracy": 0.6949001550674438, "num_tokens": 16948265.0, "step": 21064 }, { "epoch": 5.578919491525424, "grad_norm": 2.1151368618011475, "learning_rate": 7.2106726694915266e-06, "loss": 1.6176, "mean_token_accuracy": 0.6390082389116287, "num_tokens": 16949861.0, "step": 21066 }, { "epoch": 5.579449152542373, "grad_norm": 2.1598494052886963, "learning_rate": 7.2104078389830515e-06, "loss": 0.8558, "mean_token_accuracy": 0.7753361277282238, "num_tokens": 16951498.0, "step": 21068 }, { "epoch": 5.579978813559322, "grad_norm": 2.0531985759735107, "learning_rate": 7.210143008474577e-06, "loss": 1.2035, "mean_token_accuracy": 0.7006282210350037, "num_tokens": 16953324.0, "step": 21070 }, { "epoch": 5.580508474576272, "grad_norm": 2.6064627170562744, "learning_rate": 7.209878177966102e-06, "loss": 1.3768, "mean_token_accuracy": 0.7104769498109818, "num_tokens": 16954790.0, "step": 21072 }, { "epoch": 5.581038135593221, "grad_norm": 2.184521436691284, "learning_rate": 7.209613347457628e-06, "loss": 1.3277, "mean_token_accuracy": 0.7125556878745556, "num_tokens": 16956299.0, "step": 21074 }, { "epoch": 5.5815677966101696, "grad_norm": 2.5644307136535645, "learning_rate": 7.209348516949153e-06, "loss": 1.5066, "mean_token_accuracy": 0.6655248552560806, "num_tokens": 16958026.0, "step": 21076 }, { "epoch": 5.5820974576271185, "grad_norm": 1.8420528173446655, "learning_rate": 7.209083686440679e-06, "loss": 0.9268, "mean_token_accuracy": 0.7427747473120689, "num_tokens": 16959723.0, "step": 21078 }, { "epoch": 5.5826271186440675, "grad_norm": 2.5825581550598145, "learning_rate": 7.208818855932204e-06, "loss": 1.5185, "mean_token_accuracy": 0.6623985841870308, "num_tokens": 16961132.0, "step": 21080 }, { "epoch": 5.583156779661017, "grad_norm": 1.9232103824615479, "learning_rate": 7.208554025423729e-06, "loss": 1.2722, "mean_token_accuracy": 0.6992102637887001, "num_tokens": 16963036.0, "step": 21082 }, { "epoch": 5.583686440677966, "grad_norm": 2.6672849655151367, "learning_rate": 7.208289194915254e-06, "loss": 1.2537, "mean_token_accuracy": 0.7116900831460953, "num_tokens": 16964465.0, "step": 21084 }, { "epoch": 5.584216101694915, "grad_norm": 1.848363995552063, "learning_rate": 7.20802436440678e-06, "loss": 1.1192, "mean_token_accuracy": 0.7174076959490776, "num_tokens": 16966089.0, "step": 21086 }, { "epoch": 5.584745762711864, "grad_norm": 2.314516067504883, "learning_rate": 7.207759533898305e-06, "loss": 1.1368, "mean_token_accuracy": 0.7248854339122772, "num_tokens": 16967636.0, "step": 21088 }, { "epoch": 5.585275423728813, "grad_norm": 2.6493616104125977, "learning_rate": 7.207494703389832e-06, "loss": 1.7109, "mean_token_accuracy": 0.6220962032675743, "num_tokens": 16969273.0, "step": 21090 }, { "epoch": 5.585805084745763, "grad_norm": 2.0557138919830322, "learning_rate": 7.207229872881356e-06, "loss": 1.1324, "mean_token_accuracy": 0.71379554271698, "num_tokens": 16970969.0, "step": 21092 }, { "epoch": 5.586334745762712, "grad_norm": 2.215386152267456, "learning_rate": 7.206965042372882e-06, "loss": 1.5858, "mean_token_accuracy": 0.6740773990750313, "num_tokens": 16972819.0, "step": 21094 }, { "epoch": 5.586864406779661, "grad_norm": 1.8259153366088867, "learning_rate": 7.206700211864407e-06, "loss": 1.1794, "mean_token_accuracy": 0.7138200327754021, "num_tokens": 16974906.0, "step": 21096 }, { "epoch": 5.58739406779661, "grad_norm": 2.106851577758789, "learning_rate": 7.206435381355933e-06, "loss": 1.0098, "mean_token_accuracy": 0.7751251235604286, "num_tokens": 16976573.0, "step": 21098 }, { "epoch": 5.58792372881356, "grad_norm": 2.2599246501922607, "learning_rate": 7.206170550847458e-06, "loss": 1.2765, "mean_token_accuracy": 0.7126503065228462, "num_tokens": 16977945.0, "step": 21100 }, { "epoch": 5.588453389830509, "grad_norm": 2.168045997619629, "learning_rate": 7.205905720338984e-06, "loss": 1.1333, "mean_token_accuracy": 0.7310282960534096, "num_tokens": 16979710.0, "step": 21102 }, { "epoch": 5.588983050847458, "grad_norm": 1.887089729309082, "learning_rate": 7.205640889830509e-06, "loss": 1.0052, "mean_token_accuracy": 0.7412324696779251, "num_tokens": 16981590.0, "step": 21104 }, { "epoch": 5.589512711864407, "grad_norm": 2.2460055351257324, "learning_rate": 7.2053760593220345e-06, "loss": 1.2573, "mean_token_accuracy": 0.7018911764025688, "num_tokens": 16983205.0, "step": 21106 }, { "epoch": 5.590042372881356, "grad_norm": 2.2244722843170166, "learning_rate": 7.205111228813559e-06, "loss": 1.5238, "mean_token_accuracy": 0.6756496503949165, "num_tokens": 16984724.0, "step": 21108 }, { "epoch": 5.590572033898305, "grad_norm": 2.1672730445861816, "learning_rate": 7.204846398305085e-06, "loss": 0.9561, "mean_token_accuracy": 0.7752713784575462, "num_tokens": 16986330.0, "step": 21110 }, { "epoch": 5.591101694915254, "grad_norm": 2.4072556495666504, "learning_rate": 7.20458156779661e-06, "loss": 1.4734, "mean_token_accuracy": 0.6999585255980492, "num_tokens": 16987650.0, "step": 21112 }, { "epoch": 5.591631355932203, "grad_norm": 1.8801871538162231, "learning_rate": 7.204316737288136e-06, "loss": 0.9886, "mean_token_accuracy": 0.7558837234973907, "num_tokens": 16989341.0, "step": 21114 }, { "epoch": 5.592161016949152, "grad_norm": 2.243349552154541, "learning_rate": 7.2040519067796625e-06, "loss": 1.2748, "mean_token_accuracy": 0.7038437947630882, "num_tokens": 16990778.0, "step": 21116 }, { "epoch": 5.592690677966102, "grad_norm": 2.2735722064971924, "learning_rate": 7.2037870762711874e-06, "loss": 1.3008, "mean_token_accuracy": 0.7265843860805035, "num_tokens": 16992369.0, "step": 21118 }, { "epoch": 5.593220338983051, "grad_norm": 2.524372100830078, "learning_rate": 7.203522245762713e-06, "loss": 1.5238, "mean_token_accuracy": 0.6651896275579929, "num_tokens": 16993998.0, "step": 21120 }, { "epoch": 5.59375, "grad_norm": 2.481346845626831, "learning_rate": 7.203257415254238e-06, "loss": 1.1625, "mean_token_accuracy": 0.7303434386849403, "num_tokens": 16995332.0, "step": 21122 }, { "epoch": 5.594279661016949, "grad_norm": 2.108062982559204, "learning_rate": 7.202992584745764e-06, "loss": 1.0507, "mean_token_accuracy": 0.722443588078022, "num_tokens": 16996912.0, "step": 21124 }, { "epoch": 5.594809322033898, "grad_norm": 2.6628949642181396, "learning_rate": 7.202727754237289e-06, "loss": 1.3328, "mean_token_accuracy": 0.6972597613930702, "num_tokens": 16998332.0, "step": 21126 }, { "epoch": 5.595338983050848, "grad_norm": 2.6942367553710938, "learning_rate": 7.202462923728815e-06, "loss": 1.1743, "mean_token_accuracy": 0.7221026718616486, "num_tokens": 16999569.0, "step": 21128 }, { "epoch": 5.595868644067797, "grad_norm": 1.810892105102539, "learning_rate": 7.2021980932203395e-06, "loss": 1.0272, "mean_token_accuracy": 0.7464051842689514, "num_tokens": 17001214.0, "step": 21130 }, { "epoch": 5.596398305084746, "grad_norm": 2.186239004135132, "learning_rate": 7.201933262711865e-06, "loss": 1.1699, "mean_token_accuracy": 0.7206147462129593, "num_tokens": 17002764.0, "step": 21132 }, { "epoch": 5.596927966101695, "grad_norm": 2.264744758605957, "learning_rate": 7.20166843220339e-06, "loss": 1.2864, "mean_token_accuracy": 0.7099311500787735, "num_tokens": 17004276.0, "step": 21134 }, { "epoch": 5.597457627118644, "grad_norm": 2.345308303833008, "learning_rate": 7.201403601694916e-06, "loss": 1.3664, "mean_token_accuracy": 0.6949618980288506, "num_tokens": 17005776.0, "step": 21136 }, { "epoch": 5.597987288135593, "grad_norm": 1.8009980916976929, "learning_rate": 7.201138771186441e-06, "loss": 1.0423, "mean_token_accuracy": 0.7392814382910728, "num_tokens": 17007676.0, "step": 21138 }, { "epoch": 5.598516949152542, "grad_norm": 1.8266550302505493, "learning_rate": 7.200873940677967e-06, "loss": 1.2968, "mean_token_accuracy": 0.6933730468153954, "num_tokens": 17009328.0, "step": 21140 }, { "epoch": 5.599046610169491, "grad_norm": 2.401031017303467, "learning_rate": 7.200609110169492e-06, "loss": 1.5622, "mean_token_accuracy": 0.6634601578116417, "num_tokens": 17011141.0, "step": 21142 }, { "epoch": 5.59957627118644, "grad_norm": 2.347142219543457, "learning_rate": 7.200344279661018e-06, "loss": 1.0637, "mean_token_accuracy": 0.7539128810167313, "num_tokens": 17012655.0, "step": 21144 }, { "epoch": 5.60010593220339, "grad_norm": 2.4856503009796143, "learning_rate": 7.200079449152542e-06, "loss": 1.3671, "mean_token_accuracy": 0.6973808109760284, "num_tokens": 17014086.0, "step": 21146 }, { "epoch": 5.600635593220339, "grad_norm": 2.8002026081085205, "learning_rate": 7.199814618644069e-06, "loss": 1.2668, "mean_token_accuracy": 0.7089220434427261, "num_tokens": 17015500.0, "step": 21148 }, { "epoch": 5.601165254237288, "grad_norm": 2.1482250690460205, "learning_rate": 7.199549788135594e-06, "loss": 1.3377, "mean_token_accuracy": 0.690556637942791, "num_tokens": 17017230.0, "step": 21150 }, { "epoch": 5.601694915254237, "grad_norm": 1.751328706741333, "learning_rate": 7.19928495762712e-06, "loss": 1.0255, "mean_token_accuracy": 0.7506457790732384, "num_tokens": 17018837.0, "step": 21152 }, { "epoch": 5.602224576271187, "grad_norm": 2.12039852142334, "learning_rate": 7.199020127118645e-06, "loss": 1.149, "mean_token_accuracy": 0.7261555716395378, "num_tokens": 17020451.0, "step": 21154 }, { "epoch": 5.602754237288136, "grad_norm": 2.129426956176758, "learning_rate": 7.19875529661017e-06, "loss": 1.3024, "mean_token_accuracy": 0.7076135873794556, "num_tokens": 17022077.0, "step": 21156 }, { "epoch": 5.603283898305085, "grad_norm": 2.240424633026123, "learning_rate": 7.198490466101695e-06, "loss": 1.1804, "mean_token_accuracy": 0.7207952439785004, "num_tokens": 17023848.0, "step": 21158 }, { "epoch": 5.603813559322034, "grad_norm": 2.202141761779785, "learning_rate": 7.198225635593221e-06, "loss": 1.1024, "mean_token_accuracy": 0.7241703867912292, "num_tokens": 17025428.0, "step": 21160 }, { "epoch": 5.604343220338983, "grad_norm": 2.3033194541931152, "learning_rate": 7.197960805084746e-06, "loss": 1.1905, "mean_token_accuracy": 0.7192320451140404, "num_tokens": 17026853.0, "step": 21162 }, { "epoch": 5.6048728813559325, "grad_norm": 2.003722667694092, "learning_rate": 7.197695974576272e-06, "loss": 1.1136, "mean_token_accuracy": 0.7406111732125282, "num_tokens": 17028502.0, "step": 21164 }, { "epoch": 5.6054025423728815, "grad_norm": 1.9096473455429077, "learning_rate": 7.197431144067797e-06, "loss": 1.545, "mean_token_accuracy": 0.6375784799456596, "num_tokens": 17030308.0, "step": 21166 }, { "epoch": 5.6059322033898304, "grad_norm": 2.8001980781555176, "learning_rate": 7.1971663135593225e-06, "loss": 1.2106, "mean_token_accuracy": 0.7314635068178177, "num_tokens": 17031731.0, "step": 21168 }, { "epoch": 5.606461864406779, "grad_norm": 2.0520284175872803, "learning_rate": 7.1969014830508474e-06, "loss": 0.8074, "mean_token_accuracy": 0.7937732040882111, "num_tokens": 17033269.0, "step": 21170 }, { "epoch": 5.606991525423728, "grad_norm": 2.3350439071655273, "learning_rate": 7.196636652542374e-06, "loss": 1.5418, "mean_token_accuracy": 0.6597325503826141, "num_tokens": 17034840.0, "step": 21172 }, { "epoch": 5.607521186440678, "grad_norm": 2.216546058654785, "learning_rate": 7.196371822033898e-06, "loss": 1.174, "mean_token_accuracy": 0.7074642032384872, "num_tokens": 17036708.0, "step": 21174 }, { "epoch": 5.608050847457627, "grad_norm": 2.4569075107574463, "learning_rate": 7.196106991525425e-06, "loss": 1.8077, "mean_token_accuracy": 0.6256536170840263, "num_tokens": 17038399.0, "step": 21176 }, { "epoch": 5.608580508474576, "grad_norm": 2.0994508266448975, "learning_rate": 7.19584216101695e-06, "loss": 1.4107, "mean_token_accuracy": 0.689593143761158, "num_tokens": 17040123.0, "step": 21178 }, { "epoch": 5.609110169491525, "grad_norm": 2.8614392280578613, "learning_rate": 7.1955773305084755e-06, "loss": 1.5606, "mean_token_accuracy": 0.6415089517831802, "num_tokens": 17041834.0, "step": 21180 }, { "epoch": 5.609639830508475, "grad_norm": 2.108158588409424, "learning_rate": 7.1953125e-06, "loss": 1.0072, "mean_token_accuracy": 0.7553595677018166, "num_tokens": 17043295.0, "step": 21182 }, { "epoch": 5.610169491525424, "grad_norm": 2.215543270111084, "learning_rate": 7.195047669491526e-06, "loss": 1.1485, "mean_token_accuracy": 0.6861468628048897, "num_tokens": 17044822.0, "step": 21184 }, { "epoch": 5.610699152542373, "grad_norm": 2.6653201580047607, "learning_rate": 7.194782838983051e-06, "loss": 1.1656, "mean_token_accuracy": 0.7494561076164246, "num_tokens": 17046122.0, "step": 21186 }, { "epoch": 5.611228813559322, "grad_norm": 1.9773433208465576, "learning_rate": 7.194518008474577e-06, "loss": 0.8855, "mean_token_accuracy": 0.7757035419344902, "num_tokens": 17047484.0, "step": 21188 }, { "epoch": 5.611758474576272, "grad_norm": 2.1094894409179688, "learning_rate": 7.194253177966102e-06, "loss": 1.3907, "mean_token_accuracy": 0.6685643121600151, "num_tokens": 17048970.0, "step": 21190 }, { "epoch": 5.612288135593221, "grad_norm": 1.8911141157150269, "learning_rate": 7.193988347457628e-06, "loss": 1.1166, "mean_token_accuracy": 0.7359630987048149, "num_tokens": 17050578.0, "step": 21192 }, { "epoch": 5.6128177966101696, "grad_norm": 2.0545761585235596, "learning_rate": 7.1937235169491525e-06, "loss": 1.5684, "mean_token_accuracy": 0.6548507288098335, "num_tokens": 17052083.0, "step": 21194 }, { "epoch": 5.6133474576271185, "grad_norm": 2.178860902786255, "learning_rate": 7.193458686440678e-06, "loss": 1.1224, "mean_token_accuracy": 0.7259548306465149, "num_tokens": 17053589.0, "step": 21196 }, { "epoch": 5.6138771186440675, "grad_norm": 2.408867835998535, "learning_rate": 7.193193855932203e-06, "loss": 1.887, "mean_token_accuracy": 0.6031320616602898, "num_tokens": 17055145.0, "step": 21198 }, { "epoch": 5.614406779661017, "grad_norm": 2.7860267162323, "learning_rate": 7.192929025423729e-06, "loss": 1.2509, "mean_token_accuracy": 0.7151239663362503, "num_tokens": 17056496.0, "step": 21200 }, { "epoch": 5.614936440677966, "grad_norm": 3.4438424110412598, "learning_rate": 7.192664194915256e-06, "loss": 1.0354, "mean_token_accuracy": 0.7428209558129311, "num_tokens": 17058030.0, "step": 21202 }, { "epoch": 5.615466101694915, "grad_norm": 2.672436475753784, "learning_rate": 7.1923993644067806e-06, "loss": 1.6617, "mean_token_accuracy": 0.6427481546998024, "num_tokens": 17059522.0, "step": 21204 }, { "epoch": 5.615995762711864, "grad_norm": 2.2125585079193115, "learning_rate": 7.192134533898306e-06, "loss": 1.1991, "mean_token_accuracy": 0.7096557915210724, "num_tokens": 17061068.0, "step": 21206 }, { "epoch": 5.616525423728813, "grad_norm": 2.5361404418945312, "learning_rate": 7.191869703389831e-06, "loss": 1.2303, "mean_token_accuracy": 0.7196404412388802, "num_tokens": 17062374.0, "step": 21208 }, { "epoch": 5.617055084745763, "grad_norm": 2.145406484603882, "learning_rate": 7.191604872881357e-06, "loss": 1.1543, "mean_token_accuracy": 0.7399949803948402, "num_tokens": 17064057.0, "step": 21210 }, { "epoch": 5.617584745762712, "grad_norm": 2.742955446243286, "learning_rate": 7.191340042372882e-06, "loss": 1.0854, "mean_token_accuracy": 0.7472145110368729, "num_tokens": 17065467.0, "step": 21212 }, { "epoch": 5.618114406779661, "grad_norm": 2.1215708255767822, "learning_rate": 7.191075211864408e-06, "loss": 1.378, "mean_token_accuracy": 0.6906843408942223, "num_tokens": 17066931.0, "step": 21214 }, { "epoch": 5.61864406779661, "grad_norm": 2.2235124111175537, "learning_rate": 7.190810381355933e-06, "loss": 1.2116, "mean_token_accuracy": 0.724060945212841, "num_tokens": 17068210.0, "step": 21216 }, { "epoch": 5.61917372881356, "grad_norm": 2.176575183868408, "learning_rate": 7.1905455508474584e-06, "loss": 1.1831, "mean_token_accuracy": 0.7390321791172028, "num_tokens": 17069675.0, "step": 21218 }, { "epoch": 5.619703389830509, "grad_norm": 1.584731936454773, "learning_rate": 7.190280720338983e-06, "loss": 1.0407, "mean_token_accuracy": 0.7493632659316063, "num_tokens": 17071336.0, "step": 21220 }, { "epoch": 5.620233050847458, "grad_norm": 2.7249844074249268, "learning_rate": 7.190015889830509e-06, "loss": 1.2065, "mean_token_accuracy": 0.7293318435549736, "num_tokens": 17072862.0, "step": 21222 }, { "epoch": 5.620762711864407, "grad_norm": 2.2028543949127197, "learning_rate": 7.189751059322034e-06, "loss": 1.0465, "mean_token_accuracy": 0.7489755377173424, "num_tokens": 17074305.0, "step": 21224 }, { "epoch": 5.621292372881356, "grad_norm": 1.227028727531433, "learning_rate": 7.189486228813561e-06, "loss": 1.5763, "mean_token_accuracy": 0.6737675312906504, "num_tokens": 17076714.0, "step": 21226 }, { "epoch": 5.621822033898305, "grad_norm": 2.5531275272369385, "learning_rate": 7.189221398305085e-06, "loss": 1.3345, "mean_token_accuracy": 0.6839172393083572, "num_tokens": 17078276.0, "step": 21228 }, { "epoch": 5.622351694915254, "grad_norm": 1.9294817447662354, "learning_rate": 7.188956567796611e-06, "loss": 0.8539, "mean_token_accuracy": 0.77800652384758, "num_tokens": 17079913.0, "step": 21230 }, { "epoch": 5.622881355932203, "grad_norm": 2.537360191345215, "learning_rate": 7.188691737288136e-06, "loss": 1.8116, "mean_token_accuracy": 0.5991297289729118, "num_tokens": 17081578.0, "step": 21232 }, { "epoch": 5.623411016949152, "grad_norm": 2.513397455215454, "learning_rate": 7.188426906779662e-06, "loss": 1.5348, "mean_token_accuracy": 0.6768400743603706, "num_tokens": 17083106.0, "step": 21234 }, { "epoch": 5.623940677966102, "grad_norm": 2.2468488216400146, "learning_rate": 7.188162076271187e-06, "loss": 1.315, "mean_token_accuracy": 0.7053296566009521, "num_tokens": 17084760.0, "step": 21236 }, { "epoch": 5.624470338983051, "grad_norm": 2.5385637283325195, "learning_rate": 7.187897245762713e-06, "loss": 1.2594, "mean_token_accuracy": 0.7221605256199837, "num_tokens": 17086114.0, "step": 21238 }, { "epoch": 5.625, "grad_norm": 2.38754940032959, "learning_rate": 7.187632415254238e-06, "loss": 1.4393, "mean_token_accuracy": 0.6750876754522324, "num_tokens": 17087610.0, "step": 21240 }, { "epoch": 5.625529661016949, "grad_norm": 2.2998905181884766, "learning_rate": 7.1873675847457635e-06, "loss": 1.2854, "mean_token_accuracy": 0.7017776221036911, "num_tokens": 17089071.0, "step": 21242 }, { "epoch": 5.626059322033898, "grad_norm": 2.2149064540863037, "learning_rate": 7.1871027542372885e-06, "loss": 1.4893, "mean_token_accuracy": 0.7000621668994427, "num_tokens": 17090686.0, "step": 21244 }, { "epoch": 5.626588983050848, "grad_norm": 2.100350856781006, "learning_rate": 7.186837923728814e-06, "loss": 1.0498, "mean_token_accuracy": 0.746194913983345, "num_tokens": 17092377.0, "step": 21246 }, { "epoch": 5.627118644067797, "grad_norm": 2.7721810340881348, "learning_rate": 7.186573093220339e-06, "loss": 0.9792, "mean_token_accuracy": 0.7640786916017532, "num_tokens": 17093843.0, "step": 21248 }, { "epoch": 5.627648305084746, "grad_norm": 2.554832696914673, "learning_rate": 7.186308262711865e-06, "loss": 1.5833, "step": 21250 }, { "epoch": 5.627648305084746, "eval_loss": 1.3182361125946045, "eval_mean_token_accuracy": 0.7013099916376077, "eval_num_tokens": 17096020.0, "eval_runtime": 48.336, "eval_samples_per_second": 6.372, "eval_steps_per_second": 6.372, "step": 21250 }, { "epoch": 5.628177966101695, "grad_norm": 1.5342381000518799, "learning_rate": 7.18604343220339e-06, "loss": 0.9594, "mean_token_accuracy": 0.7435851879417896, "num_tokens": 17098502.0, "step": 21252 }, { "epoch": 5.628707627118644, "grad_norm": 2.428359031677246, "learning_rate": 7.185778601694916e-06, "loss": 1.1058, "mean_token_accuracy": 0.7576902732253075, "num_tokens": 17100016.0, "step": 21254 }, { "epoch": 5.629237288135593, "grad_norm": 2.6514837741851807, "learning_rate": 7.1855137711864406e-06, "loss": 1.0288, "mean_token_accuracy": 0.7601510584354401, "num_tokens": 17101484.0, "step": 21256 }, { "epoch": 5.629766949152542, "grad_norm": 1.7339458465576172, "learning_rate": 7.185248940677967e-06, "loss": 1.2081, "mean_token_accuracy": 0.7206867709755898, "num_tokens": 17103409.0, "step": 21258 }, { "epoch": 5.630296610169491, "grad_norm": 1.849376916885376, "learning_rate": 7.184984110169492e-06, "loss": 0.7864, "mean_token_accuracy": 0.791553296148777, "num_tokens": 17104927.0, "step": 21260 }, { "epoch": 5.63082627118644, "grad_norm": 2.3809802532196045, "learning_rate": 7.184719279661018e-06, "loss": 1.3681, "mean_token_accuracy": 0.6769296079874039, "num_tokens": 17106370.0, "step": 21262 }, { "epoch": 5.63135593220339, "grad_norm": 2.156245231628418, "learning_rate": 7.184454449152543e-06, "loss": 1.4102, "mean_token_accuracy": 0.668467327952385, "num_tokens": 17107960.0, "step": 21264 }, { "epoch": 5.631885593220339, "grad_norm": 2.374896287918091, "learning_rate": 7.184189618644069e-06, "loss": 1.3382, "mean_token_accuracy": 0.6986421421170235, "num_tokens": 17109310.0, "step": 21266 }, { "epoch": 5.632415254237288, "grad_norm": 1.9119285345077515, "learning_rate": 7.1839247881355935e-06, "loss": 1.2098, "mean_token_accuracy": 0.71485435962677, "num_tokens": 17111070.0, "step": 21268 }, { "epoch": 5.632944915254237, "grad_norm": 2.5308616161346436, "learning_rate": 7.183659957627119e-06, "loss": 1.3582, "mean_token_accuracy": 0.7077386416494846, "num_tokens": 17112555.0, "step": 21270 }, { "epoch": 5.633474576271187, "grad_norm": 3.2490885257720947, "learning_rate": 7.183395127118644e-06, "loss": 1.7052, "mean_token_accuracy": 0.6338096596300602, "num_tokens": 17115038.0, "step": 21272 }, { "epoch": 5.634004237288136, "grad_norm": 2.1396710872650146, "learning_rate": 7.18313029661017e-06, "loss": 1.2588, "mean_token_accuracy": 0.6566445305943489, "num_tokens": 17117393.0, "step": 21274 }, { "epoch": 5.634533898305085, "grad_norm": 2.1177444458007812, "learning_rate": 7.182865466101695e-06, "loss": 1.2107, "mean_token_accuracy": 0.7452651560306549, "num_tokens": 17118914.0, "step": 21276 }, { "epoch": 5.635063559322034, "grad_norm": 2.3292722702026367, "learning_rate": 7.182600635593221e-06, "loss": 1.6261, "mean_token_accuracy": 0.6494682282209396, "num_tokens": 17120418.0, "step": 21278 }, { "epoch": 5.635593220338983, "grad_norm": 2.5781748294830322, "learning_rate": 7.182335805084746e-06, "loss": 1.2623, "mean_token_accuracy": 0.6924845576286316, "num_tokens": 17122034.0, "step": 21280 }, { "epoch": 5.6361228813559325, "grad_norm": 2.1267220973968506, "learning_rate": 7.1820709745762714e-06, "loss": 0.9141, "mean_token_accuracy": 0.7790780737996101, "num_tokens": 17123539.0, "step": 21282 }, { "epoch": 5.6366525423728815, "grad_norm": 2.7964048385620117, "learning_rate": 7.181806144067798e-06, "loss": 1.1766, "mean_token_accuracy": 0.7339888587594032, "num_tokens": 17125070.0, "step": 21284 }, { "epoch": 5.6371822033898304, "grad_norm": 2.6295552253723145, "learning_rate": 7.181541313559323e-06, "loss": 1.0217, "mean_token_accuracy": 0.7486415430903435, "num_tokens": 17126663.0, "step": 21286 }, { "epoch": 5.637711864406779, "grad_norm": 2.6422667503356934, "learning_rate": 7.181276483050849e-06, "loss": 1.4262, "mean_token_accuracy": 0.6735158935189247, "num_tokens": 17128093.0, "step": 21288 }, { "epoch": 5.638241525423728, "grad_norm": 2.327580213546753, "learning_rate": 7.181011652542374e-06, "loss": 1.1206, "mean_token_accuracy": 0.7527128383517265, "num_tokens": 17129396.0, "step": 21290 }, { "epoch": 5.638771186440678, "grad_norm": 2.058210849761963, "learning_rate": 7.1807468220338995e-06, "loss": 0.9545, "mean_token_accuracy": 0.7779151685535908, "num_tokens": 17131012.0, "step": 21292 }, { "epoch": 5.639300847457627, "grad_norm": 2.5319783687591553, "learning_rate": 7.180481991525424e-06, "loss": 1.2493, "mean_token_accuracy": 0.7041065916419029, "num_tokens": 17132677.0, "step": 21294 }, { "epoch": 5.639830508474576, "grad_norm": 2.1989023685455322, "learning_rate": 7.18021716101695e-06, "loss": 1.3011, "mean_token_accuracy": 0.7083693966269493, "num_tokens": 17134139.0, "step": 21296 }, { "epoch": 5.640360169491525, "grad_norm": 1.8485227823257446, "learning_rate": 7.179952330508475e-06, "loss": 1.0419, "mean_token_accuracy": 0.747011311352253, "num_tokens": 17135571.0, "step": 21298 }, { "epoch": 5.640889830508475, "grad_norm": 2.3874118328094482, "learning_rate": 7.179687500000001e-06, "loss": 1.4146, "mean_token_accuracy": 0.6979970037937164, "num_tokens": 17137058.0, "step": 21300 }, { "epoch": 5.641419491525424, "grad_norm": 1.9559364318847656, "learning_rate": 7.179422669491526e-06, "loss": 0.8773, "mean_token_accuracy": 0.7714500576257706, "num_tokens": 17138877.0, "step": 21302 }, { "epoch": 5.641949152542373, "grad_norm": 2.2875378131866455, "learning_rate": 7.179157838983052e-06, "loss": 0.9133, "mean_token_accuracy": 0.7535229846835136, "num_tokens": 17140341.0, "step": 21304 }, { "epoch": 5.642478813559322, "grad_norm": 2.191051959991455, "learning_rate": 7.1788930084745765e-06, "loss": 1.2897, "mean_token_accuracy": 0.7379393726587296, "num_tokens": 17141975.0, "step": 21306 }, { "epoch": 5.643008474576272, "grad_norm": 2.1036810874938965, "learning_rate": 7.178628177966102e-06, "loss": 1.017, "mean_token_accuracy": 0.7380882725119591, "num_tokens": 17143618.0, "step": 21308 }, { "epoch": 5.643538135593221, "grad_norm": 2.028980255126953, "learning_rate": 7.178363347457627e-06, "loss": 1.2593, "mean_token_accuracy": 0.6957582011818886, "num_tokens": 17145211.0, "step": 21310 }, { "epoch": 5.6440677966101696, "grad_norm": 1.9530121088027954, "learning_rate": 7.178098516949154e-06, "loss": 0.8899, "mean_token_accuracy": 0.7780424281954765, "num_tokens": 17146682.0, "step": 21312 }, { "epoch": 5.6445974576271185, "grad_norm": 2.1383540630340576, "learning_rate": 7.177833686440679e-06, "loss": 1.855, "mean_token_accuracy": 0.5809328481554985, "num_tokens": 17148705.0, "step": 21314 }, { "epoch": 5.6451271186440675, "grad_norm": 2.0630311965942383, "learning_rate": 7.1775688559322045e-06, "loss": 1.3098, "mean_token_accuracy": 0.6925158053636551, "num_tokens": 17150903.0, "step": 21316 }, { "epoch": 5.645656779661017, "grad_norm": 1.8789180517196655, "learning_rate": 7.1773040254237295e-06, "loss": 1.2129, "mean_token_accuracy": 0.713382177054882, "num_tokens": 17152535.0, "step": 21318 }, { "epoch": 5.646186440677966, "grad_norm": 1.9920648336410522, "learning_rate": 7.177039194915255e-06, "loss": 0.7938, "mean_token_accuracy": 0.8042655736207962, "num_tokens": 17154150.0, "step": 21320 }, { "epoch": 5.646716101694915, "grad_norm": 1.6894038915634155, "learning_rate": 7.17677436440678e-06, "loss": 0.8381, "mean_token_accuracy": 0.801435224711895, "num_tokens": 17155568.0, "step": 21322 }, { "epoch": 5.647245762711864, "grad_norm": 2.343050718307495, "learning_rate": 7.176509533898306e-06, "loss": 1.4867, "mean_token_accuracy": 0.6808380633592606, "num_tokens": 17157233.0, "step": 21324 }, { "epoch": 5.647775423728813, "grad_norm": 2.201301097869873, "learning_rate": 7.176244703389831e-06, "loss": 1.147, "mean_token_accuracy": 0.7192761078476906, "num_tokens": 17159276.0, "step": 21326 }, { "epoch": 5.648305084745763, "grad_norm": 2.213207721710205, "learning_rate": 7.175979872881357e-06, "loss": 1.1062, "mean_token_accuracy": 0.7332070991396904, "num_tokens": 17160762.0, "step": 21328 }, { "epoch": 5.648834745762712, "grad_norm": 2.178983449935913, "learning_rate": 7.175715042372882e-06, "loss": 1.1384, "mean_token_accuracy": 0.7256646826863289, "num_tokens": 17162523.0, "step": 21330 }, { "epoch": 5.649364406779661, "grad_norm": 1.8410900831222534, "learning_rate": 7.175450211864407e-06, "loss": 0.8482, "mean_token_accuracy": 0.7511474117636681, "num_tokens": 17163916.0, "step": 21332 }, { "epoch": 5.64989406779661, "grad_norm": 2.344695806503296, "learning_rate": 7.175185381355932e-06, "loss": 1.5804, "mean_token_accuracy": 0.6348962262272835, "num_tokens": 17165561.0, "step": 21334 }, { "epoch": 5.65042372881356, "grad_norm": 2.267348527908325, "learning_rate": 7.174920550847458e-06, "loss": 1.3773, "mean_token_accuracy": 0.7080853916704655, "num_tokens": 17167264.0, "step": 21336 }, { "epoch": 5.650953389830509, "grad_norm": 1.8855130672454834, "learning_rate": 7.174655720338983e-06, "loss": 1.0589, "mean_token_accuracy": 0.7494878023862839, "num_tokens": 17168960.0, "step": 21338 }, { "epoch": 5.651483050847458, "grad_norm": 2.346360683441162, "learning_rate": 7.17439088983051e-06, "loss": 1.2656, "mean_token_accuracy": 0.6779847033321857, "num_tokens": 17170740.0, "step": 21340 }, { "epoch": 5.652012711864407, "grad_norm": 2.5442471504211426, "learning_rate": 7.174126059322034e-06, "loss": 1.605, "mean_token_accuracy": 0.6612912490963936, "num_tokens": 17172185.0, "step": 21342 }, { "epoch": 5.652542372881356, "grad_norm": 2.8353567123413086, "learning_rate": 7.17386122881356e-06, "loss": 1.1441, "mean_token_accuracy": 0.7306646108627319, "num_tokens": 17173452.0, "step": 21344 }, { "epoch": 5.653072033898305, "grad_norm": 2.005809783935547, "learning_rate": 7.173596398305085e-06, "loss": 1.2186, "mean_token_accuracy": 0.7228565514087677, "num_tokens": 17174971.0, "step": 21346 }, { "epoch": 5.653601694915254, "grad_norm": 2.452857255935669, "learning_rate": 7.173331567796611e-06, "loss": 1.6314, "mean_token_accuracy": 0.6393439248204231, "num_tokens": 17176731.0, "step": 21348 }, { "epoch": 5.654131355932203, "grad_norm": 2.4299447536468506, "learning_rate": 7.173066737288136e-06, "loss": 1.2243, "mean_token_accuracy": 0.7251272797584534, "num_tokens": 17179071.0, "step": 21350 }, { "epoch": 5.654661016949152, "grad_norm": 2.1811442375183105, "learning_rate": 7.172801906779662e-06, "loss": 1.2634, "mean_token_accuracy": 0.7144851088523865, "num_tokens": 17180739.0, "step": 21352 }, { "epoch": 5.655190677966102, "grad_norm": 1.9741512537002563, "learning_rate": 7.172537076271187e-06, "loss": 1.1648, "mean_token_accuracy": 0.7304455563426018, "num_tokens": 17182496.0, "step": 21354 }, { "epoch": 5.655720338983051, "grad_norm": 2.553523302078247, "learning_rate": 7.1722722457627124e-06, "loss": 1.4213, "mean_token_accuracy": 0.7149728238582611, "num_tokens": 17184011.0, "step": 21356 }, { "epoch": 5.65625, "grad_norm": 2.7781803607940674, "learning_rate": 7.172007415254237e-06, "loss": 1.7863, "mean_token_accuracy": 0.6120096743106842, "num_tokens": 17185667.0, "step": 21358 }, { "epoch": 5.656779661016949, "grad_norm": 2.4985501766204834, "learning_rate": 7.171742584745763e-06, "loss": 0.9664, "mean_token_accuracy": 0.7695013880729675, "num_tokens": 17187294.0, "step": 21360 }, { "epoch": 5.657309322033898, "grad_norm": 3.087406873703003, "learning_rate": 7.171477754237288e-06, "loss": 1.4087, "mean_token_accuracy": 0.6848553493618965, "num_tokens": 17188596.0, "step": 21362 }, { "epoch": 5.657838983050848, "grad_norm": 2.833287239074707, "learning_rate": 7.171212923728814e-06, "loss": 1.3691, "mean_token_accuracy": 0.6921625435352325, "num_tokens": 17189908.0, "step": 21364 }, { "epoch": 5.658368644067797, "grad_norm": 1.6891947984695435, "learning_rate": 7.170948093220339e-06, "loss": 0.9437, "mean_token_accuracy": 0.7528441771864891, "num_tokens": 17191635.0, "step": 21366 }, { "epoch": 5.658898305084746, "grad_norm": 1.8858873844146729, "learning_rate": 7.170683262711865e-06, "loss": 0.9941, "mean_token_accuracy": 0.7390500754117966, "num_tokens": 17193196.0, "step": 21368 }, { "epoch": 5.659427966101695, "grad_norm": 2.1977012157440186, "learning_rate": 7.170418432203391e-06, "loss": 1.1143, "mean_token_accuracy": 0.7453158497810364, "num_tokens": 17194597.0, "step": 21370 }, { "epoch": 5.659957627118644, "grad_norm": 2.265909433364868, "learning_rate": 7.170153601694916e-06, "loss": 1.1988, "mean_token_accuracy": 0.736949659883976, "num_tokens": 17196169.0, "step": 21372 }, { "epoch": 5.660487288135593, "grad_norm": 2.190574884414673, "learning_rate": 7.169888771186442e-06, "loss": 1.1744, "mean_token_accuracy": 0.7339157834649086, "num_tokens": 17197659.0, "step": 21374 }, { "epoch": 5.661016949152542, "grad_norm": 2.3462297916412354, "learning_rate": 7.169623940677967e-06, "loss": 1.4752, "mean_token_accuracy": 0.679486483335495, "num_tokens": 17199387.0, "step": 21376 }, { "epoch": 5.661546610169491, "grad_norm": 2.3506057262420654, "learning_rate": 7.169359110169493e-06, "loss": 1.2574, "mean_token_accuracy": 0.696281872689724, "num_tokens": 17200934.0, "step": 21378 }, { "epoch": 5.66207627118644, "grad_norm": 2.325528144836426, "learning_rate": 7.1690942796610175e-06, "loss": 1.2191, "mean_token_accuracy": 0.707911878824234, "num_tokens": 17202376.0, "step": 21380 }, { "epoch": 5.66260593220339, "grad_norm": 1.9964606761932373, "learning_rate": 7.168829449152543e-06, "loss": 0.9657, "mean_token_accuracy": 0.7541927546262741, "num_tokens": 17203933.0, "step": 21382 }, { "epoch": 5.663135593220339, "grad_norm": 2.37849760055542, "learning_rate": 7.168564618644068e-06, "loss": 1.0862, "mean_token_accuracy": 0.74461929500103, "num_tokens": 17205573.0, "step": 21384 }, { "epoch": 5.663665254237288, "grad_norm": 2.7736852169036865, "learning_rate": 7.168299788135594e-06, "loss": 1.5815, "mean_token_accuracy": 0.6378707587718964, "num_tokens": 17207084.0, "step": 21386 }, { "epoch": 5.664194915254237, "grad_norm": 1.8157079219818115, "learning_rate": 7.168034957627119e-06, "loss": 1.0959, "mean_token_accuracy": 0.7514100559055805, "num_tokens": 17208820.0, "step": 21388 }, { "epoch": 5.664724576271187, "grad_norm": 2.324124574661255, "learning_rate": 7.167770127118645e-06, "loss": 1.2002, "mean_token_accuracy": 0.7450720891356468, "num_tokens": 17210628.0, "step": 21390 }, { "epoch": 5.665254237288136, "grad_norm": 2.2384822368621826, "learning_rate": 7.16750529661017e-06, "loss": 1.0035, "mean_token_accuracy": 0.7591937556862831, "num_tokens": 17212105.0, "step": 21392 }, { "epoch": 5.665783898305085, "grad_norm": 1.6859824657440186, "learning_rate": 7.167240466101696e-06, "loss": 0.9126, "mean_token_accuracy": 0.7578483894467354, "num_tokens": 17213736.0, "step": 21394 }, { "epoch": 5.666313559322034, "grad_norm": 1.9149080514907837, "learning_rate": 7.16697563559322e-06, "loss": 1.0035, "mean_token_accuracy": 0.7701929435133934, "num_tokens": 17215451.0, "step": 21396 }, { "epoch": 5.666843220338983, "grad_norm": 2.306023359298706, "learning_rate": 7.166710805084747e-06, "loss": 1.1142, "mean_token_accuracy": 0.7210138887166977, "num_tokens": 17217033.0, "step": 21398 }, { "epoch": 5.6673728813559325, "grad_norm": 2.097433567047119, "learning_rate": 7.166445974576272e-06, "loss": 1.2116, "mean_token_accuracy": 0.699919156730175, "num_tokens": 17218877.0, "step": 21400 }, { "epoch": 5.6679025423728815, "grad_norm": 2.110551118850708, "learning_rate": 7.166181144067798e-06, "loss": 1.3946, "mean_token_accuracy": 0.6996721625328064, "num_tokens": 17220768.0, "step": 21402 }, { "epoch": 5.6684322033898304, "grad_norm": 2.1735599040985107, "learning_rate": 7.165916313559323e-06, "loss": 1.6562, "mean_token_accuracy": 0.6397368833422661, "num_tokens": 17222614.0, "step": 21404 }, { "epoch": 5.668961864406779, "grad_norm": 1.9350148439407349, "learning_rate": 7.165651483050848e-06, "loss": 0.9626, "mean_token_accuracy": 0.7495126575231552, "num_tokens": 17224282.0, "step": 21406 }, { "epoch": 5.669491525423728, "grad_norm": 1.6917791366577148, "learning_rate": 7.165386652542373e-06, "loss": 1.0213, "mean_token_accuracy": 0.7562646791338921, "num_tokens": 17226599.0, "step": 21408 }, { "epoch": 5.670021186440678, "grad_norm": 2.7033724784851074, "learning_rate": 7.165121822033899e-06, "loss": 1.7144, "mean_token_accuracy": 0.6334620043635368, "num_tokens": 17228153.0, "step": 21410 }, { "epoch": 5.670550847457627, "grad_norm": 2.217988967895508, "learning_rate": 7.164856991525424e-06, "loss": 1.4452, "mean_token_accuracy": 0.6958786025643349, "num_tokens": 17229592.0, "step": 21412 }, { "epoch": 5.671080508474576, "grad_norm": 2.1990725994110107, "learning_rate": 7.16459216101695e-06, "loss": 1.1475, "mean_token_accuracy": 0.7298947870731354, "num_tokens": 17231287.0, "step": 21414 }, { "epoch": 5.671610169491525, "grad_norm": 2.438339948654175, "learning_rate": 7.164327330508475e-06, "loss": 1.2806, "mean_token_accuracy": 0.7017959579825401, "num_tokens": 17232890.0, "step": 21416 }, { "epoch": 5.672139830508475, "grad_norm": 2.351839303970337, "learning_rate": 7.1640625000000005e-06, "loss": 1.3206, "mean_token_accuracy": 0.6913580074906349, "num_tokens": 17234579.0, "step": 21418 }, { "epoch": 5.672669491525424, "grad_norm": 2.1441497802734375, "learning_rate": 7.1637976694915254e-06, "loss": 1.0306, "mean_token_accuracy": 0.7370183542370796, "num_tokens": 17235931.0, "step": 21420 }, { "epoch": 5.673199152542373, "grad_norm": 1.9218627214431763, "learning_rate": 7.163532838983052e-06, "loss": 1.429, "mean_token_accuracy": 0.6939284726977348, "num_tokens": 17237840.0, "step": 21422 }, { "epoch": 5.673728813559322, "grad_norm": 1.4364590644836426, "learning_rate": 7.163268008474576e-06, "loss": 0.9901, "mean_token_accuracy": 0.7591201886534691, "num_tokens": 17239730.0, "step": 21424 }, { "epoch": 5.674258474576272, "grad_norm": 2.402916669845581, "learning_rate": 7.163003177966103e-06, "loss": 1.5081, "mean_token_accuracy": 0.6500389203429222, "num_tokens": 17241848.0, "step": 21426 }, { "epoch": 5.674788135593221, "grad_norm": 2.03236722946167, "learning_rate": 7.162738347457628e-06, "loss": 1.1401, "mean_token_accuracy": 0.7109562382102013, "num_tokens": 17243540.0, "step": 21428 }, { "epoch": 5.6753177966101696, "grad_norm": 2.065091371536255, "learning_rate": 7.1624735169491535e-06, "loss": 0.9484, "mean_token_accuracy": 0.7775730416178703, "num_tokens": 17244887.0, "step": 21430 }, { "epoch": 5.6758474576271185, "grad_norm": 1.9499670267105103, "learning_rate": 7.162208686440678e-06, "loss": 1.3189, "mean_token_accuracy": 0.7301685884594917, "num_tokens": 17246577.0, "step": 21432 }, { "epoch": 5.6763771186440675, "grad_norm": 2.679772138595581, "learning_rate": 7.161943855932204e-06, "loss": 1.3874, "mean_token_accuracy": 0.7024144977331161, "num_tokens": 17248059.0, "step": 21434 }, { "epoch": 5.676906779661017, "grad_norm": 2.2014706134796143, "learning_rate": 7.161679025423729e-06, "loss": 1.0049, "mean_token_accuracy": 0.7398529574275017, "num_tokens": 17249847.0, "step": 21436 }, { "epoch": 5.677436440677966, "grad_norm": 1.6865648031234741, "learning_rate": 7.161414194915255e-06, "loss": 0.6216, "mean_token_accuracy": 0.8355044424533844, "num_tokens": 17251478.0, "step": 21438 }, { "epoch": 5.677966101694915, "grad_norm": 2.1061434745788574, "learning_rate": 7.16114936440678e-06, "loss": 1.2034, "mean_token_accuracy": 0.7164895460009575, "num_tokens": 17252988.0, "step": 21440 }, { "epoch": 5.678495762711864, "grad_norm": 2.295858860015869, "learning_rate": 7.1608845338983056e-06, "loss": 1.3132, "mean_token_accuracy": 0.7020884901285172, "num_tokens": 17254633.0, "step": 21442 }, { "epoch": 5.679025423728813, "grad_norm": 2.0389463901519775, "learning_rate": 7.1606197033898305e-06, "loss": 1.3262, "mean_token_accuracy": 0.6952814012765884, "num_tokens": 17256382.0, "step": 21444 }, { "epoch": 5.679555084745763, "grad_norm": 2.233079671859741, "learning_rate": 7.160354872881356e-06, "loss": 1.2493, "mean_token_accuracy": 0.7363134175539017, "num_tokens": 17257820.0, "step": 21446 }, { "epoch": 5.680084745762712, "grad_norm": 1.9151607751846313, "learning_rate": 7.160090042372881e-06, "loss": 1.1205, "mean_token_accuracy": 0.7208569943904877, "num_tokens": 17259495.0, "step": 21448 }, { "epoch": 5.680614406779661, "grad_norm": 2.2557504177093506, "learning_rate": 7.159825211864407e-06, "loss": 1.2036, "mean_token_accuracy": 0.7313650101423264, "num_tokens": 17261183.0, "step": 21450 }, { "epoch": 5.68114406779661, "grad_norm": 2.1773762702941895, "learning_rate": 7.159560381355934e-06, "loss": 0.8617, "mean_token_accuracy": 0.7688064426183701, "num_tokens": 17262576.0, "step": 21452 }, { "epoch": 5.68167372881356, "grad_norm": 1.9292802810668945, "learning_rate": 7.1592955508474585e-06, "loss": 1.3564, "mean_token_accuracy": 0.6852436140179634, "num_tokens": 17264404.0, "step": 21454 }, { "epoch": 5.682203389830509, "grad_norm": 2.6522328853607178, "learning_rate": 7.159030720338984e-06, "loss": 1.4143, "mean_token_accuracy": 0.6785897016525269, "num_tokens": 17266051.0, "step": 21456 }, { "epoch": 5.682733050847458, "grad_norm": 2.432969093322754, "learning_rate": 7.158765889830509e-06, "loss": 1.4238, "mean_token_accuracy": 0.6960474103689194, "num_tokens": 17267634.0, "step": 21458 }, { "epoch": 5.683262711864407, "grad_norm": 2.2191920280456543, "learning_rate": 7.158501059322035e-06, "loss": 1.2232, "mean_token_accuracy": 0.6995781734585762, "num_tokens": 17269270.0, "step": 21460 }, { "epoch": 5.683792372881356, "grad_norm": 2.5499322414398193, "learning_rate": 7.15823622881356e-06, "loss": 1.7928, "mean_token_accuracy": 0.6002606451511383, "num_tokens": 17271187.0, "step": 21462 }, { "epoch": 5.684322033898305, "grad_norm": 2.4670047760009766, "learning_rate": 7.157971398305086e-06, "loss": 1.6516, "mean_token_accuracy": 0.6356991901993752, "num_tokens": 17272803.0, "step": 21464 }, { "epoch": 5.684851694915254, "grad_norm": 1.99503493309021, "learning_rate": 7.157706567796611e-06, "loss": 1.2444, "mean_token_accuracy": 0.718444250524044, "num_tokens": 17274684.0, "step": 21466 }, { "epoch": 5.685381355932203, "grad_norm": 2.3951401710510254, "learning_rate": 7.1574417372881364e-06, "loss": 1.2268, "mean_token_accuracy": 0.7111623585224152, "num_tokens": 17276033.0, "step": 21468 }, { "epoch": 5.685911016949152, "grad_norm": 2.5109755992889404, "learning_rate": 7.157176906779661e-06, "loss": 1.4402, "mean_token_accuracy": 0.66283068805933, "num_tokens": 17277532.0, "step": 21470 }, { "epoch": 5.686440677966102, "grad_norm": 2.358773946762085, "learning_rate": 7.156912076271187e-06, "loss": 1.4962, "mean_token_accuracy": 0.6634761467576027, "num_tokens": 17279023.0, "step": 21472 }, { "epoch": 5.686970338983051, "grad_norm": 1.9991565942764282, "learning_rate": 7.156647245762712e-06, "loss": 1.0655, "mean_token_accuracy": 0.768582209944725, "num_tokens": 17280660.0, "step": 21474 }, { "epoch": 5.6875, "grad_norm": 2.466318130493164, "learning_rate": 7.156382415254239e-06, "loss": 1.4875, "mean_token_accuracy": 0.6863015964627266, "num_tokens": 17282135.0, "step": 21476 }, { "epoch": 5.688029661016949, "grad_norm": 2.096489667892456, "learning_rate": 7.156117584745763e-06, "loss": 0.9294, "mean_token_accuracy": 0.7830333709716797, "num_tokens": 17283730.0, "step": 21478 }, { "epoch": 5.688559322033898, "grad_norm": 2.586549758911133, "learning_rate": 7.155852754237289e-06, "loss": 1.3794, "mean_token_accuracy": 0.7037844136357307, "num_tokens": 17285163.0, "step": 21480 }, { "epoch": 5.689088983050848, "grad_norm": 1.6658763885498047, "learning_rate": 7.155587923728814e-06, "loss": 1.0265, "mean_token_accuracy": 0.7581212744116783, "num_tokens": 17287424.0, "step": 21482 }, { "epoch": 5.689618644067797, "grad_norm": 2.2091238498687744, "learning_rate": 7.15532309322034e-06, "loss": 1.8799, "mean_token_accuracy": 0.6153226047754288, "num_tokens": 17288986.0, "step": 21484 }, { "epoch": 5.690148305084746, "grad_norm": 2.2100212574005127, "learning_rate": 7.155058262711865e-06, "loss": 1.3067, "mean_token_accuracy": 0.708794966340065, "num_tokens": 17290822.0, "step": 21486 }, { "epoch": 5.690677966101695, "grad_norm": 2.2857537269592285, "learning_rate": 7.154793432203391e-06, "loss": 1.4143, "mean_token_accuracy": 0.6609729304909706, "num_tokens": 17292458.0, "step": 21488 }, { "epoch": 5.691207627118644, "grad_norm": 2.254796028137207, "learning_rate": 7.154528601694916e-06, "loss": 1.3187, "mean_token_accuracy": 0.694009006023407, "num_tokens": 17294005.0, "step": 21490 }, { "epoch": 5.691737288135593, "grad_norm": 2.2305667400360107, "learning_rate": 7.1542637711864415e-06, "loss": 0.9153, "mean_token_accuracy": 0.7774622663855553, "num_tokens": 17295439.0, "step": 21492 }, { "epoch": 5.692266949152542, "grad_norm": 2.2692525386810303, "learning_rate": 7.1539989406779664e-06, "loss": 0.9813, "mean_token_accuracy": 0.7541427239775658, "num_tokens": 17297014.0, "step": 21494 }, { "epoch": 5.692796610169491, "grad_norm": 2.0828166007995605, "learning_rate": 7.153734110169492e-06, "loss": 1.0881, "mean_token_accuracy": 0.750875748693943, "num_tokens": 17298529.0, "step": 21496 }, { "epoch": 5.69332627118644, "grad_norm": 2.4654324054718018, "learning_rate": 7.153469279661017e-06, "loss": 1.2647, "mean_token_accuracy": 0.7079680934548378, "num_tokens": 17300151.0, "step": 21498 }, { "epoch": 5.69385593220339, "grad_norm": 2.0502586364746094, "learning_rate": 7.153204449152543e-06, "loss": 1.1635, "step": 21500 }, { "epoch": 5.69385593220339, "eval_loss": 1.316727876663208, "eval_mean_token_accuracy": 0.7015481506075177, "eval_num_tokens": 17302155.0, "eval_runtime": 48.2988, "eval_samples_per_second": 6.377, "eval_steps_per_second": 6.377, "step": 21500 }, { "epoch": 5.694385593220339, "grad_norm": 2.186260223388672, "learning_rate": 7.152939618644068e-06, "loss": 1.8137, "mean_token_accuracy": 0.664281077682972, "num_tokens": 17303711.0, "step": 21502 }, { "epoch": 5.694915254237288, "grad_norm": 2.591564416885376, "learning_rate": 7.152674788135594e-06, "loss": 1.3747, "mean_token_accuracy": 0.6921857297420502, "num_tokens": 17305347.0, "step": 21504 }, { "epoch": 5.695444915254237, "grad_norm": 2.2341601848602295, "learning_rate": 7.1524099576271186e-06, "loss": 1.3362, "mean_token_accuracy": 0.7014848068356514, "num_tokens": 17306909.0, "step": 21506 }, { "epoch": 5.695974576271187, "grad_norm": 1.9831233024597168, "learning_rate": 7.152145127118645e-06, "loss": 1.2163, "mean_token_accuracy": 0.7080427035689354, "num_tokens": 17308462.0, "step": 21508 }, { "epoch": 5.696504237288136, "grad_norm": 2.188638210296631, "learning_rate": 7.15188029661017e-06, "loss": 1.4162, "mean_token_accuracy": 0.6753509864211082, "num_tokens": 17310098.0, "step": 21510 }, { "epoch": 5.697033898305085, "grad_norm": 1.8801027536392212, "learning_rate": 7.151615466101696e-06, "loss": 1.511, "mean_token_accuracy": 0.6685348898172379, "num_tokens": 17311720.0, "step": 21512 }, { "epoch": 5.697563559322034, "grad_norm": 1.694988489151001, "learning_rate": 7.151350635593221e-06, "loss": 0.922, "mean_token_accuracy": 0.763801321387291, "num_tokens": 17313721.0, "step": 21514 }, { "epoch": 5.698093220338983, "grad_norm": 1.6996442079544067, "learning_rate": 7.151085805084747e-06, "loss": 1.1206, "mean_token_accuracy": 0.7401600182056427, "num_tokens": 17315579.0, "step": 21516 }, { "epoch": 5.6986228813559325, "grad_norm": 2.2147436141967773, "learning_rate": 7.1508209745762715e-06, "loss": 1.6863, "mean_token_accuracy": 0.6457457020878792, "num_tokens": 17317110.0, "step": 21518 }, { "epoch": 5.6991525423728815, "grad_norm": 2.2313547134399414, "learning_rate": 7.150556144067797e-06, "loss": 1.2477, "mean_token_accuracy": 0.7205196246504784, "num_tokens": 17318528.0, "step": 21520 }, { "epoch": 5.6996822033898304, "grad_norm": 1.8716397285461426, "learning_rate": 7.150291313559322e-06, "loss": 1.2272, "mean_token_accuracy": 0.724844329059124, "num_tokens": 17319935.0, "step": 21522 }, { "epoch": 5.700211864406779, "grad_norm": 2.376027822494507, "learning_rate": 7.150026483050848e-06, "loss": 1.1306, "mean_token_accuracy": 0.7415611147880554, "num_tokens": 17321525.0, "step": 21524 }, { "epoch": 5.700741525423728, "grad_norm": 2.575955390930176, "learning_rate": 7.149761652542373e-06, "loss": 1.34, "mean_token_accuracy": 0.672413557767868, "num_tokens": 17323151.0, "step": 21526 }, { "epoch": 5.701271186440678, "grad_norm": 2.593562126159668, "learning_rate": 7.149496822033899e-06, "loss": 1.1917, "mean_token_accuracy": 0.7262038737535477, "num_tokens": 17324789.0, "step": 21528 }, { "epoch": 5.701800847457627, "grad_norm": 2.7230336666107178, "learning_rate": 7.149231991525424e-06, "loss": 1.4361, "mean_token_accuracy": 0.6780791655182838, "num_tokens": 17326071.0, "step": 21530 }, { "epoch": 5.702330508474576, "grad_norm": 1.9014972448349, "learning_rate": 7.148967161016949e-06, "loss": 1.153, "mean_token_accuracy": 0.7242208048701286, "num_tokens": 17327561.0, "step": 21532 }, { "epoch": 5.702860169491525, "grad_norm": 2.6491963863372803, "learning_rate": 7.148702330508474e-06, "loss": 1.4671, "mean_token_accuracy": 0.6894234642386436, "num_tokens": 17328729.0, "step": 21534 }, { "epoch": 5.703389830508475, "grad_norm": 2.2472729682922363, "learning_rate": 7.148437500000001e-06, "loss": 1.1722, "mean_token_accuracy": 0.7500912174582481, "num_tokens": 17330340.0, "step": 21536 }, { "epoch": 5.703919491525424, "grad_norm": 2.4817850589752197, "learning_rate": 7.148172669491527e-06, "loss": 1.1544, "mean_token_accuracy": 0.7080254331231117, "num_tokens": 17332164.0, "step": 21538 }, { "epoch": 5.704449152542373, "grad_norm": 1.8717809915542603, "learning_rate": 7.147907838983052e-06, "loss": 1.2436, "mean_token_accuracy": 0.7133201211690903, "num_tokens": 17333951.0, "step": 21540 }, { "epoch": 5.704978813559322, "grad_norm": 2.1577064990997314, "learning_rate": 7.1476430084745774e-06, "loss": 1.1329, "mean_token_accuracy": 0.7411683201789856, "num_tokens": 17335507.0, "step": 21542 }, { "epoch": 5.705508474576272, "grad_norm": 2.455961227416992, "learning_rate": 7.147378177966102e-06, "loss": 1.2043, "mean_token_accuracy": 0.7354536652565002, "num_tokens": 17336852.0, "step": 21544 }, { "epoch": 5.706038135593221, "grad_norm": 2.487156629562378, "learning_rate": 7.147113347457628e-06, "loss": 1.5359, "mean_token_accuracy": 0.6825492456555367, "num_tokens": 17338539.0, "step": 21546 }, { "epoch": 5.7065677966101696, "grad_norm": 1.8933826684951782, "learning_rate": 7.146848516949153e-06, "loss": 1.1667, "mean_token_accuracy": 0.739528626203537, "num_tokens": 17340144.0, "step": 21548 }, { "epoch": 5.7070974576271185, "grad_norm": 2.621535539627075, "learning_rate": 7.146583686440679e-06, "loss": 0.9613, "mean_token_accuracy": 0.7616705521941185, "num_tokens": 17341594.0, "step": 21550 }, { "epoch": 5.7076271186440675, "grad_norm": 2.0695197582244873, "learning_rate": 7.146318855932204e-06, "loss": 1.3818, "mean_token_accuracy": 0.6740992814302444, "num_tokens": 17343426.0, "step": 21552 }, { "epoch": 5.708156779661017, "grad_norm": 2.2488622665405273, "learning_rate": 7.1460540254237296e-06, "loss": 1.1406, "mean_token_accuracy": 0.709406666457653, "num_tokens": 17345021.0, "step": 21554 }, { "epoch": 5.708686440677966, "grad_norm": 2.1726248264312744, "learning_rate": 7.1457891949152545e-06, "loss": 1.249, "mean_token_accuracy": 0.7079755589365959, "num_tokens": 17346576.0, "step": 21556 }, { "epoch": 5.709216101694915, "grad_norm": 2.1751821041107178, "learning_rate": 7.14552436440678e-06, "loss": 0.8469, "mean_token_accuracy": 0.7842900678515434, "num_tokens": 17348209.0, "step": 21558 }, { "epoch": 5.709745762711864, "grad_norm": 2.778629779815674, "learning_rate": 7.145259533898305e-06, "loss": 1.5687, "mean_token_accuracy": 0.6077772751450539, "num_tokens": 17350387.0, "step": 21560 }, { "epoch": 5.710275423728813, "grad_norm": 1.949948787689209, "learning_rate": 7.144994703389832e-06, "loss": 1.1732, "mean_token_accuracy": 0.7168052569031715, "num_tokens": 17352051.0, "step": 21562 }, { "epoch": 5.710805084745763, "grad_norm": 2.249568223953247, "learning_rate": 7.144729872881357e-06, "loss": 1.2766, "mean_token_accuracy": 0.6996364966034889, "num_tokens": 17353675.0, "step": 21564 }, { "epoch": 5.711334745762712, "grad_norm": 1.8115705251693726, "learning_rate": 7.1444650423728825e-06, "loss": 1.1446, "mean_token_accuracy": 0.722997710108757, "num_tokens": 17355188.0, "step": 21566 }, { "epoch": 5.711864406779661, "grad_norm": 2.2958154678344727, "learning_rate": 7.1442002118644075e-06, "loss": 1.3898, "mean_token_accuracy": 0.6888115108013153, "num_tokens": 17356593.0, "step": 21568 }, { "epoch": 5.71239406779661, "grad_norm": 2.1948187351226807, "learning_rate": 7.143935381355933e-06, "loss": 1.2382, "mean_token_accuracy": 0.7285861000418663, "num_tokens": 17357954.0, "step": 21570 }, { "epoch": 5.71292372881356, "grad_norm": 2.0097174644470215, "learning_rate": 7.143670550847458e-06, "loss": 1.0101, "mean_token_accuracy": 0.7458231076598167, "num_tokens": 17359468.0, "step": 21572 }, { "epoch": 5.713453389830509, "grad_norm": 2.857882499694824, "learning_rate": 7.143405720338984e-06, "loss": 1.0332, "mean_token_accuracy": 0.7565946355462074, "num_tokens": 17360783.0, "step": 21574 }, { "epoch": 5.713983050847458, "grad_norm": 2.138610601425171, "learning_rate": 7.143140889830509e-06, "loss": 1.0361, "mean_token_accuracy": 0.7576628476381302, "num_tokens": 17362131.0, "step": 21576 }, { "epoch": 5.714512711864407, "grad_norm": 2.298386573791504, "learning_rate": 7.142876059322035e-06, "loss": 1.5076, "mean_token_accuracy": 0.6685025990009308, "num_tokens": 17363724.0, "step": 21578 }, { "epoch": 5.715042372881356, "grad_norm": 1.9113370180130005, "learning_rate": 7.1426112288135596e-06, "loss": 1.0602, "mean_token_accuracy": 0.7550070360302925, "num_tokens": 17365286.0, "step": 21580 }, { "epoch": 5.715572033898305, "grad_norm": 2.2610878944396973, "learning_rate": 7.142346398305085e-06, "loss": 0.8665, "mean_token_accuracy": 0.7643452733755112, "num_tokens": 17366796.0, "step": 21582 }, { "epoch": 5.716101694915254, "grad_norm": 2.173919439315796, "learning_rate": 7.14208156779661e-06, "loss": 0.8591, "mean_token_accuracy": 0.7737052738666534, "num_tokens": 17368193.0, "step": 21584 }, { "epoch": 5.716631355932203, "grad_norm": 2.3511650562286377, "learning_rate": 7.141816737288136e-06, "loss": 1.5614, "mean_token_accuracy": 0.684215135872364, "num_tokens": 17369802.0, "step": 21586 }, { "epoch": 5.717161016949152, "grad_norm": 2.336719512939453, "learning_rate": 7.141551906779661e-06, "loss": 1.7252, "mean_token_accuracy": 0.6207120642066002, "num_tokens": 17371426.0, "step": 21588 }, { "epoch": 5.717690677966102, "grad_norm": 1.858885407447815, "learning_rate": 7.141287076271188e-06, "loss": 1.1917, "mean_token_accuracy": 0.7276785671710968, "num_tokens": 17372899.0, "step": 21590 }, { "epoch": 5.718220338983051, "grad_norm": 2.1020495891571045, "learning_rate": 7.141022245762712e-06, "loss": 0.7662, "mean_token_accuracy": 0.7914711162447929, "num_tokens": 17374248.0, "step": 21592 }, { "epoch": 5.71875, "grad_norm": 2.232919931411743, "learning_rate": 7.140757415254238e-06, "loss": 1.3931, "mean_token_accuracy": 0.6837935745716095, "num_tokens": 17375937.0, "step": 21594 }, { "epoch": 5.719279661016949, "grad_norm": 2.0505192279815674, "learning_rate": 7.140492584745763e-06, "loss": 1.0441, "mean_token_accuracy": 0.7236400842666626, "num_tokens": 17377514.0, "step": 21596 }, { "epoch": 5.719809322033898, "grad_norm": 1.9186396598815918, "learning_rate": 7.140227754237289e-06, "loss": 0.9942, "mean_token_accuracy": 0.7701671943068504, "num_tokens": 17379297.0, "step": 21598 }, { "epoch": 5.720338983050848, "grad_norm": 1.563812494277954, "learning_rate": 7.139962923728814e-06, "loss": 0.9385, "mean_token_accuracy": 0.7626558393239975, "num_tokens": 17381023.0, "step": 21600 }, { "epoch": 5.720868644067797, "grad_norm": 2.572350263595581, "learning_rate": 7.13969809322034e-06, "loss": 1.3762, "mean_token_accuracy": 0.6821097955107689, "num_tokens": 17382447.0, "step": 21602 }, { "epoch": 5.721398305084746, "grad_norm": 2.6167526245117188, "learning_rate": 7.139433262711865e-06, "loss": 1.224, "mean_token_accuracy": 0.7223835289478302, "num_tokens": 17384065.0, "step": 21604 }, { "epoch": 5.721927966101695, "grad_norm": 2.26808762550354, "learning_rate": 7.1391684322033904e-06, "loss": 1.6555, "mean_token_accuracy": 0.6319751143455505, "num_tokens": 17385660.0, "step": 21606 }, { "epoch": 5.722457627118644, "grad_norm": 2.2911534309387207, "learning_rate": 7.138903601694915e-06, "loss": 1.1656, "mean_token_accuracy": 0.7365527674555779, "num_tokens": 17387177.0, "step": 21608 }, { "epoch": 5.722987288135593, "grad_norm": 1.9082313776016235, "learning_rate": 7.138638771186441e-06, "loss": 1.2332, "mean_token_accuracy": 0.7029203549027443, "num_tokens": 17389040.0, "step": 21610 }, { "epoch": 5.723516949152542, "grad_norm": 2.1115753650665283, "learning_rate": 7.138373940677966e-06, "loss": 1.1154, "mean_token_accuracy": 0.7175220176577568, "num_tokens": 17390677.0, "step": 21612 }, { "epoch": 5.724046610169491, "grad_norm": 2.114898920059204, "learning_rate": 7.138109110169492e-06, "loss": 1.177, "mean_token_accuracy": 0.7173966765403748, "num_tokens": 17392045.0, "step": 21614 }, { "epoch": 5.72457627118644, "grad_norm": 1.8358180522918701, "learning_rate": 7.137844279661017e-06, "loss": 1.0668, "mean_token_accuracy": 0.7537223994731903, "num_tokens": 17393480.0, "step": 21616 }, { "epoch": 5.72510593220339, "grad_norm": 2.280871629714966, "learning_rate": 7.137579449152543e-06, "loss": 1.1899, "mean_token_accuracy": 0.7194839790463448, "num_tokens": 17395009.0, "step": 21618 }, { "epoch": 5.725635593220339, "grad_norm": 2.2803497314453125, "learning_rate": 7.1373146186440675e-06, "loss": 1.3849, "mean_token_accuracy": 0.7011374607682228, "num_tokens": 17396741.0, "step": 21620 }, { "epoch": 5.726165254237288, "grad_norm": 2.4699974060058594, "learning_rate": 7.137049788135594e-06, "loss": 0.9723, "mean_token_accuracy": 0.7732235863804817, "num_tokens": 17398382.0, "step": 21622 }, { "epoch": 5.726694915254237, "grad_norm": 2.616084337234497, "learning_rate": 7.13678495762712e-06, "loss": 1.403, "mean_token_accuracy": 0.6934667825698853, "num_tokens": 17399834.0, "step": 21624 }, { "epoch": 5.727224576271187, "grad_norm": 2.8689098358154297, "learning_rate": 7.136520127118645e-06, "loss": 1.4486, "mean_token_accuracy": 0.6938436403870583, "num_tokens": 17401219.0, "step": 21626 }, { "epoch": 5.727754237288136, "grad_norm": 2.3470096588134766, "learning_rate": 7.1362552966101706e-06, "loss": 1.004, "mean_token_accuracy": 0.7552792206406593, "num_tokens": 17402900.0, "step": 21628 }, { "epoch": 5.728283898305085, "grad_norm": 1.9739189147949219, "learning_rate": 7.1359904661016955e-06, "loss": 1.2354, "mean_token_accuracy": 0.6990892440080643, "num_tokens": 17404509.0, "step": 21630 }, { "epoch": 5.728813559322034, "grad_norm": 1.6033070087432861, "learning_rate": 7.135725635593221e-06, "loss": 0.935, "mean_token_accuracy": 0.7763247489929199, "num_tokens": 17406089.0, "step": 21632 }, { "epoch": 5.729343220338983, "grad_norm": 2.313419818878174, "learning_rate": 7.135460805084746e-06, "loss": 1.1952, "mean_token_accuracy": 0.7423023208975792, "num_tokens": 17407595.0, "step": 21634 }, { "epoch": 5.7298728813559325, "grad_norm": 2.0855190753936768, "learning_rate": 7.135195974576272e-06, "loss": 0.7516, "mean_token_accuracy": 0.8077776730060577, "num_tokens": 17409392.0, "step": 21636 }, { "epoch": 5.7304025423728815, "grad_norm": 2.6285769939422607, "learning_rate": 7.134931144067797e-06, "loss": 1.4375, "mean_token_accuracy": 0.6718990355730057, "num_tokens": 17410755.0, "step": 21638 }, { "epoch": 5.7309322033898304, "grad_norm": 2.12860369682312, "learning_rate": 7.134666313559323e-06, "loss": 1.2955, "mean_token_accuracy": 0.7119512557983398, "num_tokens": 17412423.0, "step": 21640 }, { "epoch": 5.731461864406779, "grad_norm": 2.060988664627075, "learning_rate": 7.134401483050848e-06, "loss": 0.9671, "mean_token_accuracy": 0.7710795775055885, "num_tokens": 17413957.0, "step": 21642 }, { "epoch": 5.731991525423728, "grad_norm": 1.940285563468933, "learning_rate": 7.134136652542374e-06, "loss": 0.9775, "mean_token_accuracy": 0.758265309035778, "num_tokens": 17415586.0, "step": 21644 }, { "epoch": 5.732521186440678, "grad_norm": 2.006819486618042, "learning_rate": 7.133871822033898e-06, "loss": 1.2609, "mean_token_accuracy": 0.6924081966280937, "num_tokens": 17417805.0, "step": 21646 }, { "epoch": 5.733050847457627, "grad_norm": 1.980326533317566, "learning_rate": 7.133606991525425e-06, "loss": 1.2618, "mean_token_accuracy": 0.7103913128376007, "num_tokens": 17419305.0, "step": 21648 }, { "epoch": 5.733580508474576, "grad_norm": 1.9177451133728027, "learning_rate": 7.13334216101695e-06, "loss": 1.1164, "mean_token_accuracy": 0.7184702828526497, "num_tokens": 17421137.0, "step": 21650 }, { "epoch": 5.734110169491525, "grad_norm": 2.508650541305542, "learning_rate": 7.133077330508476e-06, "loss": 1.3457, "mean_token_accuracy": 0.6875207349658012, "num_tokens": 17422683.0, "step": 21652 }, { "epoch": 5.734639830508475, "grad_norm": 2.033560037612915, "learning_rate": 7.132812500000001e-06, "loss": 1.1831, "mean_token_accuracy": 0.7140503525733948, "num_tokens": 17423918.0, "step": 21654 }, { "epoch": 5.735169491525424, "grad_norm": 2.02603816986084, "learning_rate": 7.132547669491526e-06, "loss": 0.9509, "mean_token_accuracy": 0.761352077126503, "num_tokens": 17425713.0, "step": 21656 }, { "epoch": 5.735699152542373, "grad_norm": 2.6550347805023193, "learning_rate": 7.132282838983051e-06, "loss": 1.2215, "mean_token_accuracy": 0.706812359392643, "num_tokens": 17427200.0, "step": 21658 }, { "epoch": 5.736228813559322, "grad_norm": 2.248965263366699, "learning_rate": 7.132018008474577e-06, "loss": 1.314, "mean_token_accuracy": 0.6931798309087753, "num_tokens": 17428785.0, "step": 21660 }, { "epoch": 5.736758474576272, "grad_norm": 1.8825942277908325, "learning_rate": 7.131753177966102e-06, "loss": 0.8991, "mean_token_accuracy": 0.7828626409173012, "num_tokens": 17430458.0, "step": 21662 }, { "epoch": 5.737288135593221, "grad_norm": 2.308220148086548, "learning_rate": 7.131488347457628e-06, "loss": 1.5546, "mean_token_accuracy": 0.6477776244282722, "num_tokens": 17432008.0, "step": 21664 }, { "epoch": 5.7378177966101696, "grad_norm": 2.6553070545196533, "learning_rate": 7.131223516949153e-06, "loss": 1.1636, "mean_token_accuracy": 0.732041671872139, "num_tokens": 17433538.0, "step": 21666 }, { "epoch": 5.7383474576271185, "grad_norm": 1.7669566869735718, "learning_rate": 7.1309586864406785e-06, "loss": 0.7376, "mean_token_accuracy": 0.7983816266059875, "num_tokens": 17435039.0, "step": 21668 }, { "epoch": 5.7388771186440675, "grad_norm": 2.0582172870635986, "learning_rate": 7.130693855932203e-06, "loss": 1.1844, "mean_token_accuracy": 0.7247232422232628, "num_tokens": 17436580.0, "step": 21670 }, { "epoch": 5.739406779661017, "grad_norm": 2.554370641708374, "learning_rate": 7.13042902542373e-06, "loss": 1.4417, "mean_token_accuracy": 0.6677763611078262, "num_tokens": 17438007.0, "step": 21672 }, { "epoch": 5.739936440677966, "grad_norm": 2.0650904178619385, "learning_rate": 7.130164194915254e-06, "loss": 1.688, "mean_token_accuracy": 0.6271013170480728, "num_tokens": 17439925.0, "step": 21674 }, { "epoch": 5.740466101694915, "grad_norm": 1.7756729125976562, "learning_rate": 7.129899364406781e-06, "loss": 1.2689, "mean_token_accuracy": 0.7012828513979912, "num_tokens": 17441610.0, "step": 21676 }, { "epoch": 5.740995762711864, "grad_norm": 2.543031692504883, "learning_rate": 7.129634533898306e-06, "loss": 1.2946, "mean_token_accuracy": 0.7442019432783127, "num_tokens": 17443156.0, "step": 21678 }, { "epoch": 5.741525423728813, "grad_norm": 1.9595000743865967, "learning_rate": 7.1293697033898314e-06, "loss": 1.3133, "mean_token_accuracy": 0.7015460878610611, "num_tokens": 17444762.0, "step": 21680 }, { "epoch": 5.742055084745763, "grad_norm": 2.2502427101135254, "learning_rate": 7.129104872881356e-06, "loss": 1.0665, "mean_token_accuracy": 0.7404900789260864, "num_tokens": 17446462.0, "step": 21682 }, { "epoch": 5.742584745762712, "grad_norm": 1.9884326457977295, "learning_rate": 7.128840042372882e-06, "loss": 1.319, "mean_token_accuracy": 0.6988227888941765, "num_tokens": 17448449.0, "step": 21684 }, { "epoch": 5.743114406779661, "grad_norm": 2.584742546081543, "learning_rate": 7.128575211864407e-06, "loss": 1.2322, "mean_token_accuracy": 0.7004961147904396, "num_tokens": 17449989.0, "step": 21686 }, { "epoch": 5.74364406779661, "grad_norm": 2.3720974922180176, "learning_rate": 7.128310381355933e-06, "loss": 1.1907, "mean_token_accuracy": 0.7349681183695793, "num_tokens": 17451415.0, "step": 21688 }, { "epoch": 5.74417372881356, "grad_norm": 1.8661690950393677, "learning_rate": 7.128045550847458e-06, "loss": 1.2519, "mean_token_accuracy": 0.6893363520503044, "num_tokens": 17453365.0, "step": 21690 }, { "epoch": 5.744703389830509, "grad_norm": 2.0820298194885254, "learning_rate": 7.1277807203389836e-06, "loss": 1.0079, "mean_token_accuracy": 0.7383783534169197, "num_tokens": 17455354.0, "step": 21692 }, { "epoch": 5.745233050847458, "grad_norm": 2.280097246170044, "learning_rate": 7.1275158898305085e-06, "loss": 1.5176, "mean_token_accuracy": 0.659190721809864, "num_tokens": 17456981.0, "step": 21694 }, { "epoch": 5.745762711864407, "grad_norm": 2.4447200298309326, "learning_rate": 7.127251059322034e-06, "loss": 1.4309, "mean_token_accuracy": 0.6958178952336311, "num_tokens": 17458351.0, "step": 21696 }, { "epoch": 5.746292372881356, "grad_norm": 2.150064468383789, "learning_rate": 7.126986228813559e-06, "loss": 1.2999, "mean_token_accuracy": 0.7091266475617886, "num_tokens": 17459969.0, "step": 21698 }, { "epoch": 5.746822033898305, "grad_norm": 2.090357542037964, "learning_rate": 7.126721398305085e-06, "loss": 1.368, "mean_token_accuracy": 0.7189296633005142, "num_tokens": 17461602.0, "step": 21700 }, { "epoch": 5.747351694915254, "grad_norm": 2.001444101333618, "learning_rate": 7.12645656779661e-06, "loss": 1.1931, "mean_token_accuracy": 0.7273383364081383, "num_tokens": 17463093.0, "step": 21702 }, { "epoch": 5.747881355932203, "grad_norm": 3.1093766689300537, "learning_rate": 7.1261917372881365e-06, "loss": 1.3643, "mean_token_accuracy": 0.7083048075437546, "num_tokens": 17464459.0, "step": 21704 }, { "epoch": 5.748411016949152, "grad_norm": 1.8724485635757446, "learning_rate": 7.125926906779662e-06, "loss": 1.1858, "mean_token_accuracy": 0.7304099425673485, "num_tokens": 17466024.0, "step": 21706 }, { "epoch": 5.748940677966102, "grad_norm": 2.094461441040039, "learning_rate": 7.125662076271187e-06, "loss": 1.0335, "mean_token_accuracy": 0.7706894353032112, "num_tokens": 17467864.0, "step": 21708 }, { "epoch": 5.749470338983051, "grad_norm": 2.74472713470459, "learning_rate": 7.125397245762713e-06, "loss": 1.3561, "mean_token_accuracy": 0.68838319927454, "num_tokens": 17469222.0, "step": 21710 }, { "epoch": 5.75, "grad_norm": 1.9061806201934814, "learning_rate": 7.125132415254238e-06, "loss": 0.922, "mean_token_accuracy": 0.7651649340987206, "num_tokens": 17470758.0, "step": 21712 }, { "epoch": 5.750529661016949, "grad_norm": 2.3247883319854736, "learning_rate": 7.124867584745764e-06, "loss": 1.1624, "mean_token_accuracy": 0.730184018611908, "num_tokens": 17472169.0, "step": 21714 }, { "epoch": 5.751059322033898, "grad_norm": 2.6837191581726074, "learning_rate": 7.124602754237289e-06, "loss": 1.3079, "mean_token_accuracy": 0.7205311581492424, "num_tokens": 17473582.0, "step": 21716 }, { "epoch": 5.751588983050848, "grad_norm": 1.8775964975357056, "learning_rate": 7.124337923728814e-06, "loss": 1.017, "mean_token_accuracy": 0.7685640454292297, "num_tokens": 17475314.0, "step": 21718 }, { "epoch": 5.752118644067797, "grad_norm": 2.076909303665161, "learning_rate": 7.124073093220339e-06, "loss": 1.0711, "mean_token_accuracy": 0.772388719022274, "num_tokens": 17476887.0, "step": 21720 }, { "epoch": 5.752648305084746, "grad_norm": 2.0361640453338623, "learning_rate": 7.123808262711865e-06, "loss": 1.1392, "mean_token_accuracy": 0.7438500672578812, "num_tokens": 17478321.0, "step": 21722 }, { "epoch": 5.753177966101695, "grad_norm": 2.0385377407073975, "learning_rate": 7.12354343220339e-06, "loss": 1.2372, "mean_token_accuracy": 0.7404960319399834, "num_tokens": 17479821.0, "step": 21724 }, { "epoch": 5.753707627118644, "grad_norm": 1.791603922843933, "learning_rate": 7.123278601694917e-06, "loss": 1.0677, "mean_token_accuracy": 0.7570662871003151, "num_tokens": 17481446.0, "step": 21726 }, { "epoch": 5.754237288135593, "grad_norm": 1.9035829305648804, "learning_rate": 7.123013771186441e-06, "loss": 0.7829, "mean_token_accuracy": 0.8171754330396652, "num_tokens": 17483180.0, "step": 21728 }, { "epoch": 5.754766949152542, "grad_norm": 2.0060689449310303, "learning_rate": 7.122748940677967e-06, "loss": 0.869, "mean_token_accuracy": 0.7798519432544708, "num_tokens": 17484919.0, "step": 21730 }, { "epoch": 5.755296610169491, "grad_norm": 1.9550658464431763, "learning_rate": 7.122484110169492e-06, "loss": 0.7578, "mean_token_accuracy": 0.8035274147987366, "num_tokens": 17486760.0, "step": 21732 }, { "epoch": 5.75582627118644, "grad_norm": 2.6463820934295654, "learning_rate": 7.122219279661018e-06, "loss": 1.2664, "mean_token_accuracy": 0.6888398826122284, "num_tokens": 17488412.0, "step": 21734 }, { "epoch": 5.75635593220339, "grad_norm": 2.302712917327881, "learning_rate": 7.121954449152543e-06, "loss": 1.4287, "mean_token_accuracy": 0.6889233961701393, "num_tokens": 17490105.0, "step": 21736 }, { "epoch": 5.756885593220339, "grad_norm": 2.072153091430664, "learning_rate": 7.121689618644069e-06, "loss": 1.3034, "mean_token_accuracy": 0.6953986957669258, "num_tokens": 17491810.0, "step": 21738 }, { "epoch": 5.757415254237288, "grad_norm": 2.246605396270752, "learning_rate": 7.121424788135594e-06, "loss": 1.0734, "mean_token_accuracy": 0.7542488127946854, "num_tokens": 17493477.0, "step": 21740 }, { "epoch": 5.757944915254237, "grad_norm": 2.6361749172210693, "learning_rate": 7.1211599576271195e-06, "loss": 1.4898, "mean_token_accuracy": 0.6619028672575951, "num_tokens": 17495021.0, "step": 21742 }, { "epoch": 5.758474576271187, "grad_norm": 2.558062791824341, "learning_rate": 7.120895127118644e-06, "loss": 1.1961, "mean_token_accuracy": 0.7290122509002686, "num_tokens": 17496391.0, "step": 21744 }, { "epoch": 5.759004237288136, "grad_norm": 2.197307586669922, "learning_rate": 7.12063029661017e-06, "loss": 0.9151, "mean_token_accuracy": 0.7807005643844604, "num_tokens": 17497924.0, "step": 21746 }, { "epoch": 5.759533898305085, "grad_norm": 1.933310627937317, "learning_rate": 7.120365466101695e-06, "loss": 0.9294, "mean_token_accuracy": 0.7917132303118706, "num_tokens": 17499515.0, "step": 21748 }, { "epoch": 5.760063559322034, "grad_norm": 2.765212059020996, "learning_rate": 7.120100635593221e-06, "loss": 1.2196, "step": 21750 }, { "epoch": 5.760063559322034, "eval_loss": 1.3182035684585571, "eval_mean_token_accuracy": 0.7011401063048994, "eval_num_tokens": 17501281.0, "eval_runtime": 48.1025, "eval_samples_per_second": 6.403, "eval_steps_per_second": 6.403, "step": 21750 }, { "epoch": 5.760593220338983, "grad_norm": 2.3194384574890137, "learning_rate": 7.119835805084746e-06, "loss": 1.35, "mean_token_accuracy": 0.6957247853279114, "num_tokens": 17502812.0, "step": 21752 }, { "epoch": 5.7611228813559325, "grad_norm": 2.199997901916504, "learning_rate": 7.119570974576272e-06, "loss": 1.1088, "mean_token_accuracy": 0.7342582195997238, "num_tokens": 17504454.0, "step": 21754 }, { "epoch": 5.7616525423728815, "grad_norm": 2.6165354251861572, "learning_rate": 7.1193061440677965e-06, "loss": 1.3956, "mean_token_accuracy": 0.7207192704081535, "num_tokens": 17505974.0, "step": 21756 }, { "epoch": 5.7621822033898304, "grad_norm": 2.2673542499542236, "learning_rate": 7.119041313559323e-06, "loss": 1.5182, "mean_token_accuracy": 0.6575288698077202, "num_tokens": 17507446.0, "step": 21758 }, { "epoch": 5.762711864406779, "grad_norm": 2.2401669025421143, "learning_rate": 7.118776483050848e-06, "loss": 1.3135, "mean_token_accuracy": 0.709271989762783, "num_tokens": 17508956.0, "step": 21760 }, { "epoch": 5.763241525423728, "grad_norm": 1.8419350385665894, "learning_rate": 7.118511652542374e-06, "loss": 0.9635, "mean_token_accuracy": 0.7702571079134941, "num_tokens": 17510659.0, "step": 21762 }, { "epoch": 5.763771186440678, "grad_norm": 2.3187150955200195, "learning_rate": 7.118246822033899e-06, "loss": 1.3866, "mean_token_accuracy": 0.695888102054596, "num_tokens": 17512061.0, "step": 21764 }, { "epoch": 5.764300847457627, "grad_norm": 2.624553680419922, "learning_rate": 7.1179819915254246e-06, "loss": 1.4465, "mean_token_accuracy": 0.6923815086483955, "num_tokens": 17513688.0, "step": 21766 }, { "epoch": 5.764830508474576, "grad_norm": 2.012523889541626, "learning_rate": 7.1177171610169495e-06, "loss": 1.092, "mean_token_accuracy": 0.7587480917572975, "num_tokens": 17515325.0, "step": 21768 }, { "epoch": 5.765360169491525, "grad_norm": 2.007336378097534, "learning_rate": 7.117452330508475e-06, "loss": 1.274, "mean_token_accuracy": 0.7209176197648048, "num_tokens": 17517098.0, "step": 21770 }, { "epoch": 5.765889830508475, "grad_norm": 2.2974791526794434, "learning_rate": 7.1171875e-06, "loss": 1.5236, "mean_token_accuracy": 0.6544665023684502, "num_tokens": 17518653.0, "step": 21772 }, { "epoch": 5.766419491525424, "grad_norm": 2.63608717918396, "learning_rate": 7.116922669491526e-06, "loss": 1.2343, "mean_token_accuracy": 0.7159754410386086, "num_tokens": 17520121.0, "step": 21774 }, { "epoch": 5.766949152542373, "grad_norm": 1.9725786447525024, "learning_rate": 7.116657838983051e-06, "loss": 1.1293, "mean_token_accuracy": 0.7406694814562798, "num_tokens": 17521836.0, "step": 21776 }, { "epoch": 5.767478813559322, "grad_norm": 1.6406323909759521, "learning_rate": 7.116393008474577e-06, "loss": 0.8217, "mean_token_accuracy": 0.7869309931993484, "num_tokens": 17523464.0, "step": 21778 }, { "epoch": 5.768008474576272, "grad_norm": 2.0776901245117188, "learning_rate": 7.116128177966102e-06, "loss": 1.5507, "mean_token_accuracy": 0.6630209535360336, "num_tokens": 17525262.0, "step": 21780 }, { "epoch": 5.768538135593221, "grad_norm": 2.228503465652466, "learning_rate": 7.115863347457627e-06, "loss": 1.0718, "mean_token_accuracy": 0.7954991459846497, "num_tokens": 17526807.0, "step": 21782 }, { "epoch": 5.7690677966101696, "grad_norm": 2.7320690155029297, "learning_rate": 7.115598516949152e-06, "loss": 0.9175, "mean_token_accuracy": 0.7510083168745041, "num_tokens": 17528143.0, "step": 21784 }, { "epoch": 5.7695974576271185, "grad_norm": 1.8789870738983154, "learning_rate": 7.115333686440679e-06, "loss": 1.279, "mean_token_accuracy": 0.701214924454689, "num_tokens": 17529980.0, "step": 21786 }, { "epoch": 5.7701271186440675, "grad_norm": 2.327745199203491, "learning_rate": 7.115068855932203e-06, "loss": 1.1082, "mean_token_accuracy": 0.7361968830227852, "num_tokens": 17531564.0, "step": 21788 }, { "epoch": 5.770656779661017, "grad_norm": 2.304506301879883, "learning_rate": 7.11480402542373e-06, "loss": 0.9787, "mean_token_accuracy": 0.7629576772451401, "num_tokens": 17533237.0, "step": 21790 }, { "epoch": 5.771186440677966, "grad_norm": 2.3801558017730713, "learning_rate": 7.114539194915255e-06, "loss": 1.2867, "mean_token_accuracy": 0.7078452557325363, "num_tokens": 17535153.0, "step": 21792 }, { "epoch": 5.771716101694915, "grad_norm": 2.351346731185913, "learning_rate": 7.11427436440678e-06, "loss": 1.204, "mean_token_accuracy": 0.719475045800209, "num_tokens": 17536504.0, "step": 21794 }, { "epoch": 5.772245762711864, "grad_norm": 2.1355831623077393, "learning_rate": 7.114009533898306e-06, "loss": 1.1901, "mean_token_accuracy": 0.7308383584022522, "num_tokens": 17538179.0, "step": 21796 }, { "epoch": 5.772775423728813, "grad_norm": 2.084911584854126, "learning_rate": 7.113744703389831e-06, "loss": 1.0705, "mean_token_accuracy": 0.7305933497846127, "num_tokens": 17539648.0, "step": 21798 }, { "epoch": 5.773305084745763, "grad_norm": 2.196361541748047, "learning_rate": 7.113479872881357e-06, "loss": 1.6376, "mean_token_accuracy": 0.6484345048666, "num_tokens": 17541271.0, "step": 21800 }, { "epoch": 5.773834745762712, "grad_norm": 1.6221131086349487, "learning_rate": 7.113215042372882e-06, "loss": 0.8922, "mean_token_accuracy": 0.7660579830408096, "num_tokens": 17543247.0, "step": 21802 }, { "epoch": 5.774364406779661, "grad_norm": 2.214684009552002, "learning_rate": 7.1129502118644075e-06, "loss": 1.0817, "mean_token_accuracy": 0.7399437427520752, "num_tokens": 17544954.0, "step": 21804 }, { "epoch": 5.77489406779661, "grad_norm": 2.670043468475342, "learning_rate": 7.1126853813559325e-06, "loss": 1.7887, "mean_token_accuracy": 0.6206627488136292, "num_tokens": 17546441.0, "step": 21806 }, { "epoch": 5.77542372881356, "grad_norm": 1.3835870027542114, "learning_rate": 7.112420550847458e-06, "loss": 0.9551, "mean_token_accuracy": 0.751958541572094, "num_tokens": 17549152.0, "step": 21808 }, { "epoch": 5.775953389830509, "grad_norm": 2.5649545192718506, "learning_rate": 7.112155720338983e-06, "loss": 1.3591, "mean_token_accuracy": 0.7053180560469627, "num_tokens": 17550560.0, "step": 21810 }, { "epoch": 5.776483050847458, "grad_norm": 1.5942554473876953, "learning_rate": 7.11189088983051e-06, "loss": 1.4983, "mean_token_accuracy": 0.6386836618185043, "num_tokens": 17553005.0, "step": 21812 }, { "epoch": 5.777012711864407, "grad_norm": 1.9915136098861694, "learning_rate": 7.111626059322035e-06, "loss": 1.0255, "mean_token_accuracy": 0.7410621345043182, "num_tokens": 17554538.0, "step": 21814 }, { "epoch": 5.777542372881356, "grad_norm": 1.307695746421814, "learning_rate": 7.1113612288135605e-06, "loss": 0.9479, "mean_token_accuracy": 0.7268185764551163, "num_tokens": 17556938.0, "step": 21816 }, { "epoch": 5.778072033898305, "grad_norm": 1.789749026298523, "learning_rate": 7.1110963983050854e-06, "loss": 0.8162, "mean_token_accuracy": 0.7961252480745316, "num_tokens": 17558479.0, "step": 21818 }, { "epoch": 5.778601694915254, "grad_norm": 2.1856186389923096, "learning_rate": 7.110831567796611e-06, "loss": 1.7022, "mean_token_accuracy": 0.6320055201649666, "num_tokens": 17560175.0, "step": 21820 }, { "epoch": 5.779131355932203, "grad_norm": 2.0678892135620117, "learning_rate": 7.110566737288136e-06, "loss": 1.3773, "mean_token_accuracy": 0.6781269833445549, "num_tokens": 17561951.0, "step": 21822 }, { "epoch": 5.779661016949152, "grad_norm": 2.0958549976348877, "learning_rate": 7.110301906779662e-06, "loss": 0.8782, "mean_token_accuracy": 0.7786390334367752, "num_tokens": 17563433.0, "step": 21824 }, { "epoch": 5.780190677966102, "grad_norm": 2.220954656600952, "learning_rate": 7.110037076271187e-06, "loss": 1.2798, "mean_token_accuracy": 0.6904640719294548, "num_tokens": 17564964.0, "step": 21826 }, { "epoch": 5.780720338983051, "grad_norm": 2.564499855041504, "learning_rate": 7.109772245762713e-06, "loss": 1.0655, "mean_token_accuracy": 0.745012454688549, "num_tokens": 17566610.0, "step": 21828 }, { "epoch": 5.78125, "grad_norm": 1.9768805503845215, "learning_rate": 7.1095074152542375e-06, "loss": 1.5637, "mean_token_accuracy": 0.6572216376662254, "num_tokens": 17568415.0, "step": 21830 }, { "epoch": 5.781779661016949, "grad_norm": 1.9973304271697998, "learning_rate": 7.109242584745763e-06, "loss": 1.0675, "mean_token_accuracy": 0.761479377746582, "num_tokens": 17569730.0, "step": 21832 }, { "epoch": 5.782309322033898, "grad_norm": 1.7130396366119385, "learning_rate": 7.108977754237288e-06, "loss": 0.8147, "mean_token_accuracy": 0.8008832409977913, "num_tokens": 17571334.0, "step": 21834 }, { "epoch": 5.782838983050848, "grad_norm": 2.3253121376037598, "learning_rate": 7.108712923728814e-06, "loss": 1.0663, "mean_token_accuracy": 0.7542128413915634, "num_tokens": 17572659.0, "step": 21836 }, { "epoch": 5.783368644067797, "grad_norm": 2.327012300491333, "learning_rate": 7.108448093220339e-06, "loss": 1.4073, "mean_token_accuracy": 0.6928679496049881, "num_tokens": 17574298.0, "step": 21838 }, { "epoch": 5.783898305084746, "grad_norm": 2.2362160682678223, "learning_rate": 7.108183262711866e-06, "loss": 1.2697, "mean_token_accuracy": 0.7234502881765366, "num_tokens": 17575742.0, "step": 21840 }, { "epoch": 5.784427966101695, "grad_norm": 1.7544958591461182, "learning_rate": 7.10791843220339e-06, "loss": 1.0822, "mean_token_accuracy": 0.752443253993988, "num_tokens": 17577501.0, "step": 21842 }, { "epoch": 5.784957627118644, "grad_norm": 1.845201015472412, "learning_rate": 7.107653601694916e-06, "loss": 1.1617, "mean_token_accuracy": 0.7060152292251587, "num_tokens": 17579192.0, "step": 21844 }, { "epoch": 5.785487288135593, "grad_norm": 2.1191437244415283, "learning_rate": 7.107388771186441e-06, "loss": 1.1304, "mean_token_accuracy": 0.7235670313239098, "num_tokens": 17580946.0, "step": 21846 }, { "epoch": 5.786016949152542, "grad_norm": 2.2240397930145264, "learning_rate": 7.107123940677967e-06, "loss": 1.3223, "mean_token_accuracy": 0.7065022438764572, "num_tokens": 17582399.0, "step": 21848 }, { "epoch": 5.786546610169491, "grad_norm": 2.311854839324951, "learning_rate": 7.106859110169492e-06, "loss": 1.1611, "mean_token_accuracy": 0.7386291325092316, "num_tokens": 17583774.0, "step": 21850 }, { "epoch": 5.78707627118644, "grad_norm": 2.4285871982574463, "learning_rate": 7.106594279661018e-06, "loss": 1.6555, "mean_token_accuracy": 0.6589418277144432, "num_tokens": 17585212.0, "step": 21852 }, { "epoch": 5.78760593220339, "grad_norm": 2.053372859954834, "learning_rate": 7.106329449152543e-06, "loss": 1.4837, "mean_token_accuracy": 0.645978644490242, "num_tokens": 17587012.0, "step": 21854 }, { "epoch": 5.788135593220339, "grad_norm": 1.998624324798584, "learning_rate": 7.106064618644068e-06, "loss": 1.0375, "mean_token_accuracy": 0.757461704313755, "num_tokens": 17588856.0, "step": 21856 }, { "epoch": 5.788665254237288, "grad_norm": 2.3384926319122314, "learning_rate": 7.105799788135593e-06, "loss": 1.3023, "mean_token_accuracy": 0.7035735696554184, "num_tokens": 17590459.0, "step": 21858 }, { "epoch": 5.789194915254237, "grad_norm": 2.1049790382385254, "learning_rate": 7.105534957627119e-06, "loss": 1.0621, "mean_token_accuracy": 0.7696642577648163, "num_tokens": 17592077.0, "step": 21860 }, { "epoch": 5.789724576271187, "grad_norm": 1.9667378664016724, "learning_rate": 7.105270127118644e-06, "loss": 1.1146, "mean_token_accuracy": 0.730992928147316, "num_tokens": 17593812.0, "step": 21862 }, { "epoch": 5.790254237288136, "grad_norm": 2.4935452938079834, "learning_rate": 7.10500529661017e-06, "loss": 1.5563, "mean_token_accuracy": 0.6505797579884529, "num_tokens": 17595363.0, "step": 21864 }, { "epoch": 5.790783898305085, "grad_norm": 2.21614146232605, "learning_rate": 7.104740466101695e-06, "loss": 1.0942, "mean_token_accuracy": 0.7085038274526596, "num_tokens": 17597232.0, "step": 21866 }, { "epoch": 5.791313559322034, "grad_norm": 2.1907577514648438, "learning_rate": 7.104475635593221e-06, "loss": 1.1191, "mean_token_accuracy": 0.7446587830781937, "num_tokens": 17598947.0, "step": 21868 }, { "epoch": 5.791843220338983, "grad_norm": 2.6073720455169678, "learning_rate": 7.1042108050847454e-06, "loss": 1.5051, "mean_token_accuracy": 0.6588810533285141, "num_tokens": 17600516.0, "step": 21870 }, { "epoch": 5.7923728813559325, "grad_norm": 1.988020896911621, "learning_rate": 7.103945974576272e-06, "loss": 1.5174, "mean_token_accuracy": 0.6634980961680412, "num_tokens": 17602319.0, "step": 21872 }, { "epoch": 5.7929025423728815, "grad_norm": 2.0726683139801025, "learning_rate": 7.103681144067798e-06, "loss": 1.1053, "mean_token_accuracy": 0.7473604530096054, "num_tokens": 17603804.0, "step": 21874 }, { "epoch": 5.7934322033898304, "grad_norm": 2.4306623935699463, "learning_rate": 7.103416313559323e-06, "loss": 1.6589, "mean_token_accuracy": 0.639797180891037, "num_tokens": 17605350.0, "step": 21876 }, { "epoch": 5.793961864406779, "grad_norm": 3.7076210975646973, "learning_rate": 7.1031514830508486e-06, "loss": 1.3624, "mean_token_accuracy": 0.7039368748664856, "num_tokens": 17606705.0, "step": 21878 }, { "epoch": 5.794491525423728, "grad_norm": 2.0348761081695557, "learning_rate": 7.1028866525423735e-06, "loss": 1.2549, "mean_token_accuracy": 0.7176050916314125, "num_tokens": 17608425.0, "step": 21880 }, { "epoch": 5.795021186440678, "grad_norm": 2.100257158279419, "learning_rate": 7.102621822033899e-06, "loss": 1.1184, "mean_token_accuracy": 0.7172665670514107, "num_tokens": 17610009.0, "step": 21882 }, { "epoch": 5.795550847457627, "grad_norm": 1.798669457435608, "learning_rate": 7.102356991525424e-06, "loss": 1.4272, "mean_token_accuracy": 0.6800955832004547, "num_tokens": 17611614.0, "step": 21884 }, { "epoch": 5.796080508474576, "grad_norm": 2.504183292388916, "learning_rate": 7.10209216101695e-06, "loss": 1.6296, "mean_token_accuracy": 0.6502601206302643, "num_tokens": 17613412.0, "step": 21886 }, { "epoch": 5.796610169491525, "grad_norm": 2.1430675983428955, "learning_rate": 7.101827330508475e-06, "loss": 1.2183, "mean_token_accuracy": 0.7098163738846779, "num_tokens": 17615105.0, "step": 21888 }, { "epoch": 5.797139830508475, "grad_norm": 2.428755044937134, "learning_rate": 7.101562500000001e-06, "loss": 0.9541, "mean_token_accuracy": 0.7626528069376945, "num_tokens": 17616618.0, "step": 21890 }, { "epoch": 5.797669491525424, "grad_norm": 2.728559732437134, "learning_rate": 7.101297669491526e-06, "loss": 1.415, "mean_token_accuracy": 0.6817098781466484, "num_tokens": 17617985.0, "step": 21892 }, { "epoch": 5.798199152542373, "grad_norm": 2.137388229370117, "learning_rate": 7.101032838983052e-06, "loss": 1.2049, "mean_token_accuracy": 0.7310536280274391, "num_tokens": 17619458.0, "step": 21894 }, { "epoch": 5.798728813559322, "grad_norm": 2.1093504428863525, "learning_rate": 7.100768008474576e-06, "loss": 1.0074, "mean_token_accuracy": 0.7357829436659813, "num_tokens": 17621971.0, "step": 21896 }, { "epoch": 5.799258474576272, "grad_norm": 2.0075159072875977, "learning_rate": 7.100503177966103e-06, "loss": 0.9366, "mean_token_accuracy": 0.7594641745090485, "num_tokens": 17623442.0, "step": 21898 }, { "epoch": 5.799788135593221, "grad_norm": 2.2646305561065674, "learning_rate": 7.100238347457628e-06, "loss": 1.7155, "mean_token_accuracy": 0.6125407442450523, "num_tokens": 17624979.0, "step": 21900 }, { "epoch": 5.8003177966101696, "grad_norm": 2.049988031387329, "learning_rate": 7.099973516949154e-06, "loss": 1.1701, "mean_token_accuracy": 0.7163429632782936, "num_tokens": 17626588.0, "step": 21902 }, { "epoch": 5.8008474576271185, "grad_norm": 2.725263833999634, "learning_rate": 7.0997086864406786e-06, "loss": 1.107, "mean_token_accuracy": 0.7363421842455864, "num_tokens": 17627948.0, "step": 21904 }, { "epoch": 5.8013771186440675, "grad_norm": 2.4023892879486084, "learning_rate": 7.099443855932204e-06, "loss": 1.1281, "mean_token_accuracy": 0.7244240716099739, "num_tokens": 17629718.0, "step": 21906 }, { "epoch": 5.801906779661017, "grad_norm": 2.0638859272003174, "learning_rate": 7.099179025423729e-06, "loss": 0.9858, "mean_token_accuracy": 0.7607281655073166, "num_tokens": 17631078.0, "step": 21908 }, { "epoch": 5.802436440677966, "grad_norm": 2.314530372619629, "learning_rate": 7.098914194915255e-06, "loss": 0.9057, "mean_token_accuracy": 0.7853401079773903, "num_tokens": 17632374.0, "step": 21910 }, { "epoch": 5.802966101694915, "grad_norm": 2.5435843467712402, "learning_rate": 7.09864936440678e-06, "loss": 1.2639, "mean_token_accuracy": 0.7008875608444214, "num_tokens": 17633888.0, "step": 21912 }, { "epoch": 5.803495762711864, "grad_norm": 2.266732931137085, "learning_rate": 7.098384533898306e-06, "loss": 1.3657, "mean_token_accuracy": 0.7045619636774063, "num_tokens": 17635316.0, "step": 21914 }, { "epoch": 5.804025423728813, "grad_norm": 2.2678301334381104, "learning_rate": 7.098119703389831e-06, "loss": 1.3003, "mean_token_accuracy": 0.6855091974139214, "num_tokens": 17637173.0, "step": 21916 }, { "epoch": 5.804555084745763, "grad_norm": 2.5233817100524902, "learning_rate": 7.0978548728813565e-06, "loss": 1.1556, "mean_token_accuracy": 0.7068153843283653, "num_tokens": 17638560.0, "step": 21918 }, { "epoch": 5.805084745762712, "grad_norm": 2.3540518283843994, "learning_rate": 7.097590042372881e-06, "loss": 1.4288, "mean_token_accuracy": 0.672747977077961, "num_tokens": 17640333.0, "step": 21920 }, { "epoch": 5.805614406779661, "grad_norm": 2.184859037399292, "learning_rate": 7.097325211864408e-06, "loss": 1.2679, "mean_token_accuracy": 0.7151491418480873, "num_tokens": 17641890.0, "step": 21922 }, { "epoch": 5.80614406779661, "grad_norm": 2.7050211429595947, "learning_rate": 7.097060381355932e-06, "loss": 1.3356, "mean_token_accuracy": 0.670462541282177, "num_tokens": 17643321.0, "step": 21924 }, { "epoch": 5.80667372881356, "grad_norm": 2.2439565658569336, "learning_rate": 7.096795550847459e-06, "loss": 1.0962, "mean_token_accuracy": 0.7154127880930901, "num_tokens": 17645103.0, "step": 21926 }, { "epoch": 5.807203389830509, "grad_norm": 1.9586151838302612, "learning_rate": 7.096530720338984e-06, "loss": 0.8055, "mean_token_accuracy": 0.7938668206334114, "num_tokens": 17646663.0, "step": 21928 }, { "epoch": 5.807733050847458, "grad_norm": 2.176875114440918, "learning_rate": 7.096265889830509e-06, "loss": 1.4664, "mean_token_accuracy": 0.6789558231830597, "num_tokens": 17648212.0, "step": 21930 }, { "epoch": 5.808262711864407, "grad_norm": 2.3377223014831543, "learning_rate": 7.096001059322034e-06, "loss": 1.6577, "mean_token_accuracy": 0.6702521443367004, "num_tokens": 17649695.0, "step": 21932 }, { "epoch": 5.808792372881356, "grad_norm": 2.226069927215576, "learning_rate": 7.09573622881356e-06, "loss": 1.2969, "mean_token_accuracy": 0.7039893642067909, "num_tokens": 17651086.0, "step": 21934 }, { "epoch": 5.809322033898305, "grad_norm": 2.278759479522705, "learning_rate": 7.095471398305085e-06, "loss": 1.4576, "mean_token_accuracy": 0.7154822796583176, "num_tokens": 17652278.0, "step": 21936 }, { "epoch": 5.809851694915254, "grad_norm": 1.6981662511825562, "learning_rate": 7.095206567796611e-06, "loss": 1.119, "mean_token_accuracy": 0.7262857332825661, "num_tokens": 17653907.0, "step": 21938 }, { "epoch": 5.810381355932203, "grad_norm": 2.3560352325439453, "learning_rate": 7.094941737288136e-06, "loss": 1.2747, "mean_token_accuracy": 0.7116638123989105, "num_tokens": 17655319.0, "step": 21940 }, { "epoch": 5.810911016949152, "grad_norm": 2.084872245788574, "learning_rate": 7.0946769067796615e-06, "loss": 1.352, "mean_token_accuracy": 0.7035700008273125, "num_tokens": 17657008.0, "step": 21942 }, { "epoch": 5.811440677966102, "grad_norm": 1.9290425777435303, "learning_rate": 7.0944120762711865e-06, "loss": 0.9943, "mean_token_accuracy": 0.7575401291251183, "num_tokens": 17658423.0, "step": 21944 }, { "epoch": 5.811970338983051, "grad_norm": 2.4526665210723877, "learning_rate": 7.094147245762712e-06, "loss": 1.4157, "mean_token_accuracy": 0.6808667629957199, "num_tokens": 17660014.0, "step": 21946 }, { "epoch": 5.8125, "grad_norm": 2.2577550411224365, "learning_rate": 7.093882415254237e-06, "loss": 0.9303, "mean_token_accuracy": 0.7804099097847939, "num_tokens": 17661471.0, "step": 21948 }, { "epoch": 5.813029661016949, "grad_norm": 2.1764025688171387, "learning_rate": 7.093617584745763e-06, "loss": 0.8428, "mean_token_accuracy": 0.8122303783893585, "num_tokens": 17663034.0, "step": 21950 }, { "epoch": 5.813559322033898, "grad_norm": 2.1582305431365967, "learning_rate": 7.093352754237288e-06, "loss": 1.0666, "mean_token_accuracy": 0.7507272362709045, "num_tokens": 17664570.0, "step": 21952 }, { "epoch": 5.814088983050848, "grad_norm": 2.126404047012329, "learning_rate": 7.0930879237288145e-06, "loss": 1.3455, "mean_token_accuracy": 0.6666376069188118, "num_tokens": 17666440.0, "step": 21954 }, { "epoch": 5.814618644067797, "grad_norm": 2.0483412742614746, "learning_rate": 7.0928230932203394e-06, "loss": 1.3875, "mean_token_accuracy": 0.6797159239649773, "num_tokens": 17668096.0, "step": 21956 }, { "epoch": 5.815148305084746, "grad_norm": 1.9680312871932983, "learning_rate": 7.092558262711865e-06, "loss": 1.4419, "mean_token_accuracy": 0.668420322239399, "num_tokens": 17669617.0, "step": 21958 }, { "epoch": 5.815677966101695, "grad_norm": 2.2549452781677246, "learning_rate": 7.092293432203391e-06, "loss": 1.1216, "mean_token_accuracy": 0.718218095600605, "num_tokens": 17671153.0, "step": 21960 }, { "epoch": 5.816207627118644, "grad_norm": 1.9445326328277588, "learning_rate": 7.092028601694916e-06, "loss": 1.1338, "mean_token_accuracy": 0.744344387203455, "num_tokens": 17672773.0, "step": 21962 }, { "epoch": 5.816737288135593, "grad_norm": 2.9118340015411377, "learning_rate": 7.091763771186442e-06, "loss": 1.0588, "mean_token_accuracy": 0.7544890344142914, "num_tokens": 17674252.0, "step": 21964 }, { "epoch": 5.817266949152542, "grad_norm": 1.745159387588501, "learning_rate": 7.091498940677967e-06, "loss": 0.8514, "mean_token_accuracy": 0.801487386226654, "num_tokens": 17675785.0, "step": 21966 }, { "epoch": 5.817796610169491, "grad_norm": 2.241421699523926, "learning_rate": 7.091234110169492e-06, "loss": 1.4696, "mean_token_accuracy": 0.6703076437115669, "num_tokens": 17677433.0, "step": 21968 }, { "epoch": 5.81832627118644, "grad_norm": 2.049582004547119, "learning_rate": 7.090969279661017e-06, "loss": 1.1335, "mean_token_accuracy": 0.7249434888362885, "num_tokens": 17679271.0, "step": 21970 }, { "epoch": 5.81885593220339, "grad_norm": 2.499628782272339, "learning_rate": 7.090704449152543e-06, "loss": 1.4598, "mean_token_accuracy": 0.6695363894104958, "num_tokens": 17680778.0, "step": 21972 }, { "epoch": 5.819385593220339, "grad_norm": 1.988544225692749, "learning_rate": 7.090439618644068e-06, "loss": 1.1692, "mean_token_accuracy": 0.7430477663874626, "num_tokens": 17682496.0, "step": 21974 }, { "epoch": 5.819915254237288, "grad_norm": 2.612914800643921, "learning_rate": 7.090174788135595e-06, "loss": 1.3498, "mean_token_accuracy": 0.6755268722772598, "num_tokens": 17683948.0, "step": 21976 }, { "epoch": 5.820444915254237, "grad_norm": 2.0769639015197754, "learning_rate": 7.089909957627119e-06, "loss": 1.1309, "mean_token_accuracy": 0.7294770702719688, "num_tokens": 17685469.0, "step": 21978 }, { "epoch": 5.820974576271187, "grad_norm": 2.645580291748047, "learning_rate": 7.089645127118645e-06, "loss": 1.5598, "mean_token_accuracy": 0.6290650144219398, "num_tokens": 17687087.0, "step": 21980 }, { "epoch": 5.821504237288136, "grad_norm": 2.3025004863739014, "learning_rate": 7.08938029661017e-06, "loss": 1.5579, "mean_token_accuracy": 0.64004135876894, "num_tokens": 17688795.0, "step": 21982 }, { "epoch": 5.822033898305085, "grad_norm": 2.761444568634033, "learning_rate": 7.089115466101696e-06, "loss": 1.5701, "mean_token_accuracy": 0.6461578421294689, "num_tokens": 17690855.0, "step": 21984 }, { "epoch": 5.822563559322034, "grad_norm": 2.2511019706726074, "learning_rate": 7.088850635593221e-06, "loss": 0.9165, "mean_token_accuracy": 0.7646623998880386, "num_tokens": 17692361.0, "step": 21986 }, { "epoch": 5.823093220338983, "grad_norm": 2.202867269515991, "learning_rate": 7.088585805084747e-06, "loss": 1.4344, "mean_token_accuracy": 0.6776824221014977, "num_tokens": 17694089.0, "step": 21988 }, { "epoch": 5.8236228813559325, "grad_norm": 2.24648118019104, "learning_rate": 7.088320974576272e-06, "loss": 1.1701, "mean_token_accuracy": 0.729619063436985, "num_tokens": 17695721.0, "step": 21990 }, { "epoch": 5.8241525423728815, "grad_norm": 2.2641823291778564, "learning_rate": 7.0880561440677975e-06, "loss": 1.1358, "mean_token_accuracy": 0.7211338877677917, "num_tokens": 17697413.0, "step": 21992 }, { "epoch": 5.8246822033898304, "grad_norm": 2.602764129638672, "learning_rate": 7.087791313559322e-06, "loss": 1.4049, "mean_token_accuracy": 0.686165101826191, "num_tokens": 17698917.0, "step": 21994 }, { "epoch": 5.825211864406779, "grad_norm": 1.9512882232666016, "learning_rate": 7.087526483050848e-06, "loss": 1.1397, "mean_token_accuracy": 0.7519897073507309, "num_tokens": 17700760.0, "step": 21996 }, { "epoch": 5.825741525423728, "grad_norm": 2.272179365158081, "learning_rate": 7.087261652542373e-06, "loss": 0.9684, "mean_token_accuracy": 0.7660148590803146, "num_tokens": 17702244.0, "step": 21998 }, { "epoch": 5.826271186440678, "grad_norm": 2.255469560623169, "learning_rate": 7.086996822033899e-06, "loss": 1.3322, "step": 22000 }, { "epoch": 5.826271186440678, "eval_loss": 1.315534234046936, "eval_mean_token_accuracy": 0.7003876446903526, "eval_num_tokens": 17703647.0, "eval_runtime": 48.2809, "eval_samples_per_second": 6.379, "eval_steps_per_second": 6.379, "step": 22000 }, { "epoch": 5.826800847457627, "grad_norm": 2.3336548805236816, "learning_rate": 7.086731991525424e-06, "loss": 1.0848, "mean_token_accuracy": 0.7206320501863956, "num_tokens": 17705209.0, "step": 22002 }, { "epoch": 5.827330508474576, "grad_norm": 2.155437707901001, "learning_rate": 7.08646716101695e-06, "loss": 1.2195, "mean_token_accuracy": 0.7185021489858627, "num_tokens": 17706834.0, "step": 22004 }, { "epoch": 5.827860169491525, "grad_norm": 1.84172785282135, "learning_rate": 7.0862023305084745e-06, "loss": 0.7419, "mean_token_accuracy": 0.8143380433320999, "num_tokens": 17708098.0, "step": 22006 }, { "epoch": 5.828389830508475, "grad_norm": 2.351830244064331, "learning_rate": 7.085937500000001e-06, "loss": 1.1303, "mean_token_accuracy": 0.7318505868315697, "num_tokens": 17709852.0, "step": 22008 }, { "epoch": 5.828919491525424, "grad_norm": 2.0319936275482178, "learning_rate": 7.085672669491526e-06, "loss": 1.101, "mean_token_accuracy": 0.7342542558908463, "num_tokens": 17711298.0, "step": 22010 }, { "epoch": 5.829449152542373, "grad_norm": 1.9410409927368164, "learning_rate": 7.085407838983052e-06, "loss": 0.8531, "mean_token_accuracy": 0.7867177650332451, "num_tokens": 17712853.0, "step": 22012 }, { "epoch": 5.829978813559322, "grad_norm": 2.5751047134399414, "learning_rate": 7.085143008474577e-06, "loss": 1.112, "mean_token_accuracy": 0.7545710280537605, "num_tokens": 17714114.0, "step": 22014 }, { "epoch": 5.830508474576272, "grad_norm": 1.9303710460662842, "learning_rate": 7.0848781779661025e-06, "loss": 1.1811, "mean_token_accuracy": 0.7001549750566483, "num_tokens": 17715547.0, "step": 22016 }, { "epoch": 5.831038135593221, "grad_norm": 2.3337390422821045, "learning_rate": 7.0846133474576275e-06, "loss": 1.5691, "mean_token_accuracy": 0.6621077209711075, "num_tokens": 17717292.0, "step": 22018 }, { "epoch": 5.8315677966101696, "grad_norm": 2.3451461791992188, "learning_rate": 7.084348516949153e-06, "loss": 1.0935, "mean_token_accuracy": 0.7302260994911194, "num_tokens": 17718899.0, "step": 22020 }, { "epoch": 5.8320974576271185, "grad_norm": 2.028207302093506, "learning_rate": 7.084083686440678e-06, "loss": 1.4198, "mean_token_accuracy": 0.6746340245008469, "num_tokens": 17720474.0, "step": 22022 }, { "epoch": 5.8326271186440675, "grad_norm": 2.577141046524048, "learning_rate": 7.083818855932204e-06, "loss": 1.027, "mean_token_accuracy": 0.7643333673477173, "num_tokens": 17722070.0, "step": 22024 }, { "epoch": 5.833156779661017, "grad_norm": 2.189757823944092, "learning_rate": 7.083554025423729e-06, "loss": 1.066, "mean_token_accuracy": 0.7438317686319351, "num_tokens": 17723490.0, "step": 22026 }, { "epoch": 5.833686440677966, "grad_norm": 2.44612455368042, "learning_rate": 7.083289194915255e-06, "loss": 1.2484, "mean_token_accuracy": 0.7036119252443314, "num_tokens": 17725183.0, "step": 22028 }, { "epoch": 5.834216101694915, "grad_norm": 2.2553164958953857, "learning_rate": 7.08302436440678e-06, "loss": 1.5257, "mean_token_accuracy": 0.6464320793747902, "num_tokens": 17726749.0, "step": 22030 }, { "epoch": 5.834745762711864, "grad_norm": 1.8721767663955688, "learning_rate": 7.082759533898305e-06, "loss": 0.9847, "mean_token_accuracy": 0.7610897943377495, "num_tokens": 17728340.0, "step": 22032 }, { "epoch": 5.835275423728813, "grad_norm": 2.696054697036743, "learning_rate": 7.08249470338983e-06, "loss": 1.5643, "mean_token_accuracy": 0.6399085745215416, "num_tokens": 17729909.0, "step": 22034 }, { "epoch": 5.835805084745763, "grad_norm": 2.610239267349243, "learning_rate": 7.082229872881357e-06, "loss": 1.3115, "mean_token_accuracy": 0.7205772697925568, "num_tokens": 17731216.0, "step": 22036 }, { "epoch": 5.836334745762712, "grad_norm": 2.2585558891296387, "learning_rate": 7.081965042372881e-06, "loss": 1.1942, "mean_token_accuracy": 0.7390605434775352, "num_tokens": 17732609.0, "step": 22038 }, { "epoch": 5.836864406779661, "grad_norm": 1.5893851518630981, "learning_rate": 7.081700211864408e-06, "loss": 1.117, "mean_token_accuracy": 0.7187114953994751, "num_tokens": 17734557.0, "step": 22040 }, { "epoch": 5.83739406779661, "grad_norm": 2.9673635959625244, "learning_rate": 7.081435381355933e-06, "loss": 1.8109, "mean_token_accuracy": 0.6099014207720757, "num_tokens": 17736149.0, "step": 22042 }, { "epoch": 5.83792372881356, "grad_norm": 1.7689446210861206, "learning_rate": 7.081170550847458e-06, "loss": 0.9412, "mean_token_accuracy": 0.7569501250982285, "num_tokens": 17737791.0, "step": 22044 }, { "epoch": 5.838453389830509, "grad_norm": 2.062290668487549, "learning_rate": 7.080905720338984e-06, "loss": 0.98, "mean_token_accuracy": 0.7739869803190231, "num_tokens": 17739360.0, "step": 22046 }, { "epoch": 5.838983050847458, "grad_norm": 2.3198506832122803, "learning_rate": 7.080640889830509e-06, "loss": 1.036, "mean_token_accuracy": 0.7778816893696785, "num_tokens": 17741069.0, "step": 22048 }, { "epoch": 5.839512711864407, "grad_norm": 2.2130889892578125, "learning_rate": 7.080376059322035e-06, "loss": 1.272, "mean_token_accuracy": 0.7005451917648315, "num_tokens": 17742620.0, "step": 22050 }, { "epoch": 5.840042372881356, "grad_norm": 2.42281436920166, "learning_rate": 7.08011122881356e-06, "loss": 1.3424, "mean_token_accuracy": 0.6875714957714081, "num_tokens": 17744219.0, "step": 22052 }, { "epoch": 5.840572033898305, "grad_norm": 2.3053762912750244, "learning_rate": 7.0798463983050855e-06, "loss": 1.2665, "mean_token_accuracy": 0.7228970900177956, "num_tokens": 17745499.0, "step": 22054 }, { "epoch": 5.841101694915254, "grad_norm": 2.4088847637176514, "learning_rate": 7.0795815677966104e-06, "loss": 1.2975, "mean_token_accuracy": 0.710302397608757, "num_tokens": 17747259.0, "step": 22056 }, { "epoch": 5.841631355932203, "grad_norm": 2.5644476413726807, "learning_rate": 7.079316737288136e-06, "loss": 1.4137, "mean_token_accuracy": 0.7141854241490364, "num_tokens": 17748692.0, "step": 22058 }, { "epoch": 5.842161016949152, "grad_norm": 1.918753981590271, "learning_rate": 7.079051906779661e-06, "loss": 1.0876, "mean_token_accuracy": 0.7316122725605965, "num_tokens": 17750504.0, "step": 22060 }, { "epoch": 5.842690677966102, "grad_norm": 2.0080859661102295, "learning_rate": 7.078787076271188e-06, "loss": 1.3974, "mean_token_accuracy": 0.7166708037257195, "num_tokens": 17752129.0, "step": 22062 }, { "epoch": 5.843220338983051, "grad_norm": 2.120300531387329, "learning_rate": 7.078522245762713e-06, "loss": 0.7736, "mean_token_accuracy": 0.8170609697699547, "num_tokens": 17753604.0, "step": 22064 }, { "epoch": 5.84375, "grad_norm": 2.0453197956085205, "learning_rate": 7.0782574152542385e-06, "loss": 0.6678, "mean_token_accuracy": 0.8065493255853653, "num_tokens": 17755199.0, "step": 22066 }, { "epoch": 5.844279661016949, "grad_norm": 2.567178964614868, "learning_rate": 7.077992584745763e-06, "loss": 1.2577, "mean_token_accuracy": 0.7111853882670403, "num_tokens": 17756599.0, "step": 22068 }, { "epoch": 5.844809322033898, "grad_norm": 2.2738966941833496, "learning_rate": 7.077727754237289e-06, "loss": 1.0469, "mean_token_accuracy": 0.7682838663458824, "num_tokens": 17758039.0, "step": 22070 }, { "epoch": 5.845338983050848, "grad_norm": 2.3679049015045166, "learning_rate": 7.077462923728814e-06, "loss": 1.2953, "mean_token_accuracy": 0.7211277633905411, "num_tokens": 17759468.0, "step": 22072 }, { "epoch": 5.845868644067797, "grad_norm": 2.6026461124420166, "learning_rate": 7.07719809322034e-06, "loss": 1.1327, "mean_token_accuracy": 0.730167455971241, "num_tokens": 17760950.0, "step": 22074 }, { "epoch": 5.846398305084746, "grad_norm": 2.8578860759735107, "learning_rate": 7.076933262711865e-06, "loss": 1.397, "mean_token_accuracy": 0.6995842605829239, "num_tokens": 17762501.0, "step": 22076 }, { "epoch": 5.846927966101695, "grad_norm": 2.2156286239624023, "learning_rate": 7.076668432203391e-06, "loss": 1.3196, "mean_token_accuracy": 0.7116098925471306, "num_tokens": 17764053.0, "step": 22078 }, { "epoch": 5.847457627118644, "grad_norm": 2.225804328918457, "learning_rate": 7.0764036016949155e-06, "loss": 1.0223, "mean_token_accuracy": 0.7518125921487808, "num_tokens": 17765754.0, "step": 22080 }, { "epoch": 5.847987288135593, "grad_norm": 2.2595579624176025, "learning_rate": 7.076138771186441e-06, "loss": 1.2507, "mean_token_accuracy": 0.7027270123362541, "num_tokens": 17767094.0, "step": 22082 }, { "epoch": 5.848516949152542, "grad_norm": 2.426105260848999, "learning_rate": 7.075873940677966e-06, "loss": 1.2189, "mean_token_accuracy": 0.7012043446302414, "num_tokens": 17768676.0, "step": 22084 }, { "epoch": 5.849046610169491, "grad_norm": 2.3391714096069336, "learning_rate": 7.075609110169492e-06, "loss": 1.2057, "mean_token_accuracy": 0.7330500110983849, "num_tokens": 17770075.0, "step": 22086 }, { "epoch": 5.84957627118644, "grad_norm": 2.038166046142578, "learning_rate": 7.075344279661017e-06, "loss": 1.2011, "mean_token_accuracy": 0.7261856570839882, "num_tokens": 17771723.0, "step": 22088 }, { "epoch": 5.85010593220339, "grad_norm": 2.1579582691192627, "learning_rate": 7.0750794491525436e-06, "loss": 1.0819, "mean_token_accuracy": 0.7409072741866112, "num_tokens": 17773323.0, "step": 22090 }, { "epoch": 5.850635593220339, "grad_norm": 2.177241086959839, "learning_rate": 7.074814618644068e-06, "loss": 1.2768, "mean_token_accuracy": 0.6970169395208359, "num_tokens": 17774882.0, "step": 22092 }, { "epoch": 5.851165254237288, "grad_norm": 2.2459187507629395, "learning_rate": 7.074549788135594e-06, "loss": 1.3862, "mean_token_accuracy": 0.680990532040596, "num_tokens": 17776721.0, "step": 22094 }, { "epoch": 5.851694915254237, "grad_norm": 2.4023327827453613, "learning_rate": 7.074284957627119e-06, "loss": 1.1551, "mean_token_accuracy": 0.7328347265720367, "num_tokens": 17778284.0, "step": 22096 }, { "epoch": 5.852224576271187, "grad_norm": 2.4792068004608154, "learning_rate": 7.074020127118645e-06, "loss": 1.2919, "mean_token_accuracy": 0.6959890201687813, "num_tokens": 17779846.0, "step": 22098 }, { "epoch": 5.852754237288136, "grad_norm": 2.25628399848938, "learning_rate": 7.07375529661017e-06, "loss": 1.4641, "mean_token_accuracy": 0.6799298748373985, "num_tokens": 17781481.0, "step": 22100 }, { "epoch": 5.853283898305085, "grad_norm": 2.570159435272217, "learning_rate": 7.073490466101696e-06, "loss": 1.0512, "mean_token_accuracy": 0.7618798241019249, "num_tokens": 17782865.0, "step": 22102 }, { "epoch": 5.853813559322034, "grad_norm": 2.2552404403686523, "learning_rate": 7.073225635593221e-06, "loss": 0.7393, "mean_token_accuracy": 0.7959287017583847, "num_tokens": 17784461.0, "step": 22104 }, { "epoch": 5.854343220338983, "grad_norm": 2.3840725421905518, "learning_rate": 7.072960805084746e-06, "loss": 1.4644, "mean_token_accuracy": 0.6709438934922218, "num_tokens": 17786456.0, "step": 22106 }, { "epoch": 5.8548728813559325, "grad_norm": 2.128906488418579, "learning_rate": 7.072695974576271e-06, "loss": 1.0137, "mean_token_accuracy": 0.7526726573705673, "num_tokens": 17788231.0, "step": 22108 }, { "epoch": 5.8554025423728815, "grad_norm": 1.9037147760391235, "learning_rate": 7.072431144067797e-06, "loss": 1.1422, "mean_token_accuracy": 0.7283741980791092, "num_tokens": 17789811.0, "step": 22110 }, { "epoch": 5.8559322033898304, "grad_norm": 2.6546545028686523, "learning_rate": 7.072166313559322e-06, "loss": 1.4877, "mean_token_accuracy": 0.6794127970933914, "num_tokens": 17791183.0, "step": 22112 }, { "epoch": 5.856461864406779, "grad_norm": 2.432802200317383, "learning_rate": 7.071901483050848e-06, "loss": 1.0222, "mean_token_accuracy": 0.7445899024605751, "num_tokens": 17792746.0, "step": 22114 }, { "epoch": 5.856991525423728, "grad_norm": 2.3863141536712646, "learning_rate": 7.071636652542373e-06, "loss": 1.4546, "mean_token_accuracy": 0.6761074438691139, "num_tokens": 17794802.0, "step": 22116 }, { "epoch": 5.857521186440678, "grad_norm": 2.619874954223633, "learning_rate": 7.071371822033899e-06, "loss": 1.4268, "mean_token_accuracy": 0.6996815800666809, "num_tokens": 17796329.0, "step": 22118 }, { "epoch": 5.858050847457627, "grad_norm": 2.1254093647003174, "learning_rate": 7.0711069915254234e-06, "loss": 1.0804, "mean_token_accuracy": 0.7255774438381195, "num_tokens": 17797965.0, "step": 22120 }, { "epoch": 5.858580508474576, "grad_norm": 2.326456069946289, "learning_rate": 7.07084216101695e-06, "loss": 1.4591, "mean_token_accuracy": 0.6735976189374924, "num_tokens": 17799787.0, "step": 22122 }, { "epoch": 5.859110169491525, "grad_norm": 2.1090126037597656, "learning_rate": 7.070577330508475e-06, "loss": 1.391, "mean_token_accuracy": 0.6936564892530441, "num_tokens": 17801476.0, "step": 22124 }, { "epoch": 5.859639830508475, "grad_norm": 2.2546896934509277, "learning_rate": 7.070312500000001e-06, "loss": 1.4085, "mean_token_accuracy": 0.6898952126502991, "num_tokens": 17803133.0, "step": 22126 }, { "epoch": 5.860169491525424, "grad_norm": 2.170325517654419, "learning_rate": 7.0700476694915265e-06, "loss": 1.141, "mean_token_accuracy": 0.7387468814849854, "num_tokens": 17804573.0, "step": 22128 }, { "epoch": 5.860699152542373, "grad_norm": 2.6584339141845703, "learning_rate": 7.0697828389830515e-06, "loss": 1.2301, "mean_token_accuracy": 0.733315534889698, "num_tokens": 17806009.0, "step": 22130 }, { "epoch": 5.861228813559322, "grad_norm": 2.5594356060028076, "learning_rate": 7.069518008474577e-06, "loss": 1.3524, "mean_token_accuracy": 0.6888580024242401, "num_tokens": 17807434.0, "step": 22132 }, { "epoch": 5.861758474576272, "grad_norm": 1.921492338180542, "learning_rate": 7.069253177966102e-06, "loss": 1.1877, "mean_token_accuracy": 0.7078636735677719, "num_tokens": 17808953.0, "step": 22134 }, { "epoch": 5.862288135593221, "grad_norm": 2.05332350730896, "learning_rate": 7.068988347457628e-06, "loss": 1.2151, "mean_token_accuracy": 0.698043018579483, "num_tokens": 17810835.0, "step": 22136 }, { "epoch": 5.8628177966101696, "grad_norm": 2.1770896911621094, "learning_rate": 7.068723516949153e-06, "loss": 1.1438, "mean_token_accuracy": 0.7327687069773674, "num_tokens": 17812361.0, "step": 22138 }, { "epoch": 5.8633474576271185, "grad_norm": 2.2013909816741943, "learning_rate": 7.068458686440679e-06, "loss": 1.4194, "mean_token_accuracy": 0.6619323119521141, "num_tokens": 17814072.0, "step": 22140 }, { "epoch": 5.8638771186440675, "grad_norm": 2.0384435653686523, "learning_rate": 7.068193855932204e-06, "loss": 1.2392, "mean_token_accuracy": 0.7084866017103195, "num_tokens": 17815827.0, "step": 22142 }, { "epoch": 5.864406779661017, "grad_norm": 2.393454074859619, "learning_rate": 7.06792902542373e-06, "loss": 1.5993, "mean_token_accuracy": 0.6382822394371033, "num_tokens": 17817451.0, "step": 22144 }, { "epoch": 5.864936440677966, "grad_norm": 2.3514606952667236, "learning_rate": 7.067664194915254e-06, "loss": 1.3902, "mean_token_accuracy": 0.6755320131778717, "num_tokens": 17819034.0, "step": 22146 }, { "epoch": 5.865466101694915, "grad_norm": 2.225330114364624, "learning_rate": 7.067399364406781e-06, "loss": 1.2852, "mean_token_accuracy": 0.7454491332173347, "num_tokens": 17820727.0, "step": 22148 }, { "epoch": 5.865995762711864, "grad_norm": 2.3303823471069336, "learning_rate": 7.067134533898306e-06, "loss": 1.0356, "mean_token_accuracy": 0.7570390105247498, "num_tokens": 17822118.0, "step": 22150 }, { "epoch": 5.866525423728813, "grad_norm": 2.40297532081604, "learning_rate": 7.066869703389832e-06, "loss": 1.4238, "mean_token_accuracy": 0.698474645614624, "num_tokens": 17823832.0, "step": 22152 }, { "epoch": 5.867055084745763, "grad_norm": 1.8626701831817627, "learning_rate": 7.0666048728813565e-06, "loss": 0.918, "mean_token_accuracy": 0.7683501243591309, "num_tokens": 17825570.0, "step": 22154 }, { "epoch": 5.867584745762712, "grad_norm": 1.7869960069656372, "learning_rate": 7.066340042372882e-06, "loss": 0.882, "mean_token_accuracy": 0.7694002091884613, "num_tokens": 17827298.0, "step": 22156 }, { "epoch": 5.868114406779661, "grad_norm": 2.149214744567871, "learning_rate": 7.066075211864407e-06, "loss": 1.1775, "mean_token_accuracy": 0.748342476785183, "num_tokens": 17828734.0, "step": 22158 }, { "epoch": 5.86864406779661, "grad_norm": 2.221125602722168, "learning_rate": 7.065810381355933e-06, "loss": 1.2129, "mean_token_accuracy": 0.6800490990281105, "num_tokens": 17830845.0, "step": 22160 }, { "epoch": 5.86917372881356, "grad_norm": 2.6944899559020996, "learning_rate": 7.065545550847458e-06, "loss": 1.2419, "mean_token_accuracy": 0.7201416939496994, "num_tokens": 17832429.0, "step": 22162 }, { "epoch": 5.869703389830509, "grad_norm": 2.2006497383117676, "learning_rate": 7.065280720338984e-06, "loss": 1.1986, "mean_token_accuracy": 0.7078993692994118, "num_tokens": 17833937.0, "step": 22164 }, { "epoch": 5.870233050847458, "grad_norm": 2.443459987640381, "learning_rate": 7.065015889830509e-06, "loss": 0.928, "mean_token_accuracy": 0.7924240380525589, "num_tokens": 17835580.0, "step": 22166 }, { "epoch": 5.870762711864407, "grad_norm": 2.0726919174194336, "learning_rate": 7.0647510593220344e-06, "loss": 1.5253, "mean_token_accuracy": 0.6651548743247986, "num_tokens": 17837277.0, "step": 22168 }, { "epoch": 5.871292372881356, "grad_norm": 2.262101888656616, "learning_rate": 7.064486228813559e-06, "loss": 1.2013, "mean_token_accuracy": 0.732339408248663, "num_tokens": 17838937.0, "step": 22170 }, { "epoch": 5.871822033898305, "grad_norm": 2.2019832134246826, "learning_rate": 7.064221398305086e-06, "loss": 0.9936, "mean_token_accuracy": 0.7827957794070244, "num_tokens": 17840269.0, "step": 22172 }, { "epoch": 5.872351694915254, "grad_norm": 1.9638887643814087, "learning_rate": 7.06395656779661e-06, "loss": 1.2391, "mean_token_accuracy": 0.7145448997616768, "num_tokens": 17842247.0, "step": 22174 }, { "epoch": 5.872881355932203, "grad_norm": 2.5302698612213135, "learning_rate": 7.063691737288137e-06, "loss": 1.7557, "mean_token_accuracy": 0.6200529076159, "num_tokens": 17844204.0, "step": 22176 }, { "epoch": 5.873411016949152, "grad_norm": 2.006826162338257, "learning_rate": 7.063426906779662e-06, "loss": 1.4828, "mean_token_accuracy": 0.6804976761341095, "num_tokens": 17845902.0, "step": 22178 }, { "epoch": 5.873940677966102, "grad_norm": 2.553225040435791, "learning_rate": 7.063162076271187e-06, "loss": 0.9416, "mean_token_accuracy": 0.7692337781190872, "num_tokens": 17847274.0, "step": 22180 }, { "epoch": 5.874470338983051, "grad_norm": 2.2525618076324463, "learning_rate": 7.062897245762712e-06, "loss": 1.0367, "mean_token_accuracy": 0.7570541948080063, "num_tokens": 17849000.0, "step": 22182 }, { "epoch": 5.875, "grad_norm": 2.093852996826172, "learning_rate": 7.062632415254238e-06, "loss": 1.2328, "mean_token_accuracy": 0.7385076954960823, "num_tokens": 17850782.0, "step": 22184 }, { "epoch": 5.875529661016949, "grad_norm": 2.1932497024536133, "learning_rate": 7.062367584745763e-06, "loss": 0.9908, "mean_token_accuracy": 0.7475766316056252, "num_tokens": 17852952.0, "step": 22186 }, { "epoch": 5.876059322033898, "grad_norm": 2.043710470199585, "learning_rate": 7.062102754237289e-06, "loss": 1.4924, "mean_token_accuracy": 0.6709941402077675, "num_tokens": 17854725.0, "step": 22188 }, { "epoch": 5.876588983050848, "grad_norm": 2.2542736530303955, "learning_rate": 7.061837923728814e-06, "loss": 1.3237, "mean_token_accuracy": 0.7106064707040787, "num_tokens": 17856268.0, "step": 22190 }, { "epoch": 5.877118644067797, "grad_norm": 1.532956838607788, "learning_rate": 7.0615730932203395e-06, "loss": 1.1457, "mean_token_accuracy": 0.7265589646995068, "num_tokens": 17857933.0, "step": 22192 }, { "epoch": 5.877648305084746, "grad_norm": 2.028508424758911, "learning_rate": 7.0613082627118644e-06, "loss": 0.8149, "mean_token_accuracy": 0.789013035595417, "num_tokens": 17859497.0, "step": 22194 }, { "epoch": 5.878177966101695, "grad_norm": 2.022573947906494, "learning_rate": 7.06104343220339e-06, "loss": 1.3171, "mean_token_accuracy": 0.7201027423143387, "num_tokens": 17861009.0, "step": 22196 }, { "epoch": 5.878707627118644, "grad_norm": 2.089946746826172, "learning_rate": 7.060778601694915e-06, "loss": 1.3967, "mean_token_accuracy": 0.6717336475849152, "num_tokens": 17862558.0, "step": 22198 }, { "epoch": 5.879237288135593, "grad_norm": 2.1461586952209473, "learning_rate": 7.060513771186441e-06, "loss": 1.0395, "mean_token_accuracy": 0.746045783162117, "num_tokens": 17864214.0, "step": 22200 }, { "epoch": 5.879766949152542, "grad_norm": 2.1648452281951904, "learning_rate": 7.060248940677966e-06, "loss": 0.9635, "mean_token_accuracy": 0.76339041441679, "num_tokens": 17865788.0, "step": 22202 }, { "epoch": 5.880296610169491, "grad_norm": 2.181222438812256, "learning_rate": 7.0599841101694925e-06, "loss": 1.5531, "mean_token_accuracy": 0.6461907029151917, "num_tokens": 17867510.0, "step": 22204 }, { "epoch": 5.88082627118644, "grad_norm": 1.9240418672561646, "learning_rate": 7.059719279661017e-06, "loss": 1.3673, "mean_token_accuracy": 0.6834847927093506, "num_tokens": 17869189.0, "step": 22206 }, { "epoch": 5.88135593220339, "grad_norm": 2.622159719467163, "learning_rate": 7.059454449152543e-06, "loss": 1.1542, "mean_token_accuracy": 0.7174499034881592, "num_tokens": 17870704.0, "step": 22208 }, { "epoch": 5.881885593220339, "grad_norm": 2.903110980987549, "learning_rate": 7.059189618644068e-06, "loss": 1.0039, "mean_token_accuracy": 0.7525418475270271, "num_tokens": 17872047.0, "step": 22210 }, { "epoch": 5.882415254237288, "grad_norm": 2.2716808319091797, "learning_rate": 7.058924788135594e-06, "loss": 1.1347, "mean_token_accuracy": 0.7366316020488739, "num_tokens": 17873575.0, "step": 22212 }, { "epoch": 5.882944915254237, "grad_norm": 2.361623764038086, "learning_rate": 7.05865995762712e-06, "loss": 1.4282, "mean_token_accuracy": 0.6466397419571877, "num_tokens": 17875225.0, "step": 22214 }, { "epoch": 5.883474576271187, "grad_norm": 2.499182939529419, "learning_rate": 7.058395127118645e-06, "loss": 1.5338, "mean_token_accuracy": 0.6662345044314861, "num_tokens": 17876857.0, "step": 22216 }, { "epoch": 5.884004237288136, "grad_norm": 2.1036036014556885, "learning_rate": 7.05813029661017e-06, "loss": 0.9847, "mean_token_accuracy": 0.7544131875038147, "num_tokens": 17878576.0, "step": 22218 }, { "epoch": 5.884533898305085, "grad_norm": 1.8638741970062256, "learning_rate": 7.057865466101695e-06, "loss": 0.7627, "mean_token_accuracy": 0.8003125861287117, "num_tokens": 17880155.0, "step": 22220 }, { "epoch": 5.885063559322034, "grad_norm": 2.394850969314575, "learning_rate": 7.057600635593221e-06, "loss": 1.4651, "mean_token_accuracy": 0.6636266484856606, "num_tokens": 17881527.0, "step": 22222 }, { "epoch": 5.885593220338983, "grad_norm": 2.062692642211914, "learning_rate": 7.057335805084746e-06, "loss": 1.2619, "mean_token_accuracy": 0.7230672985315323, "num_tokens": 17882978.0, "step": 22224 }, { "epoch": 5.8861228813559325, "grad_norm": 2.24210786819458, "learning_rate": 7.057070974576273e-06, "loss": 0.9384, "mean_token_accuracy": 0.7727383449673653, "num_tokens": 17884492.0, "step": 22226 }, { "epoch": 5.8866525423728815, "grad_norm": 1.8553893566131592, "learning_rate": 7.056806144067797e-06, "loss": 0.9502, "mean_token_accuracy": 0.7582394778728485, "num_tokens": 17886264.0, "step": 22228 }, { "epoch": 5.8871822033898304, "grad_norm": 2.691842555999756, "learning_rate": 7.056541313559323e-06, "loss": 1.663, "mean_token_accuracy": 0.6660982891917229, "num_tokens": 17887550.0, "step": 22230 }, { "epoch": 5.887711864406779, "grad_norm": 2.191615343093872, "learning_rate": 7.056276483050848e-06, "loss": 1.5182, "mean_token_accuracy": 0.7042286619544029, "num_tokens": 17889250.0, "step": 22232 }, { "epoch": 5.888241525423728, "grad_norm": 2.1923904418945312, "learning_rate": 7.056011652542374e-06, "loss": 1.081, "mean_token_accuracy": 0.7424071654677391, "num_tokens": 17890723.0, "step": 22234 }, { "epoch": 5.888771186440678, "grad_norm": 2.773240089416504, "learning_rate": 7.055746822033899e-06, "loss": 1.1, "mean_token_accuracy": 0.7140084952116013, "num_tokens": 17892061.0, "step": 22236 }, { "epoch": 5.889300847457627, "grad_norm": 1.8811900615692139, "learning_rate": 7.055481991525425e-06, "loss": 1.1726, "mean_token_accuracy": 0.7355335205793381, "num_tokens": 17893522.0, "step": 22238 }, { "epoch": 5.889830508474576, "grad_norm": 2.042180061340332, "learning_rate": 7.05521716101695e-06, "loss": 1.1562, "mean_token_accuracy": 0.7429531142115593, "num_tokens": 17895181.0, "step": 22240 }, { "epoch": 5.890360169491525, "grad_norm": 1.9252359867095947, "learning_rate": 7.0549523305084754e-06, "loss": 1.3513, "mean_token_accuracy": 0.697827622294426, "num_tokens": 17896838.0, "step": 22242 }, { "epoch": 5.890889830508475, "grad_norm": 1.9914159774780273, "learning_rate": 7.0546875e-06, "loss": 1.3335, "mean_token_accuracy": 0.6610076949000359, "num_tokens": 17898755.0, "step": 22244 }, { "epoch": 5.891419491525424, "grad_norm": 2.1792056560516357, "learning_rate": 7.054422669491526e-06, "loss": 1.1589, "mean_token_accuracy": 0.7332406044006348, "num_tokens": 17900368.0, "step": 22246 }, { "epoch": 5.891949152542373, "grad_norm": 1.8232163190841675, "learning_rate": 7.054157838983051e-06, "loss": 0.8819, "mean_token_accuracy": 0.7902576327323914, "num_tokens": 17901741.0, "step": 22248 }, { "epoch": 5.892478813559322, "grad_norm": 1.9382163286209106, "learning_rate": 7.053893008474577e-06, "loss": 1.2145, "step": 22250 }, { "epoch": 5.892478813559322, "eval_loss": 1.3170392513275146, "eval_mean_token_accuracy": 0.700708855572459, "eval_num_tokens": 17903455.0, "eval_runtime": 48.2655, "eval_samples_per_second": 6.381, "eval_steps_per_second": 6.381, "step": 22250 }, { "epoch": 5.893008474576272, "grad_norm": 2.25653338432312, "learning_rate": 7.053628177966102e-06, "loss": 1.4141, "mean_token_accuracy": 0.6936457306146622, "num_tokens": 17904980.0, "step": 22252 }, { "epoch": 5.893538135593221, "grad_norm": 2.0858583450317383, "learning_rate": 7.0533633474576276e-06, "loss": 1.1863, "mean_token_accuracy": 0.714855968952179, "num_tokens": 17906429.0, "step": 22254 }, { "epoch": 5.8940677966101696, "grad_norm": 2.6866512298583984, "learning_rate": 7.0530985169491525e-06, "loss": 1.3052, "mean_token_accuracy": 0.6999612078070641, "num_tokens": 17908033.0, "step": 22256 }, { "epoch": 5.8945974576271185, "grad_norm": 2.5326695442199707, "learning_rate": 7.052833686440679e-06, "loss": 1.3072, "mean_token_accuracy": 0.6871835738420486, "num_tokens": 17909386.0, "step": 22258 }, { "epoch": 5.8951271186440675, "grad_norm": 2.1584088802337646, "learning_rate": 7.052568855932204e-06, "loss": 1.3468, "mean_token_accuracy": 0.6826872825622559, "num_tokens": 17911039.0, "step": 22260 }, { "epoch": 5.895656779661017, "grad_norm": 2.6873302459716797, "learning_rate": 7.05230402542373e-06, "loss": 1.2844, "mean_token_accuracy": 0.718020111322403, "num_tokens": 17912412.0, "step": 22262 }, { "epoch": 5.896186440677966, "grad_norm": 2.3645646572113037, "learning_rate": 7.052039194915255e-06, "loss": 1.3602, "mean_token_accuracy": 0.6864510774612427, "num_tokens": 17914066.0, "step": 22264 }, { "epoch": 5.896716101694915, "grad_norm": 2.2850501537323, "learning_rate": 7.0517743644067805e-06, "loss": 1.3405, "mean_token_accuracy": 0.7068330571055412, "num_tokens": 17915769.0, "step": 22266 }, { "epoch": 5.897245762711864, "grad_norm": 1.9112416505813599, "learning_rate": 7.0515095338983055e-06, "loss": 1.1135, "mean_token_accuracy": 0.7428391724824905, "num_tokens": 17917084.0, "step": 22268 }, { "epoch": 5.897775423728813, "grad_norm": 2.227266550064087, "learning_rate": 7.051244703389831e-06, "loss": 1.1156, "mean_token_accuracy": 0.7361812591552734, "num_tokens": 17918844.0, "step": 22270 }, { "epoch": 5.898305084745763, "grad_norm": 2.5060205459594727, "learning_rate": 7.050979872881356e-06, "loss": 1.3782, "mean_token_accuracy": 0.6654091402888298, "num_tokens": 17920593.0, "step": 22272 }, { "epoch": 5.898834745762712, "grad_norm": 2.08170223236084, "learning_rate": 7.050715042372882e-06, "loss": 1.2138, "mean_token_accuracy": 0.7260959222912788, "num_tokens": 17922657.0, "step": 22274 }, { "epoch": 5.899364406779661, "grad_norm": 2.141826629638672, "learning_rate": 7.050450211864407e-06, "loss": 1.061, "mean_token_accuracy": 0.7398600205779076, "num_tokens": 17924296.0, "step": 22276 }, { "epoch": 5.89989406779661, "grad_norm": 1.965759038925171, "learning_rate": 7.050185381355933e-06, "loss": 1.2088, "mean_token_accuracy": 0.7098044604063034, "num_tokens": 17925990.0, "step": 22278 }, { "epoch": 5.90042372881356, "grad_norm": 1.9947545528411865, "learning_rate": 7.0499205508474576e-06, "loss": 0.6804, "mean_token_accuracy": 0.8125514760613441, "num_tokens": 17927577.0, "step": 22280 }, { "epoch": 5.900953389830509, "grad_norm": 2.4999213218688965, "learning_rate": 7.049655720338983e-06, "loss": 1.7622, "mean_token_accuracy": 0.630631186068058, "num_tokens": 17929105.0, "step": 22282 }, { "epoch": 5.901483050847458, "grad_norm": 2.2975072860717773, "learning_rate": 7.049390889830508e-06, "loss": 1.0092, "mean_token_accuracy": 0.7517347857356071, "num_tokens": 17930496.0, "step": 22284 }, { "epoch": 5.902012711864407, "grad_norm": 1.9567058086395264, "learning_rate": 7.049126059322035e-06, "loss": 1.5117, "mean_token_accuracy": 0.6993195861577988, "num_tokens": 17932117.0, "step": 22286 }, { "epoch": 5.902542372881356, "grad_norm": 2.917729377746582, "learning_rate": 7.048861228813559e-06, "loss": 1.3816, "mean_token_accuracy": 0.6883241385221481, "num_tokens": 17933533.0, "step": 22288 }, { "epoch": 5.903072033898305, "grad_norm": 2.0153069496154785, "learning_rate": 7.048596398305086e-06, "loss": 1.0357, "mean_token_accuracy": 0.7300884798169136, "num_tokens": 17935340.0, "step": 22290 }, { "epoch": 5.903601694915254, "grad_norm": 1.9306488037109375, "learning_rate": 7.0483315677966105e-06, "loss": 0.9219, "mean_token_accuracy": 0.776928573846817, "num_tokens": 17936851.0, "step": 22292 }, { "epoch": 5.904131355932203, "grad_norm": 2.2376441955566406, "learning_rate": 7.048066737288136e-06, "loss": 1.132, "mean_token_accuracy": 0.7572451308369637, "num_tokens": 17938369.0, "step": 22294 }, { "epoch": 5.904661016949152, "grad_norm": 2.7085864543914795, "learning_rate": 7.047801906779662e-06, "loss": 1.5507, "mean_token_accuracy": 0.6663794592022896, "num_tokens": 17939832.0, "step": 22296 }, { "epoch": 5.905190677966102, "grad_norm": 2.232271194458008, "learning_rate": 7.047537076271187e-06, "loss": 1.3683, "mean_token_accuracy": 0.6911517009139061, "num_tokens": 17941623.0, "step": 22298 }, { "epoch": 5.905720338983051, "grad_norm": 2.402452230453491, "learning_rate": 7.047272245762713e-06, "loss": 1.0716, "mean_token_accuracy": 0.7266802042722702, "num_tokens": 17943108.0, "step": 22300 }, { "epoch": 5.90625, "grad_norm": 2.1807897090911865, "learning_rate": 7.047007415254238e-06, "loss": 1.5986, "mean_token_accuracy": 0.6788247935473919, "num_tokens": 17944755.0, "step": 22302 }, { "epoch": 5.906779661016949, "grad_norm": 2.1463608741760254, "learning_rate": 7.0467425847457635e-06, "loss": 1.1581, "mean_token_accuracy": 0.7435509636998177, "num_tokens": 17946316.0, "step": 22304 }, { "epoch": 5.907309322033898, "grad_norm": 2.085817813873291, "learning_rate": 7.0464777542372884e-06, "loss": 1.1682, "mean_token_accuracy": 0.7127711921930313, "num_tokens": 17948192.0, "step": 22306 }, { "epoch": 5.907838983050848, "grad_norm": 2.3092904090881348, "learning_rate": 7.046212923728814e-06, "loss": 1.4679, "mean_token_accuracy": 0.6844931617379189, "num_tokens": 17949945.0, "step": 22308 }, { "epoch": 5.908368644067797, "grad_norm": 2.4524929523468018, "learning_rate": 7.045948093220339e-06, "loss": 1.4027, "mean_token_accuracy": 0.6580898314714432, "num_tokens": 17951424.0, "step": 22310 }, { "epoch": 5.908898305084746, "grad_norm": 2.49117374420166, "learning_rate": 7.045683262711866e-06, "loss": 1.0362, "mean_token_accuracy": 0.7520125508308411, "num_tokens": 17952861.0, "step": 22312 }, { "epoch": 5.909427966101695, "grad_norm": 1.9782177209854126, "learning_rate": 7.045418432203391e-06, "loss": 1.2264, "mean_token_accuracy": 0.6963965371251106, "num_tokens": 17954583.0, "step": 22314 }, { "epoch": 5.909957627118644, "grad_norm": 2.3992326259613037, "learning_rate": 7.0451536016949165e-06, "loss": 1.3062, "mean_token_accuracy": 0.6839871518313885, "num_tokens": 17956222.0, "step": 22316 }, { "epoch": 5.910487288135593, "grad_norm": 2.292829751968384, "learning_rate": 7.044888771186441e-06, "loss": 1.4611, "mean_token_accuracy": 0.6576110050082207, "num_tokens": 17957730.0, "step": 22318 }, { "epoch": 5.911016949152542, "grad_norm": 2.1988065242767334, "learning_rate": 7.044623940677967e-06, "loss": 1.4954, "mean_token_accuracy": 0.676985390484333, "num_tokens": 17959268.0, "step": 22320 }, { "epoch": 5.911546610169491, "grad_norm": 2.106851577758789, "learning_rate": 7.044359110169492e-06, "loss": 1.279, "mean_token_accuracy": 0.7064932771027088, "num_tokens": 17961262.0, "step": 22322 }, { "epoch": 5.91207627118644, "grad_norm": 1.6921212673187256, "learning_rate": 7.044094279661018e-06, "loss": 1.2279, "mean_token_accuracy": 0.6935748979449272, "num_tokens": 17963079.0, "step": 22324 }, { "epoch": 5.91260593220339, "grad_norm": 2.575059652328491, "learning_rate": 7.043829449152543e-06, "loss": 1.2926, "mean_token_accuracy": 0.7076314985752106, "num_tokens": 17964480.0, "step": 22326 }, { "epoch": 5.913135593220339, "grad_norm": 3.0961711406707764, "learning_rate": 7.043564618644069e-06, "loss": 1.1745, "mean_token_accuracy": 0.7155196070671082, "num_tokens": 17965708.0, "step": 22328 }, { "epoch": 5.913665254237288, "grad_norm": 2.334202289581299, "learning_rate": 7.0432997881355935e-06, "loss": 1.1351, "mean_token_accuracy": 0.732217438519001, "num_tokens": 17967430.0, "step": 22330 }, { "epoch": 5.914194915254237, "grad_norm": 2.073434829711914, "learning_rate": 7.043034957627119e-06, "loss": 1.2287, "mean_token_accuracy": 0.7217330783605576, "num_tokens": 17969112.0, "step": 22332 }, { "epoch": 5.914724576271187, "grad_norm": 1.3512290716171265, "learning_rate": 7.042770127118644e-06, "loss": 0.9875, "mean_token_accuracy": 0.747345507144928, "num_tokens": 17971156.0, "step": 22334 }, { "epoch": 5.915254237288136, "grad_norm": 1.979762077331543, "learning_rate": 7.04250529661017e-06, "loss": 1.0696, "mean_token_accuracy": 0.7089185416698456, "num_tokens": 17973000.0, "step": 22336 }, { "epoch": 5.915783898305085, "grad_norm": 2.140040874481201, "learning_rate": 7.042240466101695e-06, "loss": 1.6142, "mean_token_accuracy": 0.6192158088088036, "num_tokens": 17974780.0, "step": 22338 }, { "epoch": 5.916313559322034, "grad_norm": 2.371976137161255, "learning_rate": 7.0419756355932215e-06, "loss": 1.2701, "mean_token_accuracy": 0.7216259688138962, "num_tokens": 17976506.0, "step": 22340 }, { "epoch": 5.916843220338983, "grad_norm": 2.097484827041626, "learning_rate": 7.041710805084746e-06, "loss": 0.8638, "mean_token_accuracy": 0.7909599021077156, "num_tokens": 17977950.0, "step": 22342 }, { "epoch": 5.9173728813559325, "grad_norm": 2.8906569480895996, "learning_rate": 7.041445974576272e-06, "loss": 1.5045, "mean_token_accuracy": 0.6793161854147911, "num_tokens": 17979650.0, "step": 22344 }, { "epoch": 5.9179025423728815, "grad_norm": 1.641069769859314, "learning_rate": 7.041181144067797e-06, "loss": 0.9107, "mean_token_accuracy": 0.7518904432654381, "num_tokens": 17981566.0, "step": 22346 }, { "epoch": 5.9184322033898304, "grad_norm": 2.292468547821045, "learning_rate": 7.040916313559323e-06, "loss": 1.2283, "mean_token_accuracy": 0.7123228088021278, "num_tokens": 17982967.0, "step": 22348 }, { "epoch": 5.918961864406779, "grad_norm": 2.1603734493255615, "learning_rate": 7.040651483050848e-06, "loss": 0.948, "mean_token_accuracy": 0.7755014896392822, "num_tokens": 17984418.0, "step": 22350 }, { "epoch": 5.919491525423728, "grad_norm": 2.242072820663452, "learning_rate": 7.040386652542374e-06, "loss": 1.4821, "mean_token_accuracy": 0.6569823771715164, "num_tokens": 17986056.0, "step": 22352 }, { "epoch": 5.920021186440678, "grad_norm": 2.0426504611968994, "learning_rate": 7.040121822033899e-06, "loss": 0.8901, "mean_token_accuracy": 0.7639856040477753, "num_tokens": 17987949.0, "step": 22354 }, { "epoch": 5.920550847457627, "grad_norm": 2.2951815128326416, "learning_rate": 7.039856991525424e-06, "loss": 1.2975, "mean_token_accuracy": 0.6930654048919678, "num_tokens": 17989182.0, "step": 22356 }, { "epoch": 5.921080508474576, "grad_norm": 2.489290237426758, "learning_rate": 7.039592161016949e-06, "loss": 0.8735, "mean_token_accuracy": 0.7689389660954475, "num_tokens": 17990715.0, "step": 22358 }, { "epoch": 5.921610169491525, "grad_norm": 2.1953744888305664, "learning_rate": 7.039327330508475e-06, "loss": 1.464, "mean_token_accuracy": 0.6713967472314835, "num_tokens": 17992327.0, "step": 22360 }, { "epoch": 5.922139830508475, "grad_norm": 2.3432042598724365, "learning_rate": 7.0390625e-06, "loss": 0.8605, "mean_token_accuracy": 0.8195626437664032, "num_tokens": 17993856.0, "step": 22362 }, { "epoch": 5.922669491525424, "grad_norm": 2.2929534912109375, "learning_rate": 7.038797669491526e-06, "loss": 1.4499, "mean_token_accuracy": 0.6732607781887054, "num_tokens": 17995401.0, "step": 22364 }, { "epoch": 5.923199152542373, "grad_norm": 2.3628807067871094, "learning_rate": 7.038532838983051e-06, "loss": 1.5631, "mean_token_accuracy": 0.6377434358000755, "num_tokens": 17996974.0, "step": 22366 }, { "epoch": 5.923728813559322, "grad_norm": 2.511493444442749, "learning_rate": 7.038268008474577e-06, "loss": 1.2196, "mean_token_accuracy": 0.7094070389866829, "num_tokens": 17998619.0, "step": 22368 }, { "epoch": 5.924258474576272, "grad_norm": 2.3808963298797607, "learning_rate": 7.038003177966101e-06, "loss": 1.5433, "mean_token_accuracy": 0.6524731516838074, "num_tokens": 18000290.0, "step": 22370 }, { "epoch": 5.924788135593221, "grad_norm": 2.321580410003662, "learning_rate": 7.037738347457628e-06, "loss": 1.032, "mean_token_accuracy": 0.7602934688329697, "num_tokens": 18001822.0, "step": 22372 }, { "epoch": 5.9253177966101696, "grad_norm": 2.8417816162109375, "learning_rate": 7.037473516949153e-06, "loss": 1.5108, "mean_token_accuracy": 0.6670041978359222, "num_tokens": 18003242.0, "step": 22374 }, { "epoch": 5.9258474576271185, "grad_norm": 2.327719211578369, "learning_rate": 7.037208686440679e-06, "loss": 1.234, "mean_token_accuracy": 0.7191957831382751, "num_tokens": 18005102.0, "step": 22376 }, { "epoch": 5.9263771186440675, "grad_norm": 2.139273166656494, "learning_rate": 7.036943855932204e-06, "loss": 1.3758, "mean_token_accuracy": 0.6923977881669998, "num_tokens": 18006831.0, "step": 22378 }, { "epoch": 5.926906779661017, "grad_norm": 2.6875383853912354, "learning_rate": 7.0366790254237294e-06, "loss": 1.2721, "mean_token_accuracy": 0.7034520506858826, "num_tokens": 18008360.0, "step": 22380 }, { "epoch": 5.927436440677966, "grad_norm": 1.8716381788253784, "learning_rate": 7.036414194915255e-06, "loss": 0.9777, "mean_token_accuracy": 0.7434090599417686, "num_tokens": 18010015.0, "step": 22382 }, { "epoch": 5.927966101694915, "grad_norm": 2.1432907581329346, "learning_rate": 7.03614936440678e-06, "loss": 0.817, "mean_token_accuracy": 0.7772337570786476, "num_tokens": 18011479.0, "step": 22384 }, { "epoch": 5.928495762711864, "grad_norm": 2.332258462905884, "learning_rate": 7.035884533898306e-06, "loss": 1.3371, "mean_token_accuracy": 0.6981300562620163, "num_tokens": 18012897.0, "step": 22386 }, { "epoch": 5.929025423728813, "grad_norm": 2.2118659019470215, "learning_rate": 7.035619703389831e-06, "loss": 1.1691, "mean_token_accuracy": 0.7248129472136497, "num_tokens": 18014739.0, "step": 22388 }, { "epoch": 5.929555084745763, "grad_norm": 2.167422294616699, "learning_rate": 7.035354872881357e-06, "loss": 1.3135, "mean_token_accuracy": 0.6949672773480415, "num_tokens": 18016170.0, "step": 22390 }, { "epoch": 5.930084745762712, "grad_norm": 2.0116891860961914, "learning_rate": 7.0350900423728816e-06, "loss": 0.9039, "mean_token_accuracy": 0.78514663875103, "num_tokens": 18017427.0, "step": 22392 }, { "epoch": 5.930614406779661, "grad_norm": 3.0799524784088135, "learning_rate": 7.034825211864408e-06, "loss": 1.2304, "mean_token_accuracy": 0.7249722555279732, "num_tokens": 18018718.0, "step": 22394 }, { "epoch": 5.93114406779661, "grad_norm": 2.1831765174865723, "learning_rate": 7.034560381355932e-06, "loss": 1.4428, "mean_token_accuracy": 0.654421329498291, "num_tokens": 18020512.0, "step": 22396 }, { "epoch": 5.93167372881356, "grad_norm": 2.05410099029541, "learning_rate": 7.034295550847459e-06, "loss": 1.3523, "mean_token_accuracy": 0.6947287693619728, "num_tokens": 18021982.0, "step": 22398 }, { "epoch": 5.932203389830509, "grad_norm": 2.504507303237915, "learning_rate": 7.034030720338984e-06, "loss": 1.3887, "mean_token_accuracy": 0.6700993105769157, "num_tokens": 18023755.0, "step": 22400 }, { "epoch": 5.932733050847458, "grad_norm": 2.043891668319702, "learning_rate": 7.03376588983051e-06, "loss": 1.5564, "mean_token_accuracy": 0.6728711798787117, "num_tokens": 18025325.0, "step": 22402 }, { "epoch": 5.933262711864407, "grad_norm": 2.2355854511260986, "learning_rate": 7.0335010593220345e-06, "loss": 1.1925, "mean_token_accuracy": 0.7235406711697578, "num_tokens": 18026823.0, "step": 22404 }, { "epoch": 5.933792372881356, "grad_norm": 2.09663724899292, "learning_rate": 7.03323622881356e-06, "loss": 1.1647, "mean_token_accuracy": 0.7443815469741821, "num_tokens": 18028366.0, "step": 22406 }, { "epoch": 5.934322033898305, "grad_norm": 2.1589157581329346, "learning_rate": 7.032971398305085e-06, "loss": 1.2419, "mean_token_accuracy": 0.7047494947910309, "num_tokens": 18030042.0, "step": 22408 }, { "epoch": 5.934851694915254, "grad_norm": 2.201357841491699, "learning_rate": 7.032706567796611e-06, "loss": 1.2275, "mean_token_accuracy": 0.7397638261318207, "num_tokens": 18031670.0, "step": 22410 }, { "epoch": 5.935381355932203, "grad_norm": 2.4574835300445557, "learning_rate": 7.032441737288136e-06, "loss": 1.3017, "mean_token_accuracy": 0.6881203427910805, "num_tokens": 18033347.0, "step": 22412 }, { "epoch": 5.935911016949152, "grad_norm": 2.8872222900390625, "learning_rate": 7.032176906779662e-06, "loss": 1.4803, "mean_token_accuracy": 0.6647529676556587, "num_tokens": 18034936.0, "step": 22414 }, { "epoch": 5.936440677966102, "grad_norm": 2.3295512199401855, "learning_rate": 7.031912076271187e-06, "loss": 1.6826, "mean_token_accuracy": 0.6440714448690414, "num_tokens": 18036640.0, "step": 22416 }, { "epoch": 5.936970338983051, "grad_norm": 2.0051333904266357, "learning_rate": 7.031647245762712e-06, "loss": 1.1527, "mean_token_accuracy": 0.741279736161232, "num_tokens": 18038228.0, "step": 22418 }, { "epoch": 5.9375, "grad_norm": 2.371659278869629, "learning_rate": 7.031382415254237e-06, "loss": 1.2367, "mean_token_accuracy": 0.7336441874504089, "num_tokens": 18039780.0, "step": 22420 }, { "epoch": 5.938029661016949, "grad_norm": 2.3818094730377197, "learning_rate": 7.031117584745764e-06, "loss": 1.416, "mean_token_accuracy": 0.6896956413984299, "num_tokens": 18041376.0, "step": 22422 }, { "epoch": 5.938559322033898, "grad_norm": 2.3727593421936035, "learning_rate": 7.030852754237288e-06, "loss": 1.5887, "mean_token_accuracy": 0.6378061771392822, "num_tokens": 18043025.0, "step": 22424 }, { "epoch": 5.939088983050848, "grad_norm": 2.280409336090088, "learning_rate": 7.030587923728815e-06, "loss": 1.3277, "mean_token_accuracy": 0.7182638347148895, "num_tokens": 18044544.0, "step": 22426 }, { "epoch": 5.939618644067797, "grad_norm": 1.9042690992355347, "learning_rate": 7.03032309322034e-06, "loss": 1.0654, "mean_token_accuracy": 0.7492337971925735, "num_tokens": 18046070.0, "step": 22428 }, { "epoch": 5.940148305084746, "grad_norm": 2.49453067779541, "learning_rate": 7.030058262711865e-06, "loss": 1.2082, "mean_token_accuracy": 0.7027588486671448, "num_tokens": 18047585.0, "step": 22430 }, { "epoch": 5.940677966101695, "grad_norm": 2.9948337078094482, "learning_rate": 7.02979343220339e-06, "loss": 1.0243, "mean_token_accuracy": 0.7674041986465454, "num_tokens": 18048816.0, "step": 22432 }, { "epoch": 5.941207627118644, "grad_norm": 2.5124661922454834, "learning_rate": 7.029528601694916e-06, "loss": 1.7368, "mean_token_accuracy": 0.651233609765768, "num_tokens": 18050575.0, "step": 22434 }, { "epoch": 5.941737288135593, "grad_norm": 2.227140188217163, "learning_rate": 7.029263771186441e-06, "loss": 0.9176, "mean_token_accuracy": 0.7886157780885696, "num_tokens": 18052079.0, "step": 22436 }, { "epoch": 5.942266949152542, "grad_norm": 2.9257633686065674, "learning_rate": 7.028998940677967e-06, "loss": 1.4101, "mean_token_accuracy": 0.672801248729229, "num_tokens": 18053549.0, "step": 22438 }, { "epoch": 5.942796610169491, "grad_norm": 2.246835470199585, "learning_rate": 7.028734110169492e-06, "loss": 1.3842, "mean_token_accuracy": 0.6911598145961761, "num_tokens": 18055118.0, "step": 22440 }, { "epoch": 5.94332627118644, "grad_norm": 1.9156606197357178, "learning_rate": 7.0284692796610175e-06, "loss": 0.9039, "mean_token_accuracy": 0.8033881708979607, "num_tokens": 18056857.0, "step": 22442 }, { "epoch": 5.94385593220339, "grad_norm": 2.5554964542388916, "learning_rate": 7.028204449152542e-06, "loss": 1.409, "mean_token_accuracy": 0.6913561299443245, "num_tokens": 18058314.0, "step": 22444 }, { "epoch": 5.944385593220339, "grad_norm": 1.9248815774917603, "learning_rate": 7.027939618644068e-06, "loss": 0.9799, "mean_token_accuracy": 0.7706512436270714, "num_tokens": 18059854.0, "step": 22446 }, { "epoch": 5.944915254237288, "grad_norm": 2.626389265060425, "learning_rate": 7.027674788135593e-06, "loss": 1.349, "mean_token_accuracy": 0.689907431602478, "num_tokens": 18061362.0, "step": 22448 }, { "epoch": 5.945444915254237, "grad_norm": 2.5409913063049316, "learning_rate": 7.027409957627119e-06, "loss": 1.234, "mean_token_accuracy": 0.7125760316848755, "num_tokens": 18062841.0, "step": 22450 }, { "epoch": 5.945974576271187, "grad_norm": 1.8576598167419434, "learning_rate": 7.027145127118644e-06, "loss": 1.2148, "mean_token_accuracy": 0.7178707309067249, "num_tokens": 18064728.0, "step": 22452 }, { "epoch": 5.946504237288136, "grad_norm": 2.087453603744507, "learning_rate": 7.0268802966101705e-06, "loss": 1.4813, "mean_token_accuracy": 0.6867922395467758, "num_tokens": 18066415.0, "step": 22454 }, { "epoch": 5.947033898305085, "grad_norm": 2.558626174926758, "learning_rate": 7.026615466101695e-06, "loss": 1.5265, "mean_token_accuracy": 0.6574218422174454, "num_tokens": 18067978.0, "step": 22456 }, { "epoch": 5.947563559322034, "grad_norm": 2.2516634464263916, "learning_rate": 7.026350635593221e-06, "loss": 1.4145, "mean_token_accuracy": 0.6717278808355331, "num_tokens": 18069635.0, "step": 22458 }, { "epoch": 5.948093220338983, "grad_norm": 2.3780899047851562, "learning_rate": 7.026085805084746e-06, "loss": 1.5482, "mean_token_accuracy": 0.6858250871300697, "num_tokens": 18071174.0, "step": 22460 }, { "epoch": 5.9486228813559325, "grad_norm": 2.0441367626190186, "learning_rate": 7.025820974576272e-06, "loss": 0.6639, "mean_token_accuracy": 0.8220932260155678, "num_tokens": 18072629.0, "step": 22462 }, { "epoch": 5.9491525423728815, "grad_norm": 1.9932866096496582, "learning_rate": 7.025556144067798e-06, "loss": 1.205, "mean_token_accuracy": 0.7214059308171272, "num_tokens": 18074367.0, "step": 22464 }, { "epoch": 5.9496822033898304, "grad_norm": 1.8695217370986938, "learning_rate": 7.0252913135593226e-06, "loss": 0.9356, "mean_token_accuracy": 0.7837163805961609, "num_tokens": 18075909.0, "step": 22466 }, { "epoch": 5.950211864406779, "grad_norm": 2.0778186321258545, "learning_rate": 7.025026483050848e-06, "loss": 1.0035, "mean_token_accuracy": 0.7702220007777214, "num_tokens": 18077388.0, "step": 22468 }, { "epoch": 5.950741525423728, "grad_norm": 1.8748453855514526, "learning_rate": 7.024761652542373e-06, "loss": 1.2272, "mean_token_accuracy": 0.7185208573937416, "num_tokens": 18079133.0, "step": 22470 }, { "epoch": 5.951271186440678, "grad_norm": 1.7961580753326416, "learning_rate": 7.024496822033899e-06, "loss": 0.9949, "mean_token_accuracy": 0.757217213511467, "num_tokens": 18080574.0, "step": 22472 }, { "epoch": 5.951800847457627, "grad_norm": 2.115353584289551, "learning_rate": 7.024231991525424e-06, "loss": 1.1486, "mean_token_accuracy": 0.7393705248832703, "num_tokens": 18082198.0, "step": 22474 }, { "epoch": 5.952330508474576, "grad_norm": 1.7183001041412354, "learning_rate": 7.023967161016951e-06, "loss": 0.7246, "mean_token_accuracy": 0.811180517077446, "num_tokens": 18083962.0, "step": 22476 }, { "epoch": 5.952860169491525, "grad_norm": 2.4384822845458984, "learning_rate": 7.023702330508475e-06, "loss": 1.2076, "mean_token_accuracy": 0.7154529839754105, "num_tokens": 18085511.0, "step": 22478 }, { "epoch": 5.953389830508475, "grad_norm": 2.1398820877075195, "learning_rate": 7.023437500000001e-06, "loss": 1.3123, "mean_token_accuracy": 0.7036463841795921, "num_tokens": 18087201.0, "step": 22480 }, { "epoch": 5.953919491525424, "grad_norm": 2.3875515460968018, "learning_rate": 7.023172669491526e-06, "loss": 1.2779, "mean_token_accuracy": 0.6807067915797234, "num_tokens": 18089706.0, "step": 22482 }, { "epoch": 5.954449152542373, "grad_norm": 1.9855904579162598, "learning_rate": 7.022907838983052e-06, "loss": 1.2957, "mean_token_accuracy": 0.7010327130556107, "num_tokens": 18091093.0, "step": 22484 }, { "epoch": 5.954978813559322, "grad_norm": 2.075253486633301, "learning_rate": 7.022643008474577e-06, "loss": 1.1276, "mean_token_accuracy": 0.7309043705463409, "num_tokens": 18092711.0, "step": 22486 }, { "epoch": 5.955508474576272, "grad_norm": 2.0666658878326416, "learning_rate": 7.022378177966103e-06, "loss": 1.0522, "mean_token_accuracy": 0.7235192358493805, "num_tokens": 18094423.0, "step": 22488 }, { "epoch": 5.956038135593221, "grad_norm": 2.1020584106445312, "learning_rate": 7.022113347457628e-06, "loss": 1.1017, "mean_token_accuracy": 0.7440807893872261, "num_tokens": 18095779.0, "step": 22490 }, { "epoch": 5.9565677966101696, "grad_norm": 2.652954339981079, "learning_rate": 7.0218485169491534e-06, "loss": 1.1277, "mean_token_accuracy": 0.7231661528348923, "num_tokens": 18097333.0, "step": 22492 }, { "epoch": 5.9570974576271185, "grad_norm": 3.3150253295898438, "learning_rate": 7.021583686440678e-06, "loss": 1.0724, "mean_token_accuracy": 0.7478784397244453, "num_tokens": 18098810.0, "step": 22494 }, { "epoch": 5.9576271186440675, "grad_norm": 1.445358395576477, "learning_rate": 7.021318855932204e-06, "loss": 1.0265, "mean_token_accuracy": 0.7556019425392151, "num_tokens": 18101077.0, "step": 22496 }, { "epoch": 5.958156779661017, "grad_norm": 2.242792844772339, "learning_rate": 7.021054025423729e-06, "loss": 1.0838, "mean_token_accuracy": 0.7556872814893723, "num_tokens": 18102552.0, "step": 22498 }, { "epoch": 5.958686440677966, "grad_norm": 2.588386058807373, "learning_rate": 7.020789194915255e-06, "loss": 1.5394, "step": 22500 }, { "epoch": 5.958686440677966, "eval_loss": 1.3180700540542603, "eval_mean_token_accuracy": 0.7007987554390709, "eval_num_tokens": 18104101.0, "eval_runtime": 48.2867, "eval_samples_per_second": 6.379, "eval_steps_per_second": 6.379, "step": 22500 }, { "epoch": 5.959216101694915, "grad_norm": 1.6880937814712524, "learning_rate": 7.02052436440678e-06, "loss": 1.053, "mean_token_accuracy": 0.6991773061454296, "num_tokens": 18105842.0, "step": 22502 }, { "epoch": 5.959745762711864, "grad_norm": 1.9440444707870483, "learning_rate": 7.0202595338983055e-06, "loss": 1.1544, "mean_token_accuracy": 0.7289810851216316, "num_tokens": 18107802.0, "step": 22504 }, { "epoch": 5.960275423728813, "grad_norm": 2.2604362964630127, "learning_rate": 7.0199947033898305e-06, "loss": 1.1599, "mean_token_accuracy": 0.7284806296229362, "num_tokens": 18109337.0, "step": 22506 }, { "epoch": 5.960805084745763, "grad_norm": 2.28273344039917, "learning_rate": 7.019729872881357e-06, "loss": 1.8371, "mean_token_accuracy": 0.6189523339271545, "num_tokens": 18111243.0, "step": 22508 }, { "epoch": 5.961334745762712, "grad_norm": 2.1529133319854736, "learning_rate": 7.019465042372882e-06, "loss": 0.9023, "mean_token_accuracy": 0.7707796320319176, "num_tokens": 18112732.0, "step": 22510 }, { "epoch": 5.961864406779661, "grad_norm": 1.876513957977295, "learning_rate": 7.019200211864408e-06, "loss": 1.454, "mean_token_accuracy": 0.6673625409603119, "num_tokens": 18114468.0, "step": 22512 }, { "epoch": 5.96239406779661, "grad_norm": 1.7680336236953735, "learning_rate": 7.018935381355933e-06, "loss": 0.6871, "mean_token_accuracy": 0.8150873184204102, "num_tokens": 18116305.0, "step": 22514 }, { "epoch": 5.96292372881356, "grad_norm": 2.2424185276031494, "learning_rate": 7.0186705508474585e-06, "loss": 0.9998, "mean_token_accuracy": 0.776114210486412, "num_tokens": 18117958.0, "step": 22516 }, { "epoch": 5.963453389830509, "grad_norm": 2.0774545669555664, "learning_rate": 7.0184057203389834e-06, "loss": 1.1853, "mean_token_accuracy": 0.7014221251010895, "num_tokens": 18119624.0, "step": 22518 }, { "epoch": 5.963983050847458, "grad_norm": 2.358215808868408, "learning_rate": 7.018140889830509e-06, "loss": 0.9169, "mean_token_accuracy": 0.7773303166031837, "num_tokens": 18121068.0, "step": 22520 }, { "epoch": 5.964512711864407, "grad_norm": 2.432187795639038, "learning_rate": 7.017876059322034e-06, "loss": 1.0834, "mean_token_accuracy": 0.7469252124428749, "num_tokens": 18122452.0, "step": 22522 }, { "epoch": 5.965042372881356, "grad_norm": 2.2954277992248535, "learning_rate": 7.01761122881356e-06, "loss": 1.2689, "mean_token_accuracy": 0.7010012157261372, "num_tokens": 18124048.0, "step": 22524 }, { "epoch": 5.965572033898305, "grad_norm": 1.4366703033447266, "learning_rate": 7.017346398305085e-06, "loss": 0.7651, "mean_token_accuracy": 0.7955350428819656, "num_tokens": 18126230.0, "step": 22526 }, { "epoch": 5.966101694915254, "grad_norm": 2.1993043422698975, "learning_rate": 7.017081567796611e-06, "loss": 1.2394, "mean_token_accuracy": 0.7176395803689957, "num_tokens": 18127871.0, "step": 22528 }, { "epoch": 5.966631355932203, "grad_norm": 2.3565773963928223, "learning_rate": 7.0168167372881356e-06, "loss": 1.4257, "mean_token_accuracy": 0.6906553357839584, "num_tokens": 18129379.0, "step": 22530 }, { "epoch": 5.967161016949152, "grad_norm": 2.204989433288574, "learning_rate": 7.016551906779661e-06, "loss": 0.8936, "mean_token_accuracy": 0.7960852757096291, "num_tokens": 18130908.0, "step": 22532 }, { "epoch": 5.967690677966102, "grad_norm": 2.0901072025299072, "learning_rate": 7.016287076271186e-06, "loss": 1.137, "mean_token_accuracy": 0.711301825940609, "num_tokens": 18132541.0, "step": 22534 }, { "epoch": 5.968220338983051, "grad_norm": 2.358306884765625, "learning_rate": 7.016022245762713e-06, "loss": 0.9078, "mean_token_accuracy": 0.7774452865123749, "num_tokens": 18134309.0, "step": 22536 }, { "epoch": 5.96875, "grad_norm": 2.3932454586029053, "learning_rate": 7.015757415254237e-06, "loss": 1.1688, "mean_token_accuracy": 0.7272191941738129, "num_tokens": 18135828.0, "step": 22538 }, { "epoch": 5.969279661016949, "grad_norm": 2.122649908065796, "learning_rate": 7.015492584745764e-06, "loss": 1.2693, "mean_token_accuracy": 0.719365194439888, "num_tokens": 18137180.0, "step": 22540 }, { "epoch": 5.969809322033898, "grad_norm": 1.8932642936706543, "learning_rate": 7.0152277542372885e-06, "loss": 1.0319, "mean_token_accuracy": 0.7600767090916634, "num_tokens": 18138869.0, "step": 22542 }, { "epoch": 5.970338983050848, "grad_norm": 2.1748623847961426, "learning_rate": 7.014962923728814e-06, "loss": 1.2841, "mean_token_accuracy": 0.7274311929941177, "num_tokens": 18140385.0, "step": 22544 }, { "epoch": 5.970868644067797, "grad_norm": 2.5762264728546143, "learning_rate": 7.014698093220339e-06, "loss": 1.5519, "mean_token_accuracy": 0.6900979466736317, "num_tokens": 18141997.0, "step": 22546 }, { "epoch": 5.971398305084746, "grad_norm": 1.7930362224578857, "learning_rate": 7.014433262711865e-06, "loss": 0.8727, "mean_token_accuracy": 0.7852754071354866, "num_tokens": 18143446.0, "step": 22548 }, { "epoch": 5.971927966101695, "grad_norm": 2.635610580444336, "learning_rate": 7.014168432203391e-06, "loss": 1.5626, "mean_token_accuracy": 0.6369065791368484, "num_tokens": 18145193.0, "step": 22550 }, { "epoch": 5.972457627118644, "grad_norm": 2.3862695693969727, "learning_rate": 7.013903601694916e-06, "loss": 0.955, "mean_token_accuracy": 0.7813796848058701, "num_tokens": 18146884.0, "step": 22552 }, { "epoch": 5.972987288135593, "grad_norm": 2.1966896057128906, "learning_rate": 7.0136387711864415e-06, "loss": 1.0761, "mean_token_accuracy": 0.7302554696798325, "num_tokens": 18148540.0, "step": 22554 }, { "epoch": 5.973516949152542, "grad_norm": 2.5234084129333496, "learning_rate": 7.013373940677966e-06, "loss": 1.3595, "mean_token_accuracy": 0.7142247930169106, "num_tokens": 18149877.0, "step": 22556 }, { "epoch": 5.974046610169491, "grad_norm": 1.885406494140625, "learning_rate": 7.013109110169492e-06, "loss": 0.605, "mean_token_accuracy": 0.8439085856080055, "num_tokens": 18151253.0, "step": 22558 }, { "epoch": 5.97457627118644, "grad_norm": 2.3317861557006836, "learning_rate": 7.012844279661017e-06, "loss": 1.1843, "mean_token_accuracy": 0.7295903638005257, "num_tokens": 18153033.0, "step": 22560 }, { "epoch": 5.97510593220339, "grad_norm": 2.671567678451538, "learning_rate": 7.012579449152544e-06, "loss": 0.9967, "mean_token_accuracy": 0.7528446912765503, "num_tokens": 18154600.0, "step": 22562 }, { "epoch": 5.975635593220339, "grad_norm": 2.281841278076172, "learning_rate": 7.012314618644069e-06, "loss": 1.3587, "mean_token_accuracy": 0.694606326520443, "num_tokens": 18156179.0, "step": 22564 }, { "epoch": 5.976165254237288, "grad_norm": 1.7874010801315308, "learning_rate": 7.0120497881355944e-06, "loss": 0.7122, "mean_token_accuracy": 0.8075847774744034, "num_tokens": 18157719.0, "step": 22566 }, { "epoch": 5.976694915254237, "grad_norm": 2.0247890949249268, "learning_rate": 7.011784957627119e-06, "loss": 1.1686, "mean_token_accuracy": 0.7376639917492867, "num_tokens": 18159440.0, "step": 22568 }, { "epoch": 5.977224576271187, "grad_norm": 2.7542660236358643, "learning_rate": 7.011520127118645e-06, "loss": 1.4536, "mean_token_accuracy": 0.6752387136220932, "num_tokens": 18160834.0, "step": 22570 }, { "epoch": 5.977754237288136, "grad_norm": 1.89290452003479, "learning_rate": 7.01125529661017e-06, "loss": 0.866, "mean_token_accuracy": 0.7730820551514626, "num_tokens": 18162420.0, "step": 22572 }, { "epoch": 5.978283898305085, "grad_norm": 2.113407611846924, "learning_rate": 7.010990466101696e-06, "loss": 1.2617, "mean_token_accuracy": 0.689593005925417, "num_tokens": 18163813.0, "step": 22574 }, { "epoch": 5.978813559322034, "grad_norm": 1.8781377077102661, "learning_rate": 7.010725635593221e-06, "loss": 1.1426, "mean_token_accuracy": 0.7422667518258095, "num_tokens": 18165472.0, "step": 22576 }, { "epoch": 5.979343220338983, "grad_norm": 2.0801620483398438, "learning_rate": 7.0104608050847466e-06, "loss": 1.279, "mean_token_accuracy": 0.6975927278399467, "num_tokens": 18167093.0, "step": 22578 }, { "epoch": 5.9798728813559325, "grad_norm": 2.4469692707061768, "learning_rate": 7.0101959745762715e-06, "loss": 1.2565, "mean_token_accuracy": 0.7132413387298584, "num_tokens": 18168602.0, "step": 22580 }, { "epoch": 5.9804025423728815, "grad_norm": 2.3317322731018066, "learning_rate": 7.009931144067797e-06, "loss": 1.4193, "mean_token_accuracy": 0.715310163795948, "num_tokens": 18170257.0, "step": 22582 }, { "epoch": 5.9809322033898304, "grad_norm": 2.1603963375091553, "learning_rate": 7.009666313559322e-06, "loss": 1.5961, "mean_token_accuracy": 0.6551237627863884, "num_tokens": 18171893.0, "step": 22584 }, { "epoch": 5.981461864406779, "grad_norm": 2.692711591720581, "learning_rate": 7.009401483050848e-06, "loss": 1.5313, "mean_token_accuracy": 0.6689271703362465, "num_tokens": 18173397.0, "step": 22586 }, { "epoch": 5.981991525423728, "grad_norm": 2.379786968231201, "learning_rate": 7.009136652542373e-06, "loss": 1.2311, "mean_token_accuracy": 0.7058451473712921, "num_tokens": 18175025.0, "step": 22588 }, { "epoch": 5.982521186440678, "grad_norm": 1.6935123205184937, "learning_rate": 7.0088718220338995e-06, "loss": 1.1235, "mean_token_accuracy": 0.721828430891037, "num_tokens": 18176858.0, "step": 22590 }, { "epoch": 5.983050847457627, "grad_norm": 2.152801513671875, "learning_rate": 7.008606991525424e-06, "loss": 1.0796, "mean_token_accuracy": 0.7291380614042282, "num_tokens": 18178394.0, "step": 22592 }, { "epoch": 5.983580508474576, "grad_norm": 2.440720319747925, "learning_rate": 7.00834216101695e-06, "loss": 1.1188, "mean_token_accuracy": 0.7428726106882095, "num_tokens": 18180115.0, "step": 22594 }, { "epoch": 5.984110169491525, "grad_norm": 2.1242077350616455, "learning_rate": 7.008077330508475e-06, "loss": 1.2108, "mean_token_accuracy": 0.7129691392183304, "num_tokens": 18181503.0, "step": 22596 }, { "epoch": 5.984639830508475, "grad_norm": 2.37969970703125, "learning_rate": 7.007812500000001e-06, "loss": 1.5437, "mean_token_accuracy": 0.6459061615169048, "num_tokens": 18182925.0, "step": 22598 }, { "epoch": 5.985169491525424, "grad_norm": 2.5697946548461914, "learning_rate": 7.007547669491526e-06, "loss": 1.412, "mean_token_accuracy": 0.7200446724891663, "num_tokens": 18184413.0, "step": 22600 }, { "epoch": 5.985699152542373, "grad_norm": 2.173365354537964, "learning_rate": 7.007282838983052e-06, "loss": 1.3041, "mean_token_accuracy": 0.6883357912302017, "num_tokens": 18186420.0, "step": 22602 }, { "epoch": 5.986228813559322, "grad_norm": 2.117647409439087, "learning_rate": 7.0070180084745766e-06, "loss": 0.8368, "mean_token_accuracy": 0.7774275243282318, "num_tokens": 18188078.0, "step": 22604 }, { "epoch": 5.986758474576272, "grad_norm": 2.424691915512085, "learning_rate": 7.006753177966102e-06, "loss": 1.414, "mean_token_accuracy": 0.6898007281124592, "num_tokens": 18189643.0, "step": 22606 }, { "epoch": 5.987288135593221, "grad_norm": 2.187986373901367, "learning_rate": 7.006488347457627e-06, "loss": 1.1939, "mean_token_accuracy": 0.7110524252057076, "num_tokens": 18191380.0, "step": 22608 }, { "epoch": 5.9878177966101696, "grad_norm": 2.3782641887664795, "learning_rate": 7.006223516949153e-06, "loss": 1.2283, "mean_token_accuracy": 0.7390419915318489, "num_tokens": 18192839.0, "step": 22610 }, { "epoch": 5.9883474576271185, "grad_norm": 2.057180643081665, "learning_rate": 7.005958686440678e-06, "loss": 1.0651, "mean_token_accuracy": 0.7191437035799026, "num_tokens": 18194469.0, "step": 22612 }, { "epoch": 5.9888771186440675, "grad_norm": 2.032348871231079, "learning_rate": 7.005693855932204e-06, "loss": 0.9509, "mean_token_accuracy": 0.76231749355793, "num_tokens": 18195989.0, "step": 22614 }, { "epoch": 5.989406779661017, "grad_norm": 2.5503880977630615, "learning_rate": 7.005429025423729e-06, "loss": 1.3907, "mean_token_accuracy": 0.6965856477618217, "num_tokens": 18197744.0, "step": 22616 }, { "epoch": 5.989936440677966, "grad_norm": 2.4008498191833496, "learning_rate": 7.005164194915255e-06, "loss": 1.4904, "mean_token_accuracy": 0.6513817682862282, "num_tokens": 18199452.0, "step": 22618 }, { "epoch": 5.990466101694915, "grad_norm": 2.4767816066741943, "learning_rate": 7.004899364406779e-06, "loss": 1.3166, "mean_token_accuracy": 0.7019655182957649, "num_tokens": 18200894.0, "step": 22620 }, { "epoch": 5.990995762711864, "grad_norm": 2.460340738296509, "learning_rate": 7.004634533898306e-06, "loss": 1.0076, "mean_token_accuracy": 0.7652626037597656, "num_tokens": 18202387.0, "step": 22622 }, { "epoch": 5.991525423728813, "grad_norm": 1.9899178743362427, "learning_rate": 7.004369703389831e-06, "loss": 0.6864, "mean_token_accuracy": 0.8102174401283264, "num_tokens": 18204081.0, "step": 22624 }, { "epoch": 5.992055084745763, "grad_norm": 2.1462199687957764, "learning_rate": 7.004104872881357e-06, "loss": 1.1572, "mean_token_accuracy": 0.7398098036646843, "num_tokens": 18205502.0, "step": 22626 }, { "epoch": 5.992584745762712, "grad_norm": 1.845000982284546, "learning_rate": 7.003840042372882e-06, "loss": 0.7829, "mean_token_accuracy": 0.7997261807322502, "num_tokens": 18207105.0, "step": 22628 }, { "epoch": 5.993114406779661, "grad_norm": 2.3263754844665527, "learning_rate": 7.003575211864407e-06, "loss": 1.2319, "mean_token_accuracy": 0.7173425555229187, "num_tokens": 18208685.0, "step": 22630 }, { "epoch": 5.99364406779661, "grad_norm": 2.621673822402954, "learning_rate": 7.003310381355933e-06, "loss": 1.2992, "mean_token_accuracy": 0.6951447278261185, "num_tokens": 18210187.0, "step": 22632 }, { "epoch": 5.99417372881356, "grad_norm": 1.923411250114441, "learning_rate": 7.003045550847458e-06, "loss": 1.5356, "mean_token_accuracy": 0.6761213392019272, "num_tokens": 18212012.0, "step": 22634 }, { "epoch": 5.994703389830509, "grad_norm": 2.0150039196014404, "learning_rate": 7.002780720338984e-06, "loss": 1.4354, "mean_token_accuracy": 0.6940585672855377, "num_tokens": 18213542.0, "step": 22636 }, { "epoch": 5.995233050847458, "grad_norm": 2.435159206390381, "learning_rate": 7.002515889830509e-06, "loss": 1.6746, "mean_token_accuracy": 0.6265233755111694, "num_tokens": 18215353.0, "step": 22638 }, { "epoch": 5.995762711864407, "grad_norm": 2.256355047225952, "learning_rate": 7.002251059322035e-06, "loss": 1.0477, "mean_token_accuracy": 0.7445112317800522, "num_tokens": 18216628.0, "step": 22640 }, { "epoch": 5.996292372881356, "grad_norm": 2.2172505855560303, "learning_rate": 7.0019862288135595e-06, "loss": 1.2441, "mean_token_accuracy": 0.7316421568393707, "num_tokens": 18218218.0, "step": 22642 }, { "epoch": 5.996822033898305, "grad_norm": 1.5688223838806152, "learning_rate": 7.001721398305086e-06, "loss": 1.2275, "mean_token_accuracy": 0.7246994078159332, "num_tokens": 18219879.0, "step": 22644 }, { "epoch": 5.997351694915254, "grad_norm": 2.39277982711792, "learning_rate": 7.00145656779661e-06, "loss": 1.206, "mean_token_accuracy": 0.7137137800455093, "num_tokens": 18221380.0, "step": 22646 }, { "epoch": 5.997881355932203, "grad_norm": 2.1924591064453125, "learning_rate": 7.001191737288137e-06, "loss": 1.1469, "mean_token_accuracy": 0.7117047980427742, "num_tokens": 18222780.0, "step": 22648 }, { "epoch": 5.998411016949152, "grad_norm": 2.979942560195923, "learning_rate": 7.000926906779662e-06, "loss": 1.6958, "mean_token_accuracy": 0.6324730850756168, "num_tokens": 18224213.0, "step": 22650 }, { "epoch": 5.998940677966102, "grad_norm": 2.109334707260132, "learning_rate": 7.0006620762711876e-06, "loss": 1.3781, "mean_token_accuracy": 0.6662634983658791, "num_tokens": 18226073.0, "step": 22652 }, { "epoch": 5.999470338983051, "grad_norm": 2.5166537761688232, "learning_rate": 7.0003972457627125e-06, "loss": 1.805, "mean_token_accuracy": 0.6030933745205402, "num_tokens": 18227814.0, "step": 22654 }, { "epoch": 6.0, "grad_norm": 1.7796761989593506, "learning_rate": 7.000132415254238e-06, "loss": 0.8719, "mean_token_accuracy": 0.7719196006655693, "num_tokens": 18229608.0, "step": 22656 }, { "epoch": 6.000529661016949, "grad_norm": 1.9943832159042358, "learning_rate": 6.999867584745763e-06, "loss": 1.1464, "mean_token_accuracy": 0.7329474464058876, "num_tokens": 18231007.0, "step": 22658 }, { "epoch": 6.001059322033898, "grad_norm": 2.0087852478027344, "learning_rate": 6.999602754237289e-06, "loss": 1.0406, "mean_token_accuracy": 0.744946613907814, "num_tokens": 18232549.0, "step": 22660 }, { "epoch": 6.001588983050848, "grad_norm": 2.369274616241455, "learning_rate": 6.999337923728814e-06, "loss": 1.306, "mean_token_accuracy": 0.7031492963433266, "num_tokens": 18234160.0, "step": 22662 }, { "epoch": 6.002118644067797, "grad_norm": 2.146658182144165, "learning_rate": 6.99907309322034e-06, "loss": 1.3508, "mean_token_accuracy": 0.7164764851331711, "num_tokens": 18235830.0, "step": 22664 }, { "epoch": 6.002648305084746, "grad_norm": 1.733798623085022, "learning_rate": 6.998808262711865e-06, "loss": 1.1141, "mean_token_accuracy": 0.7447488233447075, "num_tokens": 18237469.0, "step": 22666 }, { "epoch": 6.003177966101695, "grad_norm": 1.7040932178497314, "learning_rate": 6.99854343220339e-06, "loss": 1.1355, "mean_token_accuracy": 0.723371833562851, "num_tokens": 18239723.0, "step": 22668 }, { "epoch": 6.0037076271186445, "grad_norm": 2.1916046142578125, "learning_rate": 6.998278601694915e-06, "loss": 1.274, "mean_token_accuracy": 0.69710274040699, "num_tokens": 18241351.0, "step": 22670 }, { "epoch": 6.004237288135593, "grad_norm": 2.0840508937835693, "learning_rate": 6.998013771186442e-06, "loss": 1.2093, "mean_token_accuracy": 0.7070393040776253, "num_tokens": 18243201.0, "step": 22672 }, { "epoch": 6.004766949152542, "grad_norm": 2.0436949729919434, "learning_rate": 6.997748940677966e-06, "loss": 0.9596, "mean_token_accuracy": 0.7579454854130745, "num_tokens": 18244654.0, "step": 22674 }, { "epoch": 6.005296610169491, "grad_norm": 1.8243762254714966, "learning_rate": 6.997484110169493e-06, "loss": 1.1873, "mean_token_accuracy": 0.7050524353981018, "num_tokens": 18246496.0, "step": 22676 }, { "epoch": 6.00582627118644, "grad_norm": 2.397231340408325, "learning_rate": 6.997219279661018e-06, "loss": 1.5734, "mean_token_accuracy": 0.665876641869545, "num_tokens": 18248006.0, "step": 22678 }, { "epoch": 6.00635593220339, "grad_norm": 2.0263092517852783, "learning_rate": 6.996954449152543e-06, "loss": 1.2695, "mean_token_accuracy": 0.7144990377128124, "num_tokens": 18249979.0, "step": 22680 }, { "epoch": 6.006885593220339, "grad_norm": 2.0098319053649902, "learning_rate": 6.996689618644068e-06, "loss": 1.2212, "mean_token_accuracy": 0.7172125577926636, "num_tokens": 18251929.0, "step": 22682 }, { "epoch": 6.007415254237288, "grad_norm": 2.3181283473968506, "learning_rate": 6.996424788135594e-06, "loss": 1.0421, "mean_token_accuracy": 0.7508560344576836, "num_tokens": 18253642.0, "step": 22684 }, { "epoch": 6.007944915254237, "grad_norm": 2.0539493560791016, "learning_rate": 6.996159957627119e-06, "loss": 1.2291, "mean_token_accuracy": 0.7339063882827759, "num_tokens": 18254913.0, "step": 22686 }, { "epoch": 6.008474576271187, "grad_norm": 2.078113555908203, "learning_rate": 6.995895127118645e-06, "loss": 1.0985, "mean_token_accuracy": 0.7526018023490906, "num_tokens": 18256469.0, "step": 22688 }, { "epoch": 6.009004237288136, "grad_norm": 2.28818941116333, "learning_rate": 6.99563029661017e-06, "loss": 1.1796, "mean_token_accuracy": 0.7095489650964737, "num_tokens": 18258036.0, "step": 22690 }, { "epoch": 6.009533898305085, "grad_norm": 1.8740397691726685, "learning_rate": 6.9953654661016955e-06, "loss": 1.0637, "mean_token_accuracy": 0.7763780355453491, "num_tokens": 18259752.0, "step": 22692 }, { "epoch": 6.010063559322034, "grad_norm": 2.2415971755981445, "learning_rate": 6.99510063559322e-06, "loss": 1.4371, "mean_token_accuracy": 0.6756591238081455, "num_tokens": 18261502.0, "step": 22694 }, { "epoch": 6.010593220338983, "grad_norm": 2.2832629680633545, "learning_rate": 6.994835805084746e-06, "loss": 1.239, "mean_token_accuracy": 0.7237313911318779, "num_tokens": 18263119.0, "step": 22696 }, { "epoch": 6.0111228813559325, "grad_norm": 2.0522959232330322, "learning_rate": 6.994570974576271e-06, "loss": 1.2081, "mean_token_accuracy": 0.7283176481723785, "num_tokens": 18264824.0, "step": 22698 }, { "epoch": 6.0116525423728815, "grad_norm": 2.1911942958831787, "learning_rate": 6.994306144067797e-06, "loss": 1.2257, "mean_token_accuracy": 0.695076011121273, "num_tokens": 18266487.0, "step": 22700 }, { "epoch": 6.0121822033898304, "grad_norm": 1.8426592350006104, "learning_rate": 6.994041313559322e-06, "loss": 0.7689, "mean_token_accuracy": 0.8050297871232033, "num_tokens": 18268062.0, "step": 22702 }, { "epoch": 6.012711864406779, "grad_norm": 2.217257022857666, "learning_rate": 6.9937764830508484e-06, "loss": 1.1741, "mean_token_accuracy": 0.7124960944056511, "num_tokens": 18269937.0, "step": 22704 }, { "epoch": 6.013241525423729, "grad_norm": 2.251772165298462, "learning_rate": 6.993511652542373e-06, "loss": 1.5277, "mean_token_accuracy": 0.6588137000799179, "num_tokens": 18271745.0, "step": 22706 }, { "epoch": 6.013771186440678, "grad_norm": 2.643547296524048, "learning_rate": 6.993246822033899e-06, "loss": 1.8292, "mean_token_accuracy": 0.6284283548593521, "num_tokens": 18273180.0, "step": 22708 }, { "epoch": 6.014300847457627, "grad_norm": 2.119264841079712, "learning_rate": 6.992981991525424e-06, "loss": 0.99, "mean_token_accuracy": 0.7623980492353439, "num_tokens": 18274803.0, "step": 22710 }, { "epoch": 6.014830508474576, "grad_norm": 2.057939052581787, "learning_rate": 6.99271716101695e-06, "loss": 0.7236, "mean_token_accuracy": 0.7974887266755104, "num_tokens": 18276294.0, "step": 22712 }, { "epoch": 6.015360169491525, "grad_norm": 2.7859818935394287, "learning_rate": 6.992452330508475e-06, "loss": 1.3434, "mean_token_accuracy": 0.703583836555481, "num_tokens": 18277661.0, "step": 22714 }, { "epoch": 6.015889830508475, "grad_norm": 2.1956121921539307, "learning_rate": 6.9921875000000006e-06, "loss": 0.865, "mean_token_accuracy": 0.7763100862503052, "num_tokens": 18279104.0, "step": 22716 }, { "epoch": 6.016419491525424, "grad_norm": 2.1036016941070557, "learning_rate": 6.991922669491526e-06, "loss": 1.0996, "mean_token_accuracy": 0.7425906807184219, "num_tokens": 18280985.0, "step": 22718 }, { "epoch": 6.016949152542373, "grad_norm": 2.451605796813965, "learning_rate": 6.991657838983051e-06, "loss": 1.1487, "mean_token_accuracy": 0.753762312233448, "num_tokens": 18282566.0, "step": 22720 }, { "epoch": 6.017478813559322, "grad_norm": 2.710700511932373, "learning_rate": 6.991393008474577e-06, "loss": 1.3588, "mean_token_accuracy": 0.7242838516831398, "num_tokens": 18283983.0, "step": 22722 }, { "epoch": 6.018008474576271, "grad_norm": 2.4554543495178223, "learning_rate": 6.991128177966102e-06, "loss": 1.5612, "mean_token_accuracy": 0.6402595117688179, "num_tokens": 18285811.0, "step": 22724 }, { "epoch": 6.018538135593221, "grad_norm": 2.3347103595733643, "learning_rate": 6.990863347457629e-06, "loss": 1.3159, "mean_token_accuracy": 0.7105535566806793, "num_tokens": 18287667.0, "step": 22726 }, { "epoch": 6.0190677966101696, "grad_norm": 1.6082227230072021, "learning_rate": 6.990598516949153e-06, "loss": 0.8203, "mean_token_accuracy": 0.7478377521038055, "num_tokens": 18289978.0, "step": 22728 }, { "epoch": 6.0195974576271185, "grad_norm": 2.2274396419525146, "learning_rate": 6.990333686440679e-06, "loss": 1.2919, "mean_token_accuracy": 0.677167572081089, "num_tokens": 18291609.0, "step": 22730 }, { "epoch": 6.0201271186440675, "grad_norm": 1.9060184955596924, "learning_rate": 6.990068855932204e-06, "loss": 0.9529, "mean_token_accuracy": 0.7708428651094437, "num_tokens": 18293100.0, "step": 22732 }, { "epoch": 6.020656779661017, "grad_norm": 2.4964587688446045, "learning_rate": 6.98980402542373e-06, "loss": 0.9345, "mean_token_accuracy": 0.7561940103769302, "num_tokens": 18294427.0, "step": 22734 }, { "epoch": 6.021186440677966, "grad_norm": 2.32253098487854, "learning_rate": 6.989539194915255e-06, "loss": 1.3071, "mean_token_accuracy": 0.7003054171800613, "num_tokens": 18295978.0, "step": 22736 }, { "epoch": 6.021716101694915, "grad_norm": 2.400268793106079, "learning_rate": 6.989274364406781e-06, "loss": 1.3046, "mean_token_accuracy": 0.7117680460214615, "num_tokens": 18297445.0, "step": 22738 }, { "epoch": 6.022245762711864, "grad_norm": 2.338491916656494, "learning_rate": 6.989009533898306e-06, "loss": 1.1499, "mean_token_accuracy": 0.7351538985967636, "num_tokens": 18299010.0, "step": 22740 }, { "epoch": 6.022775423728813, "grad_norm": 2.636220932006836, "learning_rate": 6.988744703389831e-06, "loss": 1.4685, "mean_token_accuracy": 0.6678381636738777, "num_tokens": 18300494.0, "step": 22742 }, { "epoch": 6.023305084745763, "grad_norm": 2.271832227706909, "learning_rate": 6.988479872881356e-06, "loss": 1.3851, "mean_token_accuracy": 0.7116179168224335, "num_tokens": 18302092.0, "step": 22744 }, { "epoch": 6.023834745762712, "grad_norm": 2.2468090057373047, "learning_rate": 6.988215042372882e-06, "loss": 1.4161, "mean_token_accuracy": 0.6887936815619469, "num_tokens": 18303655.0, "step": 22746 }, { "epoch": 6.024364406779661, "grad_norm": 2.4091367721557617, "learning_rate": 6.987950211864407e-06, "loss": 1.168, "mean_token_accuracy": 0.7102344185113907, "num_tokens": 18305192.0, "step": 22748 }, { "epoch": 6.02489406779661, "grad_norm": 2.103024959564209, "learning_rate": 6.987685381355933e-06, "loss": 1.2635, "step": 22750 }, { "epoch": 6.02489406779661, "eval_loss": 1.3258458375930786, "eval_mean_token_accuracy": 0.7006417581593836, "eval_num_tokens": 18306855.0, "eval_runtime": 48.2286, "eval_samples_per_second": 6.386, "eval_steps_per_second": 6.386, "step": 22750 }, { "epoch": 6.02542372881356, "grad_norm": 2.5276851654052734, "learning_rate": 6.987420550847458e-06, "loss": 1.0365, "mean_token_accuracy": 0.7432645708322525, "num_tokens": 18308492.0, "step": 22752 }, { "epoch": 6.025953389830509, "grad_norm": 1.9954227209091187, "learning_rate": 6.9871557203389835e-06, "loss": 0.9423, "mean_token_accuracy": 0.7681030482053757, "num_tokens": 18310005.0, "step": 22754 }, { "epoch": 6.026483050847458, "grad_norm": 2.364145517349243, "learning_rate": 6.9868908898305085e-06, "loss": 1.1669, "mean_token_accuracy": 0.7457796558737755, "num_tokens": 18311620.0, "step": 22756 }, { "epoch": 6.027012711864407, "grad_norm": 2.5987675189971924, "learning_rate": 6.986626059322035e-06, "loss": 1.0188, "mean_token_accuracy": 0.7236737236380577, "num_tokens": 18313373.0, "step": 22758 }, { "epoch": 6.0275423728813555, "grad_norm": 2.305488348007202, "learning_rate": 6.98636122881356e-06, "loss": 1.1731, "mean_token_accuracy": 0.7281124591827393, "num_tokens": 18314909.0, "step": 22760 }, { "epoch": 6.028072033898305, "grad_norm": 2.9283955097198486, "learning_rate": 6.986096398305086e-06, "loss": 0.9373, "mean_token_accuracy": 0.771167129278183, "num_tokens": 18316376.0, "step": 22762 }, { "epoch": 6.028601694915254, "grad_norm": 2.2729320526123047, "learning_rate": 6.985831567796611e-06, "loss": 1.292, "mean_token_accuracy": 0.7162536084651947, "num_tokens": 18317996.0, "step": 22764 }, { "epoch": 6.029131355932203, "grad_norm": 1.7896947860717773, "learning_rate": 6.9855667372881365e-06, "loss": 1.0345, "mean_token_accuracy": 0.7664223462343216, "num_tokens": 18319501.0, "step": 22766 }, { "epoch": 6.029661016949152, "grad_norm": 2.543795585632324, "learning_rate": 6.985301906779661e-06, "loss": 1.2299, "mean_token_accuracy": 0.715446226298809, "num_tokens": 18321079.0, "step": 22768 }, { "epoch": 6.030190677966102, "grad_norm": 2.5967605113983154, "learning_rate": 6.985037076271187e-06, "loss": 1.582, "mean_token_accuracy": 0.6528720185160637, "num_tokens": 18322859.0, "step": 22770 }, { "epoch": 6.030720338983051, "grad_norm": 2.086276054382324, "learning_rate": 6.984772245762712e-06, "loss": 1.1782, "mean_token_accuracy": 0.7469120919704437, "num_tokens": 18324396.0, "step": 22772 }, { "epoch": 6.03125, "grad_norm": 2.64228892326355, "learning_rate": 6.984507415254238e-06, "loss": 1.3842, "mean_token_accuracy": 0.6875657141208649, "num_tokens": 18326036.0, "step": 22774 }, { "epoch": 6.031779661016949, "grad_norm": 1.9560275077819824, "learning_rate": 6.984242584745763e-06, "loss": 0.9784, "mean_token_accuracy": 0.7851512134075165, "num_tokens": 18327657.0, "step": 22776 }, { "epoch": 6.032309322033898, "grad_norm": 1.9951374530792236, "learning_rate": 6.983977754237289e-06, "loss": 1.642, "mean_token_accuracy": 0.622443899512291, "num_tokens": 18329869.0, "step": 22778 }, { "epoch": 6.032838983050848, "grad_norm": 1.8567440509796143, "learning_rate": 6.9837129237288135e-06, "loss": 1.2166, "mean_token_accuracy": 0.7186118736863136, "num_tokens": 18331811.0, "step": 22780 }, { "epoch": 6.033368644067797, "grad_norm": 1.9161722660064697, "learning_rate": 6.983448093220339e-06, "loss": 0.9962, "mean_token_accuracy": 0.782887801527977, "num_tokens": 18333282.0, "step": 22782 }, { "epoch": 6.033898305084746, "grad_norm": 2.607720375061035, "learning_rate": 6.983183262711864e-06, "loss": 1.0787, "mean_token_accuracy": 0.7274901568889618, "num_tokens": 18334912.0, "step": 22784 }, { "epoch": 6.034427966101695, "grad_norm": 2.188156843185425, "learning_rate": 6.982918432203391e-06, "loss": 1.1531, "mean_token_accuracy": 0.7112883478403091, "num_tokens": 18336476.0, "step": 22786 }, { "epoch": 6.0349576271186445, "grad_norm": 2.2533206939697266, "learning_rate": 6.982653601694915e-06, "loss": 1.2575, "mean_token_accuracy": 0.6890534833073616, "num_tokens": 18338024.0, "step": 22788 }, { "epoch": 6.035487288135593, "grad_norm": 2.4836130142211914, "learning_rate": 6.9823887711864416e-06, "loss": 1.3506, "mean_token_accuracy": 0.708750918507576, "num_tokens": 18339645.0, "step": 22790 }, { "epoch": 6.036016949152542, "grad_norm": 2.0082719326019287, "learning_rate": 6.9821239406779665e-06, "loss": 1.0421, "mean_token_accuracy": 0.7707682028412819, "num_tokens": 18341307.0, "step": 22792 }, { "epoch": 6.036546610169491, "grad_norm": 1.9093612432479858, "learning_rate": 6.981859110169492e-06, "loss": 0.9647, "mean_token_accuracy": 0.7877046763896942, "num_tokens": 18342777.0, "step": 22794 }, { "epoch": 6.03707627118644, "grad_norm": 2.8073575496673584, "learning_rate": 6.981594279661017e-06, "loss": 1.2431, "mean_token_accuracy": 0.7105376496911049, "num_tokens": 18344112.0, "step": 22796 }, { "epoch": 6.03760593220339, "grad_norm": 2.4839324951171875, "learning_rate": 6.981329449152543e-06, "loss": 1.2767, "mean_token_accuracy": 0.7231165319681168, "num_tokens": 18345846.0, "step": 22798 }, { "epoch": 6.038135593220339, "grad_norm": 2.7469959259033203, "learning_rate": 6.981064618644068e-06, "loss": 1.7011, "mean_token_accuracy": 0.6193550825119019, "num_tokens": 18347410.0, "step": 22800 }, { "epoch": 6.038665254237288, "grad_norm": 2.064633369445801, "learning_rate": 6.980799788135594e-06, "loss": 1.3111, "mean_token_accuracy": 0.6968388110399246, "num_tokens": 18349192.0, "step": 22802 }, { "epoch": 6.039194915254237, "grad_norm": 1.7935659885406494, "learning_rate": 6.9805349576271195e-06, "loss": 1.0354, "mean_token_accuracy": 0.7671541571617126, "num_tokens": 18350891.0, "step": 22804 }, { "epoch": 6.039724576271187, "grad_norm": 1.9025124311447144, "learning_rate": 6.980270127118644e-06, "loss": 0.9179, "mean_token_accuracy": 0.76359523832798, "num_tokens": 18352716.0, "step": 22806 }, { "epoch": 6.040254237288136, "grad_norm": 2.3941147327423096, "learning_rate": 6.98000529661017e-06, "loss": 1.1676, "mean_token_accuracy": 0.7032816708087921, "num_tokens": 18354320.0, "step": 22808 }, { "epoch": 6.040783898305085, "grad_norm": 2.0552570819854736, "learning_rate": 6.979740466101695e-06, "loss": 1.135, "mean_token_accuracy": 0.7328610569238663, "num_tokens": 18355824.0, "step": 22810 }, { "epoch": 6.041313559322034, "grad_norm": 1.9562045335769653, "learning_rate": 6.979475635593222e-06, "loss": 0.9034, "mean_token_accuracy": 0.769096240401268, "num_tokens": 18357485.0, "step": 22812 }, { "epoch": 6.041843220338983, "grad_norm": 2.7125720977783203, "learning_rate": 6.979210805084747e-06, "loss": 1.323, "mean_token_accuracy": 0.7151872590184212, "num_tokens": 18359051.0, "step": 22814 }, { "epoch": 6.0423728813559325, "grad_norm": 2.020531177520752, "learning_rate": 6.978945974576272e-06, "loss": 0.9463, "mean_token_accuracy": 0.7627162039279938, "num_tokens": 18360476.0, "step": 22816 }, { "epoch": 6.0429025423728815, "grad_norm": 2.3622827529907227, "learning_rate": 6.978681144067797e-06, "loss": 1.3887, "mean_token_accuracy": 0.6959055364131927, "num_tokens": 18362319.0, "step": 22818 }, { "epoch": 6.0434322033898304, "grad_norm": 2.240086317062378, "learning_rate": 6.978416313559323e-06, "loss": 1.271, "mean_token_accuracy": 0.6950574442744255, "num_tokens": 18363899.0, "step": 22820 }, { "epoch": 6.043961864406779, "grad_norm": 1.983669400215149, "learning_rate": 6.978151483050848e-06, "loss": 1.0362, "mean_token_accuracy": 0.7526394799351692, "num_tokens": 18365414.0, "step": 22822 }, { "epoch": 6.044491525423729, "grad_norm": 2.0609216690063477, "learning_rate": 6.977886652542374e-06, "loss": 0.9128, "mean_token_accuracy": 0.7938492596149445, "num_tokens": 18366945.0, "step": 22824 }, { "epoch": 6.045021186440678, "grad_norm": 1.8053810596466064, "learning_rate": 6.977621822033899e-06, "loss": 1.1763, "mean_token_accuracy": 0.7236929386854172, "num_tokens": 18368842.0, "step": 22826 }, { "epoch": 6.045550847457627, "grad_norm": 2.5039010047912598, "learning_rate": 6.9773569915254245e-06, "loss": 1.2154, "mean_token_accuracy": 0.7086671143770218, "num_tokens": 18370567.0, "step": 22828 }, { "epoch": 6.046080508474576, "grad_norm": 2.0842068195343018, "learning_rate": 6.9770921610169495e-06, "loss": 1.2899, "mean_token_accuracy": 0.6855890154838562, "num_tokens": 18372294.0, "step": 22830 }, { "epoch": 6.046610169491525, "grad_norm": 2.0789926052093506, "learning_rate": 6.976827330508475e-06, "loss": 1.2114, "mean_token_accuracy": 0.7017146199941635, "num_tokens": 18374012.0, "step": 22832 }, { "epoch": 6.047139830508475, "grad_norm": 2.5846261978149414, "learning_rate": 6.9765625e-06, "loss": 1.0235, "mean_token_accuracy": 0.7558225691318512, "num_tokens": 18375341.0, "step": 22834 }, { "epoch": 6.047669491525424, "grad_norm": 2.2998969554901123, "learning_rate": 6.976297669491526e-06, "loss": 1.0887, "mean_token_accuracy": 0.7610421776771545, "num_tokens": 18377186.0, "step": 22836 }, { "epoch": 6.048199152542373, "grad_norm": 2.5620880126953125, "learning_rate": 6.976032838983051e-06, "loss": 1.0933, "mean_token_accuracy": 0.7499971836805344, "num_tokens": 18378664.0, "step": 22838 }, { "epoch": 6.048728813559322, "grad_norm": 2.3473098278045654, "learning_rate": 6.9757680084745775e-06, "loss": 0.874, "mean_token_accuracy": 0.783006563782692, "num_tokens": 18380308.0, "step": 22840 }, { "epoch": 6.049258474576271, "grad_norm": 2.433145523071289, "learning_rate": 6.975503177966102e-06, "loss": 1.6206, "mean_token_accuracy": 0.6459216251969337, "num_tokens": 18382138.0, "step": 22842 }, { "epoch": 6.049788135593221, "grad_norm": 2.4912502765655518, "learning_rate": 6.975238347457628e-06, "loss": 1.3839, "mean_token_accuracy": 0.6727806404232979, "num_tokens": 18383785.0, "step": 22844 }, { "epoch": 6.0503177966101696, "grad_norm": 2.4496002197265625, "learning_rate": 6.974973516949153e-06, "loss": 1.3605, "mean_token_accuracy": 0.707860991358757, "num_tokens": 18385358.0, "step": 22846 }, { "epoch": 6.0508474576271185, "grad_norm": 2.2945544719696045, "learning_rate": 6.974708686440679e-06, "loss": 1.0568, "mean_token_accuracy": 0.7322108000516891, "num_tokens": 18387100.0, "step": 22848 }, { "epoch": 6.0513771186440675, "grad_norm": 3.026240348815918, "learning_rate": 6.974443855932204e-06, "loss": 1.4999, "mean_token_accuracy": 0.6612568572163582, "num_tokens": 18388822.0, "step": 22850 }, { "epoch": 6.051906779661017, "grad_norm": 2.0029983520507812, "learning_rate": 6.97417902542373e-06, "loss": 1.0617, "mean_token_accuracy": 0.7327733933925629, "num_tokens": 18390755.0, "step": 22852 }, { "epoch": 6.052436440677966, "grad_norm": 2.5362062454223633, "learning_rate": 6.9739141949152545e-06, "loss": 1.535, "mean_token_accuracy": 0.6581420972943306, "num_tokens": 18392293.0, "step": 22854 }, { "epoch": 6.052966101694915, "grad_norm": 2.668416738510132, "learning_rate": 6.97364936440678e-06, "loss": 1.1059, "mean_token_accuracy": 0.7215132340788841, "num_tokens": 18393627.0, "step": 22856 }, { "epoch": 6.053495762711864, "grad_norm": 2.6715686321258545, "learning_rate": 6.973384533898305e-06, "loss": 1.3456, "mean_token_accuracy": 0.6825295090675354, "num_tokens": 18395209.0, "step": 22858 }, { "epoch": 6.054025423728813, "grad_norm": 2.395549774169922, "learning_rate": 6.973119703389831e-06, "loss": 1.6302, "mean_token_accuracy": 0.6328653246164322, "num_tokens": 18396812.0, "step": 22860 }, { "epoch": 6.054555084745763, "grad_norm": 2.243114948272705, "learning_rate": 6.972854872881356e-06, "loss": 1.3209, "mean_token_accuracy": 0.6738615036010742, "num_tokens": 18398665.0, "step": 22862 }, { "epoch": 6.055084745762712, "grad_norm": 2.2951955795288086, "learning_rate": 6.972590042372882e-06, "loss": 0.9826, "mean_token_accuracy": 0.7720101997256279, "num_tokens": 18400274.0, "step": 22864 }, { "epoch": 6.055614406779661, "grad_norm": 2.6517679691314697, "learning_rate": 6.972325211864407e-06, "loss": 1.1502, "mean_token_accuracy": 0.7488043680787086, "num_tokens": 18401480.0, "step": 22866 }, { "epoch": 6.05614406779661, "grad_norm": 2.1543145179748535, "learning_rate": 6.972060381355933e-06, "loss": 1.4859, "mean_token_accuracy": 0.6853055581450462, "num_tokens": 18403285.0, "step": 22868 }, { "epoch": 6.05667372881356, "grad_norm": 2.572117805480957, "learning_rate": 6.971795550847457e-06, "loss": 1.4113, "mean_token_accuracy": 0.6726323589682579, "num_tokens": 18404944.0, "step": 22870 }, { "epoch": 6.057203389830509, "grad_norm": 2.523709774017334, "learning_rate": 6.971530720338984e-06, "loss": 1.2443, "mean_token_accuracy": 0.753544956445694, "num_tokens": 18407262.0, "step": 22872 }, { "epoch": 6.057733050847458, "grad_norm": 2.3071022033691406, "learning_rate": 6.971265889830509e-06, "loss": 1.1204, "mean_token_accuracy": 0.7431028783321381, "num_tokens": 18408643.0, "step": 22874 }, { "epoch": 6.058262711864407, "grad_norm": 2.587294101715088, "learning_rate": 6.971001059322035e-06, "loss": 0.935, "mean_token_accuracy": 0.7838485687971115, "num_tokens": 18410028.0, "step": 22876 }, { "epoch": 6.0587923728813555, "grad_norm": 2.307602643966675, "learning_rate": 6.97073622881356e-06, "loss": 1.0082, "mean_token_accuracy": 0.7720860689878464, "num_tokens": 18411582.0, "step": 22878 }, { "epoch": 6.059322033898305, "grad_norm": 1.9655722379684448, "learning_rate": 6.970471398305085e-06, "loss": 0.8955, "mean_token_accuracy": 0.7696609869599342, "num_tokens": 18413371.0, "step": 22880 }, { "epoch": 6.059851694915254, "grad_norm": 2.4600062370300293, "learning_rate": 6.97020656779661e-06, "loss": 1.0696, "mean_token_accuracy": 0.7222950980067253, "num_tokens": 18415047.0, "step": 22882 }, { "epoch": 6.060381355932203, "grad_norm": 2.2401676177978516, "learning_rate": 6.969941737288136e-06, "loss": 1.1987, "mean_token_accuracy": 0.7007563412189484, "num_tokens": 18417198.0, "step": 22884 }, { "epoch": 6.060911016949152, "grad_norm": 2.558349370956421, "learning_rate": 6.969676906779662e-06, "loss": 1.169, "mean_token_accuracy": 0.7366512641310692, "num_tokens": 18418425.0, "step": 22886 }, { "epoch": 6.061440677966102, "grad_norm": 2.024317502975464, "learning_rate": 6.969412076271187e-06, "loss": 1.0381, "mean_token_accuracy": 0.7258470803499222, "num_tokens": 18420291.0, "step": 22888 }, { "epoch": 6.061970338983051, "grad_norm": 2.7011492252349854, "learning_rate": 6.969147245762713e-06, "loss": 1.5152, "mean_token_accuracy": 0.6635299623012543, "num_tokens": 18421951.0, "step": 22890 }, { "epoch": 6.0625, "grad_norm": 2.123291254043579, "learning_rate": 6.9688824152542375e-06, "loss": 0.801, "mean_token_accuracy": 0.8102157190442085, "num_tokens": 18423635.0, "step": 22892 }, { "epoch": 6.063029661016949, "grad_norm": 2.8220739364624023, "learning_rate": 6.968617584745764e-06, "loss": 1.4858, "mean_token_accuracy": 0.6667748913168907, "num_tokens": 18425145.0, "step": 22894 }, { "epoch": 6.063559322033898, "grad_norm": 2.2463269233703613, "learning_rate": 6.968352754237288e-06, "loss": 1.1065, "mean_token_accuracy": 0.7523846700787544, "num_tokens": 18426456.0, "step": 22896 }, { "epoch": 6.064088983050848, "grad_norm": 3.287533760070801, "learning_rate": 6.968087923728815e-06, "loss": 1.7458, "mean_token_accuracy": 0.6392702460289001, "num_tokens": 18428012.0, "step": 22898 }, { "epoch": 6.064618644067797, "grad_norm": 1.8443344831466675, "learning_rate": 6.96782309322034e-06, "loss": 1.1344, "mean_token_accuracy": 0.7252663522958755, "num_tokens": 18429650.0, "step": 22900 }, { "epoch": 6.065148305084746, "grad_norm": 2.5819084644317627, "learning_rate": 6.9675582627118656e-06, "loss": 0.8463, "mean_token_accuracy": 0.7864669263362885, "num_tokens": 18431219.0, "step": 22902 }, { "epoch": 6.065677966101695, "grad_norm": 2.683628797531128, "learning_rate": 6.9672934322033905e-06, "loss": 1.2755, "mean_token_accuracy": 0.7103812918066978, "num_tokens": 18432850.0, "step": 22904 }, { "epoch": 6.0662076271186445, "grad_norm": 2.4400718212127686, "learning_rate": 6.967028601694916e-06, "loss": 1.362, "mean_token_accuracy": 0.6973079517483711, "num_tokens": 18434361.0, "step": 22906 }, { "epoch": 6.066737288135593, "grad_norm": 2.5199472904205322, "learning_rate": 6.966763771186441e-06, "loss": 1.3878, "mean_token_accuracy": 0.7076080143451691, "num_tokens": 18436073.0, "step": 22908 }, { "epoch": 6.067266949152542, "grad_norm": 2.749264717102051, "learning_rate": 6.966498940677967e-06, "loss": 1.4979, "mean_token_accuracy": 0.6638188436627388, "num_tokens": 18437593.0, "step": 22910 }, { "epoch": 6.067796610169491, "grad_norm": 2.2086501121520996, "learning_rate": 6.966234110169492e-06, "loss": 1.071, "mean_token_accuracy": 0.7526113614439964, "num_tokens": 18439174.0, "step": 22912 }, { "epoch": 6.06832627118644, "grad_norm": 3.216773509979248, "learning_rate": 6.965969279661018e-06, "loss": 1.2749, "mean_token_accuracy": 0.7016913294792175, "num_tokens": 18440791.0, "step": 22914 }, { "epoch": 6.06885593220339, "grad_norm": 2.806544065475464, "learning_rate": 6.965704449152543e-06, "loss": 1.2336, "mean_token_accuracy": 0.7372823059558868, "num_tokens": 18442296.0, "step": 22916 }, { "epoch": 6.069385593220339, "grad_norm": 2.3049535751342773, "learning_rate": 6.965439618644068e-06, "loss": 1.3195, "mean_token_accuracy": 0.6941627413034439, "num_tokens": 18443913.0, "step": 22918 }, { "epoch": 6.069915254237288, "grad_norm": 2.33815336227417, "learning_rate": 6.965174788135593e-06, "loss": 1.4235, "mean_token_accuracy": 0.665984470397234, "num_tokens": 18445942.0, "step": 22920 }, { "epoch": 6.070444915254237, "grad_norm": 2.2015485763549805, "learning_rate": 6.96490995762712e-06, "loss": 1.1661, "mean_token_accuracy": 0.7267181873321533, "num_tokens": 18447817.0, "step": 22922 }, { "epoch": 6.070974576271187, "grad_norm": 2.8348538875579834, "learning_rate": 6.964645127118644e-06, "loss": 1.2109, "mean_token_accuracy": 0.7178734913468361, "num_tokens": 18449263.0, "step": 22924 }, { "epoch": 6.071504237288136, "grad_norm": 2.4811220169067383, "learning_rate": 6.964380296610171e-06, "loss": 1.0863, "mean_token_accuracy": 0.7410764172673225, "num_tokens": 18450845.0, "step": 22926 }, { "epoch": 6.072033898305085, "grad_norm": 1.790549397468567, "learning_rate": 6.9641154661016956e-06, "loss": 0.9994, "mean_token_accuracy": 0.7730643376708031, "num_tokens": 18452406.0, "step": 22928 }, { "epoch": 6.072563559322034, "grad_norm": 2.0420002937316895, "learning_rate": 6.963850635593221e-06, "loss": 1.1961, "mean_token_accuracy": 0.7068299204111099, "num_tokens": 18454762.0, "step": 22930 }, { "epoch": 6.073093220338983, "grad_norm": 2.2859842777252197, "learning_rate": 6.963585805084746e-06, "loss": 1.2109, "mean_token_accuracy": 0.7051228955388069, "num_tokens": 18456503.0, "step": 22932 }, { "epoch": 6.0736228813559325, "grad_norm": 2.553532361984253, "learning_rate": 6.963320974576272e-06, "loss": 1.3122, "mean_token_accuracy": 0.6929296553134918, "num_tokens": 18458069.0, "step": 22934 }, { "epoch": 6.0741525423728815, "grad_norm": 2.4399261474609375, "learning_rate": 6.963056144067797e-06, "loss": 1.3092, "mean_token_accuracy": 0.6737145483493805, "num_tokens": 18459916.0, "step": 22936 }, { "epoch": 6.0746822033898304, "grad_norm": 2.3628363609313965, "learning_rate": 6.962791313559323e-06, "loss": 1.3725, "mean_token_accuracy": 0.698775477707386, "num_tokens": 18461619.0, "step": 22938 }, { "epoch": 6.075211864406779, "grad_norm": 2.2567923069000244, "learning_rate": 6.962526483050848e-06, "loss": 1.3577, "mean_token_accuracy": 0.6936095617711544, "num_tokens": 18463143.0, "step": 22940 }, { "epoch": 6.075741525423729, "grad_norm": 2.295804262161255, "learning_rate": 6.9622616525423735e-06, "loss": 0.8149, "mean_token_accuracy": 0.7680394798517227, "num_tokens": 18464815.0, "step": 22942 }, { "epoch": 6.076271186440678, "grad_norm": 2.7590396404266357, "learning_rate": 6.961996822033898e-06, "loss": 1.3477, "mean_token_accuracy": 0.703076496720314, "num_tokens": 18466457.0, "step": 22944 }, { "epoch": 6.076800847457627, "grad_norm": 2.5598866939544678, "learning_rate": 6.961731991525424e-06, "loss": 1.857, "mean_token_accuracy": 0.6088762730360031, "num_tokens": 18468022.0, "step": 22946 }, { "epoch": 6.077330508474576, "grad_norm": 2.257223606109619, "learning_rate": 6.961467161016949e-06, "loss": 1.5038, "mean_token_accuracy": 0.6636846736073494, "num_tokens": 18469977.0, "step": 22948 }, { "epoch": 6.077860169491525, "grad_norm": 2.501096725463867, "learning_rate": 6.961202330508475e-06, "loss": 1.176, "mean_token_accuracy": 0.7297312691807747, "num_tokens": 18471332.0, "step": 22950 }, { "epoch": 6.078389830508475, "grad_norm": 2.8927366733551025, "learning_rate": 6.9609375e-06, "loss": 1.2253, "mean_token_accuracy": 0.730523481965065, "num_tokens": 18472643.0, "step": 22952 }, { "epoch": 6.078919491525424, "grad_norm": 2.535736560821533, "learning_rate": 6.960672669491526e-06, "loss": 1.1501, "mean_token_accuracy": 0.7278566509485245, "num_tokens": 18474197.0, "step": 22954 }, { "epoch": 6.079449152542373, "grad_norm": 2.885831356048584, "learning_rate": 6.960407838983051e-06, "loss": 1.571, "mean_token_accuracy": 0.6643776595592499, "num_tokens": 18475896.0, "step": 22956 }, { "epoch": 6.079978813559322, "grad_norm": 2.8308322429656982, "learning_rate": 6.960143008474577e-06, "loss": 1.4462, "mean_token_accuracy": 0.6655824333429337, "num_tokens": 18477537.0, "step": 22958 }, { "epoch": 6.080508474576271, "grad_norm": 2.2685768604278564, "learning_rate": 6.959878177966102e-06, "loss": 1.1653, "mean_token_accuracy": 0.7286293730139732, "num_tokens": 18479366.0, "step": 22960 }, { "epoch": 6.081038135593221, "grad_norm": 2.488499402999878, "learning_rate": 6.959613347457628e-06, "loss": 1.272, "mean_token_accuracy": 0.6973516717553139, "num_tokens": 18480943.0, "step": 22962 }, { "epoch": 6.0815677966101696, "grad_norm": 2.3716108798980713, "learning_rate": 6.959348516949153e-06, "loss": 1.0711, "mean_token_accuracy": 0.7440257892012596, "num_tokens": 18482499.0, "step": 22964 }, { "epoch": 6.0820974576271185, "grad_norm": 1.6074979305267334, "learning_rate": 6.9590836864406785e-06, "loss": 1.103, "mean_token_accuracy": 0.7581752017140388, "num_tokens": 18484191.0, "step": 22966 }, { "epoch": 6.0826271186440675, "grad_norm": 2.5982367992401123, "learning_rate": 6.9588188559322035e-06, "loss": 0.9742, "mean_token_accuracy": 0.7725231796503067, "num_tokens": 18485567.0, "step": 22968 }, { "epoch": 6.083156779661017, "grad_norm": 2.804899215698242, "learning_rate": 6.958554025423729e-06, "loss": 1.015, "mean_token_accuracy": 0.7453111484646797, "num_tokens": 18487709.0, "step": 22970 }, { "epoch": 6.083686440677966, "grad_norm": 2.2189230918884277, "learning_rate": 6.958289194915255e-06, "loss": 0.8997, "mean_token_accuracy": 0.7734099328517914, "num_tokens": 18489196.0, "step": 22972 }, { "epoch": 6.084216101694915, "grad_norm": 2.0787739753723145, "learning_rate": 6.95802436440678e-06, "loss": 1.3775, "mean_token_accuracy": 0.6723069697618484, "num_tokens": 18491187.0, "step": 22974 }, { "epoch": 6.084745762711864, "grad_norm": 2.7231833934783936, "learning_rate": 6.9577595338983066e-06, "loss": 1.3546, "mean_token_accuracy": 0.7061783000826836, "num_tokens": 18492527.0, "step": 22976 }, { "epoch": 6.085275423728813, "grad_norm": 1.7718995809555054, "learning_rate": 6.957494703389831e-06, "loss": 0.8765, "mean_token_accuracy": 0.7479668408632278, "num_tokens": 18494266.0, "step": 22978 }, { "epoch": 6.085805084745763, "grad_norm": 2.5870370864868164, "learning_rate": 6.957229872881357e-06, "loss": 1.1985, "mean_token_accuracy": 0.7277874872088432, "num_tokens": 18495788.0, "step": 22980 }, { "epoch": 6.086334745762712, "grad_norm": 2.6882481575012207, "learning_rate": 6.956965042372882e-06, "loss": 1.0336, "mean_token_accuracy": 0.7548670992255211, "num_tokens": 18497171.0, "step": 22982 }, { "epoch": 6.086864406779661, "grad_norm": 2.058067560195923, "learning_rate": 6.956700211864408e-06, "loss": 1.1842, "mean_token_accuracy": 0.7526871562004089, "num_tokens": 18498915.0, "step": 22984 }, { "epoch": 6.08739406779661, "grad_norm": 2.3380775451660156, "learning_rate": 6.956435381355933e-06, "loss": 1.1407, "mean_token_accuracy": 0.7312034666538239, "num_tokens": 18500630.0, "step": 22986 }, { "epoch": 6.08792372881356, "grad_norm": 2.649851083755493, "learning_rate": 6.956170550847459e-06, "loss": 0.9642, "mean_token_accuracy": 0.7692203149199486, "num_tokens": 18502040.0, "step": 22988 }, { "epoch": 6.088453389830509, "grad_norm": 2.1203787326812744, "learning_rate": 6.955905720338984e-06, "loss": 1.1782, "mean_token_accuracy": 0.7398412004113197, "num_tokens": 18503657.0, "step": 22990 }, { "epoch": 6.088983050847458, "grad_norm": 2.56816029548645, "learning_rate": 6.955640889830509e-06, "loss": 1.1504, "mean_token_accuracy": 0.7181318178772926, "num_tokens": 18505181.0, "step": 22992 }, { "epoch": 6.089512711864407, "grad_norm": 2.523618698120117, "learning_rate": 6.955376059322034e-06, "loss": 1.363, "mean_token_accuracy": 0.6871452182531357, "num_tokens": 18506800.0, "step": 22994 }, { "epoch": 6.0900423728813555, "grad_norm": 2.352944850921631, "learning_rate": 6.95511122881356e-06, "loss": 1.2969, "mean_token_accuracy": 0.7043258249759674, "num_tokens": 18508212.0, "step": 22996 }, { "epoch": 6.090572033898305, "grad_norm": 2.277186393737793, "learning_rate": 6.954846398305085e-06, "loss": 1.2501, "mean_token_accuracy": 0.7341964393854141, "num_tokens": 18509791.0, "step": 22998 }, { "epoch": 6.091101694915254, "grad_norm": 2.7347922325134277, "learning_rate": 6.954581567796611e-06, "loss": 1.2984, "step": 23000 }, { "epoch": 6.091101694915254, "eval_loss": 1.3277587890625, "eval_mean_token_accuracy": 0.7003303013451687, "eval_num_tokens": 18511321.0, "eval_runtime": 48.3162, "eval_samples_per_second": 6.375, "eval_steps_per_second": 6.375, "step": 23000 }, { "epoch": 6.091631355932203, "grad_norm": 2.5321767330169678, "learning_rate": 6.954316737288136e-06, "loss": 1.1984, "mean_token_accuracy": 0.7226456627249718, "num_tokens": 18512768.0, "step": 23002 }, { "epoch": 6.092161016949152, "grad_norm": 3.044489860534668, "learning_rate": 6.9540519067796615e-06, "loss": 1.0135, "mean_token_accuracy": 0.7495708614587784, "num_tokens": 18514208.0, "step": 23004 }, { "epoch": 6.092690677966102, "grad_norm": 2.3872673511505127, "learning_rate": 6.9537870762711864e-06, "loss": 1.155, "mean_token_accuracy": 0.7291387319564819, "num_tokens": 18515744.0, "step": 23006 }, { "epoch": 6.093220338983051, "grad_norm": 2.4902501106262207, "learning_rate": 6.953522245762713e-06, "loss": 1.0542, "mean_token_accuracy": 0.7586928606033325, "num_tokens": 18517311.0, "step": 23008 }, { "epoch": 6.09375, "grad_norm": 2.994374990463257, "learning_rate": 6.953257415254238e-06, "loss": 1.2951, "mean_token_accuracy": 0.7027008384466171, "num_tokens": 18518817.0, "step": 23010 }, { "epoch": 6.094279661016949, "grad_norm": 2.5937814712524414, "learning_rate": 6.952992584745764e-06, "loss": 1.0241, "mean_token_accuracy": 0.7717193365097046, "num_tokens": 18520271.0, "step": 23012 }, { "epoch": 6.094809322033898, "grad_norm": 2.3443403244018555, "learning_rate": 6.952727754237289e-06, "loss": 1.1368, "mean_token_accuracy": 0.7215256243944168, "num_tokens": 18521945.0, "step": 23014 }, { "epoch": 6.095338983050848, "grad_norm": 2.3650689125061035, "learning_rate": 6.9524629237288145e-06, "loss": 0.9573, "mean_token_accuracy": 0.7622342333197594, "num_tokens": 18523303.0, "step": 23016 }, { "epoch": 6.095868644067797, "grad_norm": 2.6759157180786133, "learning_rate": 6.952198093220339e-06, "loss": 1.3463, "mean_token_accuracy": 0.7168524414300919, "num_tokens": 18524980.0, "step": 23018 }, { "epoch": 6.096398305084746, "grad_norm": 2.2963569164276123, "learning_rate": 6.951933262711865e-06, "loss": 1.0477, "mean_token_accuracy": 0.7327721863985062, "num_tokens": 18526637.0, "step": 23020 }, { "epoch": 6.096927966101695, "grad_norm": 2.3721463680267334, "learning_rate": 6.95166843220339e-06, "loss": 1.0102, "mean_token_accuracy": 0.7539874091744423, "num_tokens": 18528133.0, "step": 23022 }, { "epoch": 6.0974576271186445, "grad_norm": 2.2791991233825684, "learning_rate": 6.951403601694916e-06, "loss": 1.1026, "mean_token_accuracy": 0.726395845413208, "num_tokens": 18529783.0, "step": 23024 }, { "epoch": 6.097987288135593, "grad_norm": 3.051759958267212, "learning_rate": 6.951138771186441e-06, "loss": 1.2406, "mean_token_accuracy": 0.7140245363116264, "num_tokens": 18531436.0, "step": 23026 }, { "epoch": 6.098516949152542, "grad_norm": 2.6622698307037354, "learning_rate": 6.950873940677967e-06, "loss": 1.2874, "mean_token_accuracy": 0.6938390359282494, "num_tokens": 18532932.0, "step": 23028 }, { "epoch": 6.099046610169491, "grad_norm": 2.3663885593414307, "learning_rate": 6.9506091101694915e-06, "loss": 1.3406, "mean_token_accuracy": 0.6926813051104546, "num_tokens": 18534602.0, "step": 23030 }, { "epoch": 6.09957627118644, "grad_norm": 2.975694417953491, "learning_rate": 6.950344279661017e-06, "loss": 1.3509, "mean_token_accuracy": 0.6639594286680222, "num_tokens": 18536283.0, "step": 23032 }, { "epoch": 6.10010593220339, "grad_norm": 2.3675904273986816, "learning_rate": 6.950079449152542e-06, "loss": 1.149, "mean_token_accuracy": 0.7272706180810928, "num_tokens": 18537768.0, "step": 23034 }, { "epoch": 6.100635593220339, "grad_norm": 2.1523213386535645, "learning_rate": 6.949814618644069e-06, "loss": 1.1654, "mean_token_accuracy": 0.7358067184686661, "num_tokens": 18539271.0, "step": 23036 }, { "epoch": 6.101165254237288, "grad_norm": 2.1122047901153564, "learning_rate": 6.949549788135593e-06, "loss": 1.085, "mean_token_accuracy": 0.7393346354365349, "num_tokens": 18540643.0, "step": 23038 }, { "epoch": 6.101694915254237, "grad_norm": 2.5749258995056152, "learning_rate": 6.9492849576271195e-06, "loss": 1.2144, "mean_token_accuracy": 0.7092649191617966, "num_tokens": 18542379.0, "step": 23040 }, { "epoch": 6.102224576271187, "grad_norm": 2.2678136825561523, "learning_rate": 6.9490201271186445e-06, "loss": 1.2151, "mean_token_accuracy": 0.7145836129784584, "num_tokens": 18544106.0, "step": 23042 }, { "epoch": 6.102754237288136, "grad_norm": 2.5830140113830566, "learning_rate": 6.94875529661017e-06, "loss": 1.5865, "mean_token_accuracy": 0.6563457399606705, "num_tokens": 18545751.0, "step": 23044 }, { "epoch": 6.103283898305085, "grad_norm": 2.766092300415039, "learning_rate": 6.948490466101695e-06, "loss": 1.0166, "mean_token_accuracy": 0.7511115297675133, "num_tokens": 18547333.0, "step": 23046 }, { "epoch": 6.103813559322034, "grad_norm": 2.248318910598755, "learning_rate": 6.948225635593221e-06, "loss": 0.8181, "mean_token_accuracy": 0.8176547959446907, "num_tokens": 18548600.0, "step": 23048 }, { "epoch": 6.104343220338983, "grad_norm": 2.5474531650543213, "learning_rate": 6.947960805084746e-06, "loss": 1.5561, "mean_token_accuracy": 0.6562730818986893, "num_tokens": 18550203.0, "step": 23050 }, { "epoch": 6.1048728813559325, "grad_norm": 2.045687198638916, "learning_rate": 6.947695974576272e-06, "loss": 1.025, "mean_token_accuracy": 0.7472586035728455, "num_tokens": 18551749.0, "step": 23052 }, { "epoch": 6.1054025423728815, "grad_norm": 3.2178139686584473, "learning_rate": 6.9474311440677974e-06, "loss": 1.3512, "mean_token_accuracy": 0.7123097702860832, "num_tokens": 18553115.0, "step": 23054 }, { "epoch": 6.1059322033898304, "grad_norm": 2.418562889099121, "learning_rate": 6.947166313559322e-06, "loss": 0.9194, "mean_token_accuracy": 0.7804892435669899, "num_tokens": 18554605.0, "step": 23056 }, { "epoch": 6.106461864406779, "grad_norm": 2.0434231758117676, "learning_rate": 6.946901483050848e-06, "loss": 0.9434, "mean_token_accuracy": 0.75019371509552, "num_tokens": 18556166.0, "step": 23058 }, { "epoch": 6.106991525423729, "grad_norm": 2.499847888946533, "learning_rate": 6.946636652542373e-06, "loss": 1.3456, "mean_token_accuracy": 0.6887791305780411, "num_tokens": 18557663.0, "step": 23060 }, { "epoch": 6.107521186440678, "grad_norm": 2.469733715057373, "learning_rate": 6.9463718220339e-06, "loss": 1.7672, "mean_token_accuracy": 0.6163491681218147, "num_tokens": 18559530.0, "step": 23062 }, { "epoch": 6.108050847457627, "grad_norm": 2.967013120651245, "learning_rate": 6.946106991525425e-06, "loss": 1.1073, "mean_token_accuracy": 0.73954788595438, "num_tokens": 18560754.0, "step": 23064 }, { "epoch": 6.108580508474576, "grad_norm": 2.77319598197937, "learning_rate": 6.94584216101695e-06, "loss": 1.395, "mean_token_accuracy": 0.6922168657183647, "num_tokens": 18562209.0, "step": 23066 }, { "epoch": 6.109110169491525, "grad_norm": 2.091132879257202, "learning_rate": 6.945577330508475e-06, "loss": 1.3708, "mean_token_accuracy": 0.6807648912072182, "num_tokens": 18564211.0, "step": 23068 }, { "epoch": 6.109639830508475, "grad_norm": 2.471846342086792, "learning_rate": 6.945312500000001e-06, "loss": 1.0565, "mean_token_accuracy": 0.7722417935729027, "num_tokens": 18565653.0, "step": 23070 }, { "epoch": 6.110169491525424, "grad_norm": 2.537108898162842, "learning_rate": 6.945047669491526e-06, "loss": 1.4666, "mean_token_accuracy": 0.6816977486014366, "num_tokens": 18567486.0, "step": 23072 }, { "epoch": 6.110699152542373, "grad_norm": 2.410463809967041, "learning_rate": 6.944782838983052e-06, "loss": 1.0266, "mean_token_accuracy": 0.7569002136588097, "num_tokens": 18569009.0, "step": 23074 }, { "epoch": 6.111228813559322, "grad_norm": 2.1878280639648438, "learning_rate": 6.944518008474577e-06, "loss": 1.5089, "mean_token_accuracy": 0.6898298189043999, "num_tokens": 18570824.0, "step": 23076 }, { "epoch": 6.111758474576271, "grad_norm": 2.4727914333343506, "learning_rate": 6.9442531779661025e-06, "loss": 1.0883, "mean_token_accuracy": 0.7429462298750877, "num_tokens": 18572519.0, "step": 23078 }, { "epoch": 6.112288135593221, "grad_norm": 2.6098639965057373, "learning_rate": 6.9439883474576274e-06, "loss": 1.4447, "mean_token_accuracy": 0.6609898582100868, "num_tokens": 18574050.0, "step": 23080 }, { "epoch": 6.1128177966101696, "grad_norm": 2.11118483543396, "learning_rate": 6.943723516949153e-06, "loss": 1.3006, "mean_token_accuracy": 0.7251740768551826, "num_tokens": 18575532.0, "step": 23082 }, { "epoch": 6.1133474576271185, "grad_norm": 2.2778518199920654, "learning_rate": 6.943458686440678e-06, "loss": 1.1022, "mean_token_accuracy": 0.7709722742438316, "num_tokens": 18577276.0, "step": 23084 }, { "epoch": 6.1138771186440675, "grad_norm": 1.7731850147247314, "learning_rate": 6.943193855932204e-06, "loss": 1.1085, "mean_token_accuracy": 0.7413138896226883, "num_tokens": 18579121.0, "step": 23086 }, { "epoch": 6.114406779661017, "grad_norm": 2.9538495540618896, "learning_rate": 6.942929025423729e-06, "loss": 1.4418, "mean_token_accuracy": 0.6752357855439186, "num_tokens": 18580850.0, "step": 23088 }, { "epoch": 6.114936440677966, "grad_norm": 2.24653959274292, "learning_rate": 6.9426641949152555e-06, "loss": 1.0674, "mean_token_accuracy": 0.7321488335728645, "num_tokens": 18582360.0, "step": 23090 }, { "epoch": 6.115466101694915, "grad_norm": 1.843230962753296, "learning_rate": 6.9423993644067796e-06, "loss": 0.9474, "mean_token_accuracy": 0.7528459206223488, "num_tokens": 18583868.0, "step": 23092 }, { "epoch": 6.115995762711864, "grad_norm": 2.3571760654449463, "learning_rate": 6.942134533898306e-06, "loss": 1.1614, "mean_token_accuracy": 0.7015600502490997, "num_tokens": 18585472.0, "step": 23094 }, { "epoch": 6.116525423728813, "grad_norm": 1.9390026330947876, "learning_rate": 6.941869703389831e-06, "loss": 1.1369, "mean_token_accuracy": 0.7382712364196777, "num_tokens": 18587286.0, "step": 23096 }, { "epoch": 6.117055084745763, "grad_norm": 2.4750466346740723, "learning_rate": 6.941604872881357e-06, "loss": 1.3587, "mean_token_accuracy": 0.6912946254014969, "num_tokens": 18588649.0, "step": 23098 }, { "epoch": 6.117584745762712, "grad_norm": 2.513911724090576, "learning_rate": 6.941340042372882e-06, "loss": 1.1192, "mean_token_accuracy": 0.7082154005765915, "num_tokens": 18590125.0, "step": 23100 }, { "epoch": 6.118114406779661, "grad_norm": 2.817692756652832, "learning_rate": 6.941075211864408e-06, "loss": 1.3502, "mean_token_accuracy": 0.6976244747638702, "num_tokens": 18591743.0, "step": 23102 }, { "epoch": 6.11864406779661, "grad_norm": 2.2530808448791504, "learning_rate": 6.9408103813559325e-06, "loss": 1.0804, "mean_token_accuracy": 0.7278892695903778, "num_tokens": 18593128.0, "step": 23104 }, { "epoch": 6.11917372881356, "grad_norm": 2.3411221504211426, "learning_rate": 6.940545550847458e-06, "loss": 1.1653, "mean_token_accuracy": 0.730872243642807, "num_tokens": 18594627.0, "step": 23106 }, { "epoch": 6.119703389830509, "grad_norm": 2.851260185241699, "learning_rate": 6.940280720338983e-06, "loss": 1.2311, "mean_token_accuracy": 0.7186440750956535, "num_tokens": 18596290.0, "step": 23108 }, { "epoch": 6.120233050847458, "grad_norm": 2.6819920539855957, "learning_rate": 6.940015889830509e-06, "loss": 1.1649, "mean_token_accuracy": 0.7540516257286072, "num_tokens": 18597769.0, "step": 23110 }, { "epoch": 6.120762711864407, "grad_norm": 2.4385173320770264, "learning_rate": 6.939751059322034e-06, "loss": 0.9955, "mean_token_accuracy": 0.7546398639678955, "num_tokens": 18599291.0, "step": 23112 }, { "epoch": 6.1212923728813555, "grad_norm": 2.4843549728393555, "learning_rate": 6.93948622881356e-06, "loss": 1.444, "mean_token_accuracy": 0.6849084198474884, "num_tokens": 18600951.0, "step": 23114 }, { "epoch": 6.121822033898305, "grad_norm": 2.884876012802124, "learning_rate": 6.939221398305085e-06, "loss": 1.7645, "mean_token_accuracy": 0.6429139226675034, "num_tokens": 18602740.0, "step": 23116 }, { "epoch": 6.122351694915254, "grad_norm": 2.3113696575164795, "learning_rate": 6.938956567796611e-06, "loss": 1.1847, "mean_token_accuracy": 0.7183944210410118, "num_tokens": 18604248.0, "step": 23118 }, { "epoch": 6.122881355932203, "grad_norm": 2.720611333847046, "learning_rate": 6.938691737288135e-06, "loss": 1.4709, "mean_token_accuracy": 0.6680345609784126, "num_tokens": 18606115.0, "step": 23120 }, { "epoch": 6.123411016949152, "grad_norm": 2.7457616329193115, "learning_rate": 6.938426906779662e-06, "loss": 1.5502, "mean_token_accuracy": 0.6534125804901123, "num_tokens": 18607692.0, "step": 23122 }, { "epoch": 6.123940677966102, "grad_norm": 2.506873607635498, "learning_rate": 6.938162076271187e-06, "loss": 1.6007, "mean_token_accuracy": 0.6464970260858536, "num_tokens": 18609358.0, "step": 23124 }, { "epoch": 6.124470338983051, "grad_norm": 3.062511920928955, "learning_rate": 6.937897245762713e-06, "loss": 0.769, "mean_token_accuracy": 0.7938511967658997, "num_tokens": 18611019.0, "step": 23126 }, { "epoch": 6.125, "grad_norm": 2.497267961502075, "learning_rate": 6.937632415254238e-06, "loss": 0.9709, "mean_token_accuracy": 0.766015462577343, "num_tokens": 18612891.0, "step": 23128 }, { "epoch": 6.125529661016949, "grad_norm": 2.5381133556365967, "learning_rate": 6.937367584745763e-06, "loss": 1.2719, "mean_token_accuracy": 0.7265798673033714, "num_tokens": 18614593.0, "step": 23130 }, { "epoch": 6.126059322033898, "grad_norm": 2.8287394046783447, "learning_rate": 6.937102754237288e-06, "loss": 0.943, "mean_token_accuracy": 0.7615025267004967, "num_tokens": 18616266.0, "step": 23132 }, { "epoch": 6.126588983050848, "grad_norm": 2.2490577697753906, "learning_rate": 6.936837923728814e-06, "loss": 1.404, "mean_token_accuracy": 0.6704963594675064, "num_tokens": 18618044.0, "step": 23134 }, { "epoch": 6.127118644067797, "grad_norm": 3.9792535305023193, "learning_rate": 6.936573093220339e-06, "loss": 1.365, "mean_token_accuracy": 0.6944333389401436, "num_tokens": 18619437.0, "step": 23136 }, { "epoch": 6.127648305084746, "grad_norm": 2.471264123916626, "learning_rate": 6.936308262711865e-06, "loss": 1.285, "mean_token_accuracy": 0.7244506254792213, "num_tokens": 18621025.0, "step": 23138 }, { "epoch": 6.128177966101695, "grad_norm": 2.668896436691284, "learning_rate": 6.9360434322033906e-06, "loss": 0.997, "mean_token_accuracy": 0.7392809242010117, "num_tokens": 18622788.0, "step": 23140 }, { "epoch": 6.1287076271186445, "grad_norm": 2.4521732330322266, "learning_rate": 6.9357786016949155e-06, "loss": 1.3292, "mean_token_accuracy": 0.6885180398821831, "num_tokens": 18624286.0, "step": 23142 }, { "epoch": 6.129237288135593, "grad_norm": 2.3260133266448975, "learning_rate": 6.935513771186442e-06, "loss": 1.3638, "mean_token_accuracy": 0.6707251667976379, "num_tokens": 18626123.0, "step": 23144 }, { "epoch": 6.129766949152542, "grad_norm": 2.2387959957122803, "learning_rate": 6.935248940677966e-06, "loss": 1.2527, "mean_token_accuracy": 0.7197348922491074, "num_tokens": 18627596.0, "step": 23146 }, { "epoch": 6.130296610169491, "grad_norm": 2.0681324005126953, "learning_rate": 6.934984110169493e-06, "loss": 0.8671, "mean_token_accuracy": 0.7679257839918137, "num_tokens": 18629129.0, "step": 23148 }, { "epoch": 6.13082627118644, "grad_norm": 2.406195640563965, "learning_rate": 6.934719279661018e-06, "loss": 1.3151, "mean_token_accuracy": 0.7048500627279282, "num_tokens": 18630515.0, "step": 23150 }, { "epoch": 6.13135593220339, "grad_norm": 2.243021011352539, "learning_rate": 6.9344544491525435e-06, "loss": 0.9741, "mean_token_accuracy": 0.76028873026371, "num_tokens": 18632032.0, "step": 23152 }, { "epoch": 6.131885593220339, "grad_norm": 2.058786630630493, "learning_rate": 6.9341896186440685e-06, "loss": 0.9592, "mean_token_accuracy": 0.7622461542487144, "num_tokens": 18633600.0, "step": 23154 }, { "epoch": 6.132415254237288, "grad_norm": 2.692943572998047, "learning_rate": 6.933924788135594e-06, "loss": 1.2751, "mean_token_accuracy": 0.7198360487818718, "num_tokens": 18634996.0, "step": 23156 }, { "epoch": 6.132944915254237, "grad_norm": 2.107020139694214, "learning_rate": 6.933659957627119e-06, "loss": 0.7535, "mean_token_accuracy": 0.8104835450649261, "num_tokens": 18636403.0, "step": 23158 }, { "epoch": 6.133474576271187, "grad_norm": 2.3035130500793457, "learning_rate": 6.933395127118645e-06, "loss": 1.2289, "mean_token_accuracy": 0.7108698710799217, "num_tokens": 18638067.0, "step": 23160 }, { "epoch": 6.134004237288136, "grad_norm": 2.160454034805298, "learning_rate": 6.93313029661017e-06, "loss": 1.2799, "mean_token_accuracy": 0.7264761179685593, "num_tokens": 18639738.0, "step": 23162 }, { "epoch": 6.134533898305085, "grad_norm": 2.697854995727539, "learning_rate": 6.932865466101696e-06, "loss": 1.1235, "mean_token_accuracy": 0.7336765304207802, "num_tokens": 18641315.0, "step": 23164 }, { "epoch": 6.135063559322034, "grad_norm": 2.7706568241119385, "learning_rate": 6.932600635593221e-06, "loss": 1.2105, "mean_token_accuracy": 0.7169479727745056, "num_tokens": 18642904.0, "step": 23166 }, { "epoch": 6.135593220338983, "grad_norm": 3.793713092803955, "learning_rate": 6.932335805084746e-06, "loss": 1.0039, "mean_token_accuracy": 0.751623198390007, "num_tokens": 18644222.0, "step": 23168 }, { "epoch": 6.1361228813559325, "grad_norm": 2.6209523677825928, "learning_rate": 6.932070974576271e-06, "loss": 1.2255, "mean_token_accuracy": 0.6952485293149948, "num_tokens": 18645622.0, "step": 23170 }, { "epoch": 6.1366525423728815, "grad_norm": 1.8349816799163818, "learning_rate": 6.931806144067798e-06, "loss": 0.9522, "mean_token_accuracy": 0.7630483359098434, "num_tokens": 18647619.0, "step": 23172 }, { "epoch": 6.1371822033898304, "grad_norm": 2.439521074295044, "learning_rate": 6.931541313559322e-06, "loss": 1.1139, "mean_token_accuracy": 0.7331978008151054, "num_tokens": 18649105.0, "step": 23174 }, { "epoch": 6.137711864406779, "grad_norm": 2.607196807861328, "learning_rate": 6.931276483050849e-06, "loss": 1.2794, "mean_token_accuracy": 0.720499262213707, "num_tokens": 18650586.0, "step": 23176 }, { "epoch": 6.138241525423728, "grad_norm": 1.948682188987732, "learning_rate": 6.9310116525423735e-06, "loss": 0.9648, "mean_token_accuracy": 0.767629586160183, "num_tokens": 18652143.0, "step": 23178 }, { "epoch": 6.138771186440678, "grad_norm": 2.4864912033081055, "learning_rate": 6.930746822033899e-06, "loss": 1.0457, "mean_token_accuracy": 0.7581591010093689, "num_tokens": 18653539.0, "step": 23180 }, { "epoch": 6.139300847457627, "grad_norm": 2.3137524127960205, "learning_rate": 6.930481991525424e-06, "loss": 1.2607, "mean_token_accuracy": 0.7074775695800781, "num_tokens": 18655224.0, "step": 23182 }, { "epoch": 6.139830508474576, "grad_norm": 2.7147507667541504, "learning_rate": 6.93021716101695e-06, "loss": 1.2355, "mean_token_accuracy": 0.7130965515971184, "num_tokens": 18656701.0, "step": 23184 }, { "epoch": 6.140360169491525, "grad_norm": 2.6764678955078125, "learning_rate": 6.929952330508475e-06, "loss": 0.8461, "mean_token_accuracy": 0.7834631949663162, "num_tokens": 18658000.0, "step": 23186 }, { "epoch": 6.140889830508475, "grad_norm": 2.4506571292877197, "learning_rate": 6.929687500000001e-06, "loss": 1.0699, "mean_token_accuracy": 0.7307594567537308, "num_tokens": 18659920.0, "step": 23188 }, { "epoch": 6.141419491525424, "grad_norm": 3.0230183601379395, "learning_rate": 6.929422669491526e-06, "loss": 1.1803, "mean_token_accuracy": 0.7216105163097382, "num_tokens": 18661563.0, "step": 23190 }, { "epoch": 6.141949152542373, "grad_norm": 2.446596384048462, "learning_rate": 6.9291578389830514e-06, "loss": 1.0747, "mean_token_accuracy": 0.7514866143465042, "num_tokens": 18663080.0, "step": 23192 }, { "epoch": 6.142478813559322, "grad_norm": 2.2003750801086426, "learning_rate": 6.928893008474576e-06, "loss": 0.9037, "mean_token_accuracy": 0.7839109301567078, "num_tokens": 18664460.0, "step": 23194 }, { "epoch": 6.143008474576272, "grad_norm": 3.1457791328430176, "learning_rate": 6.928628177966102e-06, "loss": 1.0903, "mean_token_accuracy": 0.7399129346013069, "num_tokens": 18665774.0, "step": 23196 }, { "epoch": 6.143538135593221, "grad_norm": 2.6758389472961426, "learning_rate": 6.928363347457627e-06, "loss": 1.1834, "mean_token_accuracy": 0.7630862221121788, "num_tokens": 18667175.0, "step": 23198 }, { "epoch": 6.1440677966101696, "grad_norm": 2.049798011779785, "learning_rate": 6.928098516949153e-06, "loss": 0.9753, "mean_token_accuracy": 0.7525909170508385, "num_tokens": 18668787.0, "step": 23200 }, { "epoch": 6.1445974576271185, "grad_norm": 2.444605827331543, "learning_rate": 6.927833686440678e-06, "loss": 0.9259, "mean_token_accuracy": 0.7782526835799217, "num_tokens": 18670161.0, "step": 23202 }, { "epoch": 6.1451271186440675, "grad_norm": 2.9008102416992188, "learning_rate": 6.927568855932204e-06, "loss": 1.4054, "mean_token_accuracy": 0.6685155630111694, "num_tokens": 18671722.0, "step": 23204 }, { "epoch": 6.145656779661017, "grad_norm": 2.573488712310791, "learning_rate": 6.927304025423729e-06, "loss": 1.1686, "mean_token_accuracy": 0.7152599394321442, "num_tokens": 18673336.0, "step": 23206 }, { "epoch": 6.146186440677966, "grad_norm": 2.515996217727661, "learning_rate": 6.927039194915255e-06, "loss": 1.216, "mean_token_accuracy": 0.726162850856781, "num_tokens": 18674731.0, "step": 23208 }, { "epoch": 6.146716101694915, "grad_norm": 2.2444262504577637, "learning_rate": 6.92677436440678e-06, "loss": 1.6595, "mean_token_accuracy": 0.6500895544886589, "num_tokens": 18676399.0, "step": 23210 }, { "epoch": 6.147245762711864, "grad_norm": 2.1581504344940186, "learning_rate": 6.926509533898306e-06, "loss": 1.0012, "mean_token_accuracy": 0.7593012005090714, "num_tokens": 18678013.0, "step": 23212 }, { "epoch": 6.147775423728813, "grad_norm": 2.2079193592071533, "learning_rate": 6.926244703389831e-06, "loss": 1.1036, "mean_token_accuracy": 0.7464310228824615, "num_tokens": 18679651.0, "step": 23214 }, { "epoch": 6.148305084745763, "grad_norm": 2.5453381538391113, "learning_rate": 6.9259798728813565e-06, "loss": 1.2128, "mean_token_accuracy": 0.7046682685613632, "num_tokens": 18681226.0, "step": 23216 }, { "epoch": 6.148834745762712, "grad_norm": 2.4942188262939453, "learning_rate": 6.9257150423728814e-06, "loss": 1.1678, "mean_token_accuracy": 0.7210837826132774, "num_tokens": 18682650.0, "step": 23218 }, { "epoch": 6.149364406779661, "grad_norm": 2.145043134689331, "learning_rate": 6.925450211864407e-06, "loss": 0.8998, "mean_token_accuracy": 0.7784665077924728, "num_tokens": 18684173.0, "step": 23220 }, { "epoch": 6.14989406779661, "grad_norm": 2.7770562171936035, "learning_rate": 6.925185381355933e-06, "loss": 1.3141, "mean_token_accuracy": 0.7173024788498878, "num_tokens": 18685757.0, "step": 23222 }, { "epoch": 6.15042372881356, "grad_norm": 2.0622317790985107, "learning_rate": 6.924920550847458e-06, "loss": 1.0421, "mean_token_accuracy": 0.7451939582824707, "num_tokens": 18687093.0, "step": 23224 }, { "epoch": 6.150953389830509, "grad_norm": 1.879895806312561, "learning_rate": 6.9246557203389845e-06, "loss": 1.0864, "mean_token_accuracy": 0.750078409910202, "num_tokens": 18688552.0, "step": 23226 }, { "epoch": 6.151483050847458, "grad_norm": 2.4187376499176025, "learning_rate": 6.924390889830509e-06, "loss": 1.324, "mean_token_accuracy": 0.7131329104304314, "num_tokens": 18690123.0, "step": 23228 }, { "epoch": 6.152012711864407, "grad_norm": 2.6629483699798584, "learning_rate": 6.924126059322035e-06, "loss": 1.4674, "mean_token_accuracy": 0.6635672822594643, "num_tokens": 18691775.0, "step": 23230 }, { "epoch": 6.1525423728813555, "grad_norm": 2.6629178524017334, "learning_rate": 6.92386122881356e-06, "loss": 1.2053, "mean_token_accuracy": 0.7261915877461433, "num_tokens": 18693380.0, "step": 23232 }, { "epoch": 6.153072033898305, "grad_norm": 2.5740303993225098, "learning_rate": 6.923596398305086e-06, "loss": 1.4293, "mean_token_accuracy": 0.6906053572893143, "num_tokens": 18694852.0, "step": 23234 }, { "epoch": 6.153601694915254, "grad_norm": 2.329555034637451, "learning_rate": 6.923331567796611e-06, "loss": 1.2384, "mean_token_accuracy": 0.7165599465370178, "num_tokens": 18696310.0, "step": 23236 }, { "epoch": 6.154131355932203, "grad_norm": 2.4834697246551514, "learning_rate": 6.923066737288137e-06, "loss": 1.0804, "mean_token_accuracy": 0.7461207360029221, "num_tokens": 18697765.0, "step": 23238 }, { "epoch": 6.154661016949152, "grad_norm": 2.0977742671966553, "learning_rate": 6.922801906779662e-06, "loss": 0.8454, "mean_token_accuracy": 0.7802083939313889, "num_tokens": 18699512.0, "step": 23240 }, { "epoch": 6.155190677966102, "grad_norm": 2.429424285888672, "learning_rate": 6.922537076271187e-06, "loss": 1.0469, "mean_token_accuracy": 0.7536505684256554, "num_tokens": 18701030.0, "step": 23242 }, { "epoch": 6.155720338983051, "grad_norm": 2.3985276222229004, "learning_rate": 6.922272245762712e-06, "loss": 1.194, "mean_token_accuracy": 0.694269485771656, "num_tokens": 18702865.0, "step": 23244 }, { "epoch": 6.15625, "grad_norm": 2.091036081314087, "learning_rate": 6.922007415254238e-06, "loss": 1.2622, "mean_token_accuracy": 0.7153987437486649, "num_tokens": 18704664.0, "step": 23246 }, { "epoch": 6.156779661016949, "grad_norm": 2.4268579483032227, "learning_rate": 6.921742584745763e-06, "loss": 1.2459, "mean_token_accuracy": 0.7240122556686401, "num_tokens": 18706400.0, "step": 23248 }, { "epoch": 6.157309322033898, "grad_norm": 2.2997610569000244, "learning_rate": 6.921477754237289e-06, "loss": 1.3908, "step": 23250 }, { "epoch": 6.157309322033898, "eval_loss": 1.3245999813079834, "eval_mean_token_accuracy": 0.7005485299152213, "eval_num_tokens": 18708051.0, "eval_runtime": 48.6055, "eval_samples_per_second": 6.337, "eval_steps_per_second": 6.337, "step": 23250 }, { "epoch": 6.157838983050848, "grad_norm": 2.5312552452087402, "learning_rate": 6.921212923728814e-06, "loss": 1.2835, "mean_token_accuracy": 0.6930287182331085, "num_tokens": 18709618.0, "step": 23252 }, { "epoch": 6.158368644067797, "grad_norm": 2.3110885620117188, "learning_rate": 6.9209480932203395e-06, "loss": 1.3196, "mean_token_accuracy": 0.6918823197484016, "num_tokens": 18711259.0, "step": 23254 }, { "epoch": 6.158898305084746, "grad_norm": 2.391155958175659, "learning_rate": 6.920683262711864e-06, "loss": 1.0457, "mean_token_accuracy": 0.7627784907817841, "num_tokens": 18712759.0, "step": 23256 }, { "epoch": 6.159427966101695, "grad_norm": 2.421116590499878, "learning_rate": 6.920418432203391e-06, "loss": 1.1471, "mean_token_accuracy": 0.7110109776258469, "num_tokens": 18714416.0, "step": 23258 }, { "epoch": 6.1599576271186445, "grad_norm": 2.47483491897583, "learning_rate": 6.920153601694916e-06, "loss": 1.3414, "mean_token_accuracy": 0.6994969695806503, "num_tokens": 18715939.0, "step": 23260 }, { "epoch": 6.160487288135593, "grad_norm": 2.3442561626434326, "learning_rate": 6.919888771186442e-06, "loss": 1.082, "mean_token_accuracy": 0.7645442113280296, "num_tokens": 18717621.0, "step": 23262 }, { "epoch": 6.161016949152542, "grad_norm": 1.9138565063476562, "learning_rate": 6.919623940677967e-06, "loss": 0.9832, "mean_token_accuracy": 0.7723972722887993, "num_tokens": 18719208.0, "step": 23264 }, { "epoch": 6.161546610169491, "grad_norm": 3.112593412399292, "learning_rate": 6.9193591101694924e-06, "loss": 1.6402, "mean_token_accuracy": 0.6452385187149048, "num_tokens": 18720817.0, "step": 23266 }, { "epoch": 6.16207627118644, "grad_norm": 2.1465377807617188, "learning_rate": 6.919094279661017e-06, "loss": 1.1206, "mean_token_accuracy": 0.728853352367878, "num_tokens": 18722354.0, "step": 23268 }, { "epoch": 6.16260593220339, "grad_norm": 2.630924940109253, "learning_rate": 6.918829449152543e-06, "loss": 1.4052, "mean_token_accuracy": 0.6732200905680656, "num_tokens": 18723937.0, "step": 23270 }, { "epoch": 6.163135593220339, "grad_norm": 2.413588285446167, "learning_rate": 6.918564618644068e-06, "loss": 1.454, "mean_token_accuracy": 0.6784090772271156, "num_tokens": 18725420.0, "step": 23272 }, { "epoch": 6.163665254237288, "grad_norm": 2.4966835975646973, "learning_rate": 6.918299788135594e-06, "loss": 1.1382, "mean_token_accuracy": 0.7344326078891754, "num_tokens": 18726644.0, "step": 23274 }, { "epoch": 6.164194915254237, "grad_norm": 2.330418348312378, "learning_rate": 6.918034957627119e-06, "loss": 1.0363, "mean_token_accuracy": 0.73813396692276, "num_tokens": 18728223.0, "step": 23276 }, { "epoch": 6.164724576271187, "grad_norm": 2.07242751121521, "learning_rate": 6.9177701271186446e-06, "loss": 1.1516, "mean_token_accuracy": 0.7347381114959717, "num_tokens": 18729883.0, "step": 23278 }, { "epoch": 6.165254237288136, "grad_norm": 2.2754018306732178, "learning_rate": 6.9175052966101695e-06, "loss": 1.3457, "mean_token_accuracy": 0.6993855014443398, "num_tokens": 18731379.0, "step": 23280 }, { "epoch": 6.165783898305085, "grad_norm": 2.33895206451416, "learning_rate": 6.917240466101695e-06, "loss": 1.3036, "mean_token_accuracy": 0.6892705857753754, "num_tokens": 18732947.0, "step": 23282 }, { "epoch": 6.166313559322034, "grad_norm": 2.0388596057891846, "learning_rate": 6.91697563559322e-06, "loss": 1.2039, "mean_token_accuracy": 0.7394380271434784, "num_tokens": 18734690.0, "step": 23284 }, { "epoch": 6.166843220338983, "grad_norm": 2.735849618911743, "learning_rate": 6.916710805084747e-06, "loss": 1.272, "mean_token_accuracy": 0.7226453721523285, "num_tokens": 18736491.0, "step": 23286 }, { "epoch": 6.1673728813559325, "grad_norm": 2.315216302871704, "learning_rate": 6.916445974576271e-06, "loss": 1.0507, "mean_token_accuracy": 0.7613738104701042, "num_tokens": 18738450.0, "step": 23288 }, { "epoch": 6.1679025423728815, "grad_norm": 2.7109625339508057, "learning_rate": 6.9161811440677975e-06, "loss": 1.4512, "mean_token_accuracy": 0.6602008417248726, "num_tokens": 18740043.0, "step": 23290 }, { "epoch": 6.1684322033898304, "grad_norm": 1.8428746461868286, "learning_rate": 6.9159163135593225e-06, "loss": 0.6672, "mean_token_accuracy": 0.8233186304569244, "num_tokens": 18741532.0, "step": 23292 }, { "epoch": 6.168961864406779, "grad_norm": 2.2980055809020996, "learning_rate": 6.915651483050848e-06, "loss": 1.2936, "mean_token_accuracy": 0.7018981575965881, "num_tokens": 18743106.0, "step": 23294 }, { "epoch": 6.169491525423728, "grad_norm": 2.7997424602508545, "learning_rate": 6.915386652542373e-06, "loss": 1.217, "mean_token_accuracy": 0.724400207400322, "num_tokens": 18744660.0, "step": 23296 }, { "epoch": 6.170021186440678, "grad_norm": 2.79620623588562, "learning_rate": 6.915121822033899e-06, "loss": 1.3911, "mean_token_accuracy": 0.7027194276452065, "num_tokens": 18746279.0, "step": 23298 }, { "epoch": 6.170550847457627, "grad_norm": 2.676234006881714, "learning_rate": 6.914856991525424e-06, "loss": 0.9721, "mean_token_accuracy": 0.7745082601904869, "num_tokens": 18747558.0, "step": 23300 }, { "epoch": 6.171080508474576, "grad_norm": 2.2657358646392822, "learning_rate": 6.91459216101695e-06, "loss": 1.2563, "mean_token_accuracy": 0.6995747983455658, "num_tokens": 18749163.0, "step": 23302 }, { "epoch": 6.171610169491525, "grad_norm": 2.3406124114990234, "learning_rate": 6.9143273305084746e-06, "loss": 1.3126, "mean_token_accuracy": 0.7149225398898125, "num_tokens": 18750666.0, "step": 23304 }, { "epoch": 6.172139830508475, "grad_norm": 2.616119384765625, "learning_rate": 6.9140625e-06, "loss": 1.4662, "mean_token_accuracy": 0.6829463839530945, "num_tokens": 18752201.0, "step": 23306 }, { "epoch": 6.172669491525424, "grad_norm": 2.487471580505371, "learning_rate": 6.913797669491526e-06, "loss": 1.096, "mean_token_accuracy": 0.7405272424221039, "num_tokens": 18753873.0, "step": 23308 }, { "epoch": 6.173199152542373, "grad_norm": 2.2613561153411865, "learning_rate": 6.913532838983051e-06, "loss": 1.2145, "mean_token_accuracy": 0.7492467127740383, "num_tokens": 18755386.0, "step": 23310 }, { "epoch": 6.173728813559322, "grad_norm": 3.168292284011841, "learning_rate": 6.913268008474578e-06, "loss": 1.6991, "mean_token_accuracy": 0.6400327906012535, "num_tokens": 18756828.0, "step": 23312 }, { "epoch": 6.174258474576272, "grad_norm": 2.828679084777832, "learning_rate": 6.913003177966103e-06, "loss": 1.209, "mean_token_accuracy": 0.7023315653204918, "num_tokens": 18758607.0, "step": 23314 }, { "epoch": 6.174788135593221, "grad_norm": 2.121258020401001, "learning_rate": 6.912738347457628e-06, "loss": 1.1479, "mean_token_accuracy": 0.727052815258503, "num_tokens": 18760384.0, "step": 23316 }, { "epoch": 6.1753177966101696, "grad_norm": 1.9724805355072021, "learning_rate": 6.912473516949153e-06, "loss": 0.9385, "mean_token_accuracy": 0.7803866341710091, "num_tokens": 18762103.0, "step": 23318 }, { "epoch": 6.1758474576271185, "grad_norm": 2.0810959339141846, "learning_rate": 6.912208686440679e-06, "loss": 1.0254, "mean_token_accuracy": 0.760213203728199, "num_tokens": 18763713.0, "step": 23320 }, { "epoch": 6.1763771186440675, "grad_norm": 2.5660159587860107, "learning_rate": 6.911943855932204e-06, "loss": 1.6674, "mean_token_accuracy": 0.6303233578801155, "num_tokens": 18765534.0, "step": 23322 }, { "epoch": 6.176906779661017, "grad_norm": 2.9587955474853516, "learning_rate": 6.91167902542373e-06, "loss": 1.5042, "mean_token_accuracy": 0.662571556866169, "num_tokens": 18766942.0, "step": 23324 }, { "epoch": 6.177436440677966, "grad_norm": 2.644906997680664, "learning_rate": 6.911414194915255e-06, "loss": 0.8779, "mean_token_accuracy": 0.7751384079456329, "num_tokens": 18768890.0, "step": 23326 }, { "epoch": 6.177966101694915, "grad_norm": 2.5660901069641113, "learning_rate": 6.9111493644067805e-06, "loss": 1.3192, "mean_token_accuracy": 0.7195360958576202, "num_tokens": 18770464.0, "step": 23328 }, { "epoch": 6.178495762711864, "grad_norm": 2.604508638381958, "learning_rate": 6.9108845338983054e-06, "loss": 1.7941, "mean_token_accuracy": 0.6081357449293137, "num_tokens": 18771966.0, "step": 23330 }, { "epoch": 6.179025423728813, "grad_norm": 2.295650005340576, "learning_rate": 6.910619703389831e-06, "loss": 1.5489, "mean_token_accuracy": 0.6770453676581383, "num_tokens": 18773657.0, "step": 23332 }, { "epoch": 6.179555084745763, "grad_norm": 1.8784008026123047, "learning_rate": 6.910354872881356e-06, "loss": 1.3925, "mean_token_accuracy": 0.690018467605114, "num_tokens": 18775351.0, "step": 23334 }, { "epoch": 6.180084745762712, "grad_norm": 2.056462049484253, "learning_rate": 6.910090042372882e-06, "loss": 1.3243, "mean_token_accuracy": 0.7049618884921074, "num_tokens": 18777041.0, "step": 23336 }, { "epoch": 6.180614406779661, "grad_norm": 3.547969102859497, "learning_rate": 6.909825211864407e-06, "loss": 0.9151, "mean_token_accuracy": 0.7511352971196175, "num_tokens": 18778939.0, "step": 23338 }, { "epoch": 6.18114406779661, "grad_norm": 2.3932814598083496, "learning_rate": 6.9095603813559335e-06, "loss": 1.0514, "mean_token_accuracy": 0.7562949806451797, "num_tokens": 18780380.0, "step": 23340 }, { "epoch": 6.18167372881356, "grad_norm": 2.515904426574707, "learning_rate": 6.9092955508474575e-06, "loss": 1.0563, "mean_token_accuracy": 0.7465176358819008, "num_tokens": 18782072.0, "step": 23342 }, { "epoch": 6.182203389830509, "grad_norm": 2.5734105110168457, "learning_rate": 6.909030720338984e-06, "loss": 1.1845, "mean_token_accuracy": 0.7338306531310081, "num_tokens": 18783509.0, "step": 23344 }, { "epoch": 6.182733050847458, "grad_norm": 1.9467591047286987, "learning_rate": 6.908765889830509e-06, "loss": 1.1727, "mean_token_accuracy": 0.7280617579817772, "num_tokens": 18785215.0, "step": 23346 }, { "epoch": 6.183262711864407, "grad_norm": 2.561863899230957, "learning_rate": 6.908501059322035e-06, "loss": 1.0425, "mean_token_accuracy": 0.7362741753458977, "num_tokens": 18786889.0, "step": 23348 }, { "epoch": 6.1837923728813555, "grad_norm": 2.0337305068969727, "learning_rate": 6.90823622881356e-06, "loss": 1.0369, "mean_token_accuracy": 0.7552799805998802, "num_tokens": 18788798.0, "step": 23350 }, { "epoch": 6.184322033898305, "grad_norm": 1.6791231632232666, "learning_rate": 6.907971398305086e-06, "loss": 1.0257, "mean_token_accuracy": 0.7364461123943329, "num_tokens": 18790665.0, "step": 23352 }, { "epoch": 6.184851694915254, "grad_norm": 2.060791015625, "learning_rate": 6.9077065677966105e-06, "loss": 0.8477, "mean_token_accuracy": 0.7878431603312492, "num_tokens": 18792173.0, "step": 23354 }, { "epoch": 6.185381355932203, "grad_norm": 2.4609737396240234, "learning_rate": 6.907441737288136e-06, "loss": 1.2481, "mean_token_accuracy": 0.7186056226491928, "num_tokens": 18793727.0, "step": 23356 }, { "epoch": 6.185911016949152, "grad_norm": 2.105473279953003, "learning_rate": 6.907176906779661e-06, "loss": 0.9595, "mean_token_accuracy": 0.7654151916503906, "num_tokens": 18795255.0, "step": 23358 }, { "epoch": 6.186440677966102, "grad_norm": 2.3334217071533203, "learning_rate": 6.906912076271187e-06, "loss": 0.8392, "mean_token_accuracy": 0.7940494790673256, "num_tokens": 18796826.0, "step": 23360 }, { "epoch": 6.186970338983051, "grad_norm": 2.1387383937835693, "learning_rate": 6.906647245762712e-06, "loss": 0.9865, "mean_token_accuracy": 0.7664254605770111, "num_tokens": 18798414.0, "step": 23362 }, { "epoch": 6.1875, "grad_norm": 2.9017550945281982, "learning_rate": 6.906382415254238e-06, "loss": 1.2865, "mean_token_accuracy": 0.6979459449648857, "num_tokens": 18799865.0, "step": 23364 }, { "epoch": 6.188029661016949, "grad_norm": 2.0012094974517822, "learning_rate": 6.906117584745763e-06, "loss": 0.9788, "mean_token_accuracy": 0.76700284704566, "num_tokens": 18801491.0, "step": 23366 }, { "epoch": 6.188559322033898, "grad_norm": 2.2660646438598633, "learning_rate": 6.905852754237289e-06, "loss": 0.7603, "mean_token_accuracy": 0.8112938702106476, "num_tokens": 18803292.0, "step": 23368 }, { "epoch": 6.189088983050848, "grad_norm": 2.6795871257781982, "learning_rate": 6.905587923728813e-06, "loss": 1.2253, "mean_token_accuracy": 0.701420471072197, "num_tokens": 18805010.0, "step": 23370 }, { "epoch": 6.189618644067797, "grad_norm": 2.547091007232666, "learning_rate": 6.90532309322034e-06, "loss": 1.3193, "mean_token_accuracy": 0.7279687747359276, "num_tokens": 18806560.0, "step": 23372 }, { "epoch": 6.190148305084746, "grad_norm": 2.522620916366577, "learning_rate": 6.905058262711865e-06, "loss": 1.1053, "mean_token_accuracy": 0.7265864685177803, "num_tokens": 18808153.0, "step": 23374 }, { "epoch": 6.190677966101695, "grad_norm": 2.37393856048584, "learning_rate": 6.904793432203391e-06, "loss": 1.1926, "mean_token_accuracy": 0.7222293838858604, "num_tokens": 18809922.0, "step": 23376 }, { "epoch": 6.1912076271186445, "grad_norm": 2.0243475437164307, "learning_rate": 6.904528601694916e-06, "loss": 0.8523, "mean_token_accuracy": 0.8000351563096046, "num_tokens": 18811552.0, "step": 23378 }, { "epoch": 6.191737288135593, "grad_norm": 2.3235201835632324, "learning_rate": 6.904263771186441e-06, "loss": 1.0025, "mean_token_accuracy": 0.74830162525177, "num_tokens": 18813360.0, "step": 23380 }, { "epoch": 6.192266949152542, "grad_norm": 2.5505905151367188, "learning_rate": 6.903998940677966e-06, "loss": 0.8495, "mean_token_accuracy": 0.7961575910449028, "num_tokens": 18814836.0, "step": 23382 }, { "epoch": 6.192796610169491, "grad_norm": 2.242443323135376, "learning_rate": 6.903734110169492e-06, "loss": 1.413, "mean_token_accuracy": 0.6810521557927132, "num_tokens": 18816448.0, "step": 23384 }, { "epoch": 6.19332627118644, "grad_norm": 2.4005074501037598, "learning_rate": 6.903469279661017e-06, "loss": 1.5515, "mean_token_accuracy": 0.7046966217458248, "num_tokens": 18818103.0, "step": 23386 }, { "epoch": 6.19385593220339, "grad_norm": 2.4052369594573975, "learning_rate": 6.903204449152543e-06, "loss": 1.179, "mean_token_accuracy": 0.7056542560458183, "num_tokens": 18819690.0, "step": 23388 }, { "epoch": 6.194385593220339, "grad_norm": 2.6048755645751953, "learning_rate": 6.902939618644068e-06, "loss": 1.0601, "mean_token_accuracy": 0.7388182953000069, "num_tokens": 18821042.0, "step": 23390 }, { "epoch": 6.194915254237288, "grad_norm": 1.9250617027282715, "learning_rate": 6.9026747881355935e-06, "loss": 1.044, "mean_token_accuracy": 0.7493315264582634, "num_tokens": 18822481.0, "step": 23392 }, { "epoch": 6.195444915254237, "grad_norm": 2.2579262256622314, "learning_rate": 6.90240995762712e-06, "loss": 1.3181, "mean_token_accuracy": 0.6921656429767609, "num_tokens": 18824266.0, "step": 23394 }, { "epoch": 6.195974576271187, "grad_norm": 2.128857135772705, "learning_rate": 6.902145127118644e-06, "loss": 1.0782, "mean_token_accuracy": 0.742162212729454, "num_tokens": 18825848.0, "step": 23396 }, { "epoch": 6.196504237288136, "grad_norm": 1.726523518562317, "learning_rate": 6.901880296610171e-06, "loss": 0.8545, "mean_token_accuracy": 0.7959001883864403, "num_tokens": 18827551.0, "step": 23398 }, { "epoch": 6.197033898305085, "grad_norm": 2.5387790203094482, "learning_rate": 6.901615466101696e-06, "loss": 1.206, "mean_token_accuracy": 0.7076945155858994, "num_tokens": 18829382.0, "step": 23400 }, { "epoch": 6.197563559322034, "grad_norm": 2.7328643798828125, "learning_rate": 6.9013506355932215e-06, "loss": 1.2964, "mean_token_accuracy": 0.703799344599247, "num_tokens": 18831062.0, "step": 23402 }, { "epoch": 6.198093220338983, "grad_norm": 2.776780843734741, "learning_rate": 6.9010858050847464e-06, "loss": 1.8279, "mean_token_accuracy": 0.6084116324782372, "num_tokens": 18832578.0, "step": 23404 }, { "epoch": 6.1986228813559325, "grad_norm": 2.602553606033325, "learning_rate": 6.900820974576272e-06, "loss": 1.0778, "mean_token_accuracy": 0.7322108522057533, "num_tokens": 18834190.0, "step": 23406 }, { "epoch": 6.1991525423728815, "grad_norm": 2.9253976345062256, "learning_rate": 6.900556144067797e-06, "loss": 0.9498, "mean_token_accuracy": 0.7855686619877815, "num_tokens": 18835511.0, "step": 23408 }, { "epoch": 6.1996822033898304, "grad_norm": 2.4893393516540527, "learning_rate": 6.900291313559323e-06, "loss": 1.4014, "mean_token_accuracy": 0.6867160275578499, "num_tokens": 18837108.0, "step": 23410 }, { "epoch": 6.200211864406779, "grad_norm": 2.1646077632904053, "learning_rate": 6.900026483050848e-06, "loss": 1.2328, "mean_token_accuracy": 0.7024250477552414, "num_tokens": 18838830.0, "step": 23412 }, { "epoch": 6.200741525423728, "grad_norm": 2.4475796222686768, "learning_rate": 6.899761652542374e-06, "loss": 1.004, "mean_token_accuracy": 0.7757871896028519, "num_tokens": 18840096.0, "step": 23414 }, { "epoch": 6.201271186440678, "grad_norm": 2.195322036743164, "learning_rate": 6.8994968220338986e-06, "loss": 1.1979, "mean_token_accuracy": 0.7052600234746933, "num_tokens": 18841549.0, "step": 23416 }, { "epoch": 6.201800847457627, "grad_norm": 2.4632997512817383, "learning_rate": 6.899231991525424e-06, "loss": 1.3005, "mean_token_accuracy": 0.7014549523591995, "num_tokens": 18842853.0, "step": 23418 }, { "epoch": 6.202330508474576, "grad_norm": 2.2553932666778564, "learning_rate": 6.898967161016949e-06, "loss": 0.9197, "mean_token_accuracy": 0.7746913433074951, "num_tokens": 18844470.0, "step": 23420 }, { "epoch": 6.202860169491525, "grad_norm": 2.5927398204803467, "learning_rate": 6.898702330508476e-06, "loss": 1.1691, "mean_token_accuracy": 0.7390670329332352, "num_tokens": 18845955.0, "step": 23422 }, { "epoch": 6.203389830508475, "grad_norm": 2.436758279800415, "learning_rate": 6.8984375e-06, "loss": 0.9258, "mean_token_accuracy": 0.7749988883733749, "num_tokens": 18847569.0, "step": 23424 }, { "epoch": 6.203919491525424, "grad_norm": 1.9886620044708252, "learning_rate": 6.898172669491527e-06, "loss": 1.2197, "mean_token_accuracy": 0.7229725793004036, "num_tokens": 18849543.0, "step": 23426 }, { "epoch": 6.204449152542373, "grad_norm": 2.6750881671905518, "learning_rate": 6.8979078389830515e-06, "loss": 1.1802, "mean_token_accuracy": 0.7128347754478455, "num_tokens": 18851040.0, "step": 23428 }, { "epoch": 6.204978813559322, "grad_norm": 2.1626105308532715, "learning_rate": 6.897643008474577e-06, "loss": 1.1081, "mean_token_accuracy": 0.7603476196527481, "num_tokens": 18852577.0, "step": 23430 }, { "epoch": 6.205508474576272, "grad_norm": 2.427457332611084, "learning_rate": 6.897378177966102e-06, "loss": 1.435, "mean_token_accuracy": 0.6765811294317245, "num_tokens": 18854181.0, "step": 23432 }, { "epoch": 6.206038135593221, "grad_norm": 2.246673345565796, "learning_rate": 6.897113347457628e-06, "loss": 1.1803, "mean_token_accuracy": 0.7205460965633392, "num_tokens": 18855991.0, "step": 23434 }, { "epoch": 6.2065677966101696, "grad_norm": 2.585782289505005, "learning_rate": 6.896848516949153e-06, "loss": 1.3373, "mean_token_accuracy": 0.6941710934042931, "num_tokens": 18857667.0, "step": 23436 }, { "epoch": 6.2070974576271185, "grad_norm": 2.7991344928741455, "learning_rate": 6.896583686440679e-06, "loss": 1.4007, "mean_token_accuracy": 0.6871449798345566, "num_tokens": 18859199.0, "step": 23438 }, { "epoch": 6.2076271186440675, "grad_norm": 2.1220221519470215, "learning_rate": 6.896318855932204e-06, "loss": 1.3289, "mean_token_accuracy": 0.7190974578261375, "num_tokens": 18860747.0, "step": 23440 }, { "epoch": 6.208156779661017, "grad_norm": 2.2542853355407715, "learning_rate": 6.896054025423729e-06, "loss": 1.4447, "mean_token_accuracy": 0.6741702258586884, "num_tokens": 18862466.0, "step": 23442 }, { "epoch": 6.208686440677966, "grad_norm": 2.570446729660034, "learning_rate": 6.895789194915254e-06, "loss": 1.3558, "mean_token_accuracy": 0.7000296711921692, "num_tokens": 18864132.0, "step": 23444 }, { "epoch": 6.209216101694915, "grad_norm": 2.5983569622039795, "learning_rate": 6.89552436440678e-06, "loss": 1.3341, "mean_token_accuracy": 0.7032175622880459, "num_tokens": 18865642.0, "step": 23446 }, { "epoch": 6.209745762711864, "grad_norm": 2.8558921813964844, "learning_rate": 6.895259533898305e-06, "loss": 1.2932, "mean_token_accuracy": 0.700022354722023, "num_tokens": 18867116.0, "step": 23448 }, { "epoch": 6.210275423728813, "grad_norm": 2.7575175762176514, "learning_rate": 6.894994703389831e-06, "loss": 1.1468, "mean_token_accuracy": 0.7295709550380707, "num_tokens": 18868815.0, "step": 23450 }, { "epoch": 6.210805084745763, "grad_norm": 2.610278606414795, "learning_rate": 6.894729872881356e-06, "loss": 1.3026, "mean_token_accuracy": 0.6979547515511513, "num_tokens": 18870438.0, "step": 23452 }, { "epoch": 6.211334745762712, "grad_norm": 2.4737889766693115, "learning_rate": 6.894465042372882e-06, "loss": 1.1752, "mean_token_accuracy": 0.7236137092113495, "num_tokens": 18871869.0, "step": 23454 }, { "epoch": 6.211864406779661, "grad_norm": 2.063000440597534, "learning_rate": 6.894200211864407e-06, "loss": 1.1282, "mean_token_accuracy": 0.7514843642711639, "num_tokens": 18873569.0, "step": 23456 }, { "epoch": 6.21239406779661, "grad_norm": 3.219416618347168, "learning_rate": 6.893935381355933e-06, "loss": 1.5984, "mean_token_accuracy": 0.6323413848876953, "num_tokens": 18875070.0, "step": 23458 }, { "epoch": 6.21292372881356, "grad_norm": 2.6460156440734863, "learning_rate": 6.893670550847458e-06, "loss": 1.0263, "mean_token_accuracy": 0.7644603326916695, "num_tokens": 18876632.0, "step": 23460 }, { "epoch": 6.213453389830509, "grad_norm": 2.321256637573242, "learning_rate": 6.893405720338984e-06, "loss": 1.2307, "mean_token_accuracy": 0.7124987691640854, "num_tokens": 18878091.0, "step": 23462 }, { "epoch": 6.213983050847458, "grad_norm": 2.7410192489624023, "learning_rate": 6.893140889830509e-06, "loss": 1.5832, "mean_token_accuracy": 0.654525600373745, "num_tokens": 18879621.0, "step": 23464 }, { "epoch": 6.214512711864407, "grad_norm": 2.3675198554992676, "learning_rate": 6.8928760593220345e-06, "loss": 0.9172, "mean_token_accuracy": 0.7704235687851906, "num_tokens": 18881173.0, "step": 23466 }, { "epoch": 6.2150423728813555, "grad_norm": 1.9802888631820679, "learning_rate": 6.892611228813559e-06, "loss": 1.3507, "mean_token_accuracy": 0.6984046325087547, "num_tokens": 18883017.0, "step": 23468 }, { "epoch": 6.215572033898305, "grad_norm": 2.684297561645508, "learning_rate": 6.892346398305085e-06, "loss": 1.1453, "mean_token_accuracy": 0.7321194857358932, "num_tokens": 18884386.0, "step": 23470 }, { "epoch": 6.216101694915254, "grad_norm": 2.2022345066070557, "learning_rate": 6.89208156779661e-06, "loss": 0.729, "mean_token_accuracy": 0.8154002726078033, "num_tokens": 18885738.0, "step": 23472 }, { "epoch": 6.216631355932203, "grad_norm": 2.4697775840759277, "learning_rate": 6.891816737288136e-06, "loss": 1.1479, "mean_token_accuracy": 0.7302949503064156, "num_tokens": 18887377.0, "step": 23474 }, { "epoch": 6.217161016949152, "grad_norm": 1.8073585033416748, "learning_rate": 6.8915519067796625e-06, "loss": 0.8511, "mean_token_accuracy": 0.7693746611475945, "num_tokens": 18889044.0, "step": 23476 }, { "epoch": 6.217690677966102, "grad_norm": 1.9760361909866333, "learning_rate": 6.891287076271187e-06, "loss": 1.0962, "mean_token_accuracy": 0.742322102189064, "num_tokens": 18890642.0, "step": 23478 }, { "epoch": 6.218220338983051, "grad_norm": 2.2439677715301514, "learning_rate": 6.891022245762713e-06, "loss": 1.101, "mean_token_accuracy": 0.7210014164447784, "num_tokens": 18892305.0, "step": 23480 }, { "epoch": 6.21875, "grad_norm": 2.278838634490967, "learning_rate": 6.890757415254238e-06, "loss": 1.076, "mean_token_accuracy": 0.7698581144213676, "num_tokens": 18893790.0, "step": 23482 }, { "epoch": 6.219279661016949, "grad_norm": 2.159059762954712, "learning_rate": 6.890492584745764e-06, "loss": 0.9678, "mean_token_accuracy": 0.7635764703154564, "num_tokens": 18895671.0, "step": 23484 }, { "epoch": 6.219809322033898, "grad_norm": 2.683769941329956, "learning_rate": 6.890227754237289e-06, "loss": 1.3933, "mean_token_accuracy": 0.7164430841803551, "num_tokens": 18897232.0, "step": 23486 }, { "epoch": 6.220338983050848, "grad_norm": 2.378023386001587, "learning_rate": 6.889962923728815e-06, "loss": 1.066, "mean_token_accuracy": 0.761969655752182, "num_tokens": 18898873.0, "step": 23488 }, { "epoch": 6.220868644067797, "grad_norm": 2.611496925354004, "learning_rate": 6.8896980932203396e-06, "loss": 1.0328, "mean_token_accuracy": 0.7444509714841843, "num_tokens": 18900653.0, "step": 23490 }, { "epoch": 6.221398305084746, "grad_norm": 2.4379069805145264, "learning_rate": 6.889433262711865e-06, "loss": 0.9412, "mean_token_accuracy": 0.769690990447998, "num_tokens": 18902112.0, "step": 23492 }, { "epoch": 6.221927966101695, "grad_norm": 2.222566604614258, "learning_rate": 6.88916843220339e-06, "loss": 1.2334, "mean_token_accuracy": 0.7265581041574478, "num_tokens": 18903684.0, "step": 23494 }, { "epoch": 6.2224576271186445, "grad_norm": 2.1479272842407227, "learning_rate": 6.888903601694916e-06, "loss": 0.9355, "mean_token_accuracy": 0.7676321640610695, "num_tokens": 18905138.0, "step": 23496 }, { "epoch": 6.222987288135593, "grad_norm": 2.414196014404297, "learning_rate": 6.888638771186441e-06, "loss": 1.1096, "mean_token_accuracy": 0.7503720447421074, "num_tokens": 18906892.0, "step": 23498 }, { "epoch": 6.223516949152542, "grad_norm": 2.992964744567871, "learning_rate": 6.888373940677967e-06, "loss": 1.4452, "step": 23500 }, { "epoch": 6.223516949152542, "eval_loss": 1.328089952468872, "eval_mean_token_accuracy": 0.700929385017265, "eval_num_tokens": 18908625.0, "eval_runtime": 48.5999, "eval_samples_per_second": 6.337, "eval_steps_per_second": 6.337, "step": 23500 }, { "epoch": 6.224046610169491, "grad_norm": 2.1927223205566406, "learning_rate": 6.888109110169492e-06, "loss": 1.5454, "mean_token_accuracy": 0.6580788716673851, "num_tokens": 18910087.0, "step": 23502 }, { "epoch": 6.22457627118644, "grad_norm": 1.9374243021011353, "learning_rate": 6.8878442796610175e-06, "loss": 0.6959, "mean_token_accuracy": 0.8306451812386513, "num_tokens": 18911673.0, "step": 23504 }, { "epoch": 6.22510593220339, "grad_norm": 2.4222567081451416, "learning_rate": 6.887579449152542e-06, "loss": 1.2891, "mean_token_accuracy": 0.7121362090110779, "num_tokens": 18913376.0, "step": 23506 }, { "epoch": 6.225635593220339, "grad_norm": 2.427029609680176, "learning_rate": 6.887314618644069e-06, "loss": 1.2909, "mean_token_accuracy": 0.7095412984490395, "num_tokens": 18914746.0, "step": 23508 }, { "epoch": 6.226165254237288, "grad_norm": 2.2554574012756348, "learning_rate": 6.887049788135594e-06, "loss": 1.6394, "mean_token_accuracy": 0.629866786301136, "num_tokens": 18916191.0, "step": 23510 }, { "epoch": 6.226694915254237, "grad_norm": 2.2626118659973145, "learning_rate": 6.88678495762712e-06, "loss": 1.1268, "mean_token_accuracy": 0.7187352702021599, "num_tokens": 18918262.0, "step": 23512 }, { "epoch": 6.227224576271187, "grad_norm": 2.5873749256134033, "learning_rate": 6.886520127118645e-06, "loss": 1.1454, "mean_token_accuracy": 0.7342748865485191, "num_tokens": 18919855.0, "step": 23514 }, { "epoch": 6.227754237288136, "grad_norm": 3.0284509658813477, "learning_rate": 6.8862552966101704e-06, "loss": 1.6678, "mean_token_accuracy": 0.6471857614815235, "num_tokens": 18921416.0, "step": 23516 }, { "epoch": 6.228283898305085, "grad_norm": 1.980718731880188, "learning_rate": 6.885990466101695e-06, "loss": 0.9021, "mean_token_accuracy": 0.7834478244185448, "num_tokens": 18923252.0, "step": 23518 }, { "epoch": 6.228813559322034, "grad_norm": 2.445084571838379, "learning_rate": 6.885725635593221e-06, "loss": 1.2664, "mean_token_accuracy": 0.7349399700760841, "num_tokens": 18924797.0, "step": 23520 }, { "epoch": 6.229343220338983, "grad_norm": 2.1505346298217773, "learning_rate": 6.885460805084746e-06, "loss": 1.649, "mean_token_accuracy": 0.6504701375961304, "num_tokens": 18926673.0, "step": 23522 }, { "epoch": 6.2298728813559325, "grad_norm": 2.0132434368133545, "learning_rate": 6.885195974576272e-06, "loss": 1.0934, "mean_token_accuracy": 0.7380069270730019, "num_tokens": 18928275.0, "step": 23524 }, { "epoch": 6.2304025423728815, "grad_norm": 2.7290375232696533, "learning_rate": 6.884931144067797e-06, "loss": 0.9499, "mean_token_accuracy": 0.7706289365887642, "num_tokens": 18929944.0, "step": 23526 }, { "epoch": 6.2309322033898304, "grad_norm": 3.1406004428863525, "learning_rate": 6.8846663135593225e-06, "loss": 0.9357, "mean_token_accuracy": 0.7734242305159569, "num_tokens": 18931429.0, "step": 23528 }, { "epoch": 6.231461864406779, "grad_norm": 2.020177125930786, "learning_rate": 6.8844014830508475e-06, "loss": 1.121, "mean_token_accuracy": 0.7204577922821045, "num_tokens": 18933543.0, "step": 23530 }, { "epoch": 6.231991525423728, "grad_norm": 2.3492517471313477, "learning_rate": 6.884136652542373e-06, "loss": 1.2296, "mean_token_accuracy": 0.7267884388566017, "num_tokens": 18935501.0, "step": 23532 }, { "epoch": 6.232521186440678, "grad_norm": 2.740835189819336, "learning_rate": 6.883871822033898e-06, "loss": 1.0684, "mean_token_accuracy": 0.7427431643009186, "num_tokens": 18936983.0, "step": 23534 }, { "epoch": 6.233050847457627, "grad_norm": 2.334651231765747, "learning_rate": 6.883606991525425e-06, "loss": 1.0394, "mean_token_accuracy": 0.7496548071503639, "num_tokens": 18938390.0, "step": 23536 }, { "epoch": 6.233580508474576, "grad_norm": 2.473789930343628, "learning_rate": 6.883342161016949e-06, "loss": 1.1135, "mean_token_accuracy": 0.7558166012167931, "num_tokens": 18939852.0, "step": 23538 }, { "epoch": 6.234110169491525, "grad_norm": 2.7265079021453857, "learning_rate": 6.8830773305084755e-06, "loss": 1.4025, "mean_token_accuracy": 0.7023884057998657, "num_tokens": 18941271.0, "step": 23540 }, { "epoch": 6.234639830508475, "grad_norm": 2.393083095550537, "learning_rate": 6.8828125000000004e-06, "loss": 1.2226, "mean_token_accuracy": 0.730535488575697, "num_tokens": 18942680.0, "step": 23542 }, { "epoch": 6.235169491525424, "grad_norm": 2.829026222229004, "learning_rate": 6.882547669491526e-06, "loss": 1.1901, "mean_token_accuracy": 0.7183569520711899, "num_tokens": 18944120.0, "step": 23544 }, { "epoch": 6.235699152542373, "grad_norm": 1.91561758518219, "learning_rate": 6.882282838983051e-06, "loss": 0.9119, "mean_token_accuracy": 0.7781347259879112, "num_tokens": 18946020.0, "step": 23546 }, { "epoch": 6.236228813559322, "grad_norm": 2.4488320350646973, "learning_rate": 6.882018008474577e-06, "loss": 1.3637, "mean_token_accuracy": 0.688386008143425, "num_tokens": 18947623.0, "step": 23548 }, { "epoch": 6.236758474576272, "grad_norm": 2.2427220344543457, "learning_rate": 6.881753177966102e-06, "loss": 1.0419, "mean_token_accuracy": 0.7510306239128113, "num_tokens": 18949083.0, "step": 23550 }, { "epoch": 6.237288135593221, "grad_norm": 3.917433500289917, "learning_rate": 6.881488347457628e-06, "loss": 1.1762, "mean_token_accuracy": 0.7339547425508499, "num_tokens": 18950597.0, "step": 23552 }, { "epoch": 6.2378177966101696, "grad_norm": 2.158252716064453, "learning_rate": 6.8812235169491526e-06, "loss": 0.997, "mean_token_accuracy": 0.7475185915827751, "num_tokens": 18952373.0, "step": 23554 }, { "epoch": 6.2383474576271185, "grad_norm": 2.57147479057312, "learning_rate": 6.880958686440678e-06, "loss": 1.3597, "mean_token_accuracy": 0.6849860846996307, "num_tokens": 18953985.0, "step": 23556 }, { "epoch": 6.2388771186440675, "grad_norm": 2.753762722015381, "learning_rate": 6.880693855932203e-06, "loss": 1.3556, "mean_token_accuracy": 0.686424121260643, "num_tokens": 18955805.0, "step": 23558 }, { "epoch": 6.239406779661017, "grad_norm": 3.0766501426696777, "learning_rate": 6.880429025423729e-06, "loss": 1.214, "mean_token_accuracy": 0.7215148881077766, "num_tokens": 18957164.0, "step": 23560 }, { "epoch": 6.239936440677966, "grad_norm": 2.78840708732605, "learning_rate": 6.880164194915256e-06, "loss": 1.5373, "mean_token_accuracy": 0.6559665873646736, "num_tokens": 18958694.0, "step": 23562 }, { "epoch": 6.240466101694915, "grad_norm": 2.6103641986846924, "learning_rate": 6.879899364406781e-06, "loss": 1.468, "mean_token_accuracy": 0.6942695714533329, "num_tokens": 18960634.0, "step": 23564 }, { "epoch": 6.240995762711864, "grad_norm": 1.725119948387146, "learning_rate": 6.879634533898306e-06, "loss": 0.7718, "mean_token_accuracy": 0.7877020686864853, "num_tokens": 18962383.0, "step": 23566 }, { "epoch": 6.241525423728813, "grad_norm": 2.307833433151245, "learning_rate": 6.879369703389831e-06, "loss": 1.2223, "mean_token_accuracy": 0.7091676667332649, "num_tokens": 18964124.0, "step": 23568 }, { "epoch": 6.242055084745763, "grad_norm": 2.2399682998657227, "learning_rate": 6.879104872881357e-06, "loss": 1.2114, "mean_token_accuracy": 0.739598885178566, "num_tokens": 18965698.0, "step": 23570 }, { "epoch": 6.242584745762712, "grad_norm": 2.5437440872192383, "learning_rate": 6.878840042372882e-06, "loss": 1.0269, "mean_token_accuracy": 0.7538695260882378, "num_tokens": 18967227.0, "step": 23572 }, { "epoch": 6.243114406779661, "grad_norm": 2.1795341968536377, "learning_rate": 6.878575211864408e-06, "loss": 0.7984, "mean_token_accuracy": 0.7952342480421066, "num_tokens": 18968716.0, "step": 23574 }, { "epoch": 6.24364406779661, "grad_norm": 2.153775215148926, "learning_rate": 6.878310381355933e-06, "loss": 1.3698, "mean_token_accuracy": 0.6946200504899025, "num_tokens": 18970299.0, "step": 23576 }, { "epoch": 6.24417372881356, "grad_norm": 2.39072322845459, "learning_rate": 6.8780455508474585e-06, "loss": 1.4963, "mean_token_accuracy": 0.7011492177844048, "num_tokens": 18971786.0, "step": 23578 }, { "epoch": 6.244703389830509, "grad_norm": 3.3337883949279785, "learning_rate": 6.877780720338983e-06, "loss": 0.8387, "mean_token_accuracy": 0.7708645537495613, "num_tokens": 18973256.0, "step": 23580 }, { "epoch": 6.245233050847458, "grad_norm": 1.988869309425354, "learning_rate": 6.877515889830509e-06, "loss": 0.8983, "mean_token_accuracy": 0.7509589940309525, "num_tokens": 18974717.0, "step": 23582 }, { "epoch": 6.245762711864407, "grad_norm": 2.2763493061065674, "learning_rate": 6.877251059322034e-06, "loss": 1.2116, "mean_token_accuracy": 0.7318190708756447, "num_tokens": 18976322.0, "step": 23584 }, { "epoch": 6.2462923728813555, "grad_norm": 2.4399476051330566, "learning_rate": 6.87698622881356e-06, "loss": 1.112, "mean_token_accuracy": 0.726888433098793, "num_tokens": 18977888.0, "step": 23586 }, { "epoch": 6.246822033898305, "grad_norm": 1.8108229637145996, "learning_rate": 6.876721398305085e-06, "loss": 0.8852, "mean_token_accuracy": 0.7767230495810509, "num_tokens": 18979536.0, "step": 23588 }, { "epoch": 6.247351694915254, "grad_norm": 2.7396597862243652, "learning_rate": 6.8764565677966114e-06, "loss": 1.0354, "mean_token_accuracy": 0.7736340463161469, "num_tokens": 18980757.0, "step": 23590 }, { "epoch": 6.247881355932203, "grad_norm": 2.325131893157959, "learning_rate": 6.8761917372881355e-06, "loss": 1.089, "mean_token_accuracy": 0.7359857261180878, "num_tokens": 18982383.0, "step": 23592 }, { "epoch": 6.248411016949152, "grad_norm": 2.6797287464141846, "learning_rate": 6.875926906779662e-06, "loss": 1.1552, "mean_token_accuracy": 0.7242193594574928, "num_tokens": 18983821.0, "step": 23594 }, { "epoch": 6.248940677966102, "grad_norm": 2.2332563400268555, "learning_rate": 6.875662076271187e-06, "loss": 1.3301, "mean_token_accuracy": 0.714467778801918, "num_tokens": 18985234.0, "step": 23596 }, { "epoch": 6.249470338983051, "grad_norm": 2.788205146789551, "learning_rate": 6.875397245762713e-06, "loss": 1.3642, "mean_token_accuracy": 0.721844308078289, "num_tokens": 18986687.0, "step": 23598 }, { "epoch": 6.25, "grad_norm": 2.614842176437378, "learning_rate": 6.875132415254238e-06, "loss": 0.8065, "mean_token_accuracy": 0.8040224760770798, "num_tokens": 18988059.0, "step": 23600 }, { "epoch": 6.250529661016949, "grad_norm": 2.7422690391540527, "learning_rate": 6.8748675847457636e-06, "loss": 1.2223, "mean_token_accuracy": 0.741818368434906, "num_tokens": 18989696.0, "step": 23602 }, { "epoch": 6.251059322033898, "grad_norm": 2.304683208465576, "learning_rate": 6.8746027542372885e-06, "loss": 1.2933, "mean_token_accuracy": 0.7016461789608002, "num_tokens": 18991365.0, "step": 23604 }, { "epoch": 6.251588983050848, "grad_norm": 2.543492555618286, "learning_rate": 6.874337923728814e-06, "loss": 1.1614, "mean_token_accuracy": 0.7080052196979523, "num_tokens": 18993149.0, "step": 23606 }, { "epoch": 6.252118644067797, "grad_norm": 2.739957571029663, "learning_rate": 6.874073093220339e-06, "loss": 1.2379, "mean_token_accuracy": 0.7164287939667702, "num_tokens": 18994565.0, "step": 23608 }, { "epoch": 6.252648305084746, "grad_norm": 2.698610782623291, "learning_rate": 6.873808262711865e-06, "loss": 1.649, "mean_token_accuracy": 0.6322730034589767, "num_tokens": 18996482.0, "step": 23610 }, { "epoch": 6.253177966101695, "grad_norm": 2.473139524459839, "learning_rate": 6.87354343220339e-06, "loss": 1.0133, "mean_token_accuracy": 0.771349124610424, "num_tokens": 18997890.0, "step": 23612 }, { "epoch": 6.2537076271186445, "grad_norm": 2.3906989097595215, "learning_rate": 6.873278601694916e-06, "loss": 1.2363, "mean_token_accuracy": 0.7105414569377899, "num_tokens": 18999430.0, "step": 23614 }, { "epoch": 6.254237288135593, "grad_norm": 2.2409443855285645, "learning_rate": 6.873013771186441e-06, "loss": 1.2173, "mean_token_accuracy": 0.7150374725461006, "num_tokens": 19001225.0, "step": 23616 }, { "epoch": 6.254766949152542, "grad_norm": 2.08612322807312, "learning_rate": 6.872748940677967e-06, "loss": 1.6069, "mean_token_accuracy": 0.6732399016618729, "num_tokens": 19002995.0, "step": 23618 }, { "epoch": 6.255296610169491, "grad_norm": 2.36411714553833, "learning_rate": 6.872484110169491e-06, "loss": 1.3596, "mean_token_accuracy": 0.7221879735589027, "num_tokens": 19004442.0, "step": 23620 }, { "epoch": 6.25582627118644, "grad_norm": 2.5937485694885254, "learning_rate": 6.872219279661018e-06, "loss": 1.5727, "mean_token_accuracy": 0.6325289532542229, "num_tokens": 19006076.0, "step": 23622 }, { "epoch": 6.25635593220339, "grad_norm": 1.7744117975234985, "learning_rate": 6.871954449152543e-06, "loss": 1.2795, "mean_token_accuracy": 0.7058842442929745, "num_tokens": 19007812.0, "step": 23624 }, { "epoch": 6.256885593220339, "grad_norm": 2.003828287124634, "learning_rate": 6.871689618644069e-06, "loss": 1.4391, "mean_token_accuracy": 0.6917689740657806, "num_tokens": 19009542.0, "step": 23626 }, { "epoch": 6.257415254237288, "grad_norm": 2.7383928298950195, "learning_rate": 6.8714247881355936e-06, "loss": 1.1496, "mean_token_accuracy": 0.7170062065124512, "num_tokens": 19011035.0, "step": 23628 }, { "epoch": 6.257944915254237, "grad_norm": 2.72440242767334, "learning_rate": 6.871159957627119e-06, "loss": 1.483, "mean_token_accuracy": 0.7100159823894501, "num_tokens": 19012392.0, "step": 23630 }, { "epoch": 6.258474576271187, "grad_norm": 2.3850820064544678, "learning_rate": 6.870895127118644e-06, "loss": 1.2065, "mean_token_accuracy": 0.7374257743358612, "num_tokens": 19013881.0, "step": 23632 }, { "epoch": 6.259004237288136, "grad_norm": 2.501046895980835, "learning_rate": 6.87063029661017e-06, "loss": 1.1398, "mean_token_accuracy": 0.7518542744219303, "num_tokens": 19015317.0, "step": 23634 }, { "epoch": 6.259533898305085, "grad_norm": 2.4626388549804688, "learning_rate": 6.870365466101695e-06, "loss": 1.2525, "mean_token_accuracy": 0.7263255342841148, "num_tokens": 19016779.0, "step": 23636 }, { "epoch": 6.260063559322034, "grad_norm": 2.458414077758789, "learning_rate": 6.870100635593221e-06, "loss": 1.2203, "mean_token_accuracy": 0.7293453067541122, "num_tokens": 19018185.0, "step": 23638 }, { "epoch": 6.260593220338983, "grad_norm": 2.7134926319122314, "learning_rate": 6.869835805084746e-06, "loss": 1.1692, "mean_token_accuracy": 0.7218040004372597, "num_tokens": 19019956.0, "step": 23640 }, { "epoch": 6.2611228813559325, "grad_norm": 2.4979987144470215, "learning_rate": 6.8695709745762715e-06, "loss": 0.7967, "mean_token_accuracy": 0.7956327125430107, "num_tokens": 19021504.0, "step": 23642 }, { "epoch": 6.2616525423728815, "grad_norm": 2.0939977169036865, "learning_rate": 6.869306144067798e-06, "loss": 1.0143, "mean_token_accuracy": 0.764806441962719, "num_tokens": 19022923.0, "step": 23644 }, { "epoch": 6.2621822033898304, "grad_norm": 2.088395595550537, "learning_rate": 6.869041313559322e-06, "loss": 0.9068, "mean_token_accuracy": 0.7586642727255821, "num_tokens": 19024322.0, "step": 23646 }, { "epoch": 6.262711864406779, "grad_norm": 2.05873441696167, "learning_rate": 6.868776483050849e-06, "loss": 1.0139, "mean_token_accuracy": 0.7247593849897385, "num_tokens": 19025909.0, "step": 23648 }, { "epoch": 6.263241525423728, "grad_norm": 2.561876058578491, "learning_rate": 6.868511652542374e-06, "loss": 1.2386, "mean_token_accuracy": 0.7081401124596596, "num_tokens": 19027297.0, "step": 23650 }, { "epoch": 6.263771186440678, "grad_norm": 2.3622865676879883, "learning_rate": 6.8682468220338995e-06, "loss": 0.9521, "mean_token_accuracy": 0.7719614207744598, "num_tokens": 19028877.0, "step": 23652 }, { "epoch": 6.264300847457627, "grad_norm": 2.066115140914917, "learning_rate": 6.867981991525424e-06, "loss": 1.0751, "mean_token_accuracy": 0.7443999014794827, "num_tokens": 19030474.0, "step": 23654 }, { "epoch": 6.264830508474576, "grad_norm": 2.6458654403686523, "learning_rate": 6.86771716101695e-06, "loss": 1.3424, "mean_token_accuracy": 0.6877105385065079, "num_tokens": 19031887.0, "step": 23656 }, { "epoch": 6.265360169491525, "grad_norm": 2.530027151107788, "learning_rate": 6.867452330508475e-06, "loss": 1.5009, "mean_token_accuracy": 0.6762919537723064, "num_tokens": 19033403.0, "step": 23658 }, { "epoch": 6.265889830508475, "grad_norm": 1.8330320119857788, "learning_rate": 6.867187500000001e-06, "loss": 0.9782, "mean_token_accuracy": 0.7550478726625443, "num_tokens": 19035796.0, "step": 23660 }, { "epoch": 6.266419491525424, "grad_norm": 1.8944011926651, "learning_rate": 6.866922669491526e-06, "loss": 0.9023, "mean_token_accuracy": 0.7504293397068977, "num_tokens": 19037310.0, "step": 23662 }, { "epoch": 6.266949152542373, "grad_norm": 2.183913230895996, "learning_rate": 6.866657838983052e-06, "loss": 1.0245, "mean_token_accuracy": 0.7532176896929741, "num_tokens": 19038797.0, "step": 23664 }, { "epoch": 6.267478813559322, "grad_norm": 2.255143642425537, "learning_rate": 6.8663930084745765e-06, "loss": 1.2049, "mean_token_accuracy": 0.7130619138479233, "num_tokens": 19040545.0, "step": 23666 }, { "epoch": 6.268008474576272, "grad_norm": 2.471339225769043, "learning_rate": 6.866128177966102e-06, "loss": 1.3549, "mean_token_accuracy": 0.7122226990759373, "num_tokens": 19042262.0, "step": 23668 }, { "epoch": 6.268538135593221, "grad_norm": 2.525560140609741, "learning_rate": 6.865863347457627e-06, "loss": 1.0415, "mean_token_accuracy": 0.7316663786768913, "num_tokens": 19044102.0, "step": 23670 }, { "epoch": 6.2690677966101696, "grad_norm": 1.983251929283142, "learning_rate": 6.865598516949154e-06, "loss": 1.1519, "mean_token_accuracy": 0.7349314764142036, "num_tokens": 19045837.0, "step": 23672 }, { "epoch": 6.2695974576271185, "grad_norm": 2.5388901233673096, "learning_rate": 6.865333686440678e-06, "loss": 1.4565, "mean_token_accuracy": 0.6683029755949974, "num_tokens": 19047477.0, "step": 23674 }, { "epoch": 6.2701271186440675, "grad_norm": 2.151512384414673, "learning_rate": 6.8650688559322046e-06, "loss": 1.4041, "mean_token_accuracy": 0.6925978288054466, "num_tokens": 19049415.0, "step": 23676 }, { "epoch": 6.270656779661017, "grad_norm": 1.9045255184173584, "learning_rate": 6.8648040254237295e-06, "loss": 0.977, "mean_token_accuracy": 0.7700632661581039, "num_tokens": 19051082.0, "step": 23678 }, { "epoch": 6.271186440677966, "grad_norm": 3.0269174575805664, "learning_rate": 6.864539194915255e-06, "loss": 1.2038, "mean_token_accuracy": 0.731176383793354, "num_tokens": 19052867.0, "step": 23680 }, { "epoch": 6.271716101694915, "grad_norm": 2.46804141998291, "learning_rate": 6.86427436440678e-06, "loss": 1.1806, "mean_token_accuracy": 0.7237693443894386, "num_tokens": 19054317.0, "step": 23682 }, { "epoch": 6.272245762711864, "grad_norm": 2.187290906906128, "learning_rate": 6.864009533898306e-06, "loss": 1.5923, "mean_token_accuracy": 0.6722119525074959, "num_tokens": 19055901.0, "step": 23684 }, { "epoch": 6.272775423728813, "grad_norm": 1.893794059753418, "learning_rate": 6.863744703389831e-06, "loss": 0.6414, "mean_token_accuracy": 0.8217777386307716, "num_tokens": 19057476.0, "step": 23686 }, { "epoch": 6.273305084745763, "grad_norm": 2.377744674682617, "learning_rate": 6.863479872881357e-06, "loss": 1.3705, "mean_token_accuracy": 0.686233788728714, "num_tokens": 19059339.0, "step": 23688 }, { "epoch": 6.273834745762712, "grad_norm": 2.1808924674987793, "learning_rate": 6.863215042372882e-06, "loss": 1.1142, "mean_token_accuracy": 0.7214305028319359, "num_tokens": 19060796.0, "step": 23690 }, { "epoch": 6.274364406779661, "grad_norm": 2.4904210567474365, "learning_rate": 6.862950211864407e-06, "loss": 1.0822, "mean_token_accuracy": 0.7578399255871773, "num_tokens": 19062330.0, "step": 23692 }, { "epoch": 6.27489406779661, "grad_norm": 2.4179344177246094, "learning_rate": 6.862685381355932e-06, "loss": 1.014, "mean_token_accuracy": 0.7490353435277939, "num_tokens": 19064072.0, "step": 23694 }, { "epoch": 6.27542372881356, "grad_norm": 2.7789833545684814, "learning_rate": 6.862420550847458e-06, "loss": 1.1963, "mean_token_accuracy": 0.7255723103880882, "num_tokens": 19065706.0, "step": 23696 }, { "epoch": 6.275953389830509, "grad_norm": 3.0446360111236572, "learning_rate": 6.862155720338983e-06, "loss": 1.402, "mean_token_accuracy": 0.7089183777570724, "num_tokens": 19067521.0, "step": 23698 }, { "epoch": 6.276483050847458, "grad_norm": 2.5615627765655518, "learning_rate": 6.861890889830509e-06, "loss": 1.2472, "mean_token_accuracy": 0.7287187315523624, "num_tokens": 19069161.0, "step": 23700 }, { "epoch": 6.277012711864407, "grad_norm": 2.329612970352173, "learning_rate": 6.861626059322034e-06, "loss": 1.1248, "mean_token_accuracy": 0.7393853440880775, "num_tokens": 19070700.0, "step": 23702 }, { "epoch": 6.2775423728813555, "grad_norm": 2.559237003326416, "learning_rate": 6.86136122881356e-06, "loss": 1.2683, "mean_token_accuracy": 0.7002177610993385, "num_tokens": 19072394.0, "step": 23704 }, { "epoch": 6.278072033898305, "grad_norm": 1.9681589603424072, "learning_rate": 6.861096398305085e-06, "loss": 1.1929, "mean_token_accuracy": 0.7212597206234932, "num_tokens": 19074251.0, "step": 23706 }, { "epoch": 6.278601694915254, "grad_norm": 2.4790842533111572, "learning_rate": 6.860831567796611e-06, "loss": 1.159, "mean_token_accuracy": 0.7361353188753128, "num_tokens": 19075585.0, "step": 23708 }, { "epoch": 6.279131355932203, "grad_norm": 2.3162765502929688, "learning_rate": 6.860566737288136e-06, "loss": 0.9664, "mean_token_accuracy": 0.7496310174465179, "num_tokens": 19077203.0, "step": 23710 }, { "epoch": 6.279661016949152, "grad_norm": 2.4419796466827393, "learning_rate": 6.860301906779662e-06, "loss": 0.8239, "mean_token_accuracy": 0.7879745811223984, "num_tokens": 19078737.0, "step": 23712 }, { "epoch": 6.280190677966102, "grad_norm": 2.192275047302246, "learning_rate": 6.860037076271187e-06, "loss": 1.446, "mean_token_accuracy": 0.7045784816145897, "num_tokens": 19080596.0, "step": 23714 }, { "epoch": 6.280720338983051, "grad_norm": 2.7727105617523193, "learning_rate": 6.8597722457627125e-06, "loss": 1.7096, "mean_token_accuracy": 0.6303550750017166, "num_tokens": 19082257.0, "step": 23716 }, { "epoch": 6.28125, "grad_norm": 2.4547019004821777, "learning_rate": 6.859507415254237e-06, "loss": 1.1361, "mean_token_accuracy": 0.7314660623669624, "num_tokens": 19083582.0, "step": 23718 }, { "epoch": 6.281779661016949, "grad_norm": 2.257617235183716, "learning_rate": 6.859242584745763e-06, "loss": 1.141, "mean_token_accuracy": 0.7323609367012978, "num_tokens": 19085156.0, "step": 23720 }, { "epoch": 6.282309322033898, "grad_norm": 2.2189555168151855, "learning_rate": 6.858977754237288e-06, "loss": 1.2956, "mean_token_accuracy": 0.709165595471859, "num_tokens": 19086689.0, "step": 23722 }, { "epoch": 6.282838983050848, "grad_norm": 2.3365283012390137, "learning_rate": 6.858712923728814e-06, "loss": 1.5205, "mean_token_accuracy": 0.6579865291714668, "num_tokens": 19088391.0, "step": 23724 }, { "epoch": 6.283368644067797, "grad_norm": 2.433046340942383, "learning_rate": 6.858448093220339e-06, "loss": 1.4352, "mean_token_accuracy": 0.6955468356609344, "num_tokens": 19089917.0, "step": 23726 }, { "epoch": 6.283898305084746, "grad_norm": 2.391900062561035, "learning_rate": 6.858183262711865e-06, "loss": 1.528, "mean_token_accuracy": 0.6854164004325867, "num_tokens": 19091428.0, "step": 23728 }, { "epoch": 6.284427966101695, "grad_norm": 2.2267115116119385, "learning_rate": 6.857918432203391e-06, "loss": 0.7881, "mean_token_accuracy": 0.7941606119275093, "num_tokens": 19092884.0, "step": 23730 }, { "epoch": 6.2849576271186445, "grad_norm": 3.795219659805298, "learning_rate": 6.857653601694916e-06, "loss": 1.2849, "mean_token_accuracy": 0.7067897170782089, "num_tokens": 19094575.0, "step": 23732 }, { "epoch": 6.285487288135593, "grad_norm": 3.005913734436035, "learning_rate": 6.857388771186442e-06, "loss": 1.2233, "mean_token_accuracy": 0.7081910371780396, "num_tokens": 19095846.0, "step": 23734 }, { "epoch": 6.286016949152542, "grad_norm": 2.376433849334717, "learning_rate": 6.857123940677967e-06, "loss": 1.3263, "mean_token_accuracy": 0.7015671655535698, "num_tokens": 19097496.0, "step": 23736 }, { "epoch": 6.286546610169491, "grad_norm": 2.9183807373046875, "learning_rate": 6.856859110169493e-06, "loss": 1.2044, "mean_token_accuracy": 0.7378528192639351, "num_tokens": 19098970.0, "step": 23738 }, { "epoch": 6.28707627118644, "grad_norm": 2.2756927013397217, "learning_rate": 6.8565942796610176e-06, "loss": 1.2649, "mean_token_accuracy": 0.7039492949843407, "num_tokens": 19100809.0, "step": 23740 }, { "epoch": 6.28760593220339, "grad_norm": 2.597386360168457, "learning_rate": 6.856329449152543e-06, "loss": 1.4065, "mean_token_accuracy": 0.677451103925705, "num_tokens": 19102383.0, "step": 23742 }, { "epoch": 6.288135593220339, "grad_norm": 2.125544309616089, "learning_rate": 6.856064618644068e-06, "loss": 0.8704, "mean_token_accuracy": 0.7697978541254997, "num_tokens": 19104071.0, "step": 23744 }, { "epoch": 6.288665254237288, "grad_norm": 1.700151801109314, "learning_rate": 6.855799788135594e-06, "loss": 0.8823, "mean_token_accuracy": 0.7996710762381554, "num_tokens": 19105594.0, "step": 23746 }, { "epoch": 6.289194915254237, "grad_norm": 2.8088271617889404, "learning_rate": 6.855534957627119e-06, "loss": 1.3325, "mean_token_accuracy": 0.6974772289395332, "num_tokens": 19107187.0, "step": 23748 }, { "epoch": 6.289724576271187, "grad_norm": 2.718276023864746, "learning_rate": 6.855270127118645e-06, "loss": 1.0721, "step": 23750 }, { "epoch": 6.289724576271187, "eval_loss": 1.3291486501693726, "eval_mean_token_accuracy": 0.7006300405248419, "eval_num_tokens": 19108615.0, "eval_runtime": 48.6839, "eval_samples_per_second": 6.327, "eval_steps_per_second": 6.327, "step": 23750 }, { "epoch": 6.290254237288136, "grad_norm": 2.6352713108062744, "learning_rate": 6.85500529661017e-06, "loss": 0.9734, "mean_token_accuracy": 0.7504377514123917, "num_tokens": 19110068.0, "step": 23752 }, { "epoch": 6.290783898305085, "grad_norm": 2.9678092002868652, "learning_rate": 6.8547404661016954e-06, "loss": 1.2517, "mean_token_accuracy": 0.6984504349529743, "num_tokens": 19111519.0, "step": 23754 }, { "epoch": 6.291313559322034, "grad_norm": 1.9241441488265991, "learning_rate": 6.85447563559322e-06, "loss": 1.1622, "mean_token_accuracy": 0.715150959789753, "num_tokens": 19113121.0, "step": 23756 }, { "epoch": 6.291843220338983, "grad_norm": 1.9486126899719238, "learning_rate": 6.854210805084747e-06, "loss": 1.458, "mean_token_accuracy": 0.6572184935212135, "num_tokens": 19114928.0, "step": 23758 }, { "epoch": 6.2923728813559325, "grad_norm": 3.0350208282470703, "learning_rate": 6.853945974576272e-06, "loss": 1.3238, "mean_token_accuracy": 0.6880234107375145, "num_tokens": 19116302.0, "step": 23760 }, { "epoch": 6.2929025423728815, "grad_norm": 2.1384479999542236, "learning_rate": 6.853681144067798e-06, "loss": 1.2666, "mean_token_accuracy": 0.7155209928750992, "num_tokens": 19117930.0, "step": 23762 }, { "epoch": 6.2934322033898304, "grad_norm": 2.0814080238342285, "learning_rate": 6.853416313559323e-06, "loss": 0.794, "mean_token_accuracy": 0.7902712970972061, "num_tokens": 19119656.0, "step": 23764 }, { "epoch": 6.293961864406779, "grad_norm": 2.5205066204071045, "learning_rate": 6.853151483050848e-06, "loss": 1.1242, "mean_token_accuracy": 0.7535274773836136, "num_tokens": 19121235.0, "step": 23766 }, { "epoch": 6.294491525423728, "grad_norm": 2.6108155250549316, "learning_rate": 6.852886652542373e-06, "loss": 1.3659, "mean_token_accuracy": 0.6892019659280777, "num_tokens": 19122739.0, "step": 23768 }, { "epoch": 6.295021186440678, "grad_norm": 2.367926836013794, "learning_rate": 6.852621822033899e-06, "loss": 1.3455, "mean_token_accuracy": 0.7112544625997543, "num_tokens": 19124222.0, "step": 23770 }, { "epoch": 6.295550847457627, "grad_norm": 2.1926450729370117, "learning_rate": 6.852356991525424e-06, "loss": 1.7191, "mean_token_accuracy": 0.6283837556838989, "num_tokens": 19125946.0, "step": 23772 }, { "epoch": 6.296080508474576, "grad_norm": 2.206855297088623, "learning_rate": 6.85209216101695e-06, "loss": 1.0662, "mean_token_accuracy": 0.7593743056058884, "num_tokens": 19127663.0, "step": 23774 }, { "epoch": 6.296610169491525, "grad_norm": 2.2618701457977295, "learning_rate": 6.851827330508475e-06, "loss": 0.8059, "mean_token_accuracy": 0.7928135246038437, "num_tokens": 19129298.0, "step": 23776 }, { "epoch": 6.297139830508475, "grad_norm": 2.784972906112671, "learning_rate": 6.8515625000000005e-06, "loss": 1.3262, "mean_token_accuracy": 0.7036134675145149, "num_tokens": 19130670.0, "step": 23778 }, { "epoch": 6.297669491525424, "grad_norm": 2.5200304985046387, "learning_rate": 6.8512976694915255e-06, "loss": 0.9537, "mean_token_accuracy": 0.7603101134300232, "num_tokens": 19132291.0, "step": 23780 }, { "epoch": 6.298199152542373, "grad_norm": 2.1282968521118164, "learning_rate": 6.851032838983051e-06, "loss": 1.2929, "mean_token_accuracy": 0.716436043381691, "num_tokens": 19133949.0, "step": 23782 }, { "epoch": 6.298728813559322, "grad_norm": 2.421067714691162, "learning_rate": 6.850768008474576e-06, "loss": 0.9555, "mean_token_accuracy": 0.7628657147288322, "num_tokens": 19135349.0, "step": 23784 }, { "epoch": 6.299258474576272, "grad_norm": 2.3869636058807373, "learning_rate": 6.850503177966103e-06, "loss": 1.0318, "mean_token_accuracy": 0.736539214849472, "num_tokens": 19136915.0, "step": 23786 }, { "epoch": 6.299788135593221, "grad_norm": 1.999981164932251, "learning_rate": 6.850238347457627e-06, "loss": 1.1779, "mean_token_accuracy": 0.7353490367531776, "num_tokens": 19138666.0, "step": 23788 }, { "epoch": 6.3003177966101696, "grad_norm": 2.6971728801727295, "learning_rate": 6.8499735169491535e-06, "loss": 1.4241, "mean_token_accuracy": 0.6728104501962662, "num_tokens": 19140087.0, "step": 23790 }, { "epoch": 6.3008474576271185, "grad_norm": 2.8444554805755615, "learning_rate": 6.849708686440678e-06, "loss": 1.2311, "mean_token_accuracy": 0.7163712158799171, "num_tokens": 19141551.0, "step": 23792 }, { "epoch": 6.3013771186440675, "grad_norm": 2.525860071182251, "learning_rate": 6.849443855932204e-06, "loss": 1.0607, "mean_token_accuracy": 0.7577617093920708, "num_tokens": 19143090.0, "step": 23794 }, { "epoch": 6.301906779661017, "grad_norm": 2.574753522872925, "learning_rate": 6.849179025423729e-06, "loss": 1.254, "mean_token_accuracy": 0.7392324432730675, "num_tokens": 19144611.0, "step": 23796 }, { "epoch": 6.302436440677966, "grad_norm": 2.3091578483581543, "learning_rate": 6.848914194915255e-06, "loss": 0.7216, "mean_token_accuracy": 0.8095157667994499, "num_tokens": 19145980.0, "step": 23798 }, { "epoch": 6.302966101694915, "grad_norm": 2.5192089080810547, "learning_rate": 6.84864936440678e-06, "loss": 1.2696, "mean_token_accuracy": 0.6916539520025253, "num_tokens": 19147504.0, "step": 23800 }, { "epoch": 6.303495762711864, "grad_norm": 2.4954094886779785, "learning_rate": 6.848384533898306e-06, "loss": 1.5679, "mean_token_accuracy": 0.6366858631372452, "num_tokens": 19149124.0, "step": 23802 }, { "epoch": 6.304025423728813, "grad_norm": 2.334813356399536, "learning_rate": 6.8481197033898305e-06, "loss": 1.1584, "mean_token_accuracy": 0.7336350306868553, "num_tokens": 19150736.0, "step": 23804 }, { "epoch": 6.304555084745763, "grad_norm": 2.480513572692871, "learning_rate": 6.847854872881356e-06, "loss": 1.2571, "mean_token_accuracy": 0.694191463291645, "num_tokens": 19152363.0, "step": 23806 }, { "epoch": 6.305084745762712, "grad_norm": 2.621812343597412, "learning_rate": 6.847590042372881e-06, "loss": 1.1541, "mean_token_accuracy": 0.7303311228752136, "num_tokens": 19154113.0, "step": 23808 }, { "epoch": 6.305614406779661, "grad_norm": 2.3571135997772217, "learning_rate": 6.847325211864407e-06, "loss": 1.4966, "mean_token_accuracy": 0.6680369600653648, "num_tokens": 19155642.0, "step": 23810 }, { "epoch": 6.30614406779661, "grad_norm": 2.719101905822754, "learning_rate": 6.847060381355934e-06, "loss": 1.3869, "mean_token_accuracy": 0.717399425804615, "num_tokens": 19157107.0, "step": 23812 }, { "epoch": 6.30667372881356, "grad_norm": 2.4473321437835693, "learning_rate": 6.8467955508474586e-06, "loss": 1.0006, "mean_token_accuracy": 0.7636679261922836, "num_tokens": 19158449.0, "step": 23814 }, { "epoch": 6.307203389830509, "grad_norm": 2.395951509475708, "learning_rate": 6.846530720338984e-06, "loss": 1.0819, "mean_token_accuracy": 0.7312701791524887, "num_tokens": 19160025.0, "step": 23816 }, { "epoch": 6.307733050847458, "grad_norm": 2.0618269443511963, "learning_rate": 6.846265889830509e-06, "loss": 1.0025, "mean_token_accuracy": 0.7464650273323059, "num_tokens": 19161634.0, "step": 23818 }, { "epoch": 6.308262711864407, "grad_norm": 2.305236339569092, "learning_rate": 6.846001059322035e-06, "loss": 1.1206, "mean_token_accuracy": 0.7269309163093567, "num_tokens": 19163212.0, "step": 23820 }, { "epoch": 6.3087923728813555, "grad_norm": 2.1636955738067627, "learning_rate": 6.84573622881356e-06, "loss": 1.3868, "mean_token_accuracy": 0.7072046473622322, "num_tokens": 19164929.0, "step": 23822 }, { "epoch": 6.309322033898305, "grad_norm": 1.90862238407135, "learning_rate": 6.845471398305086e-06, "loss": 0.8335, "mean_token_accuracy": 0.7932320460677147, "num_tokens": 19166425.0, "step": 23824 }, { "epoch": 6.309851694915254, "grad_norm": 1.9240959882736206, "learning_rate": 6.845206567796611e-06, "loss": 1.2781, "mean_token_accuracy": 0.715439036488533, "num_tokens": 19168182.0, "step": 23826 }, { "epoch": 6.310381355932203, "grad_norm": 2.8353018760681152, "learning_rate": 6.8449417372881365e-06, "loss": 1.3514, "mean_token_accuracy": 0.6958174258470535, "num_tokens": 19169681.0, "step": 23828 }, { "epoch": 6.310911016949152, "grad_norm": 2.6130499839782715, "learning_rate": 6.844676906779661e-06, "loss": 1.2066, "mean_token_accuracy": 0.709141455590725, "num_tokens": 19171197.0, "step": 23830 }, { "epoch": 6.311440677966102, "grad_norm": 2.1148130893707275, "learning_rate": 6.844412076271187e-06, "loss": 1.2288, "mean_token_accuracy": 0.7369840368628502, "num_tokens": 19172909.0, "step": 23832 }, { "epoch": 6.311970338983051, "grad_norm": 2.4421989917755127, "learning_rate": 6.844147245762712e-06, "loss": 1.2487, "mean_token_accuracy": 0.6940703019499779, "num_tokens": 19174749.0, "step": 23834 }, { "epoch": 6.3125, "grad_norm": 2.5345585346221924, "learning_rate": 6.843882415254238e-06, "loss": 1.564, "mean_token_accuracy": 0.6408255845308304, "num_tokens": 19176436.0, "step": 23836 }, { "epoch": 6.313029661016949, "grad_norm": 2.884603261947632, "learning_rate": 6.843617584745763e-06, "loss": 1.1929, "mean_token_accuracy": 0.7216877862811089, "num_tokens": 19177776.0, "step": 23838 }, { "epoch": 6.313559322033898, "grad_norm": 2.505183219909668, "learning_rate": 6.843352754237289e-06, "loss": 1.1675, "mean_token_accuracy": 0.7250670939683914, "num_tokens": 19179361.0, "step": 23840 }, { "epoch": 6.314088983050848, "grad_norm": 3.257084608078003, "learning_rate": 6.8430879237288135e-06, "loss": 1.2794, "mean_token_accuracy": 0.7304497361183167, "num_tokens": 19180970.0, "step": 23842 }, { "epoch": 6.314618644067797, "grad_norm": 2.290759801864624, "learning_rate": 6.84282309322034e-06, "loss": 1.0487, "mean_token_accuracy": 0.733608216047287, "num_tokens": 19183507.0, "step": 23844 }, { "epoch": 6.315148305084746, "grad_norm": 2.1783957481384277, "learning_rate": 6.842558262711865e-06, "loss": 0.9021, "mean_token_accuracy": 0.7815262898802757, "num_tokens": 19184963.0, "step": 23846 }, { "epoch": 6.315677966101695, "grad_norm": 2.612949848175049, "learning_rate": 6.842293432203391e-06, "loss": 1.2524, "mean_token_accuracy": 0.7115747779607773, "num_tokens": 19186630.0, "step": 23848 }, { "epoch": 6.3162076271186445, "grad_norm": 2.587735176086426, "learning_rate": 6.842028601694916e-06, "loss": 1.3934, "mean_token_accuracy": 0.6847713887691498, "num_tokens": 19188088.0, "step": 23850 }, { "epoch": 6.316737288135593, "grad_norm": 2.544294834136963, "learning_rate": 6.8417637711864415e-06, "loss": 1.2804, "mean_token_accuracy": 0.7046748921275139, "num_tokens": 19189360.0, "step": 23852 }, { "epoch": 6.317266949152542, "grad_norm": 2.437927007675171, "learning_rate": 6.8414989406779665e-06, "loss": 1.035, "mean_token_accuracy": 0.7421136870980263, "num_tokens": 19190820.0, "step": 23854 }, { "epoch": 6.317796610169491, "grad_norm": 2.4593474864959717, "learning_rate": 6.841234110169492e-06, "loss": 1.3041, "mean_token_accuracy": 0.710190013051033, "num_tokens": 19192591.0, "step": 23856 }, { "epoch": 6.31832627118644, "grad_norm": 2.2072019577026367, "learning_rate": 6.840969279661017e-06, "loss": 1.4634, "mean_token_accuracy": 0.6619243808090687, "num_tokens": 19194310.0, "step": 23858 }, { "epoch": 6.31885593220339, "grad_norm": 2.1430063247680664, "learning_rate": 6.840704449152543e-06, "loss": 0.7891, "mean_token_accuracy": 0.7812969982624054, "num_tokens": 19195986.0, "step": 23860 }, { "epoch": 6.319385593220339, "grad_norm": 2.2016406059265137, "learning_rate": 6.840439618644068e-06, "loss": 1.1665, "mean_token_accuracy": 0.7182002291083336, "num_tokens": 19197427.0, "step": 23862 }, { "epoch": 6.319915254237288, "grad_norm": 3.85273814201355, "learning_rate": 6.840174788135594e-06, "loss": 1.3492, "mean_token_accuracy": 0.6969024240970612, "num_tokens": 19198932.0, "step": 23864 }, { "epoch": 6.320444915254237, "grad_norm": 2.470958948135376, "learning_rate": 6.839909957627119e-06, "loss": 1.467, "mean_token_accuracy": 0.6761499494314194, "num_tokens": 19200531.0, "step": 23866 }, { "epoch": 6.320974576271187, "grad_norm": 2.339216947555542, "learning_rate": 6.839645127118645e-06, "loss": 1.2969, "mean_token_accuracy": 0.6902319118380547, "num_tokens": 19202017.0, "step": 23868 }, { "epoch": 6.321504237288136, "grad_norm": 2.3134114742279053, "learning_rate": 6.839380296610169e-06, "loss": 1.2364, "mean_token_accuracy": 0.7016304954886436, "num_tokens": 19203704.0, "step": 23870 }, { "epoch": 6.322033898305085, "grad_norm": 2.0183980464935303, "learning_rate": 6.839115466101696e-06, "loss": 0.9421, "mean_token_accuracy": 0.7787271291017532, "num_tokens": 19205127.0, "step": 23872 }, { "epoch": 6.322563559322034, "grad_norm": 2.7476532459259033, "learning_rate": 6.838850635593221e-06, "loss": 1.2782, "mean_token_accuracy": 0.7361317202448845, "num_tokens": 19206722.0, "step": 23874 }, { "epoch": 6.323093220338983, "grad_norm": 2.271026372909546, "learning_rate": 6.838585805084747e-06, "loss": 1.1453, "mean_token_accuracy": 0.7292857393622398, "num_tokens": 19208536.0, "step": 23876 }, { "epoch": 6.3236228813559325, "grad_norm": 2.248952627182007, "learning_rate": 6.8383209745762715e-06, "loss": 1.4207, "mean_token_accuracy": 0.6745515242218971, "num_tokens": 19210277.0, "step": 23878 }, { "epoch": 6.3241525423728815, "grad_norm": 2.6650400161743164, "learning_rate": 6.838056144067797e-06, "loss": 1.6436, "mean_token_accuracy": 0.6532870382070541, "num_tokens": 19211806.0, "step": 23880 }, { "epoch": 6.3246822033898304, "grad_norm": 2.5060231685638428, "learning_rate": 6.837791313559322e-06, "loss": 1.1137, "mean_token_accuracy": 0.7190350890159607, "num_tokens": 19213269.0, "step": 23882 }, { "epoch": 6.325211864406779, "grad_norm": 2.7428226470947266, "learning_rate": 6.837526483050848e-06, "loss": 1.1491, "mean_token_accuracy": 0.7290853038430214, "num_tokens": 19214733.0, "step": 23884 }, { "epoch": 6.325741525423728, "grad_norm": 2.3271007537841797, "learning_rate": 6.837261652542373e-06, "loss": 0.831, "mean_token_accuracy": 0.7923755273222923, "num_tokens": 19216092.0, "step": 23886 }, { "epoch": 6.326271186440678, "grad_norm": 2.3241565227508545, "learning_rate": 6.836996822033899e-06, "loss": 1.0095, "mean_token_accuracy": 0.7692937254905701, "num_tokens": 19217538.0, "step": 23888 }, { "epoch": 6.326800847457627, "grad_norm": 2.6743247509002686, "learning_rate": 6.836731991525424e-06, "loss": 1.247, "mean_token_accuracy": 0.7158843874931335, "num_tokens": 19219286.0, "step": 23890 }, { "epoch": 6.327330508474576, "grad_norm": 2.841346502304077, "learning_rate": 6.8364671610169494e-06, "loss": 1.4245, "mean_token_accuracy": 0.6856581568717957, "num_tokens": 19220771.0, "step": 23892 }, { "epoch": 6.327860169491525, "grad_norm": 2.7799453735351562, "learning_rate": 6.836202330508474e-06, "loss": 1.1662, "mean_token_accuracy": 0.7104859501123428, "num_tokens": 19222232.0, "step": 23894 }, { "epoch": 6.328389830508475, "grad_norm": 2.0259006023406982, "learning_rate": 6.8359375e-06, "loss": 1.185, "mean_token_accuracy": 0.7149421386420727, "num_tokens": 19223852.0, "step": 23896 }, { "epoch": 6.328919491525424, "grad_norm": 2.4300520420074463, "learning_rate": 6.835672669491527e-06, "loss": 1.2347, "mean_token_accuracy": 0.6990278661251068, "num_tokens": 19225429.0, "step": 23898 }, { "epoch": 6.329449152542373, "grad_norm": 2.053077220916748, "learning_rate": 6.835407838983052e-06, "loss": 1.0196, "mean_token_accuracy": 0.7412048801779747, "num_tokens": 19227549.0, "step": 23900 }, { "epoch": 6.329978813559322, "grad_norm": 2.3239076137542725, "learning_rate": 6.8351430084745775e-06, "loss": 0.9307, "mean_token_accuracy": 0.791821613907814, "num_tokens": 19228999.0, "step": 23902 }, { "epoch": 6.330508474576272, "grad_norm": 2.7426977157592773, "learning_rate": 6.834878177966102e-06, "loss": 1.3992, "mean_token_accuracy": 0.6864711344242096, "num_tokens": 19230512.0, "step": 23904 }, { "epoch": 6.331038135593221, "grad_norm": 2.228135108947754, "learning_rate": 6.834613347457628e-06, "loss": 1.2695, "mean_token_accuracy": 0.7065981775522232, "num_tokens": 19232117.0, "step": 23906 }, { "epoch": 6.3315677966101696, "grad_norm": 2.2551870346069336, "learning_rate": 6.834348516949153e-06, "loss": 1.071, "mean_token_accuracy": 0.7181444615125656, "num_tokens": 19233628.0, "step": 23908 }, { "epoch": 6.3320974576271185, "grad_norm": 1.7961450815200806, "learning_rate": 6.834083686440679e-06, "loss": 1.3342, "mean_token_accuracy": 0.6717719286680222, "num_tokens": 19236339.0, "step": 23910 }, { "epoch": 6.3326271186440675, "grad_norm": 2.496400833129883, "learning_rate": 6.833818855932204e-06, "loss": 1.2227, "mean_token_accuracy": 0.715964563190937, "num_tokens": 19237791.0, "step": 23912 }, { "epoch": 6.333156779661017, "grad_norm": 2.3369884490966797, "learning_rate": 6.83355402542373e-06, "loss": 0.8636, "mean_token_accuracy": 0.7758531048893929, "num_tokens": 19239170.0, "step": 23914 }, { "epoch": 6.333686440677966, "grad_norm": 2.199042797088623, "learning_rate": 6.8332891949152545e-06, "loss": 1.1004, "mean_token_accuracy": 0.7226370722055435, "num_tokens": 19240860.0, "step": 23916 }, { "epoch": 6.334216101694915, "grad_norm": 2.1100704669952393, "learning_rate": 6.83302436440678e-06, "loss": 0.9999, "mean_token_accuracy": 0.7666072621941566, "num_tokens": 19242366.0, "step": 23918 }, { "epoch": 6.334745762711864, "grad_norm": 2.0280537605285645, "learning_rate": 6.832759533898305e-06, "loss": 0.8702, "mean_token_accuracy": 0.7729501351714134, "num_tokens": 19243966.0, "step": 23920 }, { "epoch": 6.335275423728813, "grad_norm": 2.391446828842163, "learning_rate": 6.832494703389832e-06, "loss": 1.6241, "mean_token_accuracy": 0.6479999125003815, "num_tokens": 19245497.0, "step": 23922 }, { "epoch": 6.335805084745763, "grad_norm": 2.1402440071105957, "learning_rate": 6.832229872881356e-06, "loss": 1.0652, "mean_token_accuracy": 0.7413925305008888, "num_tokens": 19247019.0, "step": 23924 }, { "epoch": 6.336334745762712, "grad_norm": 2.639599561691284, "learning_rate": 6.8319650423728826e-06, "loss": 1.4635, "mean_token_accuracy": 0.6618792191147804, "num_tokens": 19248891.0, "step": 23926 }, { "epoch": 6.336864406779661, "grad_norm": 2.162679672241211, "learning_rate": 6.8317002118644075e-06, "loss": 1.2011, "mean_token_accuracy": 0.7245667725801468, "num_tokens": 19250570.0, "step": 23928 }, { "epoch": 6.33739406779661, "grad_norm": 1.9150792360305786, "learning_rate": 6.831435381355933e-06, "loss": 1.0483, "mean_token_accuracy": 0.7371384799480438, "num_tokens": 19252225.0, "step": 23930 }, { "epoch": 6.33792372881356, "grad_norm": 2.673177480697632, "learning_rate": 6.831170550847458e-06, "loss": 1.3028, "mean_token_accuracy": 0.7021537199616432, "num_tokens": 19253679.0, "step": 23932 }, { "epoch": 6.338453389830509, "grad_norm": 1.9470925331115723, "learning_rate": 6.830905720338984e-06, "loss": 0.7937, "mean_token_accuracy": 0.7934956029057503, "num_tokens": 19256008.0, "step": 23934 }, { "epoch": 6.338983050847458, "grad_norm": 2.6173095703125, "learning_rate": 6.830640889830509e-06, "loss": 1.2897, "mean_token_accuracy": 0.7394687905907631, "num_tokens": 19257615.0, "step": 23936 }, { "epoch": 6.339512711864407, "grad_norm": 1.9458330869674683, "learning_rate": 6.830376059322035e-06, "loss": 0.9644, "mean_token_accuracy": 0.7789257764816284, "num_tokens": 19258970.0, "step": 23938 }, { "epoch": 6.3400423728813555, "grad_norm": 2.52730655670166, "learning_rate": 6.83011122881356e-06, "loss": 1.0981, "mean_token_accuracy": 0.7597807124257088, "num_tokens": 19260612.0, "step": 23940 }, { "epoch": 6.340572033898305, "grad_norm": 2.3792240619659424, "learning_rate": 6.829846398305085e-06, "loss": 1.0073, "mean_token_accuracy": 0.7481430396437645, "num_tokens": 19262130.0, "step": 23942 }, { "epoch": 6.341101694915254, "grad_norm": 2.002715826034546, "learning_rate": 6.82958156779661e-06, "loss": 1.1532, "mean_token_accuracy": 0.724163107573986, "num_tokens": 19263633.0, "step": 23944 }, { "epoch": 6.341631355932203, "grad_norm": 2.221600294113159, "learning_rate": 6.829316737288136e-06, "loss": 1.0588, "mean_token_accuracy": 0.7245604768395424, "num_tokens": 19265562.0, "step": 23946 }, { "epoch": 6.342161016949152, "grad_norm": 2.5923852920532227, "learning_rate": 6.829051906779661e-06, "loss": 1.3474, "mean_token_accuracy": 0.7056875228881836, "num_tokens": 19267059.0, "step": 23948 }, { "epoch": 6.342690677966102, "grad_norm": 2.023836135864258, "learning_rate": 6.828787076271187e-06, "loss": 1.069, "mean_token_accuracy": 0.7430330142378807, "num_tokens": 19268549.0, "step": 23950 }, { "epoch": 6.343220338983051, "grad_norm": 2.3417410850524902, "learning_rate": 6.828522245762712e-06, "loss": 1.1633, "mean_token_accuracy": 0.7144090309739113, "num_tokens": 19270145.0, "step": 23952 }, { "epoch": 6.34375, "grad_norm": 2.03171968460083, "learning_rate": 6.828257415254238e-06, "loss": 0.8087, "mean_token_accuracy": 0.7965403124690056, "num_tokens": 19271785.0, "step": 23954 }, { "epoch": 6.344279661016949, "grad_norm": 2.537386178970337, "learning_rate": 6.827992584745763e-06, "loss": 0.8377, "mean_token_accuracy": 0.7964235246181488, "num_tokens": 19273353.0, "step": 23956 }, { "epoch": 6.344809322033898, "grad_norm": 2.444471597671509, "learning_rate": 6.827727754237289e-06, "loss": 1.077, "mean_token_accuracy": 0.7387725412845612, "num_tokens": 19275050.0, "step": 23958 }, { "epoch": 6.345338983050848, "grad_norm": 3.507828950881958, "learning_rate": 6.827462923728814e-06, "loss": 1.7161, "mean_token_accuracy": 0.6169915497303009, "num_tokens": 19276375.0, "step": 23960 }, { "epoch": 6.345868644067797, "grad_norm": 2.305968761444092, "learning_rate": 6.82719809322034e-06, "loss": 1.3258, "mean_token_accuracy": 0.7081360444426537, "num_tokens": 19278069.0, "step": 23962 }, { "epoch": 6.346398305084746, "grad_norm": 2.1247737407684326, "learning_rate": 6.826933262711865e-06, "loss": 0.8491, "mean_token_accuracy": 0.762751467525959, "num_tokens": 19279583.0, "step": 23964 }, { "epoch": 6.346927966101695, "grad_norm": 2.462477922439575, "learning_rate": 6.8266684322033905e-06, "loss": 1.354, "mean_token_accuracy": 0.6897262707352638, "num_tokens": 19281296.0, "step": 23966 }, { "epoch": 6.3474576271186445, "grad_norm": 2.7249081134796143, "learning_rate": 6.826403601694915e-06, "loss": 1.4447, "mean_token_accuracy": 0.6845387741923332, "num_tokens": 19282839.0, "step": 23968 }, { "epoch": 6.347987288135593, "grad_norm": 2.1883866786956787, "learning_rate": 6.826138771186441e-06, "loss": 1.2364, "mean_token_accuracy": 0.7110615596175194, "num_tokens": 19284243.0, "step": 23970 }, { "epoch": 6.348516949152542, "grad_norm": 2.179044008255005, "learning_rate": 6.825873940677966e-06, "loss": 1.2656, "mean_token_accuracy": 0.6949782371520996, "num_tokens": 19286015.0, "step": 23972 }, { "epoch": 6.349046610169491, "grad_norm": 1.9127923250198364, "learning_rate": 6.825609110169492e-06, "loss": 1.2135, "mean_token_accuracy": 0.7293568029999733, "num_tokens": 19287514.0, "step": 23974 }, { "epoch": 6.34957627118644, "grad_norm": 2.736072063446045, "learning_rate": 6.825344279661017e-06, "loss": 1.2737, "mean_token_accuracy": 0.7390633672475815, "num_tokens": 19288943.0, "step": 23976 }, { "epoch": 6.35010593220339, "grad_norm": 2.037151575088501, "learning_rate": 6.8250794491525426e-06, "loss": 1.2041, "mean_token_accuracy": 0.7338463142514229, "num_tokens": 19290670.0, "step": 23978 }, { "epoch": 6.350635593220339, "grad_norm": 2.474705219268799, "learning_rate": 6.8248146186440675e-06, "loss": 1.2316, "mean_token_accuracy": 0.7024430483579636, "num_tokens": 19292293.0, "step": 23980 }, { "epoch": 6.351165254237288, "grad_norm": 1.9310715198516846, "learning_rate": 6.824549788135594e-06, "loss": 1.0753, "mean_token_accuracy": 0.7526706382632256, "num_tokens": 19294095.0, "step": 23982 }, { "epoch": 6.351694915254237, "grad_norm": 2.828136444091797, "learning_rate": 6.82428495762712e-06, "loss": 1.361, "mean_token_accuracy": 0.6940189301967621, "num_tokens": 19295687.0, "step": 23984 }, { "epoch": 6.352224576271187, "grad_norm": 2.0581822395324707, "learning_rate": 6.824020127118645e-06, "loss": 0.992, "mean_token_accuracy": 0.7692995667457581, "num_tokens": 19297306.0, "step": 23986 }, { "epoch": 6.352754237288136, "grad_norm": 2.4979724884033203, "learning_rate": 6.823755296610171e-06, "loss": 1.1249, "mean_token_accuracy": 0.7408038824796677, "num_tokens": 19298946.0, "step": 23988 }, { "epoch": 6.353283898305085, "grad_norm": 2.1202871799468994, "learning_rate": 6.8234904661016955e-06, "loss": 0.9424, "mean_token_accuracy": 0.7623268961906433, "num_tokens": 19300521.0, "step": 23990 }, { "epoch": 6.353813559322034, "grad_norm": 2.2435240745544434, "learning_rate": 6.823225635593221e-06, "loss": 0.8493, "mean_token_accuracy": 0.792126439511776, "num_tokens": 19302186.0, "step": 23992 }, { "epoch": 6.354343220338983, "grad_norm": 1.5970083475112915, "learning_rate": 6.822960805084746e-06, "loss": 1.052, "mean_token_accuracy": 0.7644711807370186, "num_tokens": 19303838.0, "step": 23994 }, { "epoch": 6.3548728813559325, "grad_norm": 2.417970895767212, "learning_rate": 6.822695974576272e-06, "loss": 1.2051, "mean_token_accuracy": 0.7133734971284866, "num_tokens": 19305650.0, "step": 23996 }, { "epoch": 6.3554025423728815, "grad_norm": 3.057626485824585, "learning_rate": 6.822431144067797e-06, "loss": 1.6384, "mean_token_accuracy": 0.6212326362729073, "num_tokens": 19307007.0, "step": 23998 }, { "epoch": 6.3559322033898304, "grad_norm": 1.9804754257202148, "learning_rate": 6.822166313559323e-06, "loss": 0.7867, "step": 24000 }, { "epoch": 6.3559322033898304, "eval_loss": 1.331301212310791, "eval_mean_token_accuracy": 0.7006934188403092, "eval_num_tokens": 19308825.0, "eval_runtime": 48.7199, "eval_samples_per_second": 6.322, "eval_steps_per_second": 6.322, "step": 24000 }, { "epoch": 6.356461864406779, "grad_norm": 2.2695693969726562, "learning_rate": 6.821901483050848e-06, "loss": 1.2341, "mean_token_accuracy": 0.7434728257358074, "num_tokens": 19310423.0, "step": 24002 }, { "epoch": 6.356991525423728, "grad_norm": 2.648824453353882, "learning_rate": 6.821636652542373e-06, "loss": 1.339, "mean_token_accuracy": 0.695980079472065, "num_tokens": 19312362.0, "step": 24004 }, { "epoch": 6.357521186440678, "grad_norm": 2.293031692504883, "learning_rate": 6.821371822033898e-06, "loss": 1.0313, "mean_token_accuracy": 0.7326461374759674, "num_tokens": 19313958.0, "step": 24006 }, { "epoch": 6.358050847457627, "grad_norm": 2.379838228225708, "learning_rate": 6.821106991525425e-06, "loss": 1.5182, "mean_token_accuracy": 0.6899387314915657, "num_tokens": 19315886.0, "step": 24008 }, { "epoch": 6.358580508474576, "grad_norm": 2.2080273628234863, "learning_rate": 6.82084216101695e-06, "loss": 1.0393, "mean_token_accuracy": 0.7293242141604424, "num_tokens": 19317327.0, "step": 24010 }, { "epoch": 6.359110169491525, "grad_norm": 1.935951828956604, "learning_rate": 6.820577330508476e-06, "loss": 0.8698, "mean_token_accuracy": 0.7854280844330788, "num_tokens": 19319073.0, "step": 24012 }, { "epoch": 6.359639830508475, "grad_norm": 2.264065742492676, "learning_rate": 6.820312500000001e-06, "loss": 1.1122, "mean_token_accuracy": 0.712089478969574, "num_tokens": 19320647.0, "step": 24014 }, { "epoch": 6.360169491525424, "grad_norm": 2.8165957927703857, "learning_rate": 6.820047669491526e-06, "loss": 1.4363, "mean_token_accuracy": 0.6736704781651497, "num_tokens": 19322137.0, "step": 24016 }, { "epoch": 6.360699152542373, "grad_norm": 2.2950692176818848, "learning_rate": 6.819782838983051e-06, "loss": 1.3469, "mean_token_accuracy": 0.6887073069810867, "num_tokens": 19324082.0, "step": 24018 }, { "epoch": 6.361228813559322, "grad_norm": 2.779921531677246, "learning_rate": 6.819518008474577e-06, "loss": 1.4141, "mean_token_accuracy": 0.6919577568769455, "num_tokens": 19325661.0, "step": 24020 }, { "epoch": 6.361758474576272, "grad_norm": 2.809131383895874, "learning_rate": 6.819253177966102e-06, "loss": 0.7571, "mean_token_accuracy": 0.8072427064180374, "num_tokens": 19327347.0, "step": 24022 }, { "epoch": 6.362288135593221, "grad_norm": 2.360050916671753, "learning_rate": 6.818988347457628e-06, "loss": 1.05, "mean_token_accuracy": 0.7440440133213997, "num_tokens": 19328725.0, "step": 24024 }, { "epoch": 6.3628177966101696, "grad_norm": 2.1777257919311523, "learning_rate": 6.818723516949153e-06, "loss": 1.0452, "mean_token_accuracy": 0.7573853507637978, "num_tokens": 19330185.0, "step": 24026 }, { "epoch": 6.3633474576271185, "grad_norm": 1.9263439178466797, "learning_rate": 6.8184586864406785e-06, "loss": 1.1084, "mean_token_accuracy": 0.7455243915319443, "num_tokens": 19331475.0, "step": 24028 }, { "epoch": 6.3638771186440675, "grad_norm": 2.481797456741333, "learning_rate": 6.8181938559322034e-06, "loss": 1.0653, "mean_token_accuracy": 0.7733855023980141, "num_tokens": 19332769.0, "step": 24030 }, { "epoch": 6.364406779661017, "grad_norm": 2.638730049133301, "learning_rate": 6.817929025423729e-06, "loss": 1.5221, "mean_token_accuracy": 0.6814552545547485, "num_tokens": 19334296.0, "step": 24032 }, { "epoch": 6.364936440677966, "grad_norm": 1.9497920274734497, "learning_rate": 6.817664194915254e-06, "loss": 1.3623, "mean_token_accuracy": 0.6911292597651482, "num_tokens": 19335839.0, "step": 24034 }, { "epoch": 6.365466101694915, "grad_norm": 2.5643036365509033, "learning_rate": 6.817399364406781e-06, "loss": 0.9659, "mean_token_accuracy": 0.773029588162899, "num_tokens": 19338134.0, "step": 24036 }, { "epoch": 6.365995762711864, "grad_norm": 1.7166444063186646, "learning_rate": 6.817134533898305e-06, "loss": 0.8158, "mean_token_accuracy": 0.7839746624231339, "num_tokens": 19339846.0, "step": 24038 }, { "epoch": 6.366525423728813, "grad_norm": 2.1606006622314453, "learning_rate": 6.8168697033898315e-06, "loss": 0.9516, "mean_token_accuracy": 0.7548790499567986, "num_tokens": 19341421.0, "step": 24040 }, { "epoch": 6.367055084745763, "grad_norm": 2.76118540763855, "learning_rate": 6.816604872881356e-06, "loss": 1.4434, "mean_token_accuracy": 0.6804265379905701, "num_tokens": 19342733.0, "step": 24042 }, { "epoch": 6.367584745762712, "grad_norm": 2.426217555999756, "learning_rate": 6.816340042372882e-06, "loss": 0.9742, "mean_token_accuracy": 0.759783647954464, "num_tokens": 19344241.0, "step": 24044 }, { "epoch": 6.368114406779661, "grad_norm": 2.790570020675659, "learning_rate": 6.816075211864407e-06, "loss": 1.6388, "mean_token_accuracy": 0.6425099745392799, "num_tokens": 19345797.0, "step": 24046 }, { "epoch": 6.36864406779661, "grad_norm": 2.4956984519958496, "learning_rate": 6.815810381355933e-06, "loss": 1.1579, "mean_token_accuracy": 0.7200310900807381, "num_tokens": 19347117.0, "step": 24048 }, { "epoch": 6.36917372881356, "grad_norm": 2.552222967147827, "learning_rate": 6.815545550847458e-06, "loss": 1.5857, "mean_token_accuracy": 0.6297173127532005, "num_tokens": 19348742.0, "step": 24050 }, { "epoch": 6.369703389830509, "grad_norm": 2.1462454795837402, "learning_rate": 6.815280720338984e-06, "loss": 0.7224, "mean_token_accuracy": 0.8068202808499336, "num_tokens": 19350373.0, "step": 24052 }, { "epoch": 6.370233050847458, "grad_norm": 2.750746726989746, "learning_rate": 6.8150158898305085e-06, "loss": 1.1215, "mean_token_accuracy": 0.7335666567087173, "num_tokens": 19351828.0, "step": 24054 }, { "epoch": 6.370762711864407, "grad_norm": 2.6883914470672607, "learning_rate": 6.814751059322034e-06, "loss": 1.6318, "mean_token_accuracy": 0.6395324617624283, "num_tokens": 19353648.0, "step": 24056 }, { "epoch": 6.3712923728813555, "grad_norm": 2.8710052967071533, "learning_rate": 6.814486228813559e-06, "loss": 1.4078, "mean_token_accuracy": 0.6945989355444908, "num_tokens": 19355112.0, "step": 24058 }, { "epoch": 6.371822033898305, "grad_norm": 2.16267466545105, "learning_rate": 6.814221398305085e-06, "loss": 1.1228, "mean_token_accuracy": 0.7163945138454437, "num_tokens": 19356623.0, "step": 24060 }, { "epoch": 6.372351694915254, "grad_norm": 2.4920096397399902, "learning_rate": 6.81395656779661e-06, "loss": 1.6626, "mean_token_accuracy": 0.6764540374279022, "num_tokens": 19358968.0, "step": 24062 }, { "epoch": 6.372881355932203, "grad_norm": 2.8311665058135986, "learning_rate": 6.8136917372881365e-06, "loss": 1.2302, "mean_token_accuracy": 0.688496395945549, "num_tokens": 19361189.0, "step": 24064 }, { "epoch": 6.373411016949152, "grad_norm": 2.0502781867980957, "learning_rate": 6.813426906779662e-06, "loss": 1.0467, "mean_token_accuracy": 0.7509712055325508, "num_tokens": 19362990.0, "step": 24066 }, { "epoch": 6.373940677966102, "grad_norm": 2.4823224544525146, "learning_rate": 6.813162076271187e-06, "loss": 1.674, "mean_token_accuracy": 0.611443042755127, "num_tokens": 19364543.0, "step": 24068 }, { "epoch": 6.374470338983051, "grad_norm": 2.5212507247924805, "learning_rate": 6.812897245762713e-06, "loss": 1.3295, "mean_token_accuracy": 0.6807988062500954, "num_tokens": 19366114.0, "step": 24070 }, { "epoch": 6.375, "grad_norm": 3.079073190689087, "learning_rate": 6.812632415254238e-06, "loss": 1.4761, "mean_token_accuracy": 0.6484077200293541, "num_tokens": 19367467.0, "step": 24072 }, { "epoch": 6.375529661016949, "grad_norm": 2.9075236320495605, "learning_rate": 6.812367584745764e-06, "loss": 1.4729, "mean_token_accuracy": 0.6802223026752472, "num_tokens": 19368850.0, "step": 24074 }, { "epoch": 6.376059322033898, "grad_norm": 2.103794813156128, "learning_rate": 6.812102754237289e-06, "loss": 1.2988, "mean_token_accuracy": 0.7147214412689209, "num_tokens": 19370525.0, "step": 24076 }, { "epoch": 6.376588983050848, "grad_norm": 2.4921860694885254, "learning_rate": 6.8118379237288144e-06, "loss": 1.4708, "mean_token_accuracy": 0.6673570275306702, "num_tokens": 19372450.0, "step": 24078 }, { "epoch": 6.377118644067797, "grad_norm": 1.9397820234298706, "learning_rate": 6.811573093220339e-06, "loss": 1.0107, "mean_token_accuracy": 0.7612814679741859, "num_tokens": 19373933.0, "step": 24080 }, { "epoch": 6.377648305084746, "grad_norm": 2.7058496475219727, "learning_rate": 6.811308262711865e-06, "loss": 1.4076, "mean_token_accuracy": 0.6844381392002106, "num_tokens": 19375461.0, "step": 24082 }, { "epoch": 6.378177966101695, "grad_norm": 2.2296218872070312, "learning_rate": 6.81104343220339e-06, "loss": 1.0965, "mean_token_accuracy": 0.7507707923650742, "num_tokens": 19376956.0, "step": 24084 }, { "epoch": 6.3787076271186445, "grad_norm": 2.48540997505188, "learning_rate": 6.810778601694916e-06, "loss": 1.4445, "mean_token_accuracy": 0.6858051866292953, "num_tokens": 19378413.0, "step": 24086 }, { "epoch": 6.379237288135593, "grad_norm": 2.987266778945923, "learning_rate": 6.810513771186441e-06, "loss": 1.7683, "mean_token_accuracy": 0.6388645768165588, "num_tokens": 19379929.0, "step": 24088 }, { "epoch": 6.379766949152542, "grad_norm": 2.4166386127471924, "learning_rate": 6.810248940677967e-06, "loss": 1.0623, "mean_token_accuracy": 0.7372709736227989, "num_tokens": 19381529.0, "step": 24090 }, { "epoch": 6.380296610169491, "grad_norm": 2.7623727321624756, "learning_rate": 6.8099841101694915e-06, "loss": 1.1721, "mean_token_accuracy": 0.7230429872870445, "num_tokens": 19383270.0, "step": 24092 }, { "epoch": 6.38082627118644, "grad_norm": 2.6309621334075928, "learning_rate": 6.809719279661018e-06, "loss": 1.2095, "mean_token_accuracy": 0.7055291682481766, "num_tokens": 19384739.0, "step": 24094 }, { "epoch": 6.38135593220339, "grad_norm": 2.1046533584594727, "learning_rate": 6.809454449152543e-06, "loss": 1.2605, "mean_token_accuracy": 0.704755499958992, "num_tokens": 19386281.0, "step": 24096 }, { "epoch": 6.381885593220339, "grad_norm": 2.587672472000122, "learning_rate": 6.809189618644069e-06, "loss": 1.4974, "mean_token_accuracy": 0.6688294112682343, "num_tokens": 19387855.0, "step": 24098 }, { "epoch": 6.382415254237288, "grad_norm": 2.270176649093628, "learning_rate": 6.808924788135594e-06, "loss": 1.263, "mean_token_accuracy": 0.7191416993737221, "num_tokens": 19389637.0, "step": 24100 }, { "epoch": 6.382944915254237, "grad_norm": 2.0957376956939697, "learning_rate": 6.8086599576271195e-06, "loss": 1.2577, "mean_token_accuracy": 0.7110297158360481, "num_tokens": 19391140.0, "step": 24102 }, { "epoch": 6.383474576271187, "grad_norm": 2.502326726913452, "learning_rate": 6.8083951271186444e-06, "loss": 1.6252, "mean_token_accuracy": 0.6329575553536415, "num_tokens": 19392917.0, "step": 24104 }, { "epoch": 6.384004237288136, "grad_norm": 2.8005447387695312, "learning_rate": 6.80813029661017e-06, "loss": 1.5398, "mean_token_accuracy": 0.6811496615409851, "num_tokens": 19394508.0, "step": 24106 }, { "epoch": 6.384533898305085, "grad_norm": 2.2072646617889404, "learning_rate": 6.807865466101695e-06, "loss": 0.9699, "mean_token_accuracy": 0.7572073340415955, "num_tokens": 19395927.0, "step": 24108 }, { "epoch": 6.385063559322034, "grad_norm": 2.828026294708252, "learning_rate": 6.807600635593221e-06, "loss": 1.3512, "mean_token_accuracy": 0.730609841644764, "num_tokens": 19397279.0, "step": 24110 }, { "epoch": 6.385593220338983, "grad_norm": 2.4397242069244385, "learning_rate": 6.807335805084746e-06, "loss": 1.3223, "mean_token_accuracy": 0.696414440870285, "num_tokens": 19398747.0, "step": 24112 }, { "epoch": 6.3861228813559325, "grad_norm": 2.2512056827545166, "learning_rate": 6.807070974576272e-06, "loss": 1.1328, "mean_token_accuracy": 0.7210691422224045, "num_tokens": 19400276.0, "step": 24114 }, { "epoch": 6.3866525423728815, "grad_norm": 2.4315052032470703, "learning_rate": 6.8068061440677966e-06, "loss": 1.1704, "mean_token_accuracy": 0.7183967754244804, "num_tokens": 19402238.0, "step": 24116 }, { "epoch": 6.3871822033898304, "grad_norm": 2.174100399017334, "learning_rate": 6.806541313559323e-06, "loss": 1.3073, "mean_token_accuracy": 0.6986727342009544, "num_tokens": 19403768.0, "step": 24118 }, { "epoch": 6.387711864406779, "grad_norm": 2.450990915298462, "learning_rate": 6.806276483050847e-06, "loss": 1.2149, "mean_token_accuracy": 0.7178156599402428, "num_tokens": 19405184.0, "step": 24120 }, { "epoch": 6.388241525423728, "grad_norm": 2.667755603790283, "learning_rate": 6.806011652542374e-06, "loss": 1.5165, "mean_token_accuracy": 0.6822083853185177, "num_tokens": 19406785.0, "step": 24122 }, { "epoch": 6.388771186440678, "grad_norm": 2.1435976028442383, "learning_rate": 6.805746822033899e-06, "loss": 1.2851, "mean_token_accuracy": 0.7216407470405102, "num_tokens": 19408330.0, "step": 24124 }, { "epoch": 6.389300847457627, "grad_norm": 2.1039414405822754, "learning_rate": 6.805481991525425e-06, "loss": 1.2988, "mean_token_accuracy": 0.7393625304102898, "num_tokens": 19410819.0, "step": 24126 }, { "epoch": 6.389830508474576, "grad_norm": 2.893047332763672, "learning_rate": 6.8052171610169495e-06, "loss": 1.1073, "mean_token_accuracy": 0.7366263195872307, "num_tokens": 19412162.0, "step": 24128 }, { "epoch": 6.390360169491525, "grad_norm": 2.9043562412261963, "learning_rate": 6.804952330508475e-06, "loss": 1.1082, "mean_token_accuracy": 0.7540041729807854, "num_tokens": 19413713.0, "step": 24130 }, { "epoch": 6.390889830508475, "grad_norm": 2.2709007263183594, "learning_rate": 6.8046875e-06, "loss": 0.8457, "mean_token_accuracy": 0.7827667817473412, "num_tokens": 19415007.0, "step": 24132 }, { "epoch": 6.391419491525424, "grad_norm": 2.3184432983398438, "learning_rate": 6.804422669491526e-06, "loss": 1.1609, "mean_token_accuracy": 0.7332914769649506, "num_tokens": 19416774.0, "step": 24134 }, { "epoch": 6.391949152542373, "grad_norm": 2.289686918258667, "learning_rate": 6.804157838983051e-06, "loss": 1.2044, "mean_token_accuracy": 0.725551001727581, "num_tokens": 19418511.0, "step": 24136 }, { "epoch": 6.392478813559322, "grad_norm": 2.9647865295410156, "learning_rate": 6.803893008474577e-06, "loss": 1.0342, "mean_token_accuracy": 0.7712633833289146, "num_tokens": 19419955.0, "step": 24138 }, { "epoch": 6.393008474576272, "grad_norm": 2.1588809490203857, "learning_rate": 6.803628177966102e-06, "loss": 1.1237, "mean_token_accuracy": 0.749637059867382, "num_tokens": 19421718.0, "step": 24140 }, { "epoch": 6.393538135593221, "grad_norm": 2.3759589195251465, "learning_rate": 6.803363347457627e-06, "loss": 1.5667, "mean_token_accuracy": 0.6686982475221157, "num_tokens": 19423092.0, "step": 24142 }, { "epoch": 6.3940677966101696, "grad_norm": 2.5411813259124756, "learning_rate": 6.803098516949152e-06, "loss": 1.4666, "mean_token_accuracy": 0.6495256274938583, "num_tokens": 19424741.0, "step": 24144 }, { "epoch": 6.3945974576271185, "grad_norm": 2.4355716705322266, "learning_rate": 6.802833686440678e-06, "loss": 1.3245, "mean_token_accuracy": 0.6910126507282257, "num_tokens": 19426291.0, "step": 24146 }, { "epoch": 6.3951271186440675, "grad_norm": 2.61802077293396, "learning_rate": 6.802568855932203e-06, "loss": 0.869, "mean_token_accuracy": 0.7542671039700508, "num_tokens": 19428063.0, "step": 24148 }, { "epoch": 6.395656779661017, "grad_norm": 2.9301929473876953, "learning_rate": 6.80230402542373e-06, "loss": 1.4068, "mean_token_accuracy": 0.6813434660434723, "num_tokens": 19429553.0, "step": 24150 }, { "epoch": 6.396186440677966, "grad_norm": 2.557217836380005, "learning_rate": 6.8020391949152555e-06, "loss": 0.9994, "mean_token_accuracy": 0.7411945462226868, "num_tokens": 19430957.0, "step": 24152 }, { "epoch": 6.396716101694915, "grad_norm": 2.2234957218170166, "learning_rate": 6.80177436440678e-06, "loss": 1.2607, "mean_token_accuracy": 0.7065361812710762, "num_tokens": 19432716.0, "step": 24154 }, { "epoch": 6.397245762711864, "grad_norm": 2.606738805770874, "learning_rate": 6.801509533898306e-06, "loss": 1.6215, "mean_token_accuracy": 0.6378103271126747, "num_tokens": 19434307.0, "step": 24156 }, { "epoch": 6.397775423728813, "grad_norm": 2.6577625274658203, "learning_rate": 6.801244703389831e-06, "loss": 1.038, "mean_token_accuracy": 0.7620697617530823, "num_tokens": 19435720.0, "step": 24158 }, { "epoch": 6.398305084745763, "grad_norm": 2.710766077041626, "learning_rate": 6.800979872881357e-06, "loss": 1.1651, "mean_token_accuracy": 0.7342714294791222, "num_tokens": 19437018.0, "step": 24160 }, { "epoch": 6.398834745762712, "grad_norm": 2.2546956539154053, "learning_rate": 6.800715042372882e-06, "loss": 1.0719, "mean_token_accuracy": 0.743299089372158, "num_tokens": 19438852.0, "step": 24162 }, { "epoch": 6.399364406779661, "grad_norm": 2.15634822845459, "learning_rate": 6.8004502118644076e-06, "loss": 1.2832, "mean_token_accuracy": 0.7077953889966011, "num_tokens": 19440279.0, "step": 24164 }, { "epoch": 6.39989406779661, "grad_norm": 3.0766873359680176, "learning_rate": 6.8001853813559325e-06, "loss": 1.5232, "mean_token_accuracy": 0.6749706491827965, "num_tokens": 19441767.0, "step": 24166 }, { "epoch": 6.40042372881356, "grad_norm": 2.6933529376983643, "learning_rate": 6.799920550847458e-06, "loss": 1.4068, "mean_token_accuracy": 0.6876146793365479, "num_tokens": 19443321.0, "step": 24168 }, { "epoch": 6.400953389830509, "grad_norm": 2.6033570766448975, "learning_rate": 6.799655720338983e-06, "loss": 0.869, "mean_token_accuracy": 0.778341494500637, "num_tokens": 19444768.0, "step": 24170 }, { "epoch": 6.401483050847458, "grad_norm": 2.369337558746338, "learning_rate": 6.79939088983051e-06, "loss": 1.4706, "mean_token_accuracy": 0.6923549920320511, "num_tokens": 19446397.0, "step": 24172 }, { "epoch": 6.402012711864407, "grad_norm": 2.7429089546203613, "learning_rate": 6.799126059322034e-06, "loss": 1.3294, "mean_token_accuracy": 0.7051246985793114, "num_tokens": 19448104.0, "step": 24174 }, { "epoch": 6.4025423728813555, "grad_norm": 2.8380849361419678, "learning_rate": 6.7988612288135605e-06, "loss": 0.9525, "mean_token_accuracy": 0.7775925695896149, "num_tokens": 19449621.0, "step": 24176 }, { "epoch": 6.403072033898305, "grad_norm": 2.7828898429870605, "learning_rate": 6.7985963983050855e-06, "loss": 1.2834, "mean_token_accuracy": 0.7001970633864403, "num_tokens": 19450793.0, "step": 24178 }, { "epoch": 6.403601694915254, "grad_norm": 2.309509515762329, "learning_rate": 6.798331567796611e-06, "loss": 1.4587, "mean_token_accuracy": 0.6585867330431938, "num_tokens": 19452719.0, "step": 24180 }, { "epoch": 6.404131355932203, "grad_norm": 2.767503261566162, "learning_rate": 6.798066737288136e-06, "loss": 1.2697, "mean_token_accuracy": 0.7082633599638939, "num_tokens": 19454113.0, "step": 24182 }, { "epoch": 6.404661016949152, "grad_norm": 2.7617597579956055, "learning_rate": 6.797801906779662e-06, "loss": 1.1219, "mean_token_accuracy": 0.7237064242362976, "num_tokens": 19455789.0, "step": 24184 }, { "epoch": 6.405190677966102, "grad_norm": 2.2943735122680664, "learning_rate": 6.797537076271187e-06, "loss": 1.1086, "mean_token_accuracy": 0.7398715727031231, "num_tokens": 19457341.0, "step": 24186 }, { "epoch": 6.405720338983051, "grad_norm": 2.762478828430176, "learning_rate": 6.797272245762713e-06, "loss": 1.3695, "mean_token_accuracy": 0.6857242360711098, "num_tokens": 19458836.0, "step": 24188 }, { "epoch": 6.40625, "grad_norm": 2.7513058185577393, "learning_rate": 6.7970074152542376e-06, "loss": 1.2125, "mean_token_accuracy": 0.7201855033636093, "num_tokens": 19460433.0, "step": 24190 }, { "epoch": 6.406779661016949, "grad_norm": 2.480607271194458, "learning_rate": 6.796742584745763e-06, "loss": 0.8329, "mean_token_accuracy": 0.7715223357081413, "num_tokens": 19461728.0, "step": 24192 }, { "epoch": 6.407309322033898, "grad_norm": 2.700357675552368, "learning_rate": 6.796477754237288e-06, "loss": 1.2475, "mean_token_accuracy": 0.6939724385738373, "num_tokens": 19463418.0, "step": 24194 }, { "epoch": 6.407838983050848, "grad_norm": 2.148740530014038, "learning_rate": 6.796212923728814e-06, "loss": 0.7901, "mean_token_accuracy": 0.7899703234434128, "num_tokens": 19464988.0, "step": 24196 }, { "epoch": 6.408368644067797, "grad_norm": 2.9855797290802, "learning_rate": 6.795948093220339e-06, "loss": 1.5117, "mean_token_accuracy": 0.679527647793293, "num_tokens": 19466517.0, "step": 24198 }, { "epoch": 6.408898305084746, "grad_norm": 2.266998052597046, "learning_rate": 6.795683262711865e-06, "loss": 1.0093, "mean_token_accuracy": 0.7748913541436195, "num_tokens": 19468117.0, "step": 24200 }, { "epoch": 6.409427966101695, "grad_norm": 2.1583175659179688, "learning_rate": 6.79541843220339e-06, "loss": 1.288, "mean_token_accuracy": 0.6990962401032448, "num_tokens": 19469796.0, "step": 24202 }, { "epoch": 6.4099576271186445, "grad_norm": 2.440063238143921, "learning_rate": 6.795153601694916e-06, "loss": 1.5285, "mean_token_accuracy": 0.646827220916748, "num_tokens": 19471308.0, "step": 24204 }, { "epoch": 6.410487288135593, "grad_norm": 2.016038417816162, "learning_rate": 6.794888771186441e-06, "loss": 1.0562, "mean_token_accuracy": 0.7469303235411644, "num_tokens": 19473229.0, "step": 24206 }, { "epoch": 6.411016949152542, "grad_norm": 2.471726894378662, "learning_rate": 6.794623940677967e-06, "loss": 1.4139, "mean_token_accuracy": 0.6800167039036751, "num_tokens": 19474809.0, "step": 24208 }, { "epoch": 6.411546610169491, "grad_norm": 2.2322943210601807, "learning_rate": 6.794359110169492e-06, "loss": 1.2382, "mean_token_accuracy": 0.7002443075180054, "num_tokens": 19476455.0, "step": 24210 }, { "epoch": 6.41207627118644, "grad_norm": 2.6430933475494385, "learning_rate": 6.794094279661018e-06, "loss": 0.9422, "mean_token_accuracy": 0.7502169385552406, "num_tokens": 19477930.0, "step": 24212 }, { "epoch": 6.41260593220339, "grad_norm": 2.2945902347564697, "learning_rate": 6.793829449152543e-06, "loss": 1.1847, "mean_token_accuracy": 0.6997945159673691, "num_tokens": 19479275.0, "step": 24214 }, { "epoch": 6.413135593220339, "grad_norm": 2.337557315826416, "learning_rate": 6.7935646186440684e-06, "loss": 1.1911, "mean_token_accuracy": 0.7173586785793304, "num_tokens": 19481033.0, "step": 24216 }, { "epoch": 6.413665254237288, "grad_norm": 2.7184596061706543, "learning_rate": 6.793299788135593e-06, "loss": 1.0841, "mean_token_accuracy": 0.7443658635020256, "num_tokens": 19482307.0, "step": 24218 }, { "epoch": 6.414194915254237, "grad_norm": 2.13639497756958, "learning_rate": 6.793034957627119e-06, "loss": 0.758, "mean_token_accuracy": 0.7925720065832138, "num_tokens": 19483949.0, "step": 24220 }, { "epoch": 6.414724576271187, "grad_norm": 2.3526499271392822, "learning_rate": 6.792770127118644e-06, "loss": 0.9908, "mean_token_accuracy": 0.7726400047540665, "num_tokens": 19485507.0, "step": 24222 }, { "epoch": 6.415254237288136, "grad_norm": 2.4620978832244873, "learning_rate": 6.79250529661017e-06, "loss": 1.4784, "mean_token_accuracy": 0.6937752738595009, "num_tokens": 19487056.0, "step": 24224 }, { "epoch": 6.415783898305085, "grad_norm": 2.7218616008758545, "learning_rate": 6.792240466101695e-06, "loss": 1.0555, "mean_token_accuracy": 0.7662727013230324, "num_tokens": 19488375.0, "step": 24226 }, { "epoch": 6.416313559322034, "grad_norm": 2.877288579940796, "learning_rate": 6.7919756355932205e-06, "loss": 1.1767, "mean_token_accuracy": 0.695123977959156, "num_tokens": 19489922.0, "step": 24228 }, { "epoch": 6.416843220338983, "grad_norm": 1.9876872301101685, "learning_rate": 6.7917108050847455e-06, "loss": 1.2536, "mean_token_accuracy": 0.7172472029924393, "num_tokens": 19491577.0, "step": 24230 }, { "epoch": 6.4173728813559325, "grad_norm": 2.1930551528930664, "learning_rate": 6.791445974576272e-06, "loss": 1.368, "mean_token_accuracy": 0.6721377745270729, "num_tokens": 19493417.0, "step": 24232 }, { "epoch": 6.4179025423728815, "grad_norm": 2.5686259269714355, "learning_rate": 6.791181144067798e-06, "loss": 1.1766, "mean_token_accuracy": 0.7377098053693771, "num_tokens": 19495018.0, "step": 24234 }, { "epoch": 6.4184322033898304, "grad_norm": 1.9787101745605469, "learning_rate": 6.790916313559323e-06, "loss": 0.9192, "mean_token_accuracy": 0.7869895994663239, "num_tokens": 19496867.0, "step": 24236 }, { "epoch": 6.418961864406779, "grad_norm": 2.543297529220581, "learning_rate": 6.790651483050849e-06, "loss": 1.2122, "mean_token_accuracy": 0.7118230685591698, "num_tokens": 19498413.0, "step": 24238 }, { "epoch": 6.419491525423728, "grad_norm": 2.791091203689575, "learning_rate": 6.7903866525423735e-06, "loss": 1.224, "mean_token_accuracy": 0.7001800909638405, "num_tokens": 19499876.0, "step": 24240 }, { "epoch": 6.420021186440678, "grad_norm": 2.5351099967956543, "learning_rate": 6.790121822033899e-06, "loss": 0.9748, "mean_token_accuracy": 0.7621178478002548, "num_tokens": 19501503.0, "step": 24242 }, { "epoch": 6.420550847457627, "grad_norm": 2.6219069957733154, "learning_rate": 6.789856991525424e-06, "loss": 1.4628, "mean_token_accuracy": 0.6664972603321075, "num_tokens": 19503158.0, "step": 24244 }, { "epoch": 6.421080508474576, "grad_norm": 2.614988327026367, "learning_rate": 6.78959216101695e-06, "loss": 1.5257, "mean_token_accuracy": 0.6617174223065376, "num_tokens": 19504906.0, "step": 24246 }, { "epoch": 6.421610169491525, "grad_norm": 2.404808282852173, "learning_rate": 6.789327330508475e-06, "loss": 0.9731, "mean_token_accuracy": 0.7536152005195618, "num_tokens": 19506361.0, "step": 24248 }, { "epoch": 6.422139830508475, "grad_norm": 2.8378868103027344, "learning_rate": 6.789062500000001e-06, "loss": 0.917, "step": 24250 }, { "epoch": 6.422139830508475, "eval_loss": 1.3263899087905884, "eval_mean_token_accuracy": 0.7002705089844666, "eval_num_tokens": 19507695.0, "eval_runtime": 48.8143, "eval_samples_per_second": 6.31, "eval_steps_per_second": 6.31, "step": 24250 }, { "epoch": 6.422669491525424, "grad_norm": 2.5315961837768555, "learning_rate": 6.788797669491526e-06, "loss": 1.4788, "mean_token_accuracy": 0.7176171280443668, "num_tokens": 19509491.0, "step": 24252 }, { "epoch": 6.423199152542373, "grad_norm": 2.3790576457977295, "learning_rate": 6.788532838983051e-06, "loss": 0.7952, "mean_token_accuracy": 0.807662233710289, "num_tokens": 19510886.0, "step": 24254 }, { "epoch": 6.423728813559322, "grad_norm": 2.8316352367401123, "learning_rate": 6.788268008474576e-06, "loss": 1.5134, "mean_token_accuracy": 0.664506807923317, "num_tokens": 19512415.0, "step": 24256 }, { "epoch": 6.424258474576272, "grad_norm": 2.390718936920166, "learning_rate": 6.788003177966103e-06, "loss": 1.0931, "mean_token_accuracy": 0.729166142642498, "num_tokens": 19514033.0, "step": 24258 }, { "epoch": 6.424788135593221, "grad_norm": 2.177273988723755, "learning_rate": 6.787738347457628e-06, "loss": 1.0827, "mean_token_accuracy": 0.7326474189758301, "num_tokens": 19515781.0, "step": 24260 }, { "epoch": 6.4253177966101696, "grad_norm": 2.3545923233032227, "learning_rate": 6.787473516949154e-06, "loss": 1.213, "mean_token_accuracy": 0.7446140646934509, "num_tokens": 19517630.0, "step": 24262 }, { "epoch": 6.4258474576271185, "grad_norm": 2.500408887863159, "learning_rate": 6.787208686440679e-06, "loss": 1.3715, "mean_token_accuracy": 0.7294250726699829, "num_tokens": 19519324.0, "step": 24264 }, { "epoch": 6.4263771186440675, "grad_norm": 2.213385820388794, "learning_rate": 6.786943855932204e-06, "loss": 1.1775, "mean_token_accuracy": 0.727726049721241, "num_tokens": 19520925.0, "step": 24266 }, { "epoch": 6.426906779661017, "grad_norm": 1.8901410102844238, "learning_rate": 6.786679025423729e-06, "loss": 0.935, "mean_token_accuracy": 0.7549051195383072, "num_tokens": 19522956.0, "step": 24268 }, { "epoch": 6.427436440677966, "grad_norm": 1.9019463062286377, "learning_rate": 6.786414194915255e-06, "loss": 1.2322, "mean_token_accuracy": 0.7309023216366768, "num_tokens": 19524487.0, "step": 24270 }, { "epoch": 6.427966101694915, "grad_norm": 2.275146961212158, "learning_rate": 6.78614936440678e-06, "loss": 1.1255, "mean_token_accuracy": 0.7537982761859894, "num_tokens": 19526700.0, "step": 24272 }, { "epoch": 6.428495762711864, "grad_norm": 2.492076873779297, "learning_rate": 6.785884533898306e-06, "loss": 1.5856, "mean_token_accuracy": 0.6620995998382568, "num_tokens": 19528317.0, "step": 24274 }, { "epoch": 6.429025423728813, "grad_norm": 2.5209577083587646, "learning_rate": 6.785619703389831e-06, "loss": 1.2104, "mean_token_accuracy": 0.6979650184512138, "num_tokens": 19530020.0, "step": 24276 }, { "epoch": 6.429555084745763, "grad_norm": 1.953433871269226, "learning_rate": 6.7853548728813565e-06, "loss": 1.0828, "mean_token_accuracy": 0.7275232076644897, "num_tokens": 19531550.0, "step": 24278 }, { "epoch": 6.430084745762712, "grad_norm": 2.3258259296417236, "learning_rate": 6.785090042372881e-06, "loss": 1.2973, "mean_token_accuracy": 0.7097348421812057, "num_tokens": 19533360.0, "step": 24280 }, { "epoch": 6.430614406779661, "grad_norm": 3.1143407821655273, "learning_rate": 6.784825211864407e-06, "loss": 1.1122, "mean_token_accuracy": 0.7522830069065094, "num_tokens": 19534626.0, "step": 24282 }, { "epoch": 6.43114406779661, "grad_norm": 2.581156015396118, "learning_rate": 6.784560381355932e-06, "loss": 1.3814, "mean_token_accuracy": 0.6827825307846069, "num_tokens": 19536439.0, "step": 24284 }, { "epoch": 6.43167372881356, "grad_norm": 2.7685608863830566, "learning_rate": 6.784295550847459e-06, "loss": 1.1002, "mean_token_accuracy": 0.7528838291764259, "num_tokens": 19537851.0, "step": 24286 }, { "epoch": 6.432203389830509, "grad_norm": 2.4920294284820557, "learning_rate": 6.784030720338983e-06, "loss": 1.2621, "mean_token_accuracy": 0.6919988393783569, "num_tokens": 19539799.0, "step": 24288 }, { "epoch": 6.432733050847458, "grad_norm": 2.397381544113159, "learning_rate": 6.7837658898305094e-06, "loss": 1.5751, "mean_token_accuracy": 0.6402341574430466, "num_tokens": 19541638.0, "step": 24290 }, { "epoch": 6.433262711864407, "grad_norm": 2.7492876052856445, "learning_rate": 6.783501059322034e-06, "loss": 1.6257, "mean_token_accuracy": 0.6372670531272888, "num_tokens": 19543563.0, "step": 24292 }, { "epoch": 6.4337923728813555, "grad_norm": 2.93333101272583, "learning_rate": 6.78323622881356e-06, "loss": 1.3561, "mean_token_accuracy": 0.673450656235218, "num_tokens": 19545017.0, "step": 24294 }, { "epoch": 6.434322033898305, "grad_norm": 2.526819944381714, "learning_rate": 6.782971398305085e-06, "loss": 0.9837, "mean_token_accuracy": 0.762599416077137, "num_tokens": 19546449.0, "step": 24296 }, { "epoch": 6.434851694915254, "grad_norm": 2.686910629272461, "learning_rate": 6.782706567796611e-06, "loss": 1.4662, "mean_token_accuracy": 0.6498583778738976, "num_tokens": 19548251.0, "step": 24298 }, { "epoch": 6.435381355932203, "grad_norm": 2.485882043838501, "learning_rate": 6.782441737288136e-06, "loss": 1.2205, "mean_token_accuracy": 0.7343806698918343, "num_tokens": 19550019.0, "step": 24300 }, { "epoch": 6.435911016949152, "grad_norm": 2.5125882625579834, "learning_rate": 6.7821769067796616e-06, "loss": 1.3777, "mean_token_accuracy": 0.6680825054645538, "num_tokens": 19551832.0, "step": 24302 }, { "epoch": 6.436440677966102, "grad_norm": 2.0904011726379395, "learning_rate": 6.7819120762711865e-06, "loss": 1.3064, "mean_token_accuracy": 0.6886448413133621, "num_tokens": 19553963.0, "step": 24304 }, { "epoch": 6.436970338983051, "grad_norm": 2.691100835800171, "learning_rate": 6.781647245762712e-06, "loss": 1.1497, "mean_token_accuracy": 0.7011266276240349, "num_tokens": 19555502.0, "step": 24306 }, { "epoch": 6.4375, "grad_norm": 2.3514668941497803, "learning_rate": 6.781382415254237e-06, "loss": 1.4111, "mean_token_accuracy": 0.6881202310323715, "num_tokens": 19557143.0, "step": 24308 }, { "epoch": 6.438029661016949, "grad_norm": 2.2456510066986084, "learning_rate": 6.781117584745763e-06, "loss": 1.178, "mean_token_accuracy": 0.7133205756545067, "num_tokens": 19558839.0, "step": 24310 }, { "epoch": 6.438559322033898, "grad_norm": 2.3935303688049316, "learning_rate": 6.780852754237288e-06, "loss": 1.1626, "mean_token_accuracy": 0.7419235780835152, "num_tokens": 19560245.0, "step": 24312 }, { "epoch": 6.439088983050848, "grad_norm": 2.215787410736084, "learning_rate": 6.7805879237288145e-06, "loss": 1.3273, "mean_token_accuracy": 0.6787092983722687, "num_tokens": 19561729.0, "step": 24314 }, { "epoch": 6.439618644067797, "grad_norm": 2.416637420654297, "learning_rate": 6.780323093220339e-06, "loss": 0.9492, "mean_token_accuracy": 0.7685559317469597, "num_tokens": 19563179.0, "step": 24316 }, { "epoch": 6.440148305084746, "grad_norm": 2.5329790115356445, "learning_rate": 6.780058262711865e-06, "loss": 0.986, "mean_token_accuracy": 0.7597011551260948, "num_tokens": 19564610.0, "step": 24318 }, { "epoch": 6.440677966101695, "grad_norm": 2.187624216079712, "learning_rate": 6.779793432203391e-06, "loss": 1.1264, "mean_token_accuracy": 0.7178450524806976, "num_tokens": 19566397.0, "step": 24320 }, { "epoch": 6.4412076271186445, "grad_norm": 2.1515345573425293, "learning_rate": 6.779528601694916e-06, "loss": 1.032, "mean_token_accuracy": 0.7671520859003067, "num_tokens": 19567821.0, "step": 24322 }, { "epoch": 6.441737288135593, "grad_norm": 3.084932327270508, "learning_rate": 6.779263771186442e-06, "loss": 1.5452, "mean_token_accuracy": 0.6563404873013496, "num_tokens": 19569210.0, "step": 24324 }, { "epoch": 6.442266949152542, "grad_norm": 2.396256923675537, "learning_rate": 6.778998940677967e-06, "loss": 1.1929, "mean_token_accuracy": 0.7114703431725502, "num_tokens": 19570857.0, "step": 24326 }, { "epoch": 6.442796610169491, "grad_norm": 5.092067718505859, "learning_rate": 6.778734110169492e-06, "loss": 1.2525, "mean_token_accuracy": 0.7309362292289734, "num_tokens": 19572357.0, "step": 24328 }, { "epoch": 6.44332627118644, "grad_norm": 2.9300425052642822, "learning_rate": 6.778469279661017e-06, "loss": 1.2554, "mean_token_accuracy": 0.7235875800251961, "num_tokens": 19573912.0, "step": 24330 }, { "epoch": 6.44385593220339, "grad_norm": 2.4754488468170166, "learning_rate": 6.778204449152543e-06, "loss": 1.346, "mean_token_accuracy": 0.7020739316940308, "num_tokens": 19575519.0, "step": 24332 }, { "epoch": 6.444385593220339, "grad_norm": 3.0275046825408936, "learning_rate": 6.777939618644068e-06, "loss": 1.5288, "mean_token_accuracy": 0.6352874934673309, "num_tokens": 19577257.0, "step": 24334 }, { "epoch": 6.444915254237288, "grad_norm": 1.8655004501342773, "learning_rate": 6.777674788135594e-06, "loss": 0.6469, "mean_token_accuracy": 0.8336842209100723, "num_tokens": 19578958.0, "step": 24336 }, { "epoch": 6.445444915254237, "grad_norm": 2.3159871101379395, "learning_rate": 6.777409957627119e-06, "loss": 1.0861, "mean_token_accuracy": 0.7191332206130028, "num_tokens": 19580573.0, "step": 24338 }, { "epoch": 6.445974576271187, "grad_norm": 2.443312644958496, "learning_rate": 6.777145127118645e-06, "loss": 1.1041, "mean_token_accuracy": 0.7333912029862404, "num_tokens": 19581845.0, "step": 24340 }, { "epoch": 6.446504237288136, "grad_norm": 2.6062302589416504, "learning_rate": 6.7768802966101695e-06, "loss": 1.2868, "mean_token_accuracy": 0.6819686889648438, "num_tokens": 19583507.0, "step": 24342 }, { "epoch": 6.447033898305085, "grad_norm": 2.118797540664673, "learning_rate": 6.776615466101696e-06, "loss": 1.198, "mean_token_accuracy": 0.7159296721220016, "num_tokens": 19585203.0, "step": 24344 }, { "epoch": 6.447563559322034, "grad_norm": 2.2762491703033447, "learning_rate": 6.776350635593221e-06, "loss": 0.9836, "mean_token_accuracy": 0.7590419054031372, "num_tokens": 19586844.0, "step": 24346 }, { "epoch": 6.448093220338983, "grad_norm": 2.7605087757110596, "learning_rate": 6.776085805084747e-06, "loss": 1.0692, "mean_token_accuracy": 0.7391158640384674, "num_tokens": 19588325.0, "step": 24348 }, { "epoch": 6.4486228813559325, "grad_norm": 2.663677453994751, "learning_rate": 6.775820974576272e-06, "loss": 1.3282, "mean_token_accuracy": 0.6870510876178741, "num_tokens": 19589809.0, "step": 24350 }, { "epoch": 6.4491525423728815, "grad_norm": 1.9183300733566284, "learning_rate": 6.7755561440677975e-06, "loss": 1.1671, "mean_token_accuracy": 0.7107123732566833, "num_tokens": 19591420.0, "step": 24352 }, { "epoch": 6.4496822033898304, "grad_norm": 2.5912961959838867, "learning_rate": 6.7752913135593224e-06, "loss": 1.2415, "mean_token_accuracy": 0.7180081605911255, "num_tokens": 19593316.0, "step": 24354 }, { "epoch": 6.450211864406779, "grad_norm": 2.423618793487549, "learning_rate": 6.775026483050848e-06, "loss": 1.2141, "mean_token_accuracy": 0.6995702534914017, "num_tokens": 19594856.0, "step": 24356 }, { "epoch": 6.450741525423728, "grad_norm": 2.754955291748047, "learning_rate": 6.774761652542373e-06, "loss": 1.2618, "mean_token_accuracy": 0.7159415856003761, "num_tokens": 19596404.0, "step": 24358 }, { "epoch": 6.451271186440678, "grad_norm": 3.18350887298584, "learning_rate": 6.774496822033899e-06, "loss": 1.0877, "mean_token_accuracy": 0.7321458384394646, "num_tokens": 19597883.0, "step": 24360 }, { "epoch": 6.451800847457627, "grad_norm": 2.719825267791748, "learning_rate": 6.774231991525424e-06, "loss": 1.4449, "mean_token_accuracy": 0.6802644208073616, "num_tokens": 19599520.0, "step": 24362 }, { "epoch": 6.452330508474576, "grad_norm": 1.8159213066101074, "learning_rate": 6.77396716101695e-06, "loss": 0.639, "mean_token_accuracy": 0.8215695694088936, "num_tokens": 19601302.0, "step": 24364 }, { "epoch": 6.452860169491525, "grad_norm": 2.531614303588867, "learning_rate": 6.7737023305084745e-06, "loss": 1.426, "mean_token_accuracy": 0.6913564056158066, "num_tokens": 19603081.0, "step": 24366 }, { "epoch": 6.453389830508475, "grad_norm": 2.2572619915008545, "learning_rate": 6.773437500000001e-06, "loss": 1.2248, "mean_token_accuracy": 0.7208856903016567, "num_tokens": 19604594.0, "step": 24368 }, { "epoch": 6.453919491525424, "grad_norm": 2.597452402114868, "learning_rate": 6.773172669491525e-06, "loss": 1.2833, "mean_token_accuracy": 0.7193204089999199, "num_tokens": 19606054.0, "step": 24370 }, { "epoch": 6.454449152542373, "grad_norm": 1.8203431367874146, "learning_rate": 6.772907838983052e-06, "loss": 1.0477, "mean_token_accuracy": 0.7211634367704391, "num_tokens": 19608000.0, "step": 24372 }, { "epoch": 6.454978813559322, "grad_norm": 2.0965070724487305, "learning_rate": 6.772643008474577e-06, "loss": 0.9716, "mean_token_accuracy": 0.7594625949859619, "num_tokens": 19609844.0, "step": 24374 }, { "epoch": 6.455508474576272, "grad_norm": 2.558288812637329, "learning_rate": 6.7723781779661026e-06, "loss": 0.9064, "mean_token_accuracy": 0.7655023857951164, "num_tokens": 19611485.0, "step": 24376 }, { "epoch": 6.456038135593221, "grad_norm": 3.2963554859161377, "learning_rate": 6.7721133474576275e-06, "loss": 1.0692, "mean_token_accuracy": 0.7471839636564255, "num_tokens": 19613221.0, "step": 24378 }, { "epoch": 6.4565677966101696, "grad_norm": 2.1140928268432617, "learning_rate": 6.771848516949153e-06, "loss": 1.0585, "mean_token_accuracy": 0.7153672054409981, "num_tokens": 19615428.0, "step": 24380 }, { "epoch": 6.4570974576271185, "grad_norm": 2.5776255130767822, "learning_rate": 6.771583686440678e-06, "loss": 1.1131, "mean_token_accuracy": 0.7114621326327324, "num_tokens": 19617274.0, "step": 24382 }, { "epoch": 6.4576271186440675, "grad_norm": 2.0446486473083496, "learning_rate": 6.771318855932204e-06, "loss": 1.1447, "mean_token_accuracy": 0.7315317392349243, "num_tokens": 19618913.0, "step": 24384 }, { "epoch": 6.458156779661017, "grad_norm": 1.9245463609695435, "learning_rate": 6.771054025423729e-06, "loss": 0.7153, "mean_token_accuracy": 0.822851374745369, "num_tokens": 19620461.0, "step": 24386 }, { "epoch": 6.458686440677966, "grad_norm": 2.1429221630096436, "learning_rate": 6.770789194915255e-06, "loss": 1.3929, "mean_token_accuracy": 0.6924305111169815, "num_tokens": 19621992.0, "step": 24388 }, { "epoch": 6.459216101694915, "grad_norm": 2.615056037902832, "learning_rate": 6.77052436440678e-06, "loss": 1.2424, "mean_token_accuracy": 0.7028718516230583, "num_tokens": 19623585.0, "step": 24390 }, { "epoch": 6.459745762711864, "grad_norm": 2.0585453510284424, "learning_rate": 6.770259533898305e-06, "loss": 0.9035, "mean_token_accuracy": 0.7593251541256905, "num_tokens": 19625314.0, "step": 24392 }, { "epoch": 6.460275423728813, "grad_norm": 2.6840972900390625, "learning_rate": 6.76999470338983e-06, "loss": 1.5999, "mean_token_accuracy": 0.6560112237930298, "num_tokens": 19627263.0, "step": 24394 }, { "epoch": 6.460805084745763, "grad_norm": 2.8923754692077637, "learning_rate": 6.769729872881356e-06, "loss": 1.0812, "mean_token_accuracy": 0.756261371076107, "num_tokens": 19628770.0, "step": 24396 }, { "epoch": 6.461334745762712, "grad_norm": 2.7965264320373535, "learning_rate": 6.769465042372881e-06, "loss": 1.1133, "mean_token_accuracy": 0.774876244366169, "num_tokens": 19630583.0, "step": 24398 }, { "epoch": 6.461864406779661, "grad_norm": 2.4920291900634766, "learning_rate": 6.769200211864408e-06, "loss": 1.6888, "mean_token_accuracy": 0.649124126881361, "num_tokens": 19632427.0, "step": 24400 }, { "epoch": 6.46239406779661, "grad_norm": 2.6985623836517334, "learning_rate": 6.7689353813559334e-06, "loss": 1.4976, "mean_token_accuracy": 0.6779482588171959, "num_tokens": 19633862.0, "step": 24402 }, { "epoch": 6.46292372881356, "grad_norm": 1.7161402702331543, "learning_rate": 6.768670550847458e-06, "loss": 1.4167, "mean_token_accuracy": 0.6698194220662117, "num_tokens": 19635882.0, "step": 24404 }, { "epoch": 6.463453389830509, "grad_norm": 3.030869245529175, "learning_rate": 6.768405720338984e-06, "loss": 0.9377, "mean_token_accuracy": 0.7712486609816551, "num_tokens": 19637453.0, "step": 24406 }, { "epoch": 6.463983050847458, "grad_norm": 3.2130069732666016, "learning_rate": 6.768140889830509e-06, "loss": 1.3289, "mean_token_accuracy": 0.7033737152814865, "num_tokens": 19638787.0, "step": 24408 }, { "epoch": 6.464512711864407, "grad_norm": 1.9571679830551147, "learning_rate": 6.767876059322035e-06, "loss": 0.7896, "mean_token_accuracy": 0.7903874516487122, "num_tokens": 19640541.0, "step": 24410 }, { "epoch": 6.4650423728813555, "grad_norm": 2.7246718406677246, "learning_rate": 6.76761122881356e-06, "loss": 1.1282, "mean_token_accuracy": 0.7274778038263321, "num_tokens": 19642993.0, "step": 24412 }, { "epoch": 6.465572033898305, "grad_norm": 2.029024839401245, "learning_rate": 6.7673463983050855e-06, "loss": 0.828, "mean_token_accuracy": 0.7797376960515976, "num_tokens": 19644408.0, "step": 24414 }, { "epoch": 6.466101694915254, "grad_norm": 2.78778076171875, "learning_rate": 6.7670815677966105e-06, "loss": 1.3768, "mean_token_accuracy": 0.7015954330563545, "num_tokens": 19645979.0, "step": 24416 }, { "epoch": 6.466631355932203, "grad_norm": 2.1627047061920166, "learning_rate": 6.766816737288136e-06, "loss": 0.7525, "mean_token_accuracy": 0.7984752282500267, "num_tokens": 19647844.0, "step": 24418 }, { "epoch": 6.467161016949152, "grad_norm": 2.642340898513794, "learning_rate": 6.766551906779661e-06, "loss": 1.5482, "mean_token_accuracy": 0.653648667037487, "num_tokens": 19649549.0, "step": 24420 }, { "epoch": 6.467690677966102, "grad_norm": 2.4806554317474365, "learning_rate": 6.766287076271188e-06, "loss": 1.2101, "mean_token_accuracy": 0.7166097685694695, "num_tokens": 19651072.0, "step": 24422 }, { "epoch": 6.468220338983051, "grad_norm": 2.562568187713623, "learning_rate": 6.766022245762712e-06, "loss": 1.4199, "mean_token_accuracy": 0.7072912305593491, "num_tokens": 19652638.0, "step": 24424 }, { "epoch": 6.46875, "grad_norm": 2.4284610748291016, "learning_rate": 6.7657574152542385e-06, "loss": 1.2482, "mean_token_accuracy": 0.7071619033813477, "num_tokens": 19654248.0, "step": 24426 }, { "epoch": 6.469279661016949, "grad_norm": 2.4823758602142334, "learning_rate": 6.7654925847457634e-06, "loss": 1.6139, "mean_token_accuracy": 0.6310145705938339, "num_tokens": 19655916.0, "step": 24428 }, { "epoch": 6.469809322033898, "grad_norm": 2.674844741821289, "learning_rate": 6.765227754237289e-06, "loss": 1.1477, "mean_token_accuracy": 0.7472007870674133, "num_tokens": 19657221.0, "step": 24430 }, { "epoch": 6.470338983050848, "grad_norm": 2.1382675170898438, "learning_rate": 6.764962923728814e-06, "loss": 0.8997, "mean_token_accuracy": 0.770720973610878, "num_tokens": 19658941.0, "step": 24432 }, { "epoch": 6.470868644067797, "grad_norm": 2.488955497741699, "learning_rate": 6.76469809322034e-06, "loss": 1.1141, "mean_token_accuracy": 0.7363522350788116, "num_tokens": 19660480.0, "step": 24434 }, { "epoch": 6.471398305084746, "grad_norm": 2.143798589706421, "learning_rate": 6.764433262711865e-06, "loss": 1.2505, "mean_token_accuracy": 0.6968978717923164, "num_tokens": 19662126.0, "step": 24436 }, { "epoch": 6.471927966101695, "grad_norm": 2.806725263595581, "learning_rate": 6.764168432203391e-06, "loss": 1.2434, "mean_token_accuracy": 0.7177361845970154, "num_tokens": 19663952.0, "step": 24438 }, { "epoch": 6.4724576271186445, "grad_norm": 2.0622005462646484, "learning_rate": 6.7639036016949156e-06, "loss": 1.1487, "mean_token_accuracy": 0.7198665738105774, "num_tokens": 19665704.0, "step": 24440 }, { "epoch": 6.472987288135593, "grad_norm": 2.3151590824127197, "learning_rate": 6.763638771186441e-06, "loss": 1.4202, "mean_token_accuracy": 0.66475909948349, "num_tokens": 19667393.0, "step": 24442 }, { "epoch": 6.473516949152542, "grad_norm": 2.9622795581817627, "learning_rate": 6.763373940677966e-06, "loss": 1.5223, "mean_token_accuracy": 0.655999481678009, "num_tokens": 19668912.0, "step": 24444 }, { "epoch": 6.474046610169491, "grad_norm": 2.486393690109253, "learning_rate": 6.763109110169492e-06, "loss": 1.1181, "mean_token_accuracy": 0.7297025620937347, "num_tokens": 19670228.0, "step": 24446 }, { "epoch": 6.47457627118644, "grad_norm": 1.7958003282546997, "learning_rate": 6.762844279661017e-06, "loss": 0.8115, "mean_token_accuracy": 0.797741562128067, "num_tokens": 19671807.0, "step": 24448 }, { "epoch": 6.47510593220339, "grad_norm": 2.753279447555542, "learning_rate": 6.762579449152543e-06, "loss": 1.3819, "mean_token_accuracy": 0.6849573105573654, "num_tokens": 19673396.0, "step": 24450 }, { "epoch": 6.475635593220339, "grad_norm": 2.120227575302124, "learning_rate": 6.762314618644068e-06, "loss": 1.368, "mean_token_accuracy": 0.6978294476866722, "num_tokens": 19675223.0, "step": 24452 }, { "epoch": 6.476165254237288, "grad_norm": 2.8239753246307373, "learning_rate": 6.762049788135594e-06, "loss": 1.18, "mean_token_accuracy": 0.7201395630836487, "num_tokens": 19676836.0, "step": 24454 }, { "epoch": 6.476694915254237, "grad_norm": 2.059617042541504, "learning_rate": 6.761784957627119e-06, "loss": 0.7805, "mean_token_accuracy": 0.7991960868239403, "num_tokens": 19678347.0, "step": 24456 }, { "epoch": 6.477224576271187, "grad_norm": 3.0174567699432373, "learning_rate": 6.761520127118645e-06, "loss": 1.2912, "mean_token_accuracy": 0.7229667901992798, "num_tokens": 19679815.0, "step": 24458 }, { "epoch": 6.477754237288136, "grad_norm": 2.285661458969116, "learning_rate": 6.76125529661017e-06, "loss": 0.6994, "mean_token_accuracy": 0.8201035931706429, "num_tokens": 19681267.0, "step": 24460 }, { "epoch": 6.478283898305085, "grad_norm": 2.439296245574951, "learning_rate": 6.760990466101696e-06, "loss": 1.2491, "mean_token_accuracy": 0.7271200753748417, "num_tokens": 19682793.0, "step": 24462 }, { "epoch": 6.478813559322034, "grad_norm": 2.4770920276641846, "learning_rate": 6.760725635593221e-06, "loss": 1.1294, "mean_token_accuracy": 0.7199801877140999, "num_tokens": 19684228.0, "step": 24464 }, { "epoch": 6.479343220338983, "grad_norm": 2.2287254333496094, "learning_rate": 6.760460805084746e-06, "loss": 1.101, "mean_token_accuracy": 0.7288252264261246, "num_tokens": 19685761.0, "step": 24466 }, { "epoch": 6.4798728813559325, "grad_norm": 2.350555419921875, "learning_rate": 6.760195974576271e-06, "loss": 1.2895, "mean_token_accuracy": 0.6862274333834648, "num_tokens": 19687346.0, "step": 24468 }, { "epoch": 6.4804025423728815, "grad_norm": 2.3707799911499023, "learning_rate": 6.759931144067797e-06, "loss": 1.5372, "mean_token_accuracy": 0.6463767141103745, "num_tokens": 19689128.0, "step": 24470 }, { "epoch": 6.4809322033898304, "grad_norm": 2.457754135131836, "learning_rate": 6.759666313559322e-06, "loss": 1.2547, "mean_token_accuracy": 0.6893185824155807, "num_tokens": 19690841.0, "step": 24472 }, { "epoch": 6.481461864406779, "grad_norm": 2.2819101810455322, "learning_rate": 6.759401483050848e-06, "loss": 1.4897, "mean_token_accuracy": 0.6774904131889343, "num_tokens": 19692416.0, "step": 24474 }, { "epoch": 6.481991525423728, "grad_norm": 2.1403615474700928, "learning_rate": 6.759136652542373e-06, "loss": 0.7573, "mean_token_accuracy": 0.7907117083668709, "num_tokens": 19694152.0, "step": 24476 }, { "epoch": 6.482521186440678, "grad_norm": 2.8213417530059814, "learning_rate": 6.7588718220338985e-06, "loss": 1.5683, "mean_token_accuracy": 0.674546230584383, "num_tokens": 19695768.0, "step": 24478 }, { "epoch": 6.483050847457627, "grad_norm": 1.840298056602478, "learning_rate": 6.7586069915254235e-06, "loss": 0.9772, "mean_token_accuracy": 0.7814854234457016, "num_tokens": 19697438.0, "step": 24480 }, { "epoch": 6.483580508474576, "grad_norm": 2.2630693912506104, "learning_rate": 6.75834216101695e-06, "loss": 1.2798, "mean_token_accuracy": 0.6979364156723022, "num_tokens": 19699109.0, "step": 24482 }, { "epoch": 6.484110169491525, "grad_norm": 2.696942090988159, "learning_rate": 6.758077330508475e-06, "loss": 1.0626, "mean_token_accuracy": 0.7356021329760551, "num_tokens": 19700608.0, "step": 24484 }, { "epoch": 6.484639830508475, "grad_norm": 2.5929758548736572, "learning_rate": 6.757812500000001e-06, "loss": 1.0599, "mean_token_accuracy": 0.7305682525038719, "num_tokens": 19702250.0, "step": 24486 }, { "epoch": 6.485169491525424, "grad_norm": 2.454146385192871, "learning_rate": 6.7575476694915266e-06, "loss": 1.2439, "mean_token_accuracy": 0.7100508436560631, "num_tokens": 19703556.0, "step": 24488 }, { "epoch": 6.485699152542373, "grad_norm": 2.377326250076294, "learning_rate": 6.7572828389830515e-06, "loss": 1.3578, "mean_token_accuracy": 0.719179667532444, "num_tokens": 19704865.0, "step": 24490 }, { "epoch": 6.486228813559322, "grad_norm": 2.7048227787017822, "learning_rate": 6.757018008474577e-06, "loss": 1.1273, "mean_token_accuracy": 0.7301125824451447, "num_tokens": 19706361.0, "step": 24492 }, { "epoch": 6.486758474576272, "grad_norm": 2.592236042022705, "learning_rate": 6.756753177966102e-06, "loss": 1.1856, "mean_token_accuracy": 0.723327748477459, "num_tokens": 19708090.0, "step": 24494 }, { "epoch": 6.487288135593221, "grad_norm": 2.749699831008911, "learning_rate": 6.756488347457628e-06, "loss": 1.2378, "mean_token_accuracy": 0.7007360495626926, "num_tokens": 19709690.0, "step": 24496 }, { "epoch": 6.4878177966101696, "grad_norm": 1.900003433227539, "learning_rate": 6.756223516949153e-06, "loss": 1.0683, "mean_token_accuracy": 0.7391645088791847, "num_tokens": 19711239.0, "step": 24498 }, { "epoch": 6.4883474576271185, "grad_norm": 2.6308207511901855, "learning_rate": 6.755958686440679e-06, "loss": 0.8868, "step": 24500 }, { "epoch": 6.4883474576271185, "eval_loss": 1.3283816576004028, "eval_mean_token_accuracy": 0.7005196978132446, "eval_num_tokens": 19712709.0, "eval_runtime": 48.8313, "eval_samples_per_second": 6.307, "eval_steps_per_second": 6.307, "step": 24500 }, { "epoch": 6.4888771186440675, "grad_norm": 2.198380470275879, "learning_rate": 6.755693855932204e-06, "loss": 1.1183, "mean_token_accuracy": 0.7596094943583012, "num_tokens": 19714373.0, "step": 24502 }, { "epoch": 6.489406779661017, "grad_norm": 2.838540554046631, "learning_rate": 6.755429025423729e-06, "loss": 1.3645, "mean_token_accuracy": 0.6700099185109138, "num_tokens": 19716178.0, "step": 24504 }, { "epoch": 6.489936440677966, "grad_norm": 2.7437191009521484, "learning_rate": 6.755164194915254e-06, "loss": 1.0454, "mean_token_accuracy": 0.7791746035218239, "num_tokens": 19717646.0, "step": 24506 }, { "epoch": 6.490466101694915, "grad_norm": 2.2095606327056885, "learning_rate": 6.754899364406781e-06, "loss": 1.381, "mean_token_accuracy": 0.686764732003212, "num_tokens": 19719292.0, "step": 24508 }, { "epoch": 6.490995762711864, "grad_norm": 2.613250970840454, "learning_rate": 6.754634533898306e-06, "loss": 0.9558, "mean_token_accuracy": 0.7663799002766609, "num_tokens": 19720900.0, "step": 24510 }, { "epoch": 6.491525423728813, "grad_norm": 2.755781412124634, "learning_rate": 6.754369703389832e-06, "loss": 1.7232, "mean_token_accuracy": 0.6222809329628944, "num_tokens": 19722378.0, "step": 24512 }, { "epoch": 6.492055084745763, "grad_norm": 2.413886308670044, "learning_rate": 6.7541048728813566e-06, "loss": 0.9423, "mean_token_accuracy": 0.7687739133834839, "num_tokens": 19724011.0, "step": 24514 }, { "epoch": 6.492584745762712, "grad_norm": 2.6313636302948, "learning_rate": 6.753840042372882e-06, "loss": 1.0235, "mean_token_accuracy": 0.7579780593514442, "num_tokens": 19725571.0, "step": 24516 }, { "epoch": 6.493114406779661, "grad_norm": 2.665642023086548, "learning_rate": 6.753575211864407e-06, "loss": 1.0075, "mean_token_accuracy": 0.752556823194027, "num_tokens": 19726853.0, "step": 24518 }, { "epoch": 6.49364406779661, "grad_norm": 2.1682608127593994, "learning_rate": 6.753310381355933e-06, "loss": 0.9136, "mean_token_accuracy": 0.785098634660244, "num_tokens": 19728507.0, "step": 24520 }, { "epoch": 6.49417372881356, "grad_norm": 2.8671324253082275, "learning_rate": 6.753045550847458e-06, "loss": 1.3545, "mean_token_accuracy": 0.6898795887827873, "num_tokens": 19730042.0, "step": 24522 }, { "epoch": 6.494703389830509, "grad_norm": 2.5803451538085938, "learning_rate": 6.752780720338984e-06, "loss": 1.0324, "mean_token_accuracy": 0.7508117482066154, "num_tokens": 19731729.0, "step": 24524 }, { "epoch": 6.495233050847458, "grad_norm": 2.5621299743652344, "learning_rate": 6.752515889830509e-06, "loss": 1.0233, "mean_token_accuracy": 0.7572798579931259, "num_tokens": 19733027.0, "step": 24526 }, { "epoch": 6.495762711864407, "grad_norm": 1.793230414390564, "learning_rate": 6.7522510593220345e-06, "loss": 0.8134, "mean_token_accuracy": 0.8026889711618423, "num_tokens": 19734802.0, "step": 24528 }, { "epoch": 6.4962923728813555, "grad_norm": 2.111769914627075, "learning_rate": 6.751986228813559e-06, "loss": 1.0075, "mean_token_accuracy": 0.7641246914863586, "num_tokens": 19736255.0, "step": 24530 }, { "epoch": 6.496822033898305, "grad_norm": 2.2983102798461914, "learning_rate": 6.751721398305085e-06, "loss": 1.261, "mean_token_accuracy": 0.731522798538208, "num_tokens": 19737705.0, "step": 24532 }, { "epoch": 6.497351694915254, "grad_norm": 2.5309383869171143, "learning_rate": 6.75145656779661e-06, "loss": 1.6208, "mean_token_accuracy": 0.6736794635653496, "num_tokens": 19739225.0, "step": 24534 }, { "epoch": 6.497881355932203, "grad_norm": 2.5448379516601562, "learning_rate": 6.751191737288137e-06, "loss": 1.3989, "mean_token_accuracy": 0.7099657505750656, "num_tokens": 19740851.0, "step": 24536 }, { "epoch": 6.498411016949152, "grad_norm": 2.6754581928253174, "learning_rate": 6.750926906779662e-06, "loss": 1.4767, "mean_token_accuracy": 0.7138269022107124, "num_tokens": 19742287.0, "step": 24538 }, { "epoch": 6.498940677966102, "grad_norm": 2.3287134170532227, "learning_rate": 6.7506620762711874e-06, "loss": 1.0777, "mean_token_accuracy": 0.7539490759372711, "num_tokens": 19743942.0, "step": 24540 }, { "epoch": 6.499470338983051, "grad_norm": 2.3325836658477783, "learning_rate": 6.750397245762712e-06, "loss": 1.1537, "mean_token_accuracy": 0.7159208729863167, "num_tokens": 19745486.0, "step": 24542 }, { "epoch": 6.5, "grad_norm": 2.8180606365203857, "learning_rate": 6.750132415254238e-06, "loss": 1.1905, "mean_token_accuracy": 0.7156564071774483, "num_tokens": 19747153.0, "step": 24544 }, { "epoch": 6.500529661016949, "grad_norm": 2.2250986099243164, "learning_rate": 6.749867584745763e-06, "loss": 1.0947, "mean_token_accuracy": 0.7432102113962173, "num_tokens": 19748755.0, "step": 24546 }, { "epoch": 6.501059322033898, "grad_norm": 2.7712349891662598, "learning_rate": 6.749602754237289e-06, "loss": 1.3132, "mean_token_accuracy": 0.6688382625579834, "num_tokens": 19750508.0, "step": 24548 }, { "epoch": 6.501588983050848, "grad_norm": 1.9960353374481201, "learning_rate": 6.749337923728814e-06, "loss": 1.0303, "mean_token_accuracy": 0.7735660821199417, "num_tokens": 19752123.0, "step": 24550 }, { "epoch": 6.502118644067797, "grad_norm": 2.393563747406006, "learning_rate": 6.7490730932203395e-06, "loss": 1.243, "mean_token_accuracy": 0.7148779332637787, "num_tokens": 19753614.0, "step": 24552 }, { "epoch": 6.502648305084746, "grad_norm": 2.3959808349609375, "learning_rate": 6.7488082627118645e-06, "loss": 1.3673, "mean_token_accuracy": 0.6793966963887215, "num_tokens": 19755575.0, "step": 24554 }, { "epoch": 6.503177966101695, "grad_norm": 2.591728448867798, "learning_rate": 6.74854343220339e-06, "loss": 1.4416, "mean_token_accuracy": 0.6892464011907578, "num_tokens": 19757107.0, "step": 24556 }, { "epoch": 6.503707627118644, "grad_norm": 2.5260393619537354, "learning_rate": 6.748278601694915e-06, "loss": 1.4846, "mean_token_accuracy": 0.6632859483361244, "num_tokens": 19758714.0, "step": 24558 }, { "epoch": 6.504237288135593, "grad_norm": 2.470209836959839, "learning_rate": 6.748013771186441e-06, "loss": 1.2959, "mean_token_accuracy": 0.7111987248063087, "num_tokens": 19760252.0, "step": 24560 }, { "epoch": 6.504766949152542, "grad_norm": 2.112999200820923, "learning_rate": 6.747748940677966e-06, "loss": 1.0316, "mean_token_accuracy": 0.7487302124500275, "num_tokens": 19761869.0, "step": 24562 }, { "epoch": 6.505296610169491, "grad_norm": 2.5946221351623535, "learning_rate": 6.7474841101694925e-06, "loss": 1.4488, "mean_token_accuracy": 0.6844548508524895, "num_tokens": 19763475.0, "step": 24564 }, { "epoch": 6.50582627118644, "grad_norm": 1.8194140195846558, "learning_rate": 6.747219279661017e-06, "loss": 1.1212, "mean_token_accuracy": 0.743334136903286, "num_tokens": 19765304.0, "step": 24566 }, { "epoch": 6.50635593220339, "grad_norm": 2.4373550415039062, "learning_rate": 6.746954449152543e-06, "loss": 1.0242, "mean_token_accuracy": 0.7559815496206284, "num_tokens": 19766832.0, "step": 24568 }, { "epoch": 6.506885593220339, "grad_norm": 2.909759283065796, "learning_rate": 6.746689618644068e-06, "loss": 1.567, "mean_token_accuracy": 0.6622926443815231, "num_tokens": 19768412.0, "step": 24570 }, { "epoch": 6.507415254237288, "grad_norm": 2.5143399238586426, "learning_rate": 6.746424788135594e-06, "loss": 1.1798, "mean_token_accuracy": 0.7311828881502151, "num_tokens": 19769957.0, "step": 24572 }, { "epoch": 6.507944915254237, "grad_norm": 2.598085641860962, "learning_rate": 6.74615995762712e-06, "loss": 1.4797, "mean_token_accuracy": 0.6529109105467796, "num_tokens": 19771691.0, "step": 24574 }, { "epoch": 6.508474576271187, "grad_norm": 2.7285971641540527, "learning_rate": 6.745895127118645e-06, "loss": 1.2864, "mean_token_accuracy": 0.7259767949581146, "num_tokens": 19773210.0, "step": 24576 }, { "epoch": 6.509004237288136, "grad_norm": 2.701974391937256, "learning_rate": 6.74563029661017e-06, "loss": 1.1051, "mean_token_accuracy": 0.7246604859828949, "num_tokens": 19774681.0, "step": 24578 }, { "epoch": 6.509533898305085, "grad_norm": 2.8521437644958496, "learning_rate": 6.745365466101695e-06, "loss": 1.3478, "mean_token_accuracy": 0.7116238996386528, "num_tokens": 19776154.0, "step": 24580 }, { "epoch": 6.510063559322034, "grad_norm": 2.1243467330932617, "learning_rate": 6.745100635593221e-06, "loss": 0.9915, "mean_token_accuracy": 0.7597714886069298, "num_tokens": 19777603.0, "step": 24582 }, { "epoch": 6.510593220338983, "grad_norm": 2.024491310119629, "learning_rate": 6.744835805084746e-06, "loss": 0.8368, "mean_token_accuracy": 0.7770893648266792, "num_tokens": 19779202.0, "step": 24584 }, { "epoch": 6.5111228813559325, "grad_norm": 2.416433334350586, "learning_rate": 6.744570974576272e-06, "loss": 1.5329, "mean_token_accuracy": 0.650929220020771, "num_tokens": 19780673.0, "step": 24586 }, { "epoch": 6.5116525423728815, "grad_norm": 2.2465085983276367, "learning_rate": 6.744306144067797e-06, "loss": 1.1767, "mean_token_accuracy": 0.7265536487102509, "num_tokens": 19782324.0, "step": 24588 }, { "epoch": 6.5121822033898304, "grad_norm": 3.1664230823516846, "learning_rate": 6.744041313559323e-06, "loss": 1.2957, "mean_token_accuracy": 0.7214479520916939, "num_tokens": 19783778.0, "step": 24590 }, { "epoch": 6.512711864406779, "grad_norm": 2.7159624099731445, "learning_rate": 6.743776483050848e-06, "loss": 1.5778, "mean_token_accuracy": 0.6772791296243668, "num_tokens": 19785175.0, "step": 24592 }, { "epoch": 6.513241525423728, "grad_norm": 2.522104263305664, "learning_rate": 6.743511652542374e-06, "loss": 1.3675, "mean_token_accuracy": 0.7047025859355927, "num_tokens": 19787051.0, "step": 24594 }, { "epoch": 6.513771186440678, "grad_norm": 2.5538437366485596, "learning_rate": 6.743246822033899e-06, "loss": 1.0706, "mean_token_accuracy": 0.73118045181036, "num_tokens": 19788602.0, "step": 24596 }, { "epoch": 6.514300847457627, "grad_norm": 2.677445411682129, "learning_rate": 6.742981991525425e-06, "loss": 1.293, "mean_token_accuracy": 0.7079331055283546, "num_tokens": 19790420.0, "step": 24598 }, { "epoch": 6.514830508474576, "grad_norm": 2.3177990913391113, "learning_rate": 6.74271716101695e-06, "loss": 1.5913, "mean_token_accuracy": 0.647923156619072, "num_tokens": 19792258.0, "step": 24600 }, { "epoch": 6.515360169491525, "grad_norm": 2.617781162261963, "learning_rate": 6.7424523305084755e-06, "loss": 1.1532, "mean_token_accuracy": 0.7288661226630211, "num_tokens": 19793434.0, "step": 24602 }, { "epoch": 6.515889830508475, "grad_norm": 2.0488882064819336, "learning_rate": 6.7421875e-06, "loss": 1.1233, "mean_token_accuracy": 0.7208994254469872, "num_tokens": 19795216.0, "step": 24604 }, { "epoch": 6.516419491525424, "grad_norm": 1.8757604360580444, "learning_rate": 6.741922669491526e-06, "loss": 1.1141, "mean_token_accuracy": 0.741468645632267, "num_tokens": 19797099.0, "step": 24606 }, { "epoch": 6.516949152542373, "grad_norm": 2.1493992805480957, "learning_rate": 6.741657838983051e-06, "loss": 0.9701, "mean_token_accuracy": 0.7748172506690025, "num_tokens": 19798626.0, "step": 24608 }, { "epoch": 6.517478813559322, "grad_norm": 2.7032124996185303, "learning_rate": 6.741393008474577e-06, "loss": 1.4703, "mean_token_accuracy": 0.6762109696865082, "num_tokens": 19800078.0, "step": 24610 }, { "epoch": 6.518008474576272, "grad_norm": 2.4260144233703613, "learning_rate": 6.741128177966102e-06, "loss": 1.4257, "mean_token_accuracy": 0.6784448176622391, "num_tokens": 19801698.0, "step": 24612 }, { "epoch": 6.518538135593221, "grad_norm": 2.8088793754577637, "learning_rate": 6.740863347457628e-06, "loss": 1.2655, "mean_token_accuracy": 0.7018880397081375, "num_tokens": 19804138.0, "step": 24614 }, { "epoch": 6.5190677966101696, "grad_norm": 2.3597419261932373, "learning_rate": 6.7405985169491525e-06, "loss": 1.0889, "mean_token_accuracy": 0.7180495634675026, "num_tokens": 19805658.0, "step": 24616 }, { "epoch": 6.5195974576271185, "grad_norm": 2.337186574935913, "learning_rate": 6.740333686440679e-06, "loss": 1.3386, "mean_token_accuracy": 0.6731394231319427, "num_tokens": 19807173.0, "step": 24618 }, { "epoch": 6.5201271186440675, "grad_norm": 2.2728326320648193, "learning_rate": 6.740068855932203e-06, "loss": 1.2222, "mean_token_accuracy": 0.7314516492187977, "num_tokens": 19808660.0, "step": 24620 }, { "epoch": 6.520656779661017, "grad_norm": 2.8100242614746094, "learning_rate": 6.73980402542373e-06, "loss": 1.4407, "mean_token_accuracy": 0.6848981454968452, "num_tokens": 19810234.0, "step": 24622 }, { "epoch": 6.521186440677966, "grad_norm": 2.8645598888397217, "learning_rate": 6.739539194915255e-06, "loss": 1.1214, "mean_token_accuracy": 0.741161897778511, "num_tokens": 19811651.0, "step": 24624 }, { "epoch": 6.521716101694915, "grad_norm": 2.4622433185577393, "learning_rate": 6.7392743644067806e-06, "loss": 1.3038, "mean_token_accuracy": 0.7059260383248329, "num_tokens": 19813388.0, "step": 24626 }, { "epoch": 6.522245762711864, "grad_norm": 2.775634765625, "learning_rate": 6.7390095338983055e-06, "loss": 1.4371, "mean_token_accuracy": 0.6684923321008682, "num_tokens": 19814898.0, "step": 24628 }, { "epoch": 6.522775423728813, "grad_norm": 2.8645691871643066, "learning_rate": 6.738744703389831e-06, "loss": 1.307, "mean_token_accuracy": 0.6982790902256966, "num_tokens": 19816436.0, "step": 24630 }, { "epoch": 6.523305084745763, "grad_norm": 2.3625776767730713, "learning_rate": 6.738479872881356e-06, "loss": 1.0641, "mean_token_accuracy": 0.7383341640233994, "num_tokens": 19817984.0, "step": 24632 }, { "epoch": 6.523834745762712, "grad_norm": 3.2353579998016357, "learning_rate": 6.738215042372882e-06, "loss": 1.1542, "mean_token_accuracy": 0.7023402750492096, "num_tokens": 19819322.0, "step": 24634 }, { "epoch": 6.524364406779661, "grad_norm": 2.213388204574585, "learning_rate": 6.737950211864407e-06, "loss": 0.9371, "mean_token_accuracy": 0.7724405005574226, "num_tokens": 19820956.0, "step": 24636 }, { "epoch": 6.52489406779661, "grad_norm": 2.116448402404785, "learning_rate": 6.737685381355933e-06, "loss": 1.093, "mean_token_accuracy": 0.7484240457415581, "num_tokens": 19822511.0, "step": 24638 }, { "epoch": 6.52542372881356, "grad_norm": 2.147841215133667, "learning_rate": 6.737420550847458e-06, "loss": 1.332, "mean_token_accuracy": 0.697290413081646, "num_tokens": 19824105.0, "step": 24640 }, { "epoch": 6.525953389830509, "grad_norm": 2.7666356563568115, "learning_rate": 6.737155720338983e-06, "loss": 1.1872, "mean_token_accuracy": 0.7070365250110626, "num_tokens": 19825520.0, "step": 24642 }, { "epoch": 6.526483050847458, "grad_norm": 2.5609076023101807, "learning_rate": 6.736890889830508e-06, "loss": 1.1971, "mean_token_accuracy": 0.7189816683530807, "num_tokens": 19827138.0, "step": 24644 }, { "epoch": 6.527012711864407, "grad_norm": 2.120166778564453, "learning_rate": 6.736626059322035e-06, "loss": 0.9555, "mean_token_accuracy": 0.7873008102178574, "num_tokens": 19828531.0, "step": 24646 }, { "epoch": 6.527542372881356, "grad_norm": 2.14848256111145, "learning_rate": 6.736361228813559e-06, "loss": 1.1427, "mean_token_accuracy": 0.7330814525485039, "num_tokens": 19830128.0, "step": 24648 }, { "epoch": 6.528072033898305, "grad_norm": 2.027043104171753, "learning_rate": 6.736096398305086e-06, "loss": 1.5384, "mean_token_accuracy": 0.643993191421032, "num_tokens": 19832060.0, "step": 24650 }, { "epoch": 6.528601694915254, "grad_norm": 2.7642831802368164, "learning_rate": 6.7358315677966106e-06, "loss": 1.5519, "mean_token_accuracy": 0.6622018031775951, "num_tokens": 19833433.0, "step": 24652 }, { "epoch": 6.529131355932203, "grad_norm": 2.2563223838806152, "learning_rate": 6.735566737288136e-06, "loss": 1.1034, "mean_token_accuracy": 0.7307763174176216, "num_tokens": 19834946.0, "step": 24654 }, { "epoch": 6.529661016949152, "grad_norm": 2.6202542781829834, "learning_rate": 6.735301906779662e-06, "loss": 0.9474, "mean_token_accuracy": 0.7660959661006927, "num_tokens": 19836430.0, "step": 24656 }, { "epoch": 6.530190677966102, "grad_norm": 2.3678319454193115, "learning_rate": 6.735037076271187e-06, "loss": 1.1181, "mean_token_accuracy": 0.7320964299142361, "num_tokens": 19837942.0, "step": 24658 }, { "epoch": 6.530720338983051, "grad_norm": 2.261176824569702, "learning_rate": 6.734772245762713e-06, "loss": 0.8811, "mean_token_accuracy": 0.7760128453373909, "num_tokens": 19839526.0, "step": 24660 }, { "epoch": 6.53125, "grad_norm": 2.2222096920013428, "learning_rate": 6.734507415254238e-06, "loss": 1.0055, "mean_token_accuracy": 0.73937027156353, "num_tokens": 19841121.0, "step": 24662 }, { "epoch": 6.531779661016949, "grad_norm": 2.7251389026641846, "learning_rate": 6.7342425847457635e-06, "loss": 1.1375, "mean_token_accuracy": 0.7384627759456635, "num_tokens": 19842830.0, "step": 24664 }, { "epoch": 6.532309322033898, "grad_norm": 2.454362154006958, "learning_rate": 6.7339777542372885e-06, "loss": 1.0892, "mean_token_accuracy": 0.7683468088507652, "num_tokens": 19844266.0, "step": 24666 }, { "epoch": 6.532838983050848, "grad_norm": 2.0779364109039307, "learning_rate": 6.733712923728814e-06, "loss": 0.9747, "mean_token_accuracy": 0.7435675859451294, "num_tokens": 19846151.0, "step": 24668 }, { "epoch": 6.533368644067797, "grad_norm": 2.464508056640625, "learning_rate": 6.733448093220339e-06, "loss": 1.546, "mean_token_accuracy": 0.6488096788525581, "num_tokens": 19847805.0, "step": 24670 }, { "epoch": 6.533898305084746, "grad_norm": 3.2791035175323486, "learning_rate": 6.733183262711866e-06, "loss": 1.336, "mean_token_accuracy": 0.6696737632155418, "num_tokens": 19849315.0, "step": 24672 }, { "epoch": 6.534427966101695, "grad_norm": 2.1879208087921143, "learning_rate": 6.73291843220339e-06, "loss": 1.0517, "mean_token_accuracy": 0.7378734648227692, "num_tokens": 19850813.0, "step": 24674 }, { "epoch": 6.534957627118644, "grad_norm": 2.6882317066192627, "learning_rate": 6.7326536016949165e-06, "loss": 1.4851, "mean_token_accuracy": 0.6610315516591072, "num_tokens": 19852612.0, "step": 24676 }, { "epoch": 6.535487288135593, "grad_norm": 2.530841827392578, "learning_rate": 6.732388771186441e-06, "loss": 1.3464, "mean_token_accuracy": 0.6954856663942337, "num_tokens": 19854095.0, "step": 24678 }, { "epoch": 6.536016949152542, "grad_norm": 2.813941240310669, "learning_rate": 6.732123940677967e-06, "loss": 1.2092, "mean_token_accuracy": 0.706106424331665, "num_tokens": 19855636.0, "step": 24680 }, { "epoch": 6.536546610169491, "grad_norm": 1.7125608921051025, "learning_rate": 6.731859110169492e-06, "loss": 0.891, "mean_token_accuracy": 0.7830609604716301, "num_tokens": 19858079.0, "step": 24682 }, { "epoch": 6.53707627118644, "grad_norm": 2.266094923019409, "learning_rate": 6.731594279661018e-06, "loss": 1.6664, "mean_token_accuracy": 0.633212573826313, "num_tokens": 19859873.0, "step": 24684 }, { "epoch": 6.53760593220339, "grad_norm": 2.2274515628814697, "learning_rate": 6.731329449152543e-06, "loss": 1.6043, "mean_token_accuracy": 0.6303247883915901, "num_tokens": 19861676.0, "step": 24686 }, { "epoch": 6.538135593220339, "grad_norm": 1.9896682500839233, "learning_rate": 6.731064618644069e-06, "loss": 0.8375, "mean_token_accuracy": 0.7889060229063034, "num_tokens": 19863170.0, "step": 24688 }, { "epoch": 6.538665254237288, "grad_norm": 2.287525177001953, "learning_rate": 6.7307997881355935e-06, "loss": 1.0726, "mean_token_accuracy": 0.7609840407967567, "num_tokens": 19864652.0, "step": 24690 }, { "epoch": 6.539194915254237, "grad_norm": 2.47802472114563, "learning_rate": 6.730534957627119e-06, "loss": 1.2443, "mean_token_accuracy": 0.6978608667850494, "num_tokens": 19866184.0, "step": 24692 }, { "epoch": 6.539724576271187, "grad_norm": 4.05985164642334, "learning_rate": 6.730270127118644e-06, "loss": 1.167, "mean_token_accuracy": 0.7445101886987686, "num_tokens": 19867652.0, "step": 24694 }, { "epoch": 6.540254237288136, "grad_norm": 2.299760103225708, "learning_rate": 6.73000529661017e-06, "loss": 1.2513, "mean_token_accuracy": 0.6994757130742073, "num_tokens": 19869337.0, "step": 24696 }, { "epoch": 6.540783898305085, "grad_norm": 2.3980555534362793, "learning_rate": 6.729740466101695e-06, "loss": 0.8627, "mean_token_accuracy": 0.769417405128479, "num_tokens": 19870884.0, "step": 24698 }, { "epoch": 6.541313559322034, "grad_norm": 2.398685932159424, "learning_rate": 6.7294756355932216e-06, "loss": 0.8093, "mean_token_accuracy": 0.7919460088014603, "num_tokens": 19872581.0, "step": 24700 }, { "epoch": 6.541843220338983, "grad_norm": 2.0919437408447266, "learning_rate": 6.729210805084746e-06, "loss": 1.0054, "mean_token_accuracy": 0.7617149725556374, "num_tokens": 19874117.0, "step": 24702 }, { "epoch": 6.5423728813559325, "grad_norm": 2.1795499324798584, "learning_rate": 6.728945974576272e-06, "loss": 0.8821, "mean_token_accuracy": 0.7999907210469246, "num_tokens": 19875796.0, "step": 24704 }, { "epoch": 6.5429025423728815, "grad_norm": 4.719942092895508, "learning_rate": 6.728681144067797e-06, "loss": 0.7056, "mean_token_accuracy": 0.8174771964550018, "num_tokens": 19877479.0, "step": 24706 }, { "epoch": 6.5434322033898304, "grad_norm": 2.064122438430786, "learning_rate": 6.728416313559323e-06, "loss": 1.2728, "mean_token_accuracy": 0.7207750901579857, "num_tokens": 19879403.0, "step": 24708 }, { "epoch": 6.543961864406779, "grad_norm": 2.380608320236206, "learning_rate": 6.728151483050848e-06, "loss": 0.978, "mean_token_accuracy": 0.766152948141098, "num_tokens": 19881039.0, "step": 24710 }, { "epoch": 6.544491525423728, "grad_norm": 2.5187160968780518, "learning_rate": 6.727886652542374e-06, "loss": 1.0846, "mean_token_accuracy": 0.7514514029026031, "num_tokens": 19882602.0, "step": 24712 }, { "epoch": 6.545021186440678, "grad_norm": 1.987714409828186, "learning_rate": 6.727621822033899e-06, "loss": 0.7762, "mean_token_accuracy": 0.8078045099973679, "num_tokens": 19884817.0, "step": 24714 }, { "epoch": 6.545550847457627, "grad_norm": 2.280526876449585, "learning_rate": 6.727356991525424e-06, "loss": 1.599, "mean_token_accuracy": 0.646091915667057, "num_tokens": 19886562.0, "step": 24716 }, { "epoch": 6.546080508474576, "grad_norm": 2.3358585834503174, "learning_rate": 6.727092161016949e-06, "loss": 1.1451, "mean_token_accuracy": 0.7585319429636002, "num_tokens": 19888194.0, "step": 24718 }, { "epoch": 6.546610169491525, "grad_norm": 2.2526180744171143, "learning_rate": 6.726827330508475e-06, "loss": 1.3056, "mean_token_accuracy": 0.7025155425071716, "num_tokens": 19889841.0, "step": 24720 }, { "epoch": 6.547139830508475, "grad_norm": 2.379183053970337, "learning_rate": 6.7265625e-06, "loss": 0.9192, "mean_token_accuracy": 0.7937642261385918, "num_tokens": 19891328.0, "step": 24722 }, { "epoch": 6.547669491525424, "grad_norm": 1.9139200448989868, "learning_rate": 6.726297669491526e-06, "loss": 1.1013, "mean_token_accuracy": 0.7372296750545502, "num_tokens": 19893734.0, "step": 24724 }, { "epoch": 6.548199152542373, "grad_norm": 2.677581310272217, "learning_rate": 6.726032838983051e-06, "loss": 1.1555, "mean_token_accuracy": 0.7269608378410339, "num_tokens": 19895294.0, "step": 24726 }, { "epoch": 6.548728813559322, "grad_norm": 2.6635119915008545, "learning_rate": 6.7257680084745765e-06, "loss": 1.4438, "mean_token_accuracy": 0.6670878529548645, "num_tokens": 19896741.0, "step": 24728 }, { "epoch": 6.549258474576272, "grad_norm": 1.9672259092330933, "learning_rate": 6.7255031779661014e-06, "loss": 1.304, "mean_token_accuracy": 0.6975960657000542, "num_tokens": 19898660.0, "step": 24730 }, { "epoch": 6.549788135593221, "grad_norm": 2.2782299518585205, "learning_rate": 6.725238347457628e-06, "loss": 0.9531, "mean_token_accuracy": 0.7723897695541382, "num_tokens": 19900113.0, "step": 24732 }, { "epoch": 6.5503177966101696, "grad_norm": 2.766787528991699, "learning_rate": 6.724973516949153e-06, "loss": 1.4615, "mean_token_accuracy": 0.6570843383669853, "num_tokens": 19901667.0, "step": 24734 }, { "epoch": 6.5508474576271185, "grad_norm": 2.6056666374206543, "learning_rate": 6.724708686440679e-06, "loss": 1.3836, "mean_token_accuracy": 0.7085075750946999, "num_tokens": 19903347.0, "step": 24736 }, { "epoch": 6.5513771186440675, "grad_norm": 2.1118125915527344, "learning_rate": 6.724443855932204e-06, "loss": 1.4235, "mean_token_accuracy": 0.6689592450857162, "num_tokens": 19905155.0, "step": 24738 }, { "epoch": 6.551906779661017, "grad_norm": 2.685331344604492, "learning_rate": 6.7241790254237295e-06, "loss": 1.1014, "mean_token_accuracy": 0.7179027274250984, "num_tokens": 19906650.0, "step": 24740 }, { "epoch": 6.552436440677966, "grad_norm": 1.7771635055541992, "learning_rate": 6.723914194915255e-06, "loss": 1.005, "mean_token_accuracy": 0.7740196138620377, "num_tokens": 19908149.0, "step": 24742 }, { "epoch": 6.552966101694915, "grad_norm": 2.4249746799468994, "learning_rate": 6.72364936440678e-06, "loss": 0.815, "mean_token_accuracy": 0.7854005321860313, "num_tokens": 19909917.0, "step": 24744 }, { "epoch": 6.553495762711864, "grad_norm": 2.7092833518981934, "learning_rate": 6.723384533898306e-06, "loss": 1.1785, "mean_token_accuracy": 0.7136930003762245, "num_tokens": 19911244.0, "step": 24746 }, { "epoch": 6.554025423728813, "grad_norm": 2.601145029067993, "learning_rate": 6.723119703389831e-06, "loss": 1.1765, "mean_token_accuracy": 0.7204293087124825, "num_tokens": 19912847.0, "step": 24748 }, { "epoch": 6.554555084745763, "grad_norm": 2.7226662635803223, "learning_rate": 6.722854872881357e-06, "loss": 1.2811, "step": 24750 }, { "epoch": 6.554555084745763, "eval_loss": 1.3257371187210083, "eval_mean_token_accuracy": 0.7003015564246611, "eval_num_tokens": 19914439.0, "eval_runtime": 48.8526, "eval_samples_per_second": 6.305, "eval_steps_per_second": 6.305, "step": 24750 }, { "epoch": 6.555084745762712, "grad_norm": 2.1817467212677, "learning_rate": 6.722590042372882e-06, "loss": 1.0349, "mean_token_accuracy": 0.7260087169706821, "num_tokens": 19916202.0, "step": 24752 }, { "epoch": 6.555614406779661, "grad_norm": 2.546257972717285, "learning_rate": 6.722325211864407e-06, "loss": 1.4847, "mean_token_accuracy": 0.6533511877059937, "num_tokens": 19917616.0, "step": 24754 }, { "epoch": 6.55614406779661, "grad_norm": 1.9972165822982788, "learning_rate": 6.722060381355932e-06, "loss": 1.278, "mean_token_accuracy": 0.7139533683657646, "num_tokens": 19919422.0, "step": 24756 }, { "epoch": 6.55667372881356, "grad_norm": 2.511579751968384, "learning_rate": 6.721795550847459e-06, "loss": 1.2138, "mean_token_accuracy": 0.709398053586483, "num_tokens": 19920902.0, "step": 24758 }, { "epoch": 6.557203389830509, "grad_norm": 2.735419273376465, "learning_rate": 6.721530720338984e-06, "loss": 1.197, "mean_token_accuracy": 0.7120319530367851, "num_tokens": 19922297.0, "step": 24760 }, { "epoch": 6.557733050847458, "grad_norm": 2.0448405742645264, "learning_rate": 6.72126588983051e-06, "loss": 1.3801, "mean_token_accuracy": 0.6839813813567162, "num_tokens": 19924017.0, "step": 24762 }, { "epoch": 6.558262711864407, "grad_norm": 2.869347095489502, "learning_rate": 6.7210010593220345e-06, "loss": 0.8109, "mean_token_accuracy": 0.7793792262673378, "num_tokens": 19925383.0, "step": 24764 }, { "epoch": 6.558792372881356, "grad_norm": 2.2924952507019043, "learning_rate": 6.72073622881356e-06, "loss": 1.27, "mean_token_accuracy": 0.7196859046816826, "num_tokens": 19926973.0, "step": 24766 }, { "epoch": 6.559322033898305, "grad_norm": 2.257131576538086, "learning_rate": 6.720471398305085e-06, "loss": 1.2077, "mean_token_accuracy": 0.7360324040055275, "num_tokens": 19928530.0, "step": 24768 }, { "epoch": 6.559851694915254, "grad_norm": 2.1315934658050537, "learning_rate": 6.720206567796611e-06, "loss": 1.0759, "mean_token_accuracy": 0.7414267435669899, "num_tokens": 19930027.0, "step": 24770 }, { "epoch": 6.560381355932203, "grad_norm": 2.924551486968994, "learning_rate": 6.719941737288136e-06, "loss": 1.4275, "mean_token_accuracy": 0.7211813852190971, "num_tokens": 19931545.0, "step": 24772 }, { "epoch": 6.560911016949152, "grad_norm": 2.347127676010132, "learning_rate": 6.719676906779662e-06, "loss": 1.2789, "mean_token_accuracy": 0.7038892135024071, "num_tokens": 19933031.0, "step": 24774 }, { "epoch": 6.561440677966102, "grad_norm": 2.505944013595581, "learning_rate": 6.719412076271187e-06, "loss": 1.513, "mean_token_accuracy": 0.6692039594054222, "num_tokens": 19934706.0, "step": 24776 }, { "epoch": 6.561970338983051, "grad_norm": 1.8185895681381226, "learning_rate": 6.7191472457627124e-06, "loss": 0.8739, "mean_token_accuracy": 0.7847214713692665, "num_tokens": 19936300.0, "step": 24778 }, { "epoch": 6.5625, "grad_norm": 2.4890546798706055, "learning_rate": 6.718882415254237e-06, "loss": 1.331, "mean_token_accuracy": 0.6941139549016953, "num_tokens": 19938670.0, "step": 24780 }, { "epoch": 6.563029661016949, "grad_norm": 2.304696798324585, "learning_rate": 6.718617584745763e-06, "loss": 1.4179, "mean_token_accuracy": 0.6575804613530636, "num_tokens": 19940710.0, "step": 24782 }, { "epoch": 6.563559322033898, "grad_norm": 2.5642192363739014, "learning_rate": 6.718352754237288e-06, "loss": 1.3981, "mean_token_accuracy": 0.6818923018872738, "num_tokens": 19942545.0, "step": 24784 }, { "epoch": 6.564088983050848, "grad_norm": 2.876680850982666, "learning_rate": 6.718087923728815e-06, "loss": 1.8426, "mean_token_accuracy": 0.6039439663290977, "num_tokens": 19944018.0, "step": 24786 }, { "epoch": 6.564618644067797, "grad_norm": 2.3370108604431152, "learning_rate": 6.71782309322034e-06, "loss": 1.0204, "mean_token_accuracy": 0.7673961669206619, "num_tokens": 19945224.0, "step": 24788 }, { "epoch": 6.565148305084746, "grad_norm": 2.5543196201324463, "learning_rate": 6.717558262711865e-06, "loss": 1.4757, "mean_token_accuracy": 0.650882251560688, "num_tokens": 19946827.0, "step": 24790 }, { "epoch": 6.565677966101695, "grad_norm": 2.564666509628296, "learning_rate": 6.71729343220339e-06, "loss": 1.1625, "mean_token_accuracy": 0.7274005338549614, "num_tokens": 19948293.0, "step": 24792 }, { "epoch": 6.566207627118644, "grad_norm": 2.32475209236145, "learning_rate": 6.717028601694916e-06, "loss": 0.9546, "mean_token_accuracy": 0.7613294422626495, "num_tokens": 19950233.0, "step": 24794 }, { "epoch": 6.566737288135593, "grad_norm": 2.2020461559295654, "learning_rate": 6.716763771186441e-06, "loss": 1.1381, "mean_token_accuracy": 0.7254687920212746, "num_tokens": 19952244.0, "step": 24796 }, { "epoch": 6.567266949152542, "grad_norm": 1.980218768119812, "learning_rate": 6.716498940677967e-06, "loss": 0.6864, "mean_token_accuracy": 0.810438759624958, "num_tokens": 19953765.0, "step": 24798 }, { "epoch": 6.567796610169491, "grad_norm": 2.950986385345459, "learning_rate": 6.716234110169492e-06, "loss": 1.4969, "mean_token_accuracy": 0.6760146990418434, "num_tokens": 19955405.0, "step": 24800 }, { "epoch": 6.56832627118644, "grad_norm": 2.8731961250305176, "learning_rate": 6.7159692796610175e-06, "loss": 0.856, "mean_token_accuracy": 0.7913490831851959, "num_tokens": 19956740.0, "step": 24802 }, { "epoch": 6.56885593220339, "grad_norm": 2.520035982131958, "learning_rate": 6.7157044491525424e-06, "loss": 1.272, "mean_token_accuracy": 0.6973564773797989, "num_tokens": 19958446.0, "step": 24804 }, { "epoch": 6.569385593220339, "grad_norm": 2.480956792831421, "learning_rate": 6.715439618644068e-06, "loss": 1.1282, "mean_token_accuracy": 0.7326112613081932, "num_tokens": 19960064.0, "step": 24806 }, { "epoch": 6.569915254237288, "grad_norm": 3.6762759685516357, "learning_rate": 6.715174788135593e-06, "loss": 1.4795, "mean_token_accuracy": 0.6638726443052292, "num_tokens": 19961512.0, "step": 24808 }, { "epoch": 6.570444915254237, "grad_norm": 2.419602155685425, "learning_rate": 6.714909957627119e-06, "loss": 0.8831, "mean_token_accuracy": 0.7742944061756134, "num_tokens": 19963142.0, "step": 24810 }, { "epoch": 6.570974576271187, "grad_norm": 2.3671681880950928, "learning_rate": 6.714645127118644e-06, "loss": 0.8675, "mean_token_accuracy": 0.7925632297992706, "num_tokens": 19964748.0, "step": 24812 }, { "epoch": 6.571504237288136, "grad_norm": 2.3585352897644043, "learning_rate": 6.7143802966101705e-06, "loss": 1.2296, "mean_token_accuracy": 0.7107962630689144, "num_tokens": 19966513.0, "step": 24814 }, { "epoch": 6.572033898305085, "grad_norm": 2.3151466846466064, "learning_rate": 6.7141154661016946e-06, "loss": 0.9646, "mean_token_accuracy": 0.7734267860651016, "num_tokens": 19967975.0, "step": 24816 }, { "epoch": 6.572563559322034, "grad_norm": 2.732221841812134, "learning_rate": 6.713850635593221e-06, "loss": 1.8095, "mean_token_accuracy": 0.6063771545886993, "num_tokens": 19969658.0, "step": 24818 }, { "epoch": 6.573093220338983, "grad_norm": 2.1695141792297363, "learning_rate": 6.713585805084746e-06, "loss": 1.2223, "mean_token_accuracy": 0.703895702958107, "num_tokens": 19971485.0, "step": 24820 }, { "epoch": 6.5736228813559325, "grad_norm": 2.1432361602783203, "learning_rate": 6.713320974576272e-06, "loss": 1.2149, "mean_token_accuracy": 0.7099840342998505, "num_tokens": 19972988.0, "step": 24822 }, { "epoch": 6.5741525423728815, "grad_norm": 2.5781307220458984, "learning_rate": 6.713056144067798e-06, "loss": 1.7305, "mean_token_accuracy": 0.6359815448522568, "num_tokens": 19974425.0, "step": 24824 }, { "epoch": 6.5746822033898304, "grad_norm": 2.6905858516693115, "learning_rate": 6.712791313559323e-06, "loss": 1.4479, "mean_token_accuracy": 0.7112211138010025, "num_tokens": 19976122.0, "step": 24826 }, { "epoch": 6.575211864406779, "grad_norm": 2.592650890350342, "learning_rate": 6.712526483050848e-06, "loss": 1.1741, "mean_token_accuracy": 0.6983543261885643, "num_tokens": 19977864.0, "step": 24828 }, { "epoch": 6.575741525423728, "grad_norm": 2.2672719955444336, "learning_rate": 6.712261652542373e-06, "loss": 1.7914, "mean_token_accuracy": 0.6174103394150734, "num_tokens": 19979601.0, "step": 24830 }, { "epoch": 6.576271186440678, "grad_norm": 2.523833751678467, "learning_rate": 6.711996822033899e-06, "loss": 1.1117, "mean_token_accuracy": 0.7467478811740875, "num_tokens": 19980977.0, "step": 24832 }, { "epoch": 6.576800847457627, "grad_norm": 2.1994879245758057, "learning_rate": 6.711731991525424e-06, "loss": 0.8736, "mean_token_accuracy": 0.766793929040432, "num_tokens": 19982734.0, "step": 24834 }, { "epoch": 6.577330508474576, "grad_norm": 2.656026840209961, "learning_rate": 6.71146716101695e-06, "loss": 1.4841, "mean_token_accuracy": 0.6817384138703346, "num_tokens": 19984226.0, "step": 24836 }, { "epoch": 6.577860169491525, "grad_norm": 2.0357580184936523, "learning_rate": 6.711202330508475e-06, "loss": 0.9087, "mean_token_accuracy": 0.7563949227333069, "num_tokens": 19986498.0, "step": 24838 }, { "epoch": 6.578389830508475, "grad_norm": 2.4129223823547363, "learning_rate": 6.710937500000001e-06, "loss": 1.0346, "mean_token_accuracy": 0.738824799656868, "num_tokens": 19988308.0, "step": 24840 }, { "epoch": 6.578919491525424, "grad_norm": 2.3484132289886475, "learning_rate": 6.710672669491526e-06, "loss": 1.3756, "mean_token_accuracy": 0.6969193182885647, "num_tokens": 19990917.0, "step": 24842 }, { "epoch": 6.579449152542373, "grad_norm": 1.9007481336593628, "learning_rate": 6.710407838983052e-06, "loss": 0.7991, "mean_token_accuracy": 0.8048346564173698, "num_tokens": 19992485.0, "step": 24844 }, { "epoch": 6.579978813559322, "grad_norm": 2.370497703552246, "learning_rate": 6.710143008474577e-06, "loss": 1.4392, "mean_token_accuracy": 0.678764134645462, "num_tokens": 19994339.0, "step": 24846 }, { "epoch": 6.580508474576272, "grad_norm": 2.4904959201812744, "learning_rate": 6.709878177966103e-06, "loss": 1.1371, "mean_token_accuracy": 0.7528118267655373, "num_tokens": 19995924.0, "step": 24848 }, { "epoch": 6.581038135593221, "grad_norm": 2.8787145614624023, "learning_rate": 6.709613347457628e-06, "loss": 1.4795, "mean_token_accuracy": 0.6620509326457977, "num_tokens": 19997544.0, "step": 24850 }, { "epoch": 6.5815677966101696, "grad_norm": 2.734208345413208, "learning_rate": 6.7093485169491535e-06, "loss": 1.2456, "mean_token_accuracy": 0.7107342705130577, "num_tokens": 19999117.0, "step": 24852 }, { "epoch": 6.5820974576271185, "grad_norm": 3.0726447105407715, "learning_rate": 6.709083686440678e-06, "loss": 1.1241, "mean_token_accuracy": 0.715149387717247, "num_tokens": 20000737.0, "step": 24854 }, { "epoch": 6.5826271186440675, "grad_norm": 2.5346591472625732, "learning_rate": 6.708818855932204e-06, "loss": 0.7447, "mean_token_accuracy": 0.8057663291692734, "num_tokens": 20002271.0, "step": 24856 }, { "epoch": 6.583156779661017, "grad_norm": 1.6452018022537231, "learning_rate": 6.708554025423729e-06, "loss": 1.0555, "mean_token_accuracy": 0.7066820487380028, "num_tokens": 20004513.0, "step": 24858 }, { "epoch": 6.583686440677966, "grad_norm": 2.5453426837921143, "learning_rate": 6.708289194915255e-06, "loss": 0.7985, "mean_token_accuracy": 0.799977034330368, "num_tokens": 20005957.0, "step": 24860 }, { "epoch": 6.584216101694915, "grad_norm": 2.525085687637329, "learning_rate": 6.70802436440678e-06, "loss": 1.2545, "mean_token_accuracy": 0.7177928388118744, "num_tokens": 20007496.0, "step": 24862 }, { "epoch": 6.584745762711864, "grad_norm": 1.9024879932403564, "learning_rate": 6.7077595338983056e-06, "loss": 1.0976, "mean_token_accuracy": 0.7488704174757004, "num_tokens": 20009157.0, "step": 24864 }, { "epoch": 6.585275423728813, "grad_norm": 2.3006598949432373, "learning_rate": 6.7074947033898305e-06, "loss": 1.1781, "mean_token_accuracy": 0.7162793204188347, "num_tokens": 20010992.0, "step": 24866 }, { "epoch": 6.585805084745763, "grad_norm": 2.439939260482788, "learning_rate": 6.707229872881357e-06, "loss": 1.0361, "mean_token_accuracy": 0.741305761039257, "num_tokens": 20012617.0, "step": 24868 }, { "epoch": 6.586334745762712, "grad_norm": 2.552082061767578, "learning_rate": 6.706965042372881e-06, "loss": 1.3733, "mean_token_accuracy": 0.662498701363802, "num_tokens": 20014708.0, "step": 24870 }, { "epoch": 6.586864406779661, "grad_norm": 2.3509249687194824, "learning_rate": 6.706700211864408e-06, "loss": 1.1022, "mean_token_accuracy": 0.7438951879739761, "num_tokens": 20016298.0, "step": 24872 }, { "epoch": 6.58739406779661, "grad_norm": 2.9059174060821533, "learning_rate": 6.706435381355933e-06, "loss": 1.3655, "mean_token_accuracy": 0.7257672846317291, "num_tokens": 20017822.0, "step": 24874 }, { "epoch": 6.58792372881356, "grad_norm": 2.532674551010132, "learning_rate": 6.7061705508474585e-06, "loss": 1.0019, "mean_token_accuracy": 0.7431192398071289, "num_tokens": 20019262.0, "step": 24876 }, { "epoch": 6.588453389830509, "grad_norm": 2.385599374771118, "learning_rate": 6.7059057203389835e-06, "loss": 0.9922, "mean_token_accuracy": 0.7537888884544373, "num_tokens": 20020841.0, "step": 24878 }, { "epoch": 6.588983050847458, "grad_norm": 2.4005603790283203, "learning_rate": 6.705640889830509e-06, "loss": 1.4155, "mean_token_accuracy": 0.6874145641922951, "num_tokens": 20022700.0, "step": 24880 }, { "epoch": 6.589512711864407, "grad_norm": 2.3725500106811523, "learning_rate": 6.705376059322034e-06, "loss": 1.2432, "mean_token_accuracy": 0.7249978706240654, "num_tokens": 20024613.0, "step": 24882 }, { "epoch": 6.590042372881356, "grad_norm": 2.0855417251586914, "learning_rate": 6.70511122881356e-06, "loss": 1.0217, "mean_token_accuracy": 0.7576055750250816, "num_tokens": 20026527.0, "step": 24884 }, { "epoch": 6.590572033898305, "grad_norm": 2.7032434940338135, "learning_rate": 6.704846398305085e-06, "loss": 1.8838, "mean_token_accuracy": 0.5850730016827583, "num_tokens": 20028206.0, "step": 24886 }, { "epoch": 6.591101694915254, "grad_norm": 2.6834001541137695, "learning_rate": 6.704581567796611e-06, "loss": 0.8227, "mean_token_accuracy": 0.8018590435385704, "num_tokens": 20029431.0, "step": 24888 }, { "epoch": 6.591631355932203, "grad_norm": 2.335848331451416, "learning_rate": 6.704316737288136e-06, "loss": 1.0394, "mean_token_accuracy": 0.7445923015475273, "num_tokens": 20031017.0, "step": 24890 }, { "epoch": 6.592161016949152, "grad_norm": 2.5970213413238525, "learning_rate": 6.704051906779661e-06, "loss": 1.3969, "mean_token_accuracy": 0.6970225125551224, "num_tokens": 20032428.0, "step": 24892 }, { "epoch": 6.592690677966102, "grad_norm": 2.8411948680877686, "learning_rate": 6.703787076271186e-06, "loss": 1.3204, "mean_token_accuracy": 0.7200952991843224, "num_tokens": 20033737.0, "step": 24894 }, { "epoch": 6.593220338983051, "grad_norm": 2.420088768005371, "learning_rate": 6.703522245762713e-06, "loss": 1.0536, "mean_token_accuracy": 0.7478614896535873, "num_tokens": 20035397.0, "step": 24896 }, { "epoch": 6.59375, "grad_norm": 2.1555705070495605, "learning_rate": 6.703257415254237e-06, "loss": 1.1445, "mean_token_accuracy": 0.726003885269165, "num_tokens": 20037217.0, "step": 24898 }, { "epoch": 6.594279661016949, "grad_norm": 2.233236074447632, "learning_rate": 6.702992584745764e-06, "loss": 1.4217, "mean_token_accuracy": 0.6757506430149078, "num_tokens": 20038663.0, "step": 24900 }, { "epoch": 6.594809322033898, "grad_norm": 2.70760178565979, "learning_rate": 6.7027277542372885e-06, "loss": 1.1805, "mean_token_accuracy": 0.7423253282904625, "num_tokens": 20040269.0, "step": 24902 }, { "epoch": 6.595338983050848, "grad_norm": 2.3458218574523926, "learning_rate": 6.702462923728814e-06, "loss": 1.1546, "mean_token_accuracy": 0.721253864467144, "num_tokens": 20041833.0, "step": 24904 }, { "epoch": 6.595868644067797, "grad_norm": 2.3494796752929688, "learning_rate": 6.702198093220339e-06, "loss": 0.8343, "mean_token_accuracy": 0.7890680059790611, "num_tokens": 20043444.0, "step": 24906 }, { "epoch": 6.596398305084746, "grad_norm": 2.1933484077453613, "learning_rate": 6.701933262711865e-06, "loss": 1.1813, "mean_token_accuracy": 0.7201206311583519, "num_tokens": 20045223.0, "step": 24908 }, { "epoch": 6.596927966101695, "grad_norm": 2.1114213466644287, "learning_rate": 6.701668432203391e-06, "loss": 0.9124, "mean_token_accuracy": 0.7932978421449661, "num_tokens": 20046558.0, "step": 24910 }, { "epoch": 6.597457627118644, "grad_norm": 2.050915002822876, "learning_rate": 6.701403601694916e-06, "loss": 1.2037, "mean_token_accuracy": 0.7216536030173302, "num_tokens": 20048190.0, "step": 24912 }, { "epoch": 6.597987288135593, "grad_norm": 2.5757052898406982, "learning_rate": 6.7011387711864415e-06, "loss": 1.3555, "mean_token_accuracy": 0.6898323446512222, "num_tokens": 20049592.0, "step": 24914 }, { "epoch": 6.598516949152542, "grad_norm": 2.1533796787261963, "learning_rate": 6.7008739406779664e-06, "loss": 1.3439, "mean_token_accuracy": 0.6971654668450356, "num_tokens": 20051416.0, "step": 24916 }, { "epoch": 6.599046610169491, "grad_norm": 2.335914134979248, "learning_rate": 6.700609110169492e-06, "loss": 1.4893, "mean_token_accuracy": 0.6484687179327011, "num_tokens": 20053091.0, "step": 24918 }, { "epoch": 6.59957627118644, "grad_norm": 1.9547909498214722, "learning_rate": 6.700344279661017e-06, "loss": 0.9034, "mean_token_accuracy": 0.8059495687484741, "num_tokens": 20054929.0, "step": 24920 }, { "epoch": 6.60010593220339, "grad_norm": 2.658858060836792, "learning_rate": 6.700079449152544e-06, "loss": 1.1988, "mean_token_accuracy": 0.7127829678356647, "num_tokens": 20056545.0, "step": 24922 }, { "epoch": 6.600635593220339, "grad_norm": 2.3275394439697266, "learning_rate": 6.699814618644068e-06, "loss": 1.5659, "mean_token_accuracy": 0.6671236231923103, "num_tokens": 20058354.0, "step": 24924 }, { "epoch": 6.601165254237288, "grad_norm": 2.520275592803955, "learning_rate": 6.6995497881355945e-06, "loss": 1.3968, "mean_token_accuracy": 0.6883165314793587, "num_tokens": 20060045.0, "step": 24926 }, { "epoch": 6.601694915254237, "grad_norm": 2.1517832279205322, "learning_rate": 6.699284957627119e-06, "loss": 1.1996, "mean_token_accuracy": 0.7253214940428734, "num_tokens": 20061836.0, "step": 24928 }, { "epoch": 6.602224576271187, "grad_norm": 3.1129395961761475, "learning_rate": 6.699020127118645e-06, "loss": 1.2601, "mean_token_accuracy": 0.7143731489777565, "num_tokens": 20063433.0, "step": 24930 }, { "epoch": 6.602754237288136, "grad_norm": 2.4001545906066895, "learning_rate": 6.69875529661017e-06, "loss": 1.2967, "mean_token_accuracy": 0.712724082171917, "num_tokens": 20065279.0, "step": 24932 }, { "epoch": 6.603283898305085, "grad_norm": 2.63670015335083, "learning_rate": 6.698490466101696e-06, "loss": 1.4559, "mean_token_accuracy": 0.6832670122385025, "num_tokens": 20066927.0, "step": 24934 }, { "epoch": 6.603813559322034, "grad_norm": 3.080981492996216, "learning_rate": 6.698225635593221e-06, "loss": 1.0928, "mean_token_accuracy": 0.7339693754911423, "num_tokens": 20068226.0, "step": 24936 }, { "epoch": 6.604343220338983, "grad_norm": 2.3948044776916504, "learning_rate": 6.697960805084747e-06, "loss": 1.1786, "mean_token_accuracy": 0.7184118181467056, "num_tokens": 20069610.0, "step": 24938 }, { "epoch": 6.6048728813559325, "grad_norm": 1.8568801879882812, "learning_rate": 6.6976959745762715e-06, "loss": 1.052, "mean_token_accuracy": 0.7076810076832771, "num_tokens": 20071840.0, "step": 24940 }, { "epoch": 6.6054025423728815, "grad_norm": 2.0752193927764893, "learning_rate": 6.697431144067797e-06, "loss": 0.865, "mean_token_accuracy": 0.7728783413767815, "num_tokens": 20073792.0, "step": 24942 }, { "epoch": 6.6059322033898304, "grad_norm": 2.68534255027771, "learning_rate": 6.697166313559322e-06, "loss": 1.0765, "mean_token_accuracy": 0.7432465478777885, "num_tokens": 20075278.0, "step": 24944 }, { "epoch": 6.606461864406779, "grad_norm": 2.6159861087799072, "learning_rate": 6.696901483050848e-06, "loss": 1.1071, "mean_token_accuracy": 0.7416097596287727, "num_tokens": 20077021.0, "step": 24946 }, { "epoch": 6.606991525423728, "grad_norm": 2.0624773502349854, "learning_rate": 6.696636652542373e-06, "loss": 0.931, "mean_token_accuracy": 0.785607673227787, "num_tokens": 20078678.0, "step": 24948 }, { "epoch": 6.607521186440678, "grad_norm": 2.2140369415283203, "learning_rate": 6.6963718220338995e-06, "loss": 1.0258, "mean_token_accuracy": 0.7505717426538467, "num_tokens": 20080315.0, "step": 24950 }, { "epoch": 6.608050847457627, "grad_norm": 2.086606740951538, "learning_rate": 6.696106991525424e-06, "loss": 1.0928, "mean_token_accuracy": 0.7487890347838402, "num_tokens": 20082205.0, "step": 24952 }, { "epoch": 6.608580508474576, "grad_norm": 1.9496527910232544, "learning_rate": 6.69584216101695e-06, "loss": 0.999, "mean_token_accuracy": 0.7480105310678482, "num_tokens": 20083941.0, "step": 24954 }, { "epoch": 6.609110169491525, "grad_norm": 2.2776591777801514, "learning_rate": 6.695577330508475e-06, "loss": 1.3214, "mean_token_accuracy": 0.7131308987736702, "num_tokens": 20085623.0, "step": 24956 }, { "epoch": 6.609639830508475, "grad_norm": 3.2060818672180176, "learning_rate": 6.695312500000001e-06, "loss": 1.4113, "mean_token_accuracy": 0.6834590658545494, "num_tokens": 20087166.0, "step": 24958 }, { "epoch": 6.610169491525424, "grad_norm": 2.226112127304077, "learning_rate": 6.695047669491526e-06, "loss": 0.9386, "mean_token_accuracy": 0.7804563790559769, "num_tokens": 20088794.0, "step": 24960 }, { "epoch": 6.610699152542373, "grad_norm": 2.2962751388549805, "learning_rate": 6.694782838983052e-06, "loss": 1.0103, "mean_token_accuracy": 0.7578595280647278, "num_tokens": 20090408.0, "step": 24962 }, { "epoch": 6.611228813559322, "grad_norm": 2.1656410694122314, "learning_rate": 6.694518008474577e-06, "loss": 0.777, "mean_token_accuracy": 0.7904954478144646, "num_tokens": 20092056.0, "step": 24964 }, { "epoch": 6.611758474576272, "grad_norm": 2.5193886756896973, "learning_rate": 6.694253177966102e-06, "loss": 1.1453, "mean_token_accuracy": 0.727564349770546, "num_tokens": 20093359.0, "step": 24966 }, { "epoch": 6.612288135593221, "grad_norm": 2.3110201358795166, "learning_rate": 6.693988347457627e-06, "loss": 1.1816, "mean_token_accuracy": 0.7360818833112717, "num_tokens": 20094848.0, "step": 24968 }, { "epoch": 6.6128177966101696, "grad_norm": 1.9607168436050415, "learning_rate": 6.693723516949153e-06, "loss": 0.9107, "mean_token_accuracy": 0.7766410112380981, "num_tokens": 20096672.0, "step": 24970 }, { "epoch": 6.6133474576271185, "grad_norm": 2.3499934673309326, "learning_rate": 6.693458686440678e-06, "loss": 0.8566, "mean_token_accuracy": 0.775872677564621, "num_tokens": 20098455.0, "step": 24972 }, { "epoch": 6.6138771186440675, "grad_norm": 2.7592129707336426, "learning_rate": 6.693193855932204e-06, "loss": 1.0868, "mean_token_accuracy": 0.7412998229265213, "num_tokens": 20099859.0, "step": 24974 }, { "epoch": 6.614406779661017, "grad_norm": 1.8652822971343994, "learning_rate": 6.692929025423729e-06, "loss": 0.9864, "mean_token_accuracy": 0.7509328275918961, "num_tokens": 20101519.0, "step": 24976 }, { "epoch": 6.614936440677966, "grad_norm": 2.3502159118652344, "learning_rate": 6.6926641949152545e-06, "loss": 1.1967, "mean_token_accuracy": 0.7335669472813606, "num_tokens": 20103131.0, "step": 24978 }, { "epoch": 6.615466101694915, "grad_norm": 2.5625548362731934, "learning_rate": 6.692399364406779e-06, "loss": 0.995, "mean_token_accuracy": 0.7564895004034042, "num_tokens": 20104792.0, "step": 24980 }, { "epoch": 6.615995762711864, "grad_norm": 2.1678214073181152, "learning_rate": 6.692134533898306e-06, "loss": 1.4317, "mean_token_accuracy": 0.6576035842299461, "num_tokens": 20106538.0, "step": 24982 }, { "epoch": 6.616525423728813, "grad_norm": 2.2460479736328125, "learning_rate": 6.691869703389831e-06, "loss": 0.911, "mean_token_accuracy": 0.7634245455265045, "num_tokens": 20108162.0, "step": 24984 }, { "epoch": 6.617055084745763, "grad_norm": 2.3507328033447266, "learning_rate": 6.691604872881357e-06, "loss": 1.3971, "mean_token_accuracy": 0.6747628524899483, "num_tokens": 20109812.0, "step": 24986 }, { "epoch": 6.617584745762712, "grad_norm": 2.4001142978668213, "learning_rate": 6.691340042372882e-06, "loss": 1.46, "mean_token_accuracy": 0.6823538020253181, "num_tokens": 20111484.0, "step": 24988 }, { "epoch": 6.618114406779661, "grad_norm": 2.4820919036865234, "learning_rate": 6.6910752118644074e-06, "loss": 1.4975, "mean_token_accuracy": 0.6725720837712288, "num_tokens": 20113123.0, "step": 24990 }, { "epoch": 6.61864406779661, "grad_norm": 2.954503297805786, "learning_rate": 6.690810381355933e-06, "loss": 1.3524, "mean_token_accuracy": 0.6699006110429764, "num_tokens": 20114498.0, "step": 24992 }, { "epoch": 6.61917372881356, "grad_norm": 2.4409992694854736, "learning_rate": 6.690545550847458e-06, "loss": 1.1117, "mean_token_accuracy": 0.7471556887030602, "num_tokens": 20116228.0, "step": 24994 }, { "epoch": 6.619703389830509, "grad_norm": 3.1050806045532227, "learning_rate": 6.690280720338984e-06, "loss": 1.2736, "mean_token_accuracy": 0.7130538001656532, "num_tokens": 20117656.0, "step": 24996 }, { "epoch": 6.620233050847458, "grad_norm": 1.8176956176757812, "learning_rate": 6.690015889830509e-06, "loss": 0.7615, "mean_token_accuracy": 0.7920890673995018, "num_tokens": 20119265.0, "step": 24998 }, { "epoch": 6.620762711864407, "grad_norm": 2.7548718452453613, "learning_rate": 6.689751059322035e-06, "loss": 1.1021, "step": 25000 }, { "epoch": 6.620762711864407, "eval_loss": 1.326951503753662, "eval_mean_token_accuracy": 0.6999275568243745, "eval_num_tokens": 20120645.0, "eval_runtime": 48.1077, "eval_samples_per_second": 6.402, "eval_steps_per_second": 6.402, "step": 25000 }, { "epoch": 6.621292372881356, "grad_norm": 2.3055641651153564, "learning_rate": 6.6894862288135596e-06, "loss": 1.2223, "mean_token_accuracy": 0.7121197581291199, "num_tokens": 20122196.0, "step": 25002 }, { "epoch": 6.621822033898305, "grad_norm": 2.336411237716675, "learning_rate": 6.689221398305086e-06, "loss": 0.9162, "mean_token_accuracy": 0.7657889500260353, "num_tokens": 20123610.0, "step": 25004 }, { "epoch": 6.622351694915254, "grad_norm": 2.3478236198425293, "learning_rate": 6.68895656779661e-06, "loss": 1.1312, "mean_token_accuracy": 0.7113728299736977, "num_tokens": 20125031.0, "step": 25006 }, { "epoch": 6.622881355932203, "grad_norm": 2.4545507431030273, "learning_rate": 6.688691737288137e-06, "loss": 1.1378, "mean_token_accuracy": 0.7525385096669197, "num_tokens": 20126620.0, "step": 25008 }, { "epoch": 6.623411016949152, "grad_norm": 2.5328738689422607, "learning_rate": 6.688426906779662e-06, "loss": 1.0386, "mean_token_accuracy": 0.7302205115556717, "num_tokens": 20128248.0, "step": 25010 }, { "epoch": 6.623940677966102, "grad_norm": 2.84554386138916, "learning_rate": 6.688162076271188e-06, "loss": 1.3053, "mean_token_accuracy": 0.710597574710846, "num_tokens": 20129822.0, "step": 25012 }, { "epoch": 6.624470338983051, "grad_norm": 2.794879674911499, "learning_rate": 6.6878972457627125e-06, "loss": 1.4927, "mean_token_accuracy": 0.6696497350931168, "num_tokens": 20131317.0, "step": 25014 }, { "epoch": 6.625, "grad_norm": 1.7037559747695923, "learning_rate": 6.687632415254238e-06, "loss": 0.9626, "mean_token_accuracy": 0.754986435174942, "num_tokens": 20133074.0, "step": 25016 }, { "epoch": 6.625529661016949, "grad_norm": 1.9061187505722046, "learning_rate": 6.687367584745763e-06, "loss": 0.737, "mean_token_accuracy": 0.8106087744235992, "num_tokens": 20134988.0, "step": 25018 }, { "epoch": 6.626059322033898, "grad_norm": 2.231879472732544, "learning_rate": 6.687102754237289e-06, "loss": 1.1611, "mean_token_accuracy": 0.7273093909025192, "num_tokens": 20136680.0, "step": 25020 }, { "epoch": 6.626588983050848, "grad_norm": 2.207960844039917, "learning_rate": 6.686837923728814e-06, "loss": 1.1516, "mean_token_accuracy": 0.7436711341142654, "num_tokens": 20138101.0, "step": 25022 }, { "epoch": 6.627118644067797, "grad_norm": 2.6016271114349365, "learning_rate": 6.68657309322034e-06, "loss": 1.3821, "mean_token_accuracy": 0.6825161948800087, "num_tokens": 20139808.0, "step": 25024 }, { "epoch": 6.627648305084746, "grad_norm": 2.412285566329956, "learning_rate": 6.686308262711865e-06, "loss": 1.1455, "mean_token_accuracy": 0.7272519171237946, "num_tokens": 20141330.0, "step": 25026 }, { "epoch": 6.628177966101695, "grad_norm": 2.479968547821045, "learning_rate": 6.68604343220339e-06, "loss": 1.1818, "mean_token_accuracy": 0.7388468086719513, "num_tokens": 20143171.0, "step": 25028 }, { "epoch": 6.628707627118644, "grad_norm": 2.5339102745056152, "learning_rate": 6.685778601694915e-06, "loss": 0.9741, "mean_token_accuracy": 0.7553707361221313, "num_tokens": 20144620.0, "step": 25030 }, { "epoch": 6.629237288135593, "grad_norm": 2.054238796234131, "learning_rate": 6.685513771186441e-06, "loss": 1.085, "mean_token_accuracy": 0.7442224249243736, "num_tokens": 20146622.0, "step": 25032 }, { "epoch": 6.629766949152542, "grad_norm": 2.524902582168579, "learning_rate": 6.685248940677966e-06, "loss": 1.1685, "mean_token_accuracy": 0.7414325699210167, "num_tokens": 20148035.0, "step": 25034 }, { "epoch": 6.630296610169491, "grad_norm": 2.495088815689087, "learning_rate": 6.684984110169493e-06, "loss": 1.3536, "mean_token_accuracy": 0.6968519911170006, "num_tokens": 20149669.0, "step": 25036 }, { "epoch": 6.63082627118644, "grad_norm": 2.3093111515045166, "learning_rate": 6.684719279661018e-06, "loss": 1.021, "mean_token_accuracy": 0.7573652639985085, "num_tokens": 20151286.0, "step": 25038 }, { "epoch": 6.63135593220339, "grad_norm": 1.8749316930770874, "learning_rate": 6.684454449152543e-06, "loss": 1.2489, "mean_token_accuracy": 0.7122698426246643, "num_tokens": 20153015.0, "step": 25040 }, { "epoch": 6.631885593220339, "grad_norm": 2.6123414039611816, "learning_rate": 6.684189618644068e-06, "loss": 1.1167, "mean_token_accuracy": 0.7427853643894196, "num_tokens": 20154616.0, "step": 25042 }, { "epoch": 6.632415254237288, "grad_norm": 2.645824670791626, "learning_rate": 6.683924788135594e-06, "loss": 1.5607, "mean_token_accuracy": 0.6474237218499184, "num_tokens": 20155959.0, "step": 25044 }, { "epoch": 6.632944915254237, "grad_norm": 2.5728564262390137, "learning_rate": 6.683659957627119e-06, "loss": 1.0108, "mean_token_accuracy": 0.7431161925196648, "num_tokens": 20157509.0, "step": 25046 }, { "epoch": 6.633474576271187, "grad_norm": 2.6828207969665527, "learning_rate": 6.683395127118645e-06, "loss": 1.7823, "mean_token_accuracy": 0.6093178316950798, "num_tokens": 20159053.0, "step": 25048 }, { "epoch": 6.634004237288136, "grad_norm": 2.2827014923095703, "learning_rate": 6.68313029661017e-06, "loss": 1.2061, "mean_token_accuracy": 0.6988677904009819, "num_tokens": 20160832.0, "step": 25050 }, { "epoch": 6.634533898305085, "grad_norm": 2.9789962768554688, "learning_rate": 6.6828654661016955e-06, "loss": 1.6864, "mean_token_accuracy": 0.6314464844763279, "num_tokens": 20162375.0, "step": 25052 }, { "epoch": 6.635063559322034, "grad_norm": 2.6545047760009766, "learning_rate": 6.6826006355932204e-06, "loss": 1.0924, "mean_token_accuracy": 0.7385692670941353, "num_tokens": 20163788.0, "step": 25054 }, { "epoch": 6.635593220338983, "grad_norm": 2.428070545196533, "learning_rate": 6.682335805084746e-06, "loss": 1.0888, "mean_token_accuracy": 0.749988280236721, "num_tokens": 20165263.0, "step": 25056 }, { "epoch": 6.6361228813559325, "grad_norm": 2.5372204780578613, "learning_rate": 6.682070974576271e-06, "loss": 1.48, "mean_token_accuracy": 0.6587218567728996, "num_tokens": 20166920.0, "step": 25058 }, { "epoch": 6.6366525423728815, "grad_norm": 2.3581225872039795, "learning_rate": 6.681806144067797e-06, "loss": 1.3127, "mean_token_accuracy": 0.6898650228977203, "num_tokens": 20168437.0, "step": 25060 }, { "epoch": 6.6371822033898304, "grad_norm": 2.1665475368499756, "learning_rate": 6.681541313559322e-06, "loss": 0.9994, "mean_token_accuracy": 0.7493437975645065, "num_tokens": 20170232.0, "step": 25062 }, { "epoch": 6.637711864406779, "grad_norm": 2.28584361076355, "learning_rate": 6.6812764830508485e-06, "loss": 1.1982, "mean_token_accuracy": 0.6971635520458221, "num_tokens": 20172047.0, "step": 25064 }, { "epoch": 6.638241525423728, "grad_norm": 2.3842999935150146, "learning_rate": 6.6810116525423725e-06, "loss": 0.788, "mean_token_accuracy": 0.7922475710511208, "num_tokens": 20173636.0, "step": 25066 }, { "epoch": 6.638771186440678, "grad_norm": 3.2480454444885254, "learning_rate": 6.680746822033899e-06, "loss": 1.1407, "mean_token_accuracy": 0.7185898274183273, "num_tokens": 20175239.0, "step": 25068 }, { "epoch": 6.639300847457627, "grad_norm": 3.094416618347168, "learning_rate": 6.680481991525424e-06, "loss": 1.2847, "mean_token_accuracy": 0.7009207680821419, "num_tokens": 20176640.0, "step": 25070 }, { "epoch": 6.639830508474576, "grad_norm": 2.746530294418335, "learning_rate": 6.68021716101695e-06, "loss": 1.5692, "mean_token_accuracy": 0.6315515413880348, "num_tokens": 20178253.0, "step": 25072 }, { "epoch": 6.640360169491525, "grad_norm": 3.407386302947998, "learning_rate": 6.679952330508475e-06, "loss": 0.7424, "mean_token_accuracy": 0.8095996379852295, "num_tokens": 20179698.0, "step": 25074 }, { "epoch": 6.640889830508475, "grad_norm": 2.1880240440368652, "learning_rate": 6.679687500000001e-06, "loss": 0.8654, "mean_token_accuracy": 0.7933344915509224, "num_tokens": 20181150.0, "step": 25076 }, { "epoch": 6.641419491525424, "grad_norm": 2.297302007675171, "learning_rate": 6.679422669491526e-06, "loss": 1.0525, "mean_token_accuracy": 0.7296255193650723, "num_tokens": 20183049.0, "step": 25078 }, { "epoch": 6.641949152542373, "grad_norm": 2.135948657989502, "learning_rate": 6.679157838983051e-06, "loss": 0.8962, "mean_token_accuracy": 0.775950163602829, "num_tokens": 20184350.0, "step": 25080 }, { "epoch": 6.642478813559322, "grad_norm": 2.4615180492401123, "learning_rate": 6.678893008474577e-06, "loss": 1.3655, "mean_token_accuracy": 0.6717891320586205, "num_tokens": 20185952.0, "step": 25082 }, { "epoch": 6.643008474576272, "grad_norm": 2.751420259475708, "learning_rate": 6.678628177966102e-06, "loss": 1.1084, "mean_token_accuracy": 0.730587974190712, "num_tokens": 20187475.0, "step": 25084 }, { "epoch": 6.643538135593221, "grad_norm": 1.9909741878509521, "learning_rate": 6.678363347457628e-06, "loss": 0.789, "mean_token_accuracy": 0.7774432525038719, "num_tokens": 20189213.0, "step": 25086 }, { "epoch": 6.6440677966101696, "grad_norm": 3.27071213722229, "learning_rate": 6.678098516949153e-06, "loss": 1.2292, "mean_token_accuracy": 0.6902708634734154, "num_tokens": 20191040.0, "step": 25088 }, { "epoch": 6.6445974576271185, "grad_norm": 2.4226715564727783, "learning_rate": 6.677833686440679e-06, "loss": 0.9211, "mean_token_accuracy": 0.7753208503127098, "num_tokens": 20192306.0, "step": 25090 }, { "epoch": 6.6451271186440675, "grad_norm": 2.0465445518493652, "learning_rate": 6.677568855932204e-06, "loss": 0.8261, "mean_token_accuracy": 0.7912043780088425, "num_tokens": 20193740.0, "step": 25092 }, { "epoch": 6.645656779661017, "grad_norm": 2.5966241359710693, "learning_rate": 6.67730402542373e-06, "loss": 0.7502, "mean_token_accuracy": 0.7962956801056862, "num_tokens": 20195190.0, "step": 25094 }, { "epoch": 6.646186440677966, "grad_norm": 2.506528377532959, "learning_rate": 6.677039194915255e-06, "loss": 1.4634, "mean_token_accuracy": 0.7060616314411163, "num_tokens": 20196670.0, "step": 25096 }, { "epoch": 6.646716101694915, "grad_norm": 2.0999131202697754, "learning_rate": 6.676774364406781e-06, "loss": 1.1105, "mean_token_accuracy": 0.7285720854997635, "num_tokens": 20198141.0, "step": 25098 }, { "epoch": 6.647245762711864, "grad_norm": 2.8742544651031494, "learning_rate": 6.676509533898306e-06, "loss": 1.2266, "mean_token_accuracy": 0.7065620049834251, "num_tokens": 20199930.0, "step": 25100 }, { "epoch": 6.647775423728813, "grad_norm": 2.463240385055542, "learning_rate": 6.6762447033898314e-06, "loss": 1.3812, "mean_token_accuracy": 0.6859535425901413, "num_tokens": 20201573.0, "step": 25102 }, { "epoch": 6.648305084745763, "grad_norm": 2.378323793411255, "learning_rate": 6.675979872881356e-06, "loss": 1.1353, "mean_token_accuracy": 0.7383778095245361, "num_tokens": 20203090.0, "step": 25104 }, { "epoch": 6.648834745762712, "grad_norm": 2.072554349899292, "learning_rate": 6.675715042372882e-06, "loss": 0.9551, "mean_token_accuracy": 0.7789704129099846, "num_tokens": 20204798.0, "step": 25106 }, { "epoch": 6.649364406779661, "grad_norm": 2.4115872383117676, "learning_rate": 6.675450211864407e-06, "loss": 0.8918, "mean_token_accuracy": 0.7772701755166054, "num_tokens": 20206174.0, "step": 25108 }, { "epoch": 6.64989406779661, "grad_norm": 2.2695651054382324, "learning_rate": 6.675185381355933e-06, "loss": 1.1669, "mean_token_accuracy": 0.7349135279655457, "num_tokens": 20207645.0, "step": 25110 }, { "epoch": 6.65042372881356, "grad_norm": 2.2512433528900146, "learning_rate": 6.674920550847458e-06, "loss": 1.2602, "mean_token_accuracy": 0.7053488418459892, "num_tokens": 20209424.0, "step": 25112 }, { "epoch": 6.650953389830509, "grad_norm": 2.332111358642578, "learning_rate": 6.6746557203389836e-06, "loss": 1.376, "mean_token_accuracy": 0.6891050562262535, "num_tokens": 20211254.0, "step": 25114 }, { "epoch": 6.651483050847458, "grad_norm": 2.125556230545044, "learning_rate": 6.6743908898305085e-06, "loss": 0.7951, "mean_token_accuracy": 0.7843423560261726, "num_tokens": 20213825.0, "step": 25116 }, { "epoch": 6.652012711864407, "grad_norm": 2.420950412750244, "learning_rate": 6.674126059322035e-06, "loss": 1.2914, "mean_token_accuracy": 0.7156871259212494, "num_tokens": 20215495.0, "step": 25118 }, { "epoch": 6.652542372881356, "grad_norm": 2.496335983276367, "learning_rate": 6.673861228813559e-06, "loss": 0.9497, "mean_token_accuracy": 0.7634919956326485, "num_tokens": 20217241.0, "step": 25120 }, { "epoch": 6.653072033898305, "grad_norm": 1.8148289918899536, "learning_rate": 6.673596398305086e-06, "loss": 0.6047, "mean_token_accuracy": 0.8350698351860046, "num_tokens": 20218775.0, "step": 25122 }, { "epoch": 6.653601694915254, "grad_norm": 2.108196973800659, "learning_rate": 6.673331567796611e-06, "loss": 1.1087, "mean_token_accuracy": 0.7325928211212158, "num_tokens": 20220502.0, "step": 25124 }, { "epoch": 6.654131355932203, "grad_norm": 2.2693347930908203, "learning_rate": 6.6730667372881365e-06, "loss": 1.2056, "mean_token_accuracy": 0.7295578941702843, "num_tokens": 20222077.0, "step": 25126 }, { "epoch": 6.654661016949152, "grad_norm": 2.3566744327545166, "learning_rate": 6.6728019067796614e-06, "loss": 1.3719, "mean_token_accuracy": 0.6923680081963539, "num_tokens": 20223612.0, "step": 25128 }, { "epoch": 6.655190677966102, "grad_norm": 2.8729748725891113, "learning_rate": 6.672537076271187e-06, "loss": 1.0137, "mean_token_accuracy": 0.7628692761063576, "num_tokens": 20225205.0, "step": 25130 }, { "epoch": 6.655720338983051, "grad_norm": 2.166679620742798, "learning_rate": 6.672272245762712e-06, "loss": 1.2275, "mean_token_accuracy": 0.6926219612360001, "num_tokens": 20226808.0, "step": 25132 }, { "epoch": 6.65625, "grad_norm": 2.5759329795837402, "learning_rate": 6.672007415254238e-06, "loss": 1.0665, "mean_token_accuracy": 0.7459549233317375, "num_tokens": 20228337.0, "step": 25134 }, { "epoch": 6.656779661016949, "grad_norm": 2.6555604934692383, "learning_rate": 6.671742584745763e-06, "loss": 1.2315, "mean_token_accuracy": 0.7485135123133659, "num_tokens": 20229874.0, "step": 25136 }, { "epoch": 6.657309322033898, "grad_norm": 2.0308797359466553, "learning_rate": 6.671477754237289e-06, "loss": 1.1918, "mean_token_accuracy": 0.7162477746605873, "num_tokens": 20231535.0, "step": 25138 }, { "epoch": 6.657838983050848, "grad_norm": 2.4811296463012695, "learning_rate": 6.6712129237288136e-06, "loss": 1.067, "mean_token_accuracy": 0.7345104664564133, "num_tokens": 20233144.0, "step": 25140 }, { "epoch": 6.658368644067797, "grad_norm": 2.1123385429382324, "learning_rate": 6.670948093220339e-06, "loss": 1.0144, "mean_token_accuracy": 0.7436314523220062, "num_tokens": 20234855.0, "step": 25142 }, { "epoch": 6.658898305084746, "grad_norm": 2.261763095855713, "learning_rate": 6.670683262711864e-06, "loss": 1.1126, "mean_token_accuracy": 0.7190440222620964, "num_tokens": 20236476.0, "step": 25144 }, { "epoch": 6.659427966101695, "grad_norm": 2.4210164546966553, "learning_rate": 6.670418432203391e-06, "loss": 1.1379, "mean_token_accuracy": 0.7188932597637177, "num_tokens": 20238028.0, "step": 25146 }, { "epoch": 6.659957627118644, "grad_norm": 2.819929838180542, "learning_rate": 6.670153601694915e-06, "loss": 1.1502, "mean_token_accuracy": 0.7394929453730583, "num_tokens": 20239593.0, "step": 25148 }, { "epoch": 6.660487288135593, "grad_norm": 4.054717063903809, "learning_rate": 6.669888771186442e-06, "loss": 0.9617, "mean_token_accuracy": 0.7698690071702003, "num_tokens": 20241318.0, "step": 25150 }, { "epoch": 6.661016949152542, "grad_norm": 2.5264732837677, "learning_rate": 6.6696239406779665e-06, "loss": 1.5684, "mean_token_accuracy": 0.6656658798456192, "num_tokens": 20242994.0, "step": 25152 }, { "epoch": 6.661546610169491, "grad_norm": 2.432246685028076, "learning_rate": 6.669359110169492e-06, "loss": 1.2729, "mean_token_accuracy": 0.6964835822582245, "num_tokens": 20244533.0, "step": 25154 }, { "epoch": 6.66207627118644, "grad_norm": 1.8745989799499512, "learning_rate": 6.669094279661017e-06, "loss": 0.9397, "mean_token_accuracy": 0.7572367414832115, "num_tokens": 20246638.0, "step": 25156 }, { "epoch": 6.66260593220339, "grad_norm": 2.482177257537842, "learning_rate": 6.668829449152543e-06, "loss": 1.1625, "mean_token_accuracy": 0.7018554583191872, "num_tokens": 20248207.0, "step": 25158 }, { "epoch": 6.663135593220339, "grad_norm": 2.600149393081665, "learning_rate": 6.668564618644068e-06, "loss": 1.0612, "mean_token_accuracy": 0.7303919941186905, "num_tokens": 20250397.0, "step": 25160 }, { "epoch": 6.663665254237288, "grad_norm": 2.3809239864349365, "learning_rate": 6.668299788135594e-06, "loss": 0.762, "mean_token_accuracy": 0.817899763584137, "num_tokens": 20251923.0, "step": 25162 }, { "epoch": 6.664194915254237, "grad_norm": 2.648198366165161, "learning_rate": 6.6680349576271195e-06, "loss": 1.2463, "mean_token_accuracy": 0.709157757461071, "num_tokens": 20253710.0, "step": 25164 }, { "epoch": 6.664724576271187, "grad_norm": 2.518266439437866, "learning_rate": 6.667770127118644e-06, "loss": 1.1551, "mean_token_accuracy": 0.7294452041387558, "num_tokens": 20255425.0, "step": 25166 }, { "epoch": 6.665254237288136, "grad_norm": 2.377305746078491, "learning_rate": 6.66750529661017e-06, "loss": 0.7669, "mean_token_accuracy": 0.799475222826004, "num_tokens": 20256714.0, "step": 25168 }, { "epoch": 6.665783898305085, "grad_norm": 2.1522557735443115, "learning_rate": 6.667240466101695e-06, "loss": 0.8984, "mean_token_accuracy": 0.7791457176208496, "num_tokens": 20258094.0, "step": 25170 }, { "epoch": 6.666313559322034, "grad_norm": 3.167454957962036, "learning_rate": 6.666975635593222e-06, "loss": 1.3467, "mean_token_accuracy": 0.6873510703444481, "num_tokens": 20259492.0, "step": 25172 }, { "epoch": 6.666843220338983, "grad_norm": 1.9996293783187866, "learning_rate": 6.666710805084746e-06, "loss": 0.6721, "mean_token_accuracy": 0.821507140994072, "num_tokens": 20261272.0, "step": 25174 }, { "epoch": 6.6673728813559325, "grad_norm": 2.2759358882904053, "learning_rate": 6.6664459745762724e-06, "loss": 0.9762, "mean_token_accuracy": 0.7628803923726082, "num_tokens": 20263044.0, "step": 25176 }, { "epoch": 6.6679025423728815, "grad_norm": 2.5980494022369385, "learning_rate": 6.666181144067797e-06, "loss": 1.3456, "mean_token_accuracy": 0.67739687114954, "num_tokens": 20264470.0, "step": 25178 }, { "epoch": 6.6684322033898304, "grad_norm": 1.7672011852264404, "learning_rate": 6.665916313559323e-06, "loss": 0.7015, "mean_token_accuracy": 0.8187683969736099, "num_tokens": 20265882.0, "step": 25180 }, { "epoch": 6.668961864406779, "grad_norm": 2.867828607559204, "learning_rate": 6.665651483050848e-06, "loss": 1.3261, "mean_token_accuracy": 0.6973785758018494, "num_tokens": 20267235.0, "step": 25182 }, { "epoch": 6.669491525423728, "grad_norm": 1.7250127792358398, "learning_rate": 6.665386652542374e-06, "loss": 0.9537, "mean_token_accuracy": 0.7603605017066002, "num_tokens": 20268743.0, "step": 25184 }, { "epoch": 6.670021186440678, "grad_norm": 2.698150873184204, "learning_rate": 6.665121822033899e-06, "loss": 1.4637, "mean_token_accuracy": 0.6862296089529991, "num_tokens": 20270334.0, "step": 25186 }, { "epoch": 6.670550847457627, "grad_norm": 2.1914963722229004, "learning_rate": 6.6648569915254246e-06, "loss": 1.6171, "mean_token_accuracy": 0.6670128926634789, "num_tokens": 20272091.0, "step": 25188 }, { "epoch": 6.671080508474576, "grad_norm": 2.5338220596313477, "learning_rate": 6.6645921610169495e-06, "loss": 1.1936, "mean_token_accuracy": 0.7238213121891022, "num_tokens": 20273522.0, "step": 25190 }, { "epoch": 6.671610169491525, "grad_norm": 2.597465991973877, "learning_rate": 6.664327330508475e-06, "loss": 1.0984, "mean_token_accuracy": 0.7449016198515892, "num_tokens": 20275078.0, "step": 25192 }, { "epoch": 6.672139830508475, "grad_norm": 1.6717596054077148, "learning_rate": 6.6640625e-06, "loss": 0.8123, "mean_token_accuracy": 0.7735012993216515, "num_tokens": 20277153.0, "step": 25194 }, { "epoch": 6.672669491525424, "grad_norm": 2.2674736976623535, "learning_rate": 6.663797669491526e-06, "loss": 0.9101, "mean_token_accuracy": 0.7649215832352638, "num_tokens": 20278659.0, "step": 25196 }, { "epoch": 6.673199152542373, "grad_norm": 2.4548449516296387, "learning_rate": 6.663532838983051e-06, "loss": 1.4466, "mean_token_accuracy": 0.6773492023348808, "num_tokens": 20280190.0, "step": 25198 }, { "epoch": 6.673728813559322, "grad_norm": 2.453737735748291, "learning_rate": 6.6632680084745775e-06, "loss": 1.9006, "mean_token_accuracy": 0.583955142647028, "num_tokens": 20282088.0, "step": 25200 }, { "epoch": 6.674258474576272, "grad_norm": 2.7563185691833496, "learning_rate": 6.663003177966102e-06, "loss": 1.1111, "mean_token_accuracy": 0.7413839548826218, "num_tokens": 20283394.0, "step": 25202 }, { "epoch": 6.674788135593221, "grad_norm": 2.6821837425231934, "learning_rate": 6.662738347457628e-06, "loss": 1.0554, "mean_token_accuracy": 0.7505791485309601, "num_tokens": 20284781.0, "step": 25204 }, { "epoch": 6.6753177966101696, "grad_norm": 2.481147527694702, "learning_rate": 6.662473516949153e-06, "loss": 1.1796, "mean_token_accuracy": 0.7081696465611458, "num_tokens": 20286503.0, "step": 25206 }, { "epoch": 6.6758474576271185, "grad_norm": 2.5911006927490234, "learning_rate": 6.662208686440679e-06, "loss": 1.199, "mean_token_accuracy": 0.7393034845590591, "num_tokens": 20287860.0, "step": 25208 }, { "epoch": 6.6763771186440675, "grad_norm": 1.8713884353637695, "learning_rate": 6.661943855932204e-06, "loss": 0.8259, "mean_token_accuracy": 0.7829661518335342, "num_tokens": 20289808.0, "step": 25210 }, { "epoch": 6.676906779661017, "grad_norm": 2.2458109855651855, "learning_rate": 6.66167902542373e-06, "loss": 1.3888, "mean_token_accuracy": 0.6770461797714233, "num_tokens": 20291627.0, "step": 25212 }, { "epoch": 6.677436440677966, "grad_norm": 1.8109936714172363, "learning_rate": 6.6614141949152546e-06, "loss": 1.0068, "mean_token_accuracy": 0.7587633952498436, "num_tokens": 20293304.0, "step": 25214 }, { "epoch": 6.677966101694915, "grad_norm": 2.3086843490600586, "learning_rate": 6.66114936440678e-06, "loss": 1.6157, "mean_token_accuracy": 0.6596266180276871, "num_tokens": 20295128.0, "step": 25216 }, { "epoch": 6.678495762711864, "grad_norm": 3.3364944458007812, "learning_rate": 6.660884533898305e-06, "loss": 1.1521, "mean_token_accuracy": 0.70658940076828, "num_tokens": 20296694.0, "step": 25218 }, { "epoch": 6.679025423728813, "grad_norm": 2.3029744625091553, "learning_rate": 6.660619703389831e-06, "loss": 1.2845, "mean_token_accuracy": 0.7188038155436516, "num_tokens": 20298312.0, "step": 25220 }, { "epoch": 6.679555084745763, "grad_norm": 1.741690993309021, "learning_rate": 6.660354872881356e-06, "loss": 1.0334, "mean_token_accuracy": 0.7521708458662033, "num_tokens": 20300062.0, "step": 25222 }, { "epoch": 6.680084745762712, "grad_norm": 2.5645246505737305, "learning_rate": 6.660090042372882e-06, "loss": 1.4416, "mean_token_accuracy": 0.6721466705203056, "num_tokens": 20301706.0, "step": 25224 }, { "epoch": 6.680614406779661, "grad_norm": 2.2527618408203125, "learning_rate": 6.659825211864407e-06, "loss": 1.3328, "mean_token_accuracy": 0.6964998692274094, "num_tokens": 20303073.0, "step": 25226 }, { "epoch": 6.68114406779661, "grad_norm": 2.035292625427246, "learning_rate": 6.6595603813559325e-06, "loss": 1.0189, "mean_token_accuracy": 0.7503356039524078, "num_tokens": 20304706.0, "step": 25228 }, { "epoch": 6.68167372881356, "grad_norm": 2.1449460983276367, "learning_rate": 6.659295550847457e-06, "loss": 1.0975, "mean_token_accuracy": 0.744875580072403, "num_tokens": 20306306.0, "step": 25230 }, { "epoch": 6.682203389830509, "grad_norm": 2.9454610347747803, "learning_rate": 6.659030720338984e-06, "loss": 1.3708, "mean_token_accuracy": 0.7207162752747536, "num_tokens": 20307813.0, "step": 25232 }, { "epoch": 6.682733050847458, "grad_norm": 1.9688103199005127, "learning_rate": 6.658765889830509e-06, "loss": 1.4665, "mean_token_accuracy": 0.6597564965486526, "num_tokens": 20309444.0, "step": 25234 }, { "epoch": 6.683262711864407, "grad_norm": 2.184541940689087, "learning_rate": 6.658501059322035e-06, "loss": 1.0733, "mean_token_accuracy": 0.7332844361662865, "num_tokens": 20310952.0, "step": 25236 }, { "epoch": 6.683792372881356, "grad_norm": 2.598005771636963, "learning_rate": 6.65823622881356e-06, "loss": 1.2669, "mean_token_accuracy": 0.7068774476647377, "num_tokens": 20312526.0, "step": 25238 }, { "epoch": 6.684322033898305, "grad_norm": 2.2432751655578613, "learning_rate": 6.6579713983050854e-06, "loss": 1.0308, "mean_token_accuracy": 0.7572487592697144, "num_tokens": 20313910.0, "step": 25240 }, { "epoch": 6.684851694915254, "grad_norm": 2.569952964782715, "learning_rate": 6.65770656779661e-06, "loss": 1.3448, "mean_token_accuracy": 0.6964167766273022, "num_tokens": 20315585.0, "step": 25242 }, { "epoch": 6.685381355932203, "grad_norm": 2.1598544120788574, "learning_rate": 6.657441737288136e-06, "loss": 1.3236, "mean_token_accuracy": 0.7093333527445793, "num_tokens": 20317462.0, "step": 25244 }, { "epoch": 6.685911016949152, "grad_norm": 2.7580161094665527, "learning_rate": 6.657176906779662e-06, "loss": 1.0783, "mean_token_accuracy": 0.7232767343521118, "num_tokens": 20319071.0, "step": 25246 }, { "epoch": 6.686440677966102, "grad_norm": 2.5825018882751465, "learning_rate": 6.656912076271187e-06, "loss": 1.3843, "mean_token_accuracy": 0.6763079985976219, "num_tokens": 20320647.0, "step": 25248 }, { "epoch": 6.686970338983051, "grad_norm": 2.2183048725128174, "learning_rate": 6.656647245762713e-06, "loss": 1.0055, "step": 25250 }, { "epoch": 6.686970338983051, "eval_loss": 1.3265224695205688, "eval_mean_token_accuracy": 0.7002045693142074, "eval_num_tokens": 20322166.0, "eval_runtime": 48.0543, "eval_samples_per_second": 6.409, "eval_steps_per_second": 6.409, "step": 25250 }, { "epoch": 6.6875, "grad_norm": 2.5642147064208984, "learning_rate": 6.6563824152542375e-06, "loss": 1.1168, "mean_token_accuracy": 0.7508364729583263, "num_tokens": 20323679.0, "step": 25252 }, { "epoch": 6.688029661016949, "grad_norm": 2.3120152950286865, "learning_rate": 6.656117584745764e-06, "loss": 1.2818, "mean_token_accuracy": 0.6947255805134773, "num_tokens": 20325215.0, "step": 25254 }, { "epoch": 6.688559322033898, "grad_norm": 2.197469711303711, "learning_rate": 6.655852754237288e-06, "loss": 1.0485, "mean_token_accuracy": 0.7387223206460476, "num_tokens": 20327041.0, "step": 25256 }, { "epoch": 6.689088983050848, "grad_norm": 2.302272319793701, "learning_rate": 6.655587923728815e-06, "loss": 1.0524, "mean_token_accuracy": 0.7419946938753128, "num_tokens": 20328578.0, "step": 25258 }, { "epoch": 6.689618644067797, "grad_norm": 2.364971876144409, "learning_rate": 6.65532309322034e-06, "loss": 1.0247, "mean_token_accuracy": 0.7779611721634865, "num_tokens": 20330087.0, "step": 25260 }, { "epoch": 6.690148305084746, "grad_norm": 2.3557043075561523, "learning_rate": 6.655058262711866e-06, "loss": 1.0978, "mean_token_accuracy": 0.7348163165152073, "num_tokens": 20331885.0, "step": 25262 }, { "epoch": 6.690677966101695, "grad_norm": 2.0074338912963867, "learning_rate": 6.6547934322033905e-06, "loss": 1.07, "mean_token_accuracy": 0.7687452062964439, "num_tokens": 20333300.0, "step": 25264 }, { "epoch": 6.691207627118644, "grad_norm": 2.3822927474975586, "learning_rate": 6.654528601694916e-06, "loss": 1.1376, "mean_token_accuracy": 0.7133775278925896, "num_tokens": 20335061.0, "step": 25266 }, { "epoch": 6.691737288135593, "grad_norm": 3.0848610401153564, "learning_rate": 6.654263771186441e-06, "loss": 1.2056, "mean_token_accuracy": 0.7203311696648598, "num_tokens": 20336560.0, "step": 25268 }, { "epoch": 6.692266949152542, "grad_norm": 2.5785558223724365, "learning_rate": 6.653998940677967e-06, "loss": 1.6742, "mean_token_accuracy": 0.6517981365323067, "num_tokens": 20338133.0, "step": 25270 }, { "epoch": 6.692796610169491, "grad_norm": 2.4485244750976562, "learning_rate": 6.653734110169492e-06, "loss": 1.3798, "mean_token_accuracy": 0.6623533740639687, "num_tokens": 20339598.0, "step": 25272 }, { "epoch": 6.69332627118644, "grad_norm": 2.4704360961914062, "learning_rate": 6.653469279661018e-06, "loss": 1.4591, "mean_token_accuracy": 0.672743484377861, "num_tokens": 20341338.0, "step": 25274 }, { "epoch": 6.69385593220339, "grad_norm": 2.3753721714019775, "learning_rate": 6.653204449152543e-06, "loss": 1.272, "mean_token_accuracy": 0.7072197273373604, "num_tokens": 20343195.0, "step": 25276 }, { "epoch": 6.694385593220339, "grad_norm": 2.2250356674194336, "learning_rate": 6.652939618644068e-06, "loss": 1.0032, "mean_token_accuracy": 0.7585632801055908, "num_tokens": 20345267.0, "step": 25278 }, { "epoch": 6.694915254237288, "grad_norm": 2.672438383102417, "learning_rate": 6.652674788135593e-06, "loss": 0.999, "mean_token_accuracy": 0.7744898945093155, "num_tokens": 20346608.0, "step": 25280 }, { "epoch": 6.695444915254237, "grad_norm": 1.942859172821045, "learning_rate": 6.652409957627119e-06, "loss": 1.094, "mean_token_accuracy": 0.7126864492893219, "num_tokens": 20348508.0, "step": 25282 }, { "epoch": 6.695974576271187, "grad_norm": 2.127007484436035, "learning_rate": 6.652145127118644e-06, "loss": 1.0788, "mean_token_accuracy": 0.7365507110953331, "num_tokens": 20349894.0, "step": 25284 }, { "epoch": 6.696504237288136, "grad_norm": 2.265573263168335, "learning_rate": 6.651880296610171e-06, "loss": 0.9048, "mean_token_accuracy": 0.7939005345106125, "num_tokens": 20351474.0, "step": 25286 }, { "epoch": 6.697033898305085, "grad_norm": 2.4213039875030518, "learning_rate": 6.651615466101696e-06, "loss": 1.2247, "mean_token_accuracy": 0.6991591230034828, "num_tokens": 20353322.0, "step": 25288 }, { "epoch": 6.697563559322034, "grad_norm": 1.635029911994934, "learning_rate": 6.651350635593221e-06, "loss": 1.0078, "mean_token_accuracy": 0.7793897613883018, "num_tokens": 20354687.0, "step": 25290 }, { "epoch": 6.698093220338983, "grad_norm": 2.070927858352661, "learning_rate": 6.651085805084746e-06, "loss": 0.9916, "mean_token_accuracy": 0.7537438049912453, "num_tokens": 20356445.0, "step": 25292 }, { "epoch": 6.6986228813559325, "grad_norm": 3.5370934009552, "learning_rate": 6.650820974576272e-06, "loss": 1.1396, "mean_token_accuracy": 0.7283645421266556, "num_tokens": 20357795.0, "step": 25294 }, { "epoch": 6.6991525423728815, "grad_norm": 2.5312976837158203, "learning_rate": 6.650556144067797e-06, "loss": 1.3209, "mean_token_accuracy": 0.6972159296274185, "num_tokens": 20359238.0, "step": 25296 }, { "epoch": 6.6996822033898304, "grad_norm": 2.596449136734009, "learning_rate": 6.650291313559323e-06, "loss": 1.3872, "mean_token_accuracy": 0.6906437873840332, "num_tokens": 20360791.0, "step": 25298 }, { "epoch": 6.700211864406779, "grad_norm": 2.784416437149048, "learning_rate": 6.650026483050848e-06, "loss": 1.4595, "mean_token_accuracy": 0.6781752109527588, "num_tokens": 20362336.0, "step": 25300 }, { "epoch": 6.700741525423728, "grad_norm": 2.1141469478607178, "learning_rate": 6.6497616525423735e-06, "loss": 1.2247, "mean_token_accuracy": 0.7304807230830193, "num_tokens": 20363855.0, "step": 25302 }, { "epoch": 6.701271186440678, "grad_norm": 2.6450252532958984, "learning_rate": 6.649496822033898e-06, "loss": 0.9585, "mean_token_accuracy": 0.7713887318968773, "num_tokens": 20365273.0, "step": 25304 }, { "epoch": 6.701800847457627, "grad_norm": 2.2864487171173096, "learning_rate": 6.649231991525424e-06, "loss": 1.0012, "mean_token_accuracy": 0.7452522069215775, "num_tokens": 20366921.0, "step": 25306 }, { "epoch": 6.702330508474576, "grad_norm": 2.163267135620117, "learning_rate": 6.648967161016949e-06, "loss": 1.2237, "mean_token_accuracy": 0.74074886739254, "num_tokens": 20368298.0, "step": 25308 }, { "epoch": 6.702860169491525, "grad_norm": 2.1420044898986816, "learning_rate": 6.648702330508475e-06, "loss": 1.3417, "mean_token_accuracy": 0.6920481845736504, "num_tokens": 20370024.0, "step": 25310 }, { "epoch": 6.703389830508475, "grad_norm": 2.6371607780456543, "learning_rate": 6.6484375e-06, "loss": 1.2096, "mean_token_accuracy": 0.7198687642812729, "num_tokens": 20371543.0, "step": 25312 }, { "epoch": 6.703919491525424, "grad_norm": 3.1372933387756348, "learning_rate": 6.6481726694915264e-06, "loss": 1.0917, "mean_token_accuracy": 0.7370012253522873, "num_tokens": 20372778.0, "step": 25314 }, { "epoch": 6.704449152542373, "grad_norm": 2.434939384460449, "learning_rate": 6.6479078389830505e-06, "loss": 1.3609, "mean_token_accuracy": 0.6815841384232044, "num_tokens": 20374146.0, "step": 25316 }, { "epoch": 6.704978813559322, "grad_norm": 2.758941173553467, "learning_rate": 6.647643008474577e-06, "loss": 0.926, "mean_token_accuracy": 0.772024117410183, "num_tokens": 20375686.0, "step": 25318 }, { "epoch": 6.705508474576272, "grad_norm": 2.128394365310669, "learning_rate": 6.647378177966102e-06, "loss": 1.1254, "mean_token_accuracy": 0.7088991478085518, "num_tokens": 20377411.0, "step": 25320 }, { "epoch": 6.706038135593221, "grad_norm": 2.314162015914917, "learning_rate": 6.647113347457628e-06, "loss": 1.2311, "mean_token_accuracy": 0.7268120422959328, "num_tokens": 20379262.0, "step": 25322 }, { "epoch": 6.7065677966101696, "grad_norm": 2.619518756866455, "learning_rate": 6.646848516949153e-06, "loss": 1.108, "mean_token_accuracy": 0.7556376904249191, "num_tokens": 20380860.0, "step": 25324 }, { "epoch": 6.7070974576271185, "grad_norm": 2.4683167934417725, "learning_rate": 6.6465836864406786e-06, "loss": 1.5552, "mean_token_accuracy": 0.6840674355626106, "num_tokens": 20382262.0, "step": 25326 }, { "epoch": 6.7076271186440675, "grad_norm": 2.1654796600341797, "learning_rate": 6.6463188559322035e-06, "loss": 1.0101, "mean_token_accuracy": 0.7593341246247292, "num_tokens": 20383614.0, "step": 25328 }, { "epoch": 6.708156779661017, "grad_norm": 2.0600545406341553, "learning_rate": 6.646054025423729e-06, "loss": 1.1327, "mean_token_accuracy": 0.7472162246704102, "num_tokens": 20385125.0, "step": 25330 }, { "epoch": 6.708686440677966, "grad_norm": 2.2457997798919678, "learning_rate": 6.645789194915255e-06, "loss": 1.1877, "mean_token_accuracy": 0.7057015672326088, "num_tokens": 20386986.0, "step": 25332 }, { "epoch": 6.709216101694915, "grad_norm": 4.703945159912109, "learning_rate": 6.64552436440678e-06, "loss": 0.8495, "mean_token_accuracy": 0.7863000631332397, "num_tokens": 20388275.0, "step": 25334 }, { "epoch": 6.709745762711864, "grad_norm": 2.071777820587158, "learning_rate": 6.645259533898306e-06, "loss": 1.138, "mean_token_accuracy": 0.7454603239893913, "num_tokens": 20389845.0, "step": 25336 }, { "epoch": 6.710275423728813, "grad_norm": 2.364177703857422, "learning_rate": 6.644994703389831e-06, "loss": 1.1441, "mean_token_accuracy": 0.7303239405155182, "num_tokens": 20391508.0, "step": 25338 }, { "epoch": 6.710805084745763, "grad_norm": 2.458956718444824, "learning_rate": 6.644729872881357e-06, "loss": 1.4697, "mean_token_accuracy": 0.6557767540216446, "num_tokens": 20393145.0, "step": 25340 }, { "epoch": 6.711334745762712, "grad_norm": 2.751455307006836, "learning_rate": 6.644465042372882e-06, "loss": 1.7429, "mean_token_accuracy": 0.6092916503548622, "num_tokens": 20395050.0, "step": 25342 }, { "epoch": 6.711864406779661, "grad_norm": 1.7212026119232178, "learning_rate": 6.644200211864408e-06, "loss": 1.1344, "mean_token_accuracy": 0.7612355127930641, "num_tokens": 20397119.0, "step": 25344 }, { "epoch": 6.71239406779661, "grad_norm": 2.690664529800415, "learning_rate": 6.643935381355933e-06, "loss": 1.2794, "mean_token_accuracy": 0.7270764112472534, "num_tokens": 20398450.0, "step": 25346 }, { "epoch": 6.71292372881356, "grad_norm": 2.6043710708618164, "learning_rate": 6.643670550847459e-06, "loss": 1.1918, "mean_token_accuracy": 0.7291689366102219, "num_tokens": 20399903.0, "step": 25348 }, { "epoch": 6.713453389830509, "grad_norm": 2.1290555000305176, "learning_rate": 6.643405720338984e-06, "loss": 1.2101, "mean_token_accuracy": 0.7191232070326805, "num_tokens": 20401418.0, "step": 25350 }, { "epoch": 6.713983050847458, "grad_norm": 2.281041145324707, "learning_rate": 6.643140889830509e-06, "loss": 1.0555, "mean_token_accuracy": 0.7489507645368576, "num_tokens": 20402958.0, "step": 25352 }, { "epoch": 6.714512711864407, "grad_norm": 2.488719940185547, "learning_rate": 6.642876059322034e-06, "loss": 1.055, "mean_token_accuracy": 0.7329522296786308, "num_tokens": 20404621.0, "step": 25354 }, { "epoch": 6.715042372881356, "grad_norm": 2.510838031768799, "learning_rate": 6.64261122881356e-06, "loss": 1.2094, "mean_token_accuracy": 0.6947591155767441, "num_tokens": 20406237.0, "step": 25356 }, { "epoch": 6.715572033898305, "grad_norm": 2.7198591232299805, "learning_rate": 6.642346398305085e-06, "loss": 1.3797, "mean_token_accuracy": 0.6860145926475525, "num_tokens": 20407763.0, "step": 25358 }, { "epoch": 6.716101694915254, "grad_norm": 2.290196180343628, "learning_rate": 6.642081567796611e-06, "loss": 1.3507, "mean_token_accuracy": 0.6759529039263725, "num_tokens": 20409423.0, "step": 25360 }, { "epoch": 6.716631355932203, "grad_norm": 2.13700532913208, "learning_rate": 6.641816737288136e-06, "loss": 1.1679, "mean_token_accuracy": 0.7384539023041725, "num_tokens": 20411183.0, "step": 25362 }, { "epoch": 6.717161016949152, "grad_norm": 2.5861194133758545, "learning_rate": 6.6415519067796615e-06, "loss": 1.5383, "mean_token_accuracy": 0.6660653129220009, "num_tokens": 20412910.0, "step": 25364 }, { "epoch": 6.717690677966102, "grad_norm": 2.7907803058624268, "learning_rate": 6.6412870762711865e-06, "loss": 1.1833, "mean_token_accuracy": 0.7185683771967888, "num_tokens": 20414418.0, "step": 25366 }, { "epoch": 6.718220338983051, "grad_norm": 2.767487049102783, "learning_rate": 6.641022245762713e-06, "loss": 1.3886, "mean_token_accuracy": 0.683927983045578, "num_tokens": 20416043.0, "step": 25368 }, { "epoch": 6.71875, "grad_norm": 2.7829246520996094, "learning_rate": 6.640757415254237e-06, "loss": 1.4693, "mean_token_accuracy": 0.6590543612837791, "num_tokens": 20417713.0, "step": 25370 }, { "epoch": 6.719279661016949, "grad_norm": 2.4369373321533203, "learning_rate": 6.640492584745764e-06, "loss": 1.156, "mean_token_accuracy": 0.7175240069627762, "num_tokens": 20419016.0, "step": 25372 }, { "epoch": 6.719809322033898, "grad_norm": 1.587783932685852, "learning_rate": 6.640227754237289e-06, "loss": 0.822, "mean_token_accuracy": 0.7877293303608894, "num_tokens": 20420634.0, "step": 25374 }, { "epoch": 6.720338983050848, "grad_norm": 2.1732497215270996, "learning_rate": 6.6399629237288145e-06, "loss": 1.2087, "mean_token_accuracy": 0.7074915841221809, "num_tokens": 20422494.0, "step": 25376 }, { "epoch": 6.720868644067797, "grad_norm": 3.025836944580078, "learning_rate": 6.639698093220339e-06, "loss": 1.3938, "mean_token_accuracy": 0.6729350350797176, "num_tokens": 20424017.0, "step": 25378 }, { "epoch": 6.721398305084746, "grad_norm": 2.4406397342681885, "learning_rate": 6.639433262711865e-06, "loss": 0.9178, "mean_token_accuracy": 0.7666091732680798, "num_tokens": 20425589.0, "step": 25380 }, { "epoch": 6.721927966101695, "grad_norm": 2.029304265975952, "learning_rate": 6.63916843220339e-06, "loss": 1.2433, "mean_token_accuracy": 0.7042739167809486, "num_tokens": 20427305.0, "step": 25382 }, { "epoch": 6.722457627118644, "grad_norm": 2.29463791847229, "learning_rate": 6.638903601694916e-06, "loss": 1.2805, "mean_token_accuracy": 0.7234309539198875, "num_tokens": 20428964.0, "step": 25384 }, { "epoch": 6.722987288135593, "grad_norm": 2.0763049125671387, "learning_rate": 6.638638771186441e-06, "loss": 1.1266, "mean_token_accuracy": 0.7445250377058983, "num_tokens": 20430439.0, "step": 25386 }, { "epoch": 6.723516949152542, "grad_norm": 3.037125825881958, "learning_rate": 6.638373940677967e-06, "loss": 1.1489, "mean_token_accuracy": 0.719290241599083, "num_tokens": 20432094.0, "step": 25388 }, { "epoch": 6.724046610169491, "grad_norm": 2.5500612258911133, "learning_rate": 6.6381091101694915e-06, "loss": 1.5035, "mean_token_accuracy": 0.675228551030159, "num_tokens": 20433780.0, "step": 25390 }, { "epoch": 6.72457627118644, "grad_norm": 2.7000157833099365, "learning_rate": 6.637844279661017e-06, "loss": 1.4999, "mean_token_accuracy": 0.6737220957875252, "num_tokens": 20435124.0, "step": 25392 }, { "epoch": 6.72510593220339, "grad_norm": 2.1042981147766113, "learning_rate": 6.637579449152542e-06, "loss": 1.2287, "mean_token_accuracy": 0.7390217110514641, "num_tokens": 20436579.0, "step": 25394 }, { "epoch": 6.725635593220339, "grad_norm": 2.3441572189331055, "learning_rate": 6.637314618644069e-06, "loss": 1.2822, "mean_token_accuracy": 0.7311404123902321, "num_tokens": 20438059.0, "step": 25396 }, { "epoch": 6.726165254237288, "grad_norm": 2.7251927852630615, "learning_rate": 6.637049788135593e-06, "loss": 1.168, "mean_token_accuracy": 0.7365649566054344, "num_tokens": 20439541.0, "step": 25398 }, { "epoch": 6.726694915254237, "grad_norm": 2.2673017978668213, "learning_rate": 6.6367849576271196e-06, "loss": 1.2566, "mean_token_accuracy": 0.7258448973298073, "num_tokens": 20441031.0, "step": 25400 }, { "epoch": 6.727224576271187, "grad_norm": 2.4553005695343018, "learning_rate": 6.6365201271186445e-06, "loss": 1.6272, "mean_token_accuracy": 0.6456471383571625, "num_tokens": 20442727.0, "step": 25402 }, { "epoch": 6.727754237288136, "grad_norm": 2.6369545459747314, "learning_rate": 6.63625529661017e-06, "loss": 1.2364, "mean_token_accuracy": 0.7185664996504784, "num_tokens": 20444362.0, "step": 25404 }, { "epoch": 6.728283898305085, "grad_norm": 2.481020212173462, "learning_rate": 6.635990466101695e-06, "loss": 1.2828, "mean_token_accuracy": 0.7108915820717812, "num_tokens": 20446015.0, "step": 25406 }, { "epoch": 6.728813559322034, "grad_norm": 2.952558755874634, "learning_rate": 6.635725635593221e-06, "loss": 1.3765, "mean_token_accuracy": 0.6994150802493095, "num_tokens": 20447502.0, "step": 25408 }, { "epoch": 6.729343220338983, "grad_norm": 1.9189374446868896, "learning_rate": 6.635460805084746e-06, "loss": 0.7112, "mean_token_accuracy": 0.8111720606684685, "num_tokens": 20449263.0, "step": 25410 }, { "epoch": 6.7298728813559325, "grad_norm": 2.189300060272217, "learning_rate": 6.635195974576272e-06, "loss": 1.0413, "mean_token_accuracy": 0.7659633308649063, "num_tokens": 20451083.0, "step": 25412 }, { "epoch": 6.7304025423728815, "grad_norm": 2.141326665878296, "learning_rate": 6.6349311440677975e-06, "loss": 1.5406, "mean_token_accuracy": 0.6677036285400391, "num_tokens": 20452934.0, "step": 25414 }, { "epoch": 6.7309322033898304, "grad_norm": 2.113217353820801, "learning_rate": 6.634666313559322e-06, "loss": 1.1396, "mean_token_accuracy": 0.7449585720896721, "num_tokens": 20454370.0, "step": 25416 }, { "epoch": 6.731461864406779, "grad_norm": 3.0159194469451904, "learning_rate": 6.634401483050848e-06, "loss": 1.1718, "mean_token_accuracy": 0.7534715868532658, "num_tokens": 20455692.0, "step": 25418 }, { "epoch": 6.731991525423728, "grad_norm": 2.376906394958496, "learning_rate": 6.634136652542373e-06, "loss": 1.0343, "mean_token_accuracy": 0.7528996244072914, "num_tokens": 20457130.0, "step": 25420 }, { "epoch": 6.732521186440678, "grad_norm": 2.82830810546875, "learning_rate": 6.6338718220339e-06, "loss": 1.6781, "mean_token_accuracy": 0.6384935975074768, "num_tokens": 20458836.0, "step": 25422 }, { "epoch": 6.733050847457627, "grad_norm": 2.3280482292175293, "learning_rate": 6.633606991525424e-06, "loss": 0.9096, "mean_token_accuracy": 0.7777373343706131, "num_tokens": 20460325.0, "step": 25424 }, { "epoch": 6.733580508474576, "grad_norm": 2.429792642593384, "learning_rate": 6.6333421610169504e-06, "loss": 0.9771, "mean_token_accuracy": 0.7615672126412392, "num_tokens": 20461598.0, "step": 25426 }, { "epoch": 6.734110169491525, "grad_norm": 2.2810537815093994, "learning_rate": 6.633077330508475e-06, "loss": 1.434, "mean_token_accuracy": 0.66722122579813, "num_tokens": 20463242.0, "step": 25428 }, { "epoch": 6.734639830508475, "grad_norm": 2.1946938037872314, "learning_rate": 6.632812500000001e-06, "loss": 1.0571, "mean_token_accuracy": 0.7371275275945663, "num_tokens": 20464886.0, "step": 25430 }, { "epoch": 6.735169491525424, "grad_norm": 2.630967855453491, "learning_rate": 6.632547669491526e-06, "loss": 1.1958, "mean_token_accuracy": 0.7251451313495636, "num_tokens": 20466664.0, "step": 25432 }, { "epoch": 6.735699152542373, "grad_norm": 2.3750159740448, "learning_rate": 6.632282838983052e-06, "loss": 0.9753, "mean_token_accuracy": 0.7762608006596565, "num_tokens": 20467993.0, "step": 25434 }, { "epoch": 6.736228813559322, "grad_norm": 2.468508005142212, "learning_rate": 6.632018008474577e-06, "loss": 1.514, "mean_token_accuracy": 0.6467808783054352, "num_tokens": 20469583.0, "step": 25436 }, { "epoch": 6.736758474576272, "grad_norm": 2.357048273086548, "learning_rate": 6.6317531779661025e-06, "loss": 1.2074, "mean_token_accuracy": 0.7434896677732468, "num_tokens": 20471281.0, "step": 25438 }, { "epoch": 6.737288135593221, "grad_norm": 2.0041303634643555, "learning_rate": 6.6314883474576275e-06, "loss": 1.2166, "mean_token_accuracy": 0.7271591275930405, "num_tokens": 20472852.0, "step": 25440 }, { "epoch": 6.7378177966101696, "grad_norm": 2.585796356201172, "learning_rate": 6.631223516949153e-06, "loss": 1.4132, "mean_token_accuracy": 0.7039844021201134, "num_tokens": 20474257.0, "step": 25442 }, { "epoch": 6.7383474576271185, "grad_norm": 2.677060127258301, "learning_rate": 6.630958686440678e-06, "loss": 1.1377, "mean_token_accuracy": 0.7103182226419449, "num_tokens": 20475669.0, "step": 25444 }, { "epoch": 6.7388771186440675, "grad_norm": 2.1162633895874023, "learning_rate": 6.630693855932204e-06, "loss": 1.602, "mean_token_accuracy": 0.6684530004858971, "num_tokens": 20477681.0, "step": 25446 }, { "epoch": 6.739406779661017, "grad_norm": 2.319532871246338, "learning_rate": 6.630429025423729e-06, "loss": 1.2655, "mean_token_accuracy": 0.7125120982527733, "num_tokens": 20479131.0, "step": 25448 }, { "epoch": 6.739936440677966, "grad_norm": 2.923790693283081, "learning_rate": 6.6301641949152555e-06, "loss": 1.8334, "mean_token_accuracy": 0.6175791397690773, "num_tokens": 20480671.0, "step": 25450 }, { "epoch": 6.740466101694915, "grad_norm": 1.8441293239593506, "learning_rate": 6.62989936440678e-06, "loss": 1.0285, "mean_token_accuracy": 0.7438281029462814, "num_tokens": 20482357.0, "step": 25452 }, { "epoch": 6.740995762711864, "grad_norm": 2.2814910411834717, "learning_rate": 6.629634533898306e-06, "loss": 1.48, "mean_token_accuracy": 0.6764509454369545, "num_tokens": 20484077.0, "step": 25454 }, { "epoch": 6.741525423728813, "grad_norm": 2.213653326034546, "learning_rate": 6.629369703389831e-06, "loss": 1.4271, "mean_token_accuracy": 0.6704527214169502, "num_tokens": 20485727.0, "step": 25456 }, { "epoch": 6.742055084745763, "grad_norm": 2.9924421310424805, "learning_rate": 6.629104872881357e-06, "loss": 0.9352, "mean_token_accuracy": 0.7740241661667824, "num_tokens": 20487010.0, "step": 25458 }, { "epoch": 6.742584745762712, "grad_norm": 1.8885167837142944, "learning_rate": 6.628840042372882e-06, "loss": 0.8159, "mean_token_accuracy": 0.7839467227458954, "num_tokens": 20488834.0, "step": 25460 }, { "epoch": 6.743114406779661, "grad_norm": 2.010286808013916, "learning_rate": 6.628575211864408e-06, "loss": 0.7922, "mean_token_accuracy": 0.7874132618308067, "num_tokens": 20490510.0, "step": 25462 }, { "epoch": 6.74364406779661, "grad_norm": 2.2360522747039795, "learning_rate": 6.6283103813559326e-06, "loss": 1.0032, "mean_token_accuracy": 0.7639823108911514, "num_tokens": 20492231.0, "step": 25464 }, { "epoch": 6.74417372881356, "grad_norm": 2.4407477378845215, "learning_rate": 6.628045550847458e-06, "loss": 1.139, "mean_token_accuracy": 0.730994202196598, "num_tokens": 20493661.0, "step": 25466 }, { "epoch": 6.744703389830509, "grad_norm": 2.1459200382232666, "learning_rate": 6.627780720338983e-06, "loss": 1.2261, "mean_token_accuracy": 0.709658108651638, "num_tokens": 20495142.0, "step": 25468 }, { "epoch": 6.745233050847458, "grad_norm": 2.21482515335083, "learning_rate": 6.627515889830509e-06, "loss": 1.0392, "mean_token_accuracy": 0.7436268478631973, "num_tokens": 20496922.0, "step": 25470 }, { "epoch": 6.745762711864407, "grad_norm": 2.7602503299713135, "learning_rate": 6.627251059322034e-06, "loss": 0.8191, "mean_token_accuracy": 0.8002415895462036, "num_tokens": 20498432.0, "step": 25472 }, { "epoch": 6.746292372881356, "grad_norm": 1.8811047077178955, "learning_rate": 6.62698622881356e-06, "loss": 1.0823, "mean_token_accuracy": 0.7594999372959137, "num_tokens": 20500038.0, "step": 25474 }, { "epoch": 6.746822033898305, "grad_norm": 2.554722547531128, "learning_rate": 6.626721398305085e-06, "loss": 1.3451, "mean_token_accuracy": 0.7080669403076172, "num_tokens": 20501538.0, "step": 25476 }, { "epoch": 6.747351694915254, "grad_norm": 2.164693832397461, "learning_rate": 6.6264565677966104e-06, "loss": 1.0029, "mean_token_accuracy": 0.7546393349766731, "num_tokens": 20503164.0, "step": 25478 }, { "epoch": 6.747881355932203, "grad_norm": 2.5502092838287354, "learning_rate": 6.626191737288135e-06, "loss": 1.3449, "mean_token_accuracy": 0.6887032762169838, "num_tokens": 20504617.0, "step": 25480 }, { "epoch": 6.748411016949152, "grad_norm": 2.2360970973968506, "learning_rate": 6.625926906779662e-06, "loss": 1.4867, "mean_token_accuracy": 0.6737256348133087, "num_tokens": 20506559.0, "step": 25482 }, { "epoch": 6.748940677966102, "grad_norm": 2.1800289154052734, "learning_rate": 6.625662076271187e-06, "loss": 1.1061, "mean_token_accuracy": 0.7340335622429848, "num_tokens": 20508302.0, "step": 25484 }, { "epoch": 6.749470338983051, "grad_norm": 2.781369924545288, "learning_rate": 6.625397245762713e-06, "loss": 1.3858, "mean_token_accuracy": 0.6901730671525002, "num_tokens": 20509806.0, "step": 25486 }, { "epoch": 6.75, "grad_norm": 2.4509637355804443, "learning_rate": 6.625132415254238e-06, "loss": 1.1838, "mean_token_accuracy": 0.7504912540316582, "num_tokens": 20511313.0, "step": 25488 }, { "epoch": 6.750529661016949, "grad_norm": 2.1187376976013184, "learning_rate": 6.624867584745763e-06, "loss": 1.1299, "mean_token_accuracy": 0.7396794334053993, "num_tokens": 20512895.0, "step": 25490 }, { "epoch": 6.751059322033898, "grad_norm": 2.41064190864563, "learning_rate": 6.624602754237288e-06, "loss": 1.5982, "mean_token_accuracy": 0.6355586126446724, "num_tokens": 20514706.0, "step": 25492 }, { "epoch": 6.751588983050848, "grad_norm": 2.521164894104004, "learning_rate": 6.624337923728814e-06, "loss": 1.5223, "mean_token_accuracy": 0.6719275712966919, "num_tokens": 20516280.0, "step": 25494 }, { "epoch": 6.752118644067797, "grad_norm": 1.9791216850280762, "learning_rate": 6.624073093220339e-06, "loss": 0.8926, "mean_token_accuracy": 0.7504082545638084, "num_tokens": 20517865.0, "step": 25496 }, { "epoch": 6.752648305084746, "grad_norm": 2.6804873943328857, "learning_rate": 6.623808262711865e-06, "loss": 1.3611, "mean_token_accuracy": 0.6859398484230042, "num_tokens": 20519414.0, "step": 25498 }, { "epoch": 6.753177966101695, "grad_norm": 2.7065157890319824, "learning_rate": 6.623543432203391e-06, "loss": 1.2382, "step": 25500 }, { "epoch": 6.753177966101695, "eval_loss": 1.3268238306045532, "eval_mean_token_accuracy": 0.6993654794120169, "eval_num_tokens": 20520719.0, "eval_runtime": 48.0577, "eval_samples_per_second": 6.409, "eval_steps_per_second": 6.409, "step": 25500 }, { "epoch": 6.753707627118644, "grad_norm": 3.1544086933135986, "learning_rate": 6.6232786016949155e-06, "loss": 1.435, "mean_token_accuracy": 0.7152144461870193, "num_tokens": 20522018.0, "step": 25502 }, { "epoch": 6.754237288135593, "grad_norm": 1.8934619426727295, "learning_rate": 6.623013771186442e-06, "loss": 0.95, "mean_token_accuracy": 0.7640068382024765, "num_tokens": 20523924.0, "step": 25504 }, { "epoch": 6.754766949152542, "grad_norm": 2.912555456161499, "learning_rate": 6.622748940677966e-06, "loss": 0.9557, "mean_token_accuracy": 0.7616939917206764, "num_tokens": 20525389.0, "step": 25506 }, { "epoch": 6.755296610169491, "grad_norm": 2.4574625492095947, "learning_rate": 6.622484110169493e-06, "loss": 1.4075, "mean_token_accuracy": 0.7030619010329247, "num_tokens": 20526927.0, "step": 25508 }, { "epoch": 6.75582627118644, "grad_norm": 3.0860817432403564, "learning_rate": 6.622219279661018e-06, "loss": 1.2588, "mean_token_accuracy": 0.7140330895781517, "num_tokens": 20528520.0, "step": 25510 }, { "epoch": 6.75635593220339, "grad_norm": 2.496752977371216, "learning_rate": 6.6219544491525436e-06, "loss": 1.0338, "mean_token_accuracy": 0.747893400490284, "num_tokens": 20530042.0, "step": 25512 }, { "epoch": 6.756885593220339, "grad_norm": 2.437377691268921, "learning_rate": 6.6216896186440685e-06, "loss": 1.5216, "mean_token_accuracy": 0.6471163257956505, "num_tokens": 20531928.0, "step": 25514 }, { "epoch": 6.757415254237288, "grad_norm": 2.5560410022735596, "learning_rate": 6.621424788135594e-06, "loss": 1.2956, "mean_token_accuracy": 0.693905308842659, "num_tokens": 20533471.0, "step": 25516 }, { "epoch": 6.757944915254237, "grad_norm": 2.539400100708008, "learning_rate": 6.621159957627119e-06, "loss": 1.3946, "mean_token_accuracy": 0.7004430592060089, "num_tokens": 20534967.0, "step": 25518 }, { "epoch": 6.758474576271187, "grad_norm": 2.8281266689300537, "learning_rate": 6.620895127118645e-06, "loss": 1.5224, "mean_token_accuracy": 0.665069580078125, "num_tokens": 20536435.0, "step": 25520 }, { "epoch": 6.759004237288136, "grad_norm": 2.527198553085327, "learning_rate": 6.62063029661017e-06, "loss": 1.2873, "mean_token_accuracy": 0.7191975340247154, "num_tokens": 20537761.0, "step": 25522 }, { "epoch": 6.759533898305085, "grad_norm": 2.6473336219787598, "learning_rate": 6.620365466101696e-06, "loss": 0.9142, "mean_token_accuracy": 0.7614691630005836, "num_tokens": 20539276.0, "step": 25524 }, { "epoch": 6.760063559322034, "grad_norm": 2.3168764114379883, "learning_rate": 6.620100635593221e-06, "loss": 1.1542, "mean_token_accuracy": 0.7133587449789047, "num_tokens": 20540757.0, "step": 25526 }, { "epoch": 6.760593220338983, "grad_norm": 2.625260591506958, "learning_rate": 6.619835805084746e-06, "loss": 1.447, "mean_token_accuracy": 0.6727380603551865, "num_tokens": 20542526.0, "step": 25528 }, { "epoch": 6.7611228813559325, "grad_norm": 1.7446383237838745, "learning_rate": 6.619570974576271e-06, "loss": 1.0058, "mean_token_accuracy": 0.7637836337089539, "num_tokens": 20544086.0, "step": 25530 }, { "epoch": 6.7616525423728815, "grad_norm": 2.3530116081237793, "learning_rate": 6.619306144067797e-06, "loss": 1.5682, "mean_token_accuracy": 0.6396607086062431, "num_tokens": 20545932.0, "step": 25532 }, { "epoch": 6.7621822033898304, "grad_norm": 2.197578191757202, "learning_rate": 6.619041313559322e-06, "loss": 1.1653, "mean_token_accuracy": 0.7441490218043327, "num_tokens": 20547951.0, "step": 25534 }, { "epoch": 6.762711864406779, "grad_norm": 2.8290603160858154, "learning_rate": 6.618776483050849e-06, "loss": 0.9781, "mean_token_accuracy": 0.7569847032427788, "num_tokens": 20549366.0, "step": 25536 }, { "epoch": 6.763241525423728, "grad_norm": 2.3693196773529053, "learning_rate": 6.6185116525423736e-06, "loss": 1.1227, "mean_token_accuracy": 0.7298580184578896, "num_tokens": 20550793.0, "step": 25538 }, { "epoch": 6.763771186440678, "grad_norm": 2.2338151931762695, "learning_rate": 6.618246822033899e-06, "loss": 1.0181, "mean_token_accuracy": 0.7451231926679611, "num_tokens": 20552533.0, "step": 25540 }, { "epoch": 6.764300847457627, "grad_norm": 3.0906453132629395, "learning_rate": 6.617981991525424e-06, "loss": 0.8471, "mean_token_accuracy": 0.7944796830415726, "num_tokens": 20553900.0, "step": 25542 }, { "epoch": 6.764830508474576, "grad_norm": 3.598450183868408, "learning_rate": 6.61771716101695e-06, "loss": 1.2525, "mean_token_accuracy": 0.7047576382756233, "num_tokens": 20555751.0, "step": 25544 }, { "epoch": 6.765360169491525, "grad_norm": 2.3801114559173584, "learning_rate": 6.617452330508475e-06, "loss": 0.9987, "mean_token_accuracy": 0.7413132190704346, "num_tokens": 20557217.0, "step": 25546 }, { "epoch": 6.765889830508475, "grad_norm": 2.990793466567993, "learning_rate": 6.617187500000001e-06, "loss": 1.3538, "mean_token_accuracy": 0.7227713987231255, "num_tokens": 20558889.0, "step": 25548 }, { "epoch": 6.766419491525424, "grad_norm": 2.447543144226074, "learning_rate": 6.616922669491526e-06, "loss": 1.2831, "mean_token_accuracy": 0.7001888528466225, "num_tokens": 20560366.0, "step": 25550 }, { "epoch": 6.766949152542373, "grad_norm": 2.845798969268799, "learning_rate": 6.6166578389830515e-06, "loss": 1.3269, "mean_token_accuracy": 0.6856959760189056, "num_tokens": 20562136.0, "step": 25552 }, { "epoch": 6.767478813559322, "grad_norm": 2.464242458343506, "learning_rate": 6.616393008474576e-06, "loss": 1.3287, "mean_token_accuracy": 0.6961916014552116, "num_tokens": 20564060.0, "step": 25554 }, { "epoch": 6.768008474576272, "grad_norm": 2.3026046752929688, "learning_rate": 6.616128177966102e-06, "loss": 1.5197, "mean_token_accuracy": 0.6623593159019947, "num_tokens": 20565761.0, "step": 25556 }, { "epoch": 6.768538135593221, "grad_norm": 2.337981700897217, "learning_rate": 6.615863347457627e-06, "loss": 1.1905, "mean_token_accuracy": 0.7157800942659378, "num_tokens": 20567398.0, "step": 25558 }, { "epoch": 6.7690677966101696, "grad_norm": 3.186068058013916, "learning_rate": 6.615598516949153e-06, "loss": 1.2965, "mean_token_accuracy": 0.7070994824171066, "num_tokens": 20568932.0, "step": 25560 }, { "epoch": 6.7695974576271185, "grad_norm": 2.272219181060791, "learning_rate": 6.615333686440678e-06, "loss": 1.3919, "mean_token_accuracy": 0.685451976954937, "num_tokens": 20570597.0, "step": 25562 }, { "epoch": 6.7701271186440675, "grad_norm": 2.843555212020874, "learning_rate": 6.615068855932204e-06, "loss": 1.6243, "mean_token_accuracy": 0.6223902702331543, "num_tokens": 20572333.0, "step": 25564 }, { "epoch": 6.770656779661017, "grad_norm": 2.106084108352661, "learning_rate": 6.6148040254237285e-06, "loss": 1.1161, "mean_token_accuracy": 0.7501640021800995, "num_tokens": 20574011.0, "step": 25566 }, { "epoch": 6.771186440677966, "grad_norm": 2.7434334754943848, "learning_rate": 6.614539194915255e-06, "loss": 1.2763, "mean_token_accuracy": 0.7179818116128445, "num_tokens": 20575307.0, "step": 25568 }, { "epoch": 6.771716101694915, "grad_norm": 1.5016717910766602, "learning_rate": 6.61427436440678e-06, "loss": 1.0787, "mean_token_accuracy": 0.704172395169735, "num_tokens": 20578042.0, "step": 25570 }, { "epoch": 6.772245762711864, "grad_norm": 2.0447940826416016, "learning_rate": 6.614009533898306e-06, "loss": 1.1592, "mean_token_accuracy": 0.7419310510158539, "num_tokens": 20579587.0, "step": 25572 }, { "epoch": 6.772775423728813, "grad_norm": 2.685195207595825, "learning_rate": 6.613744703389831e-06, "loss": 1.2666, "mean_token_accuracy": 0.723302997648716, "num_tokens": 20581074.0, "step": 25574 }, { "epoch": 6.773305084745763, "grad_norm": 2.451789617538452, "learning_rate": 6.6134798728813565e-06, "loss": 1.5284, "mean_token_accuracy": 0.657016359269619, "num_tokens": 20582834.0, "step": 25576 }, { "epoch": 6.773834745762712, "grad_norm": 1.898140788078308, "learning_rate": 6.6132150423728815e-06, "loss": 1.2877, "mean_token_accuracy": 0.6910211741924286, "num_tokens": 20584817.0, "step": 25578 }, { "epoch": 6.774364406779661, "grad_norm": 2.686009645462036, "learning_rate": 6.612950211864407e-06, "loss": 1.4156, "mean_token_accuracy": 0.6925224512815475, "num_tokens": 20586230.0, "step": 25580 }, { "epoch": 6.77489406779661, "grad_norm": 2.694037914276123, "learning_rate": 6.612685381355933e-06, "loss": 0.8283, "mean_token_accuracy": 0.7848619222640991, "num_tokens": 20587456.0, "step": 25582 }, { "epoch": 6.77542372881356, "grad_norm": 2.825385808944702, "learning_rate": 6.612420550847458e-06, "loss": 0.9248, "mean_token_accuracy": 0.7645494267344475, "num_tokens": 20588916.0, "step": 25584 }, { "epoch": 6.775953389830509, "grad_norm": 2.2445192337036133, "learning_rate": 6.612155720338984e-06, "loss": 1.3056, "mean_token_accuracy": 0.7088408395648003, "num_tokens": 20590539.0, "step": 25586 }, { "epoch": 6.776483050847458, "grad_norm": 2.3806941509246826, "learning_rate": 6.611890889830509e-06, "loss": 1.2578, "mean_token_accuracy": 0.7091693580150604, "num_tokens": 20592243.0, "step": 25588 }, { "epoch": 6.777012711864407, "grad_norm": 2.2929837703704834, "learning_rate": 6.611626059322035e-06, "loss": 1.2891, "mean_token_accuracy": 0.7163209021091461, "num_tokens": 20593751.0, "step": 25590 }, { "epoch": 6.777542372881356, "grad_norm": 2.2057747840881348, "learning_rate": 6.61136122881356e-06, "loss": 0.9403, "mean_token_accuracy": 0.7838641405105591, "num_tokens": 20594947.0, "step": 25592 }, { "epoch": 6.778072033898305, "grad_norm": 2.8781275749206543, "learning_rate": 6.611096398305086e-06, "loss": 1.1692, "mean_token_accuracy": 0.7192763686180115, "num_tokens": 20596448.0, "step": 25594 }, { "epoch": 6.778601694915254, "grad_norm": 2.352597713470459, "learning_rate": 6.610831567796611e-06, "loss": 1.0115, "mean_token_accuracy": 0.7631355598568916, "num_tokens": 20597780.0, "step": 25596 }, { "epoch": 6.779131355932203, "grad_norm": 2.5241334438323975, "learning_rate": 6.610566737288137e-06, "loss": 1.1141, "mean_token_accuracy": 0.7551899030804634, "num_tokens": 20599208.0, "step": 25598 }, { "epoch": 6.779661016949152, "grad_norm": 2.5588741302490234, "learning_rate": 6.610301906779662e-06, "loss": 1.2608, "mean_token_accuracy": 0.7038571611046791, "num_tokens": 20600862.0, "step": 25600 }, { "epoch": 6.780190677966102, "grad_norm": 2.2662606239318848, "learning_rate": 6.610037076271187e-06, "loss": 0.8332, "mean_token_accuracy": 0.7723320871591568, "num_tokens": 20602502.0, "step": 25602 }, { "epoch": 6.780720338983051, "grad_norm": 2.6559252738952637, "learning_rate": 6.609772245762712e-06, "loss": 1.2065, "mean_token_accuracy": 0.7382273077964783, "num_tokens": 20603959.0, "step": 25604 }, { "epoch": 6.78125, "grad_norm": 2.572185754776001, "learning_rate": 6.609507415254238e-06, "loss": 1.2351, "mean_token_accuracy": 0.7076466977596283, "num_tokens": 20605482.0, "step": 25606 }, { "epoch": 6.781779661016949, "grad_norm": 2.613929510116577, "learning_rate": 6.609242584745763e-06, "loss": 1.2605, "mean_token_accuracy": 0.7085578218102455, "num_tokens": 20606919.0, "step": 25608 }, { "epoch": 6.782309322033898, "grad_norm": 2.481637954711914, "learning_rate": 6.608977754237289e-06, "loss": 1.1379, "mean_token_accuracy": 0.7298336774110794, "num_tokens": 20608402.0, "step": 25610 }, { "epoch": 6.782838983050848, "grad_norm": 2.5360159873962402, "learning_rate": 6.608712923728814e-06, "loss": 1.2114, "mean_token_accuracy": 0.7218405678868294, "num_tokens": 20610106.0, "step": 25612 }, { "epoch": 6.783368644067797, "grad_norm": 2.7858211994171143, "learning_rate": 6.6084480932203395e-06, "loss": 1.7075, "mean_token_accuracy": 0.621273297816515, "num_tokens": 20611737.0, "step": 25614 }, { "epoch": 6.783898305084746, "grad_norm": 2.3332924842834473, "learning_rate": 6.6081832627118644e-06, "loss": 1.4431, "mean_token_accuracy": 0.702309250831604, "num_tokens": 20613101.0, "step": 25616 }, { "epoch": 6.784427966101695, "grad_norm": 1.9396352767944336, "learning_rate": 6.607918432203391e-06, "loss": 1.0499, "mean_token_accuracy": 0.7608431205153465, "num_tokens": 20614715.0, "step": 25618 }, { "epoch": 6.784957627118644, "grad_norm": 2.043423891067505, "learning_rate": 6.607653601694915e-06, "loss": 1.0627, "mean_token_accuracy": 0.756373718380928, "num_tokens": 20616355.0, "step": 25620 }, { "epoch": 6.785487288135593, "grad_norm": 2.849680185317993, "learning_rate": 6.607388771186442e-06, "loss": 1.6704, "mean_token_accuracy": 0.6564717069268227, "num_tokens": 20617740.0, "step": 25622 }, { "epoch": 6.786016949152542, "grad_norm": 1.7525663375854492, "learning_rate": 6.607123940677967e-06, "loss": 1.1582, "mean_token_accuracy": 0.7663157731294632, "num_tokens": 20619249.0, "step": 25624 }, { "epoch": 6.786546610169491, "grad_norm": 2.5727953910827637, "learning_rate": 6.6068591101694925e-06, "loss": 1.3429, "mean_token_accuracy": 0.7020289748907089, "num_tokens": 20620971.0, "step": 25626 }, { "epoch": 6.78707627118644, "grad_norm": 2.4008827209472656, "learning_rate": 6.606594279661017e-06, "loss": 1.1077, "mean_token_accuracy": 0.7129426896572113, "num_tokens": 20622517.0, "step": 25628 }, { "epoch": 6.78760593220339, "grad_norm": 2.2840864658355713, "learning_rate": 6.606329449152543e-06, "loss": 1.0286, "mean_token_accuracy": 0.7306738048791885, "num_tokens": 20624260.0, "step": 25630 }, { "epoch": 6.788135593220339, "grad_norm": 2.3085598945617676, "learning_rate": 6.606064618644068e-06, "loss": 1.3007, "mean_token_accuracy": 0.7230819165706635, "num_tokens": 20625913.0, "step": 25632 }, { "epoch": 6.788665254237288, "grad_norm": 2.799393892288208, "learning_rate": 6.605799788135594e-06, "loss": 1.4764, "mean_token_accuracy": 0.6690010949969292, "num_tokens": 20627450.0, "step": 25634 }, { "epoch": 6.789194915254237, "grad_norm": 2.444736957550049, "learning_rate": 6.605534957627119e-06, "loss": 1.3205, "mean_token_accuracy": 0.6860583946108818, "num_tokens": 20629103.0, "step": 25636 }, { "epoch": 6.789724576271187, "grad_norm": 2.2402632236480713, "learning_rate": 6.605270127118645e-06, "loss": 0.941, "mean_token_accuracy": 0.7446254268288612, "num_tokens": 20630816.0, "step": 25638 }, { "epoch": 6.790254237288136, "grad_norm": 2.046278953552246, "learning_rate": 6.6050052966101695e-06, "loss": 1.2601, "mean_token_accuracy": 0.709313377737999, "num_tokens": 20632524.0, "step": 25640 }, { "epoch": 6.790783898305085, "grad_norm": 2.1243205070495605, "learning_rate": 6.604740466101695e-06, "loss": 1.0855, "mean_token_accuracy": 0.7366807833313942, "num_tokens": 20634160.0, "step": 25642 }, { "epoch": 6.791313559322034, "grad_norm": 2.978760242462158, "learning_rate": 6.60447563559322e-06, "loss": 1.4356, "mean_token_accuracy": 0.6834585964679718, "num_tokens": 20635586.0, "step": 25644 }, { "epoch": 6.791843220338983, "grad_norm": 2.3381619453430176, "learning_rate": 6.604210805084747e-06, "loss": 1.4211, "mean_token_accuracy": 0.6768958419561386, "num_tokens": 20637170.0, "step": 25646 }, { "epoch": 6.7923728813559325, "grad_norm": 2.178550958633423, "learning_rate": 6.603945974576271e-06, "loss": 0.9078, "mean_token_accuracy": 0.7687244713306427, "num_tokens": 20638679.0, "step": 25648 }, { "epoch": 6.7929025423728815, "grad_norm": 2.231110095977783, "learning_rate": 6.6036811440677976e-06, "loss": 1.1029, "mean_token_accuracy": 0.7269011661410332, "num_tokens": 20640479.0, "step": 25650 }, { "epoch": 6.7934322033898304, "grad_norm": 2.199995517730713, "learning_rate": 6.6034163135593225e-06, "loss": 1.0969, "mean_token_accuracy": 0.7092127427458763, "num_tokens": 20642149.0, "step": 25652 }, { "epoch": 6.793961864406779, "grad_norm": 2.1100940704345703, "learning_rate": 6.603151483050848e-06, "loss": 0.9232, "mean_token_accuracy": 0.7720966264605522, "num_tokens": 20643721.0, "step": 25654 }, { "epoch": 6.794491525423728, "grad_norm": 2.7248551845550537, "learning_rate": 6.602886652542373e-06, "loss": 1.3868, "mean_token_accuracy": 0.6985538303852081, "num_tokens": 20645442.0, "step": 25656 }, { "epoch": 6.795021186440678, "grad_norm": 2.3784396648406982, "learning_rate": 6.602621822033899e-06, "loss": 0.9031, "mean_token_accuracy": 0.7753820940852165, "num_tokens": 20646990.0, "step": 25658 }, { "epoch": 6.795550847457627, "grad_norm": 2.3917694091796875, "learning_rate": 6.602356991525424e-06, "loss": 0.7856, "mean_token_accuracy": 0.7885515242815018, "num_tokens": 20648418.0, "step": 25660 }, { "epoch": 6.796080508474576, "grad_norm": 2.485381603240967, "learning_rate": 6.60209216101695e-06, "loss": 1.2988, "mean_token_accuracy": 0.6991351768374443, "num_tokens": 20650054.0, "step": 25662 }, { "epoch": 6.796610169491525, "grad_norm": 2.105158567428589, "learning_rate": 6.601827330508475e-06, "loss": 1.1646, "mean_token_accuracy": 0.6936929076910019, "num_tokens": 20652338.0, "step": 25664 }, { "epoch": 6.797139830508475, "grad_norm": 2.34425950050354, "learning_rate": 6.6015625e-06, "loss": 1.3775, "mean_token_accuracy": 0.7195102386176586, "num_tokens": 20653683.0, "step": 25666 }, { "epoch": 6.797669491525424, "grad_norm": 2.107454776763916, "learning_rate": 6.601297669491526e-06, "loss": 1.4447, "mean_token_accuracy": 0.6992665454745293, "num_tokens": 20655292.0, "step": 25668 }, { "epoch": 6.798199152542373, "grad_norm": 2.6070151329040527, "learning_rate": 6.601032838983051e-06, "loss": 1.2768, "mean_token_accuracy": 0.7009619772434235, "num_tokens": 20656712.0, "step": 25670 }, { "epoch": 6.798728813559322, "grad_norm": 2.659597158432007, "learning_rate": 6.600768008474578e-06, "loss": 0.9905, "mean_token_accuracy": 0.7517279982566833, "num_tokens": 20658181.0, "step": 25672 }, { "epoch": 6.799258474576272, "grad_norm": 2.5303797721862793, "learning_rate": 6.600503177966102e-06, "loss": 1.2907, "mean_token_accuracy": 0.7345425263047218, "num_tokens": 20659752.0, "step": 25674 }, { "epoch": 6.799788135593221, "grad_norm": 1.9959297180175781, "learning_rate": 6.600238347457628e-06, "loss": 1.1484, "mean_token_accuracy": 0.7203503474593163, "num_tokens": 20661168.0, "step": 25676 }, { "epoch": 6.8003177966101696, "grad_norm": 2.2533209323883057, "learning_rate": 6.599973516949153e-06, "loss": 0.8727, "mean_token_accuracy": 0.8030659034848213, "num_tokens": 20662825.0, "step": 25678 }, { "epoch": 6.8008474576271185, "grad_norm": 2.2111599445343018, "learning_rate": 6.599708686440679e-06, "loss": 0.9396, "mean_token_accuracy": 0.7648819610476494, "num_tokens": 20664490.0, "step": 25680 }, { "epoch": 6.8013771186440675, "grad_norm": 2.1045899391174316, "learning_rate": 6.599443855932204e-06, "loss": 1.3378, "mean_token_accuracy": 0.7318943217396736, "num_tokens": 20666112.0, "step": 25682 }, { "epoch": 6.801906779661017, "grad_norm": 2.8250303268432617, "learning_rate": 6.59917902542373e-06, "loss": 1.4745, "mean_token_accuracy": 0.6935129538178444, "num_tokens": 20667504.0, "step": 25684 }, { "epoch": 6.802436440677966, "grad_norm": 1.8507765531539917, "learning_rate": 6.598914194915255e-06, "loss": 0.8369, "mean_token_accuracy": 0.8074102625250816, "num_tokens": 20669054.0, "step": 25686 }, { "epoch": 6.802966101694915, "grad_norm": 2.591765880584717, "learning_rate": 6.5986493644067805e-06, "loss": 1.3429, "mean_token_accuracy": 0.6931831240653992, "num_tokens": 20670573.0, "step": 25688 }, { "epoch": 6.803495762711864, "grad_norm": 2.5209624767303467, "learning_rate": 6.5983845338983055e-06, "loss": 1.6087, "mean_token_accuracy": 0.6749193966388702, "num_tokens": 20672063.0, "step": 25690 }, { "epoch": 6.804025423728813, "grad_norm": 2.7736847400665283, "learning_rate": 6.598119703389831e-06, "loss": 1.2441, "mean_token_accuracy": 0.6972873210906982, "num_tokens": 20673925.0, "step": 25692 }, { "epoch": 6.804555084745763, "grad_norm": 2.5177621841430664, "learning_rate": 6.597854872881356e-06, "loss": 1.3563, "mean_token_accuracy": 0.6996370106935501, "num_tokens": 20675341.0, "step": 25694 }, { "epoch": 6.805084745762712, "grad_norm": 2.4271883964538574, "learning_rate": 6.597590042372882e-06, "loss": 1.2784, "mean_token_accuracy": 0.722664438188076, "num_tokens": 20676670.0, "step": 25696 }, { "epoch": 6.805614406779661, "grad_norm": 2.4193222522735596, "learning_rate": 6.597325211864407e-06, "loss": 1.4275, "mean_token_accuracy": 0.6653253361582756, "num_tokens": 20678409.0, "step": 25698 }, { "epoch": 6.80614406779661, "grad_norm": 3.1074814796447754, "learning_rate": 6.5970603813559335e-06, "loss": 1.1621, "mean_token_accuracy": 0.7272377610206604, "num_tokens": 20679817.0, "step": 25700 }, { "epoch": 6.80667372881356, "grad_norm": 2.725177049636841, "learning_rate": 6.5967955508474576e-06, "loss": 1.0343, "mean_token_accuracy": 0.7466734945774078, "num_tokens": 20681201.0, "step": 25702 }, { "epoch": 6.807203389830509, "grad_norm": 2.00172758102417, "learning_rate": 6.596530720338984e-06, "loss": 1.2418, "mean_token_accuracy": 0.701410599052906, "num_tokens": 20682902.0, "step": 25704 }, { "epoch": 6.807733050847458, "grad_norm": 2.7839510440826416, "learning_rate": 6.596265889830509e-06, "loss": 1.2611, "mean_token_accuracy": 0.7276530340313911, "num_tokens": 20684553.0, "step": 25706 }, { "epoch": 6.808262711864407, "grad_norm": 2.8629794120788574, "learning_rate": 6.596001059322035e-06, "loss": 0.9923, "mean_token_accuracy": 0.7405627518892288, "num_tokens": 20685960.0, "step": 25708 }, { "epoch": 6.808792372881356, "grad_norm": 2.5787882804870605, "learning_rate": 6.59573622881356e-06, "loss": 1.6432, "mean_token_accuracy": 0.6466041281819344, "num_tokens": 20687532.0, "step": 25710 }, { "epoch": 6.809322033898305, "grad_norm": 2.063823699951172, "learning_rate": 6.595471398305086e-06, "loss": 1.1196, "mean_token_accuracy": 0.7369163185358047, "num_tokens": 20689138.0, "step": 25712 }, { "epoch": 6.809851694915254, "grad_norm": 2.242685079574585, "learning_rate": 6.5952065677966105e-06, "loss": 0.858, "mean_token_accuracy": 0.7676454186439514, "num_tokens": 20690585.0, "step": 25714 }, { "epoch": 6.810381355932203, "grad_norm": 2.37923526763916, "learning_rate": 6.594941737288136e-06, "loss": 1.5026, "mean_token_accuracy": 0.6555998623371124, "num_tokens": 20692355.0, "step": 25716 }, { "epoch": 6.810911016949152, "grad_norm": 2.3060178756713867, "learning_rate": 6.594676906779661e-06, "loss": 0.9477, "mean_token_accuracy": 0.7713489234447479, "num_tokens": 20693655.0, "step": 25718 }, { "epoch": 6.811440677966102, "grad_norm": 2.308396816253662, "learning_rate": 6.594412076271187e-06, "loss": 1.4161, "mean_token_accuracy": 0.6836560145020485, "num_tokens": 20695102.0, "step": 25720 }, { "epoch": 6.811970338983051, "grad_norm": 2.387643575668335, "learning_rate": 6.594147245762712e-06, "loss": 0.9144, "mean_token_accuracy": 0.7796351388096809, "num_tokens": 20696510.0, "step": 25722 }, { "epoch": 6.8125, "grad_norm": 2.3492019176483154, "learning_rate": 6.593882415254238e-06, "loss": 1.3864, "mean_token_accuracy": 0.7140684574842453, "num_tokens": 20698312.0, "step": 25724 }, { "epoch": 6.813029661016949, "grad_norm": 2.2515628337860107, "learning_rate": 6.593617584745763e-06, "loss": 1.1289, "mean_token_accuracy": 0.7184673026204109, "num_tokens": 20699953.0, "step": 25726 }, { "epoch": 6.813559322033898, "grad_norm": 2.397209405899048, "learning_rate": 6.5933527542372884e-06, "loss": 0.9685, "mean_token_accuracy": 0.7581987306475639, "num_tokens": 20701401.0, "step": 25728 }, { "epoch": 6.814088983050848, "grad_norm": 2.670531749725342, "learning_rate": 6.593087923728813e-06, "loss": 1.412, "mean_token_accuracy": 0.7054605595767498, "num_tokens": 20702925.0, "step": 25730 }, { "epoch": 6.814618644067797, "grad_norm": 2.561676025390625, "learning_rate": 6.59282309322034e-06, "loss": 1.138, "mean_token_accuracy": 0.7000536248087883, "num_tokens": 20704575.0, "step": 25732 }, { "epoch": 6.815148305084746, "grad_norm": 2.880018472671509, "learning_rate": 6.592558262711865e-06, "loss": 1.0849, "mean_token_accuracy": 0.7334819659590721, "num_tokens": 20706093.0, "step": 25734 }, { "epoch": 6.815677966101695, "grad_norm": 2.684586524963379, "learning_rate": 6.592293432203391e-06, "loss": 1.7817, "mean_token_accuracy": 0.6163738518953323, "num_tokens": 20707754.0, "step": 25736 }, { "epoch": 6.816207627118644, "grad_norm": 2.222097158432007, "learning_rate": 6.592028601694916e-06, "loss": 0.7333, "mean_token_accuracy": 0.7974580004811287, "num_tokens": 20709222.0, "step": 25738 }, { "epoch": 6.816737288135593, "grad_norm": 2.9265661239624023, "learning_rate": 6.591763771186441e-06, "loss": 1.2441, "mean_token_accuracy": 0.7098879814147949, "num_tokens": 20710740.0, "step": 25740 }, { "epoch": 6.817266949152542, "grad_norm": 2.189908504486084, "learning_rate": 6.591498940677966e-06, "loss": 1.1832, "mean_token_accuracy": 0.7224886789917946, "num_tokens": 20712330.0, "step": 25742 }, { "epoch": 6.817796610169491, "grad_norm": 2.3467025756835938, "learning_rate": 6.591234110169492e-06, "loss": 1.3159, "mean_token_accuracy": 0.6716778576374054, "num_tokens": 20714402.0, "step": 25744 }, { "epoch": 6.81832627118644, "grad_norm": 2.193507432937622, "learning_rate": 6.590969279661017e-06, "loss": 1.1034, "mean_token_accuracy": 0.7394284158945084, "num_tokens": 20716175.0, "step": 25746 }, { "epoch": 6.81885593220339, "grad_norm": 2.8075673580169678, "learning_rate": 6.590704449152543e-06, "loss": 1.457, "mean_token_accuracy": 0.6610424891114235, "num_tokens": 20717818.0, "step": 25748 }, { "epoch": 6.819385593220339, "grad_norm": 2.521265745162964, "learning_rate": 6.590439618644068e-06, "loss": 1.1024, "step": 25750 }, { "epoch": 6.819385593220339, "eval_loss": 1.3263167142868042, "eval_mean_token_accuracy": 0.7003000366029801, "eval_num_tokens": 20719365.0, "eval_runtime": 48.5951, "eval_samples_per_second": 6.338, "eval_steps_per_second": 6.338, "step": 25750 }, { "epoch": 6.819915254237288, "grad_norm": 2.6728758811950684, "learning_rate": 6.5901747881355935e-06, "loss": 0.9707, "mean_token_accuracy": 0.7546042539179325, "num_tokens": 20720739.0, "step": 25752 }, { "epoch": 6.820444915254237, "grad_norm": 2.4430770874023438, "learning_rate": 6.58990995762712e-06, "loss": 1.285, "mean_token_accuracy": 0.6972101032733917, "num_tokens": 20722664.0, "step": 25754 }, { "epoch": 6.820974576271187, "grad_norm": 2.2789626121520996, "learning_rate": 6.589645127118644e-06, "loss": 0.8999, "mean_token_accuracy": 0.782946027815342, "num_tokens": 20724315.0, "step": 25756 }, { "epoch": 6.821504237288136, "grad_norm": 2.5731897354125977, "learning_rate": 6.589380296610171e-06, "loss": 1.7775, "mean_token_accuracy": 0.6212973818182945, "num_tokens": 20725958.0, "step": 25758 }, { "epoch": 6.822033898305085, "grad_norm": 2.1198272705078125, "learning_rate": 6.589115466101696e-06, "loss": 1.2393, "mean_token_accuracy": 0.7327980250120163, "num_tokens": 20727492.0, "step": 25760 }, { "epoch": 6.822563559322034, "grad_norm": 2.294088125228882, "learning_rate": 6.5888506355932215e-06, "loss": 1.3218, "mean_token_accuracy": 0.6872286908328533, "num_tokens": 20729492.0, "step": 25762 }, { "epoch": 6.823093220338983, "grad_norm": 2.6963751316070557, "learning_rate": 6.5885858050847465e-06, "loss": 1.1876, "mean_token_accuracy": 0.7084666416049004, "num_tokens": 20730828.0, "step": 25764 }, { "epoch": 6.8236228813559325, "grad_norm": 2.5747711658477783, "learning_rate": 6.588320974576272e-06, "loss": 1.2527, "mean_token_accuracy": 0.7041991055011749, "num_tokens": 20732487.0, "step": 25766 }, { "epoch": 6.8241525423728815, "grad_norm": 2.103891611099243, "learning_rate": 6.588056144067797e-06, "loss": 1.621, "mean_token_accuracy": 0.6434330195188522, "num_tokens": 20734187.0, "step": 25768 }, { "epoch": 6.8246822033898304, "grad_norm": 2.2472705841064453, "learning_rate": 6.587791313559323e-06, "loss": 1.5078, "mean_token_accuracy": 0.6711083091795444, "num_tokens": 20735890.0, "step": 25770 }, { "epoch": 6.825211864406779, "grad_norm": 2.4711601734161377, "learning_rate": 6.587526483050848e-06, "loss": 1.4832, "mean_token_accuracy": 0.664294920861721, "num_tokens": 20737561.0, "step": 25772 }, { "epoch": 6.825741525423728, "grad_norm": 2.602545738220215, "learning_rate": 6.587261652542374e-06, "loss": 1.2169, "mean_token_accuracy": 0.7359240278601646, "num_tokens": 20739158.0, "step": 25774 }, { "epoch": 6.826271186440678, "grad_norm": 2.1663734912872314, "learning_rate": 6.586996822033899e-06, "loss": 0.8676, "mean_token_accuracy": 0.7893359661102295, "num_tokens": 20740731.0, "step": 25776 }, { "epoch": 6.826800847457627, "grad_norm": 1.971517562866211, "learning_rate": 6.586731991525424e-06, "loss": 1.269, "mean_token_accuracy": 0.7013654336333275, "num_tokens": 20742276.0, "step": 25778 }, { "epoch": 6.827330508474576, "grad_norm": 3.0761401653289795, "learning_rate": 6.586467161016949e-06, "loss": 1.551, "mean_token_accuracy": 0.6618285998702049, "num_tokens": 20744038.0, "step": 25780 }, { "epoch": 6.827860169491525, "grad_norm": 2.152573585510254, "learning_rate": 6.586202330508475e-06, "loss": 1.0727, "mean_token_accuracy": 0.745725579559803, "num_tokens": 20745397.0, "step": 25782 }, { "epoch": 6.828389830508475, "grad_norm": 2.2353525161743164, "learning_rate": 6.5859375e-06, "loss": 0.9559, "mean_token_accuracy": 0.761418916285038, "num_tokens": 20746871.0, "step": 25784 }, { "epoch": 6.828919491525424, "grad_norm": 2.531332492828369, "learning_rate": 6.585672669491527e-06, "loss": 1.4591, "mean_token_accuracy": 0.670767642557621, "num_tokens": 20748510.0, "step": 25786 }, { "epoch": 6.829449152542373, "grad_norm": 2.13051176071167, "learning_rate": 6.5854078389830515e-06, "loss": 1.2105, "mean_token_accuracy": 0.7123627811670303, "num_tokens": 20750256.0, "step": 25788 }, { "epoch": 6.829978813559322, "grad_norm": 2.689286470413208, "learning_rate": 6.585143008474577e-06, "loss": 1.1026, "mean_token_accuracy": 0.7507492825388908, "num_tokens": 20751733.0, "step": 25790 }, { "epoch": 6.830508474576272, "grad_norm": 2.619255542755127, "learning_rate": 6.584878177966102e-06, "loss": 1.1875, "mean_token_accuracy": 0.7330455407500267, "num_tokens": 20753163.0, "step": 25792 }, { "epoch": 6.831038135593221, "grad_norm": 2.390842914581299, "learning_rate": 6.584613347457628e-06, "loss": 1.5014, "mean_token_accuracy": 0.6607448011636734, "num_tokens": 20754689.0, "step": 25794 }, { "epoch": 6.8315677966101696, "grad_norm": 1.9313602447509766, "learning_rate": 6.584348516949153e-06, "loss": 1.2997, "mean_token_accuracy": 0.6985055580735207, "num_tokens": 20756281.0, "step": 25796 }, { "epoch": 6.8320974576271185, "grad_norm": 2.4751369953155518, "learning_rate": 6.584083686440679e-06, "loss": 1.2209, "mean_token_accuracy": 0.7123468294739723, "num_tokens": 20758022.0, "step": 25798 }, { "epoch": 6.8326271186440675, "grad_norm": 2.4100821018218994, "learning_rate": 6.583818855932204e-06, "loss": 1.4912, "mean_token_accuracy": 0.6838851347565651, "num_tokens": 20759670.0, "step": 25800 }, { "epoch": 6.833156779661017, "grad_norm": 2.067479372024536, "learning_rate": 6.5835540254237294e-06, "loss": 1.1189, "mean_token_accuracy": 0.7198058515787125, "num_tokens": 20761654.0, "step": 25802 }, { "epoch": 6.833686440677966, "grad_norm": 2.6020870208740234, "learning_rate": 6.583289194915254e-06, "loss": 1.2813, "mean_token_accuracy": 0.6982117742300034, "num_tokens": 20763256.0, "step": 25804 }, { "epoch": 6.834216101694915, "grad_norm": 2.557709217071533, "learning_rate": 6.58302436440678e-06, "loss": 1.3766, "mean_token_accuracy": 0.6775215938687325, "num_tokens": 20764993.0, "step": 25806 }, { "epoch": 6.834745762711864, "grad_norm": 2.2281837463378906, "learning_rate": 6.582759533898305e-06, "loss": 1.3352, "mean_token_accuracy": 0.7027477398514748, "num_tokens": 20766910.0, "step": 25808 }, { "epoch": 6.835275423728813, "grad_norm": 1.9987818002700806, "learning_rate": 6.582494703389831e-06, "loss": 0.7313, "mean_token_accuracy": 0.8090715929865837, "num_tokens": 20768490.0, "step": 25810 }, { "epoch": 6.835805084745763, "grad_norm": 2.403620719909668, "learning_rate": 6.582229872881356e-06, "loss": 1.2689, "mean_token_accuracy": 0.7175905331969261, "num_tokens": 20770255.0, "step": 25812 }, { "epoch": 6.836334745762712, "grad_norm": 1.7687815427780151, "learning_rate": 6.581965042372882e-06, "loss": 1.235, "mean_token_accuracy": 0.7153368964791298, "num_tokens": 20771849.0, "step": 25814 }, { "epoch": 6.836864406779661, "grad_norm": 2.078329563140869, "learning_rate": 6.5817002118644065e-06, "loss": 1.1233, "mean_token_accuracy": 0.7403949052095413, "num_tokens": 20773427.0, "step": 25816 }, { "epoch": 6.83739406779661, "grad_norm": 1.815402626991272, "learning_rate": 6.581435381355933e-06, "loss": 0.9003, "mean_token_accuracy": 0.7820657268166542, "num_tokens": 20774819.0, "step": 25818 }, { "epoch": 6.83792372881356, "grad_norm": 1.8312443494796753, "learning_rate": 6.581170550847458e-06, "loss": 0.781, "mean_token_accuracy": 0.7975683659315109, "num_tokens": 20776450.0, "step": 25820 }, { "epoch": 6.838453389830509, "grad_norm": 2.441422462463379, "learning_rate": 6.580905720338984e-06, "loss": 1.8267, "mean_token_accuracy": 0.5946985483169556, "num_tokens": 20778091.0, "step": 25822 }, { "epoch": 6.838983050847458, "grad_norm": 1.924560546875, "learning_rate": 6.580640889830509e-06, "loss": 0.771, "mean_token_accuracy": 0.8044344633817673, "num_tokens": 20780569.0, "step": 25824 }, { "epoch": 6.839512711864407, "grad_norm": 2.599508762359619, "learning_rate": 6.5803760593220345e-06, "loss": 1.3484, "mean_token_accuracy": 0.6863463371992111, "num_tokens": 20782199.0, "step": 25826 }, { "epoch": 6.840042372881356, "grad_norm": 2.388153553009033, "learning_rate": 6.5801112288135594e-06, "loss": 1.0079, "mean_token_accuracy": 0.7535788640379906, "num_tokens": 20783919.0, "step": 25828 }, { "epoch": 6.840572033898305, "grad_norm": 2.737715482711792, "learning_rate": 6.579846398305085e-06, "loss": 1.2172, "mean_token_accuracy": 0.7187396362423897, "num_tokens": 20785587.0, "step": 25830 }, { "epoch": 6.841101694915254, "grad_norm": 2.2970969676971436, "learning_rate": 6.57958156779661e-06, "loss": 1.4291, "mean_token_accuracy": 0.6858013048768044, "num_tokens": 20787360.0, "step": 25832 }, { "epoch": 6.841631355932203, "grad_norm": 2.5052919387817383, "learning_rate": 6.579316737288136e-06, "loss": 0.9499, "mean_token_accuracy": 0.7692401632666588, "num_tokens": 20788760.0, "step": 25834 }, { "epoch": 6.842161016949152, "grad_norm": 2.8581576347351074, "learning_rate": 6.579051906779662e-06, "loss": 1.5219, "mean_token_accuracy": 0.6562951058149338, "num_tokens": 20790235.0, "step": 25836 }, { "epoch": 6.842690677966102, "grad_norm": 2.4757111072540283, "learning_rate": 6.578787076271187e-06, "loss": 1.0636, "mean_token_accuracy": 0.7540056854486465, "num_tokens": 20791611.0, "step": 25838 }, { "epoch": 6.843220338983051, "grad_norm": 2.491485834121704, "learning_rate": 6.578522245762713e-06, "loss": 1.1243, "mean_token_accuracy": 0.7409603297710419, "num_tokens": 20792923.0, "step": 25840 }, { "epoch": 6.84375, "grad_norm": 2.4621574878692627, "learning_rate": 6.578257415254238e-06, "loss": 1.2781, "mean_token_accuracy": 0.7128861322999, "num_tokens": 20794632.0, "step": 25842 }, { "epoch": 6.844279661016949, "grad_norm": 1.929281234741211, "learning_rate": 6.577992584745764e-06, "loss": 1.1098, "mean_token_accuracy": 0.735523447394371, "num_tokens": 20796401.0, "step": 25844 }, { "epoch": 6.844809322033898, "grad_norm": 1.322744607925415, "learning_rate": 6.577727754237289e-06, "loss": 1.7571, "mean_token_accuracy": 0.6576253045350313, "num_tokens": 20798531.0, "step": 25846 }, { "epoch": 6.845338983050848, "grad_norm": 2.1411309242248535, "learning_rate": 6.577462923728815e-06, "loss": 1.1485, "mean_token_accuracy": 0.7369119375944138, "num_tokens": 20800048.0, "step": 25848 }, { "epoch": 6.845868644067797, "grad_norm": 2.1019608974456787, "learning_rate": 6.57719809322034e-06, "loss": 1.1084, "mean_token_accuracy": 0.7380213066935539, "num_tokens": 20801601.0, "step": 25850 }, { "epoch": 6.846398305084746, "grad_norm": 2.8401718139648438, "learning_rate": 6.576933262711865e-06, "loss": 0.9159, "mean_token_accuracy": 0.7624134980142117, "num_tokens": 20802979.0, "step": 25852 }, { "epoch": 6.846927966101695, "grad_norm": 2.159724712371826, "learning_rate": 6.57666843220339e-06, "loss": 1.1896, "mean_token_accuracy": 0.727731503546238, "num_tokens": 20804474.0, "step": 25854 }, { "epoch": 6.847457627118644, "grad_norm": 2.373114824295044, "learning_rate": 6.576403601694916e-06, "loss": 1.4693, "mean_token_accuracy": 0.6629615351557732, "num_tokens": 20806118.0, "step": 25856 }, { "epoch": 6.847987288135593, "grad_norm": 2.6031582355499268, "learning_rate": 6.576138771186441e-06, "loss": 0.9887, "mean_token_accuracy": 0.7551006078720093, "num_tokens": 20807806.0, "step": 25858 }, { "epoch": 6.848516949152542, "grad_norm": 2.952336549758911, "learning_rate": 6.575873940677967e-06, "loss": 1.5746, "mean_token_accuracy": 0.6723706796765327, "num_tokens": 20809273.0, "step": 25860 }, { "epoch": 6.849046610169491, "grad_norm": 2.350917100906372, "learning_rate": 6.575609110169492e-06, "loss": 1.5784, "mean_token_accuracy": 0.6623199284076691, "num_tokens": 20810927.0, "step": 25862 }, { "epoch": 6.84957627118644, "grad_norm": 2.149376630783081, "learning_rate": 6.5753442796610175e-06, "loss": 1.1128, "mean_token_accuracy": 0.7364462167024612, "num_tokens": 20812559.0, "step": 25864 }, { "epoch": 6.85010593220339, "grad_norm": 2.2752909660339355, "learning_rate": 6.575079449152542e-06, "loss": 1.6483, "mean_token_accuracy": 0.6312622278928757, "num_tokens": 20814315.0, "step": 25866 }, { "epoch": 6.850635593220339, "grad_norm": 2.448749303817749, "learning_rate": 6.574814618644069e-06, "loss": 1.286, "mean_token_accuracy": 0.7000144869089127, "num_tokens": 20815843.0, "step": 25868 }, { "epoch": 6.851165254237288, "grad_norm": 2.799811840057373, "learning_rate": 6.574549788135593e-06, "loss": 1.1483, "mean_token_accuracy": 0.7501963824033737, "num_tokens": 20817307.0, "step": 25870 }, { "epoch": 6.851694915254237, "grad_norm": 2.3583619594573975, "learning_rate": 6.57428495762712e-06, "loss": 1.1382, "mean_token_accuracy": 0.7074445113539696, "num_tokens": 20818991.0, "step": 25872 }, { "epoch": 6.852224576271187, "grad_norm": 2.2331528663635254, "learning_rate": 6.574020127118645e-06, "loss": 1.087, "mean_token_accuracy": 0.7630971819162369, "num_tokens": 20820633.0, "step": 25874 }, { "epoch": 6.852754237288136, "grad_norm": 2.7923800945281982, "learning_rate": 6.5737552966101705e-06, "loss": 1.3212, "mean_token_accuracy": 0.7119579017162323, "num_tokens": 20821994.0, "step": 25876 }, { "epoch": 6.853283898305085, "grad_norm": 2.5148260593414307, "learning_rate": 6.573490466101695e-06, "loss": 0.7875, "mean_token_accuracy": 0.8013965338468552, "num_tokens": 20823479.0, "step": 25878 }, { "epoch": 6.853813559322034, "grad_norm": 2.6322081089019775, "learning_rate": 6.573225635593221e-06, "loss": 1.0093, "mean_token_accuracy": 0.7674286887049675, "num_tokens": 20825261.0, "step": 25880 }, { "epoch": 6.854343220338983, "grad_norm": 2.3517544269561768, "learning_rate": 6.572960805084746e-06, "loss": 1.0555, "mean_token_accuracy": 0.7513964101672173, "num_tokens": 20826701.0, "step": 25882 }, { "epoch": 6.8548728813559325, "grad_norm": 2.3155882358551025, "learning_rate": 6.572695974576272e-06, "loss": 1.4528, "mean_token_accuracy": 0.6720531061291695, "num_tokens": 20828431.0, "step": 25884 }, { "epoch": 6.8554025423728815, "grad_norm": 1.7423291206359863, "learning_rate": 6.572431144067797e-06, "loss": 0.844, "mean_token_accuracy": 0.7986004874110222, "num_tokens": 20830068.0, "step": 25886 }, { "epoch": 6.8559322033898304, "grad_norm": 2.386329174041748, "learning_rate": 6.5721663135593226e-06, "loss": 1.1787, "mean_token_accuracy": 0.7356855347752571, "num_tokens": 20831812.0, "step": 25888 }, { "epoch": 6.856461864406779, "grad_norm": 2.5031979084014893, "learning_rate": 6.5719014830508475e-06, "loss": 0.9779, "mean_token_accuracy": 0.7577934190630913, "num_tokens": 20833348.0, "step": 25890 }, { "epoch": 6.856991525423728, "grad_norm": 2.889026165008545, "learning_rate": 6.571636652542373e-06, "loss": 1.6278, "mean_token_accuracy": 0.6595025807619095, "num_tokens": 20834986.0, "step": 25892 }, { "epoch": 6.857521186440678, "grad_norm": 2.701930046081543, "learning_rate": 6.571371822033898e-06, "loss": 1.057, "mean_token_accuracy": 0.7448481097817421, "num_tokens": 20836392.0, "step": 25894 }, { "epoch": 6.858050847457627, "grad_norm": 2.003838300704956, "learning_rate": 6.571106991525425e-06, "loss": 1.1329, "mean_token_accuracy": 0.7169581986963749, "num_tokens": 20838252.0, "step": 25896 }, { "epoch": 6.858580508474576, "grad_norm": 2.4851462841033936, "learning_rate": 6.570842161016949e-06, "loss": 1.303, "mean_token_accuracy": 0.705996185541153, "num_tokens": 20839787.0, "step": 25898 }, { "epoch": 6.859110169491525, "grad_norm": 2.801152467727661, "learning_rate": 6.5705773305084755e-06, "loss": 1.1095, "mean_token_accuracy": 0.747429147362709, "num_tokens": 20841169.0, "step": 25900 }, { "epoch": 6.859639830508475, "grad_norm": 2.3220245838165283, "learning_rate": 6.5703125000000005e-06, "loss": 1.439, "mean_token_accuracy": 0.6729903295636177, "num_tokens": 20842875.0, "step": 25902 }, { "epoch": 6.860169491525424, "grad_norm": 2.854861259460449, "learning_rate": 6.570047669491526e-06, "loss": 1.0562, "mean_token_accuracy": 0.7405531778931618, "num_tokens": 20844203.0, "step": 25904 }, { "epoch": 6.860699152542373, "grad_norm": 2.9958088397979736, "learning_rate": 6.569782838983051e-06, "loss": 1.1524, "mean_token_accuracy": 0.7302247956395149, "num_tokens": 20845673.0, "step": 25906 }, { "epoch": 6.861228813559322, "grad_norm": 2.228710889816284, "learning_rate": 6.569518008474577e-06, "loss": 0.9712, "mean_token_accuracy": 0.7490220293402672, "num_tokens": 20847362.0, "step": 25908 }, { "epoch": 6.861758474576272, "grad_norm": 2.213569402694702, "learning_rate": 6.569253177966102e-06, "loss": 0.9498, "mean_token_accuracy": 0.7778190597891808, "num_tokens": 20848744.0, "step": 25910 }, { "epoch": 6.862288135593221, "grad_norm": 2.9150326251983643, "learning_rate": 6.568988347457628e-06, "loss": 1.1692, "mean_token_accuracy": 0.7070317938923836, "num_tokens": 20850525.0, "step": 25912 }, { "epoch": 6.8628177966101696, "grad_norm": 1.9420905113220215, "learning_rate": 6.568723516949153e-06, "loss": 1.1616, "mean_token_accuracy": 0.7267202287912369, "num_tokens": 20852508.0, "step": 25914 }, { "epoch": 6.8633474576271185, "grad_norm": 2.74442720413208, "learning_rate": 6.568458686440678e-06, "loss": 1.1943, "mean_token_accuracy": 0.7036984339356422, "num_tokens": 20854138.0, "step": 25916 }, { "epoch": 6.8638771186440675, "grad_norm": 2.8981616497039795, "learning_rate": 6.568193855932203e-06, "loss": 1.2482, "mean_token_accuracy": 0.6946230381727219, "num_tokens": 20856056.0, "step": 25918 }, { "epoch": 6.864406779661017, "grad_norm": 2.282137393951416, "learning_rate": 6.567929025423729e-06, "loss": 1.0568, "mean_token_accuracy": 0.7259570807218552, "num_tokens": 20857670.0, "step": 25920 }, { "epoch": 6.864936440677966, "grad_norm": 2.505934238433838, "learning_rate": 6.567664194915256e-06, "loss": 0.6974, "mean_token_accuracy": 0.8085156157612801, "num_tokens": 20859342.0, "step": 25922 }, { "epoch": 6.865466101694915, "grad_norm": 2.537224769592285, "learning_rate": 6.56739936440678e-06, "loss": 1.1061, "mean_token_accuracy": 0.7181462571024895, "num_tokens": 20861017.0, "step": 25924 }, { "epoch": 6.865995762711864, "grad_norm": 2.5564069747924805, "learning_rate": 6.567134533898306e-06, "loss": 1.2991, "mean_token_accuracy": 0.6885215193033218, "num_tokens": 20862628.0, "step": 25926 }, { "epoch": 6.866525423728813, "grad_norm": 2.7506275177001953, "learning_rate": 6.566869703389831e-06, "loss": 1.2623, "mean_token_accuracy": 0.7051409408450127, "num_tokens": 20864216.0, "step": 25928 }, { "epoch": 6.867055084745763, "grad_norm": 1.8835711479187012, "learning_rate": 6.566604872881357e-06, "loss": 1.3344, "mean_token_accuracy": 0.6822439953684807, "num_tokens": 20866030.0, "step": 25930 }, { "epoch": 6.867584745762712, "grad_norm": 2.7553606033325195, "learning_rate": 6.566340042372882e-06, "loss": 1.4788, "mean_token_accuracy": 0.6700554192066193, "num_tokens": 20867708.0, "step": 25932 }, { "epoch": 6.868114406779661, "grad_norm": 2.681051254272461, "learning_rate": 6.566075211864408e-06, "loss": 1.5659, "mean_token_accuracy": 0.6561955511569977, "num_tokens": 20869313.0, "step": 25934 }, { "epoch": 6.86864406779661, "grad_norm": 2.706402063369751, "learning_rate": 6.565810381355933e-06, "loss": 1.199, "mean_token_accuracy": 0.7330344766378403, "num_tokens": 20870983.0, "step": 25936 }, { "epoch": 6.86917372881356, "grad_norm": 2.3382699489593506, "learning_rate": 6.5655455508474585e-06, "loss": 1.4728, "mean_token_accuracy": 0.6560614928603172, "num_tokens": 20872963.0, "step": 25938 }, { "epoch": 6.869703389830509, "grad_norm": 2.3258886337280273, "learning_rate": 6.5652807203389834e-06, "loss": 1.0018, "mean_token_accuracy": 0.7704475745558739, "num_tokens": 20874458.0, "step": 25940 }, { "epoch": 6.870233050847458, "grad_norm": 2.8854475021362305, "learning_rate": 6.565015889830509e-06, "loss": 1.6296, "mean_token_accuracy": 0.6550365909934044, "num_tokens": 20875886.0, "step": 25942 }, { "epoch": 6.870762711864407, "grad_norm": 2.629300355911255, "learning_rate": 6.564751059322034e-06, "loss": 1.0323, "mean_token_accuracy": 0.7482230886816978, "num_tokens": 20877399.0, "step": 25944 }, { "epoch": 6.871292372881356, "grad_norm": 2.4724535942077637, "learning_rate": 6.56448622881356e-06, "loss": 1.49, "mean_token_accuracy": 0.6828088536858559, "num_tokens": 20879222.0, "step": 25946 }, { "epoch": 6.871822033898305, "grad_norm": 2.5595924854278564, "learning_rate": 6.564221398305085e-06, "loss": 1.1984, "mean_token_accuracy": 0.7330522164702415, "num_tokens": 20880654.0, "step": 25948 }, { "epoch": 6.872351694915254, "grad_norm": 1.82111656665802, "learning_rate": 6.5639565677966115e-06, "loss": 0.7972, "mean_token_accuracy": 0.8116909042000771, "num_tokens": 20882196.0, "step": 25950 }, { "epoch": 6.872881355932203, "grad_norm": 1.8984320163726807, "learning_rate": 6.5636917372881355e-06, "loss": 1.3513, "mean_token_accuracy": 0.6712106987833977, "num_tokens": 20883933.0, "step": 25952 }, { "epoch": 6.873411016949152, "grad_norm": 3.0331552028656006, "learning_rate": 6.563426906779662e-06, "loss": 0.9815, "mean_token_accuracy": 0.7383319437503815, "num_tokens": 20885269.0, "step": 25954 }, { "epoch": 6.873940677966102, "grad_norm": 2.3880369663238525, "learning_rate": 6.563162076271187e-06, "loss": 1.4698, "mean_token_accuracy": 0.6586394160985947, "num_tokens": 20886911.0, "step": 25956 }, { "epoch": 6.874470338983051, "grad_norm": 2.728830099105835, "learning_rate": 6.562897245762713e-06, "loss": 1.4698, "mean_token_accuracy": 0.6664052307605743, "num_tokens": 20888314.0, "step": 25958 }, { "epoch": 6.875, "grad_norm": 2.539440631866455, "learning_rate": 6.562632415254238e-06, "loss": 1.08, "mean_token_accuracy": 0.7337735369801521, "num_tokens": 20889813.0, "step": 25960 }, { "epoch": 6.875529661016949, "grad_norm": 3.056105136871338, "learning_rate": 6.562367584745764e-06, "loss": 1.5735, "mean_token_accuracy": 0.6670868545770645, "num_tokens": 20891383.0, "step": 25962 }, { "epoch": 6.876059322033898, "grad_norm": 2.131606101989746, "learning_rate": 6.5621027542372885e-06, "loss": 1.1059, "mean_token_accuracy": 0.7495492696762085, "num_tokens": 20892901.0, "step": 25964 }, { "epoch": 6.876588983050848, "grad_norm": 2.273171901702881, "learning_rate": 6.561837923728814e-06, "loss": 0.8704, "mean_token_accuracy": 0.785738617181778, "num_tokens": 20894449.0, "step": 25966 }, { "epoch": 6.877118644067797, "grad_norm": 2.5359020233154297, "learning_rate": 6.561573093220339e-06, "loss": 1.7951, "mean_token_accuracy": 0.5881130248308182, "num_tokens": 20896152.0, "step": 25968 }, { "epoch": 6.877648305084746, "grad_norm": 2.419598340988159, "learning_rate": 6.561308262711865e-06, "loss": 1.2889, "mean_token_accuracy": 0.718726396560669, "num_tokens": 20897499.0, "step": 25970 }, { "epoch": 6.878177966101695, "grad_norm": 2.582326889038086, "learning_rate": 6.56104343220339e-06, "loss": 1.0113, "mean_token_accuracy": 0.7419433370232582, "num_tokens": 20898965.0, "step": 25972 }, { "epoch": 6.878707627118644, "grad_norm": 2.299563407897949, "learning_rate": 6.560778601694916e-06, "loss": 1.0878, "mean_token_accuracy": 0.7444350495934486, "num_tokens": 20900628.0, "step": 25974 }, { "epoch": 6.879237288135593, "grad_norm": 2.3718008995056152, "learning_rate": 6.560513771186441e-06, "loss": 1.1146, "mean_token_accuracy": 0.7410975024104118, "num_tokens": 20901931.0, "step": 25976 }, { "epoch": 6.879766949152542, "grad_norm": 2.233574628829956, "learning_rate": 6.560248940677966e-06, "loss": 0.9826, "mean_token_accuracy": 0.7756782323122025, "num_tokens": 20903354.0, "step": 25978 }, { "epoch": 6.880296610169491, "grad_norm": 2.3990259170532227, "learning_rate": 6.559984110169491e-06, "loss": 1.3676, "mean_token_accuracy": 0.7004647329449654, "num_tokens": 20904846.0, "step": 25980 }, { "epoch": 6.88082627118644, "grad_norm": 3.1316373348236084, "learning_rate": 6.559719279661018e-06, "loss": 1.3957, "mean_token_accuracy": 0.6700866743922234, "num_tokens": 20906223.0, "step": 25982 }, { "epoch": 6.88135593220339, "grad_norm": 2.1007542610168457, "learning_rate": 6.559454449152543e-06, "loss": 0.9306, "mean_token_accuracy": 0.7956593781709671, "num_tokens": 20907890.0, "step": 25984 }, { "epoch": 6.881885593220339, "grad_norm": 2.620372772216797, "learning_rate": 6.559189618644069e-06, "loss": 1.134, "mean_token_accuracy": 0.737406499683857, "num_tokens": 20909147.0, "step": 25986 }, { "epoch": 6.882415254237288, "grad_norm": 2.135850191116333, "learning_rate": 6.558924788135594e-06, "loss": 0.9474, "mean_token_accuracy": 0.7593676745891571, "num_tokens": 20910707.0, "step": 25988 }, { "epoch": 6.882944915254237, "grad_norm": 2.3168699741363525, "learning_rate": 6.558659957627119e-06, "loss": 1.1755, "mean_token_accuracy": 0.7203261777758598, "num_tokens": 20912280.0, "step": 25990 }, { "epoch": 6.883474576271187, "grad_norm": 2.6497175693511963, "learning_rate": 6.558395127118644e-06, "loss": 1.3671, "mean_token_accuracy": 0.7098257765173912, "num_tokens": 20913878.0, "step": 25992 }, { "epoch": 6.884004237288136, "grad_norm": 2.4190003871917725, "learning_rate": 6.55813029661017e-06, "loss": 1.5629, "mean_token_accuracy": 0.6518038138747215, "num_tokens": 20915750.0, "step": 25994 }, { "epoch": 6.884533898305085, "grad_norm": 2.7237706184387207, "learning_rate": 6.557865466101695e-06, "loss": 1.3695, "mean_token_accuracy": 0.6755977645516396, "num_tokens": 20917601.0, "step": 25996 }, { "epoch": 6.885063559322034, "grad_norm": 2.497459888458252, "learning_rate": 6.557600635593221e-06, "loss": 1.1205, "mean_token_accuracy": 0.7187395915389061, "num_tokens": 20919359.0, "step": 25998 }, { "epoch": 6.885593220338983, "grad_norm": 2.2780728340148926, "learning_rate": 6.557335805084746e-06, "loss": 1.437, "step": 26000 }, { "epoch": 6.885593220338983, "eval_loss": 1.325278878211975, "eval_mean_token_accuracy": 0.7005325404854564, "eval_num_tokens": 20920862.0, "eval_runtime": 48.7777, "eval_samples_per_second": 6.314, "eval_steps_per_second": 6.314, "step": 26000 }, { "epoch": 6.8861228813559325, "grad_norm": 2.272599935531616, "learning_rate": 6.5570709745762715e-06, "loss": 1.0259, "mean_token_accuracy": 0.7099094912409782, "num_tokens": 20922451.0, "step": 26002 }, { "epoch": 6.8866525423728815, "grad_norm": 2.1657474040985107, "learning_rate": 6.556806144067798e-06, "loss": 1.3568, "mean_token_accuracy": 0.6871418170630932, "num_tokens": 20924118.0, "step": 26004 }, { "epoch": 6.8871822033898304, "grad_norm": 2.4607348442077637, "learning_rate": 6.556541313559322e-06, "loss": 1.0486, "mean_token_accuracy": 0.7492341548204422, "num_tokens": 20925745.0, "step": 26006 }, { "epoch": 6.887711864406779, "grad_norm": 2.5241405963897705, "learning_rate": 6.556276483050849e-06, "loss": 1.4589, "mean_token_accuracy": 0.6665829420089722, "num_tokens": 20927646.0, "step": 26008 }, { "epoch": 6.888241525423728, "grad_norm": 2.1237733364105225, "learning_rate": 6.556011652542374e-06, "loss": 1.0053, "mean_token_accuracy": 0.771736316382885, "num_tokens": 20929193.0, "step": 26010 }, { "epoch": 6.888771186440678, "grad_norm": 2.34407901763916, "learning_rate": 6.5557468220338995e-06, "loss": 1.4429, "mean_token_accuracy": 0.6772589385509491, "num_tokens": 20930761.0, "step": 26012 }, { "epoch": 6.889300847457627, "grad_norm": 2.8695361614227295, "learning_rate": 6.5554819915254244e-06, "loss": 0.8156, "mean_token_accuracy": 0.8009895160794258, "num_tokens": 20931943.0, "step": 26014 }, { "epoch": 6.889830508474576, "grad_norm": 1.9127728939056396, "learning_rate": 6.55521716101695e-06, "loss": 1.0174, "mean_token_accuracy": 0.7490584924817085, "num_tokens": 20933827.0, "step": 26016 }, { "epoch": 6.890360169491525, "grad_norm": 2.9714348316192627, "learning_rate": 6.554952330508475e-06, "loss": 1.2438, "mean_token_accuracy": 0.71625816822052, "num_tokens": 20935253.0, "step": 26018 }, { "epoch": 6.890889830508475, "grad_norm": 2.1268844604492188, "learning_rate": 6.554687500000001e-06, "loss": 1.2534, "mean_token_accuracy": 0.718095675110817, "num_tokens": 20936828.0, "step": 26020 }, { "epoch": 6.891419491525424, "grad_norm": 2.387647867202759, "learning_rate": 6.554422669491526e-06, "loss": 1.28, "mean_token_accuracy": 0.6917693391442299, "num_tokens": 20938282.0, "step": 26022 }, { "epoch": 6.891949152542373, "grad_norm": 1.8083661794662476, "learning_rate": 6.554157838983052e-06, "loss": 0.8798, "mean_token_accuracy": 0.7669603899121284, "num_tokens": 20939815.0, "step": 26024 }, { "epoch": 6.892478813559322, "grad_norm": 2.510631561279297, "learning_rate": 6.5538930084745766e-06, "loss": 1.6283, "mean_token_accuracy": 0.6512477323412895, "num_tokens": 20941481.0, "step": 26026 }, { "epoch": 6.893008474576272, "grad_norm": 2.161663293838501, "learning_rate": 6.553628177966102e-06, "loss": 0.8972, "mean_token_accuracy": 0.7662172988057137, "num_tokens": 20943238.0, "step": 26028 }, { "epoch": 6.893538135593221, "grad_norm": 2.829488754272461, "learning_rate": 6.553363347457627e-06, "loss": 1.4297, "mean_token_accuracy": 0.6827840656042099, "num_tokens": 20944993.0, "step": 26030 }, { "epoch": 6.8940677966101696, "grad_norm": 2.0405964851379395, "learning_rate": 6.553098516949153e-06, "loss": 0.9326, "mean_token_accuracy": 0.7622141987085342, "num_tokens": 20946483.0, "step": 26032 }, { "epoch": 6.8945974576271185, "grad_norm": 2.1947362422943115, "learning_rate": 6.552833686440678e-06, "loss": 0.8956, "mean_token_accuracy": 0.7812348902225494, "num_tokens": 20948045.0, "step": 26034 }, { "epoch": 6.8951271186440675, "grad_norm": 2.609682083129883, "learning_rate": 6.552568855932205e-06, "loss": 1.1886, "mean_token_accuracy": 0.7253367155790329, "num_tokens": 20949509.0, "step": 26036 }, { "epoch": 6.895656779661017, "grad_norm": 2.4958555698394775, "learning_rate": 6.5523040254237295e-06, "loss": 1.6446, "mean_token_accuracy": 0.6630682423710823, "num_tokens": 20950967.0, "step": 26038 }, { "epoch": 6.896186440677966, "grad_norm": 2.52307391166687, "learning_rate": 6.552039194915255e-06, "loss": 1.1861, "mean_token_accuracy": 0.7252619117498398, "num_tokens": 20952641.0, "step": 26040 }, { "epoch": 6.896716101694915, "grad_norm": 2.5357162952423096, "learning_rate": 6.55177436440678e-06, "loss": 1.3783, "mean_token_accuracy": 0.6706486940383911, "num_tokens": 20954269.0, "step": 26042 }, { "epoch": 6.897245762711864, "grad_norm": 2.382739305496216, "learning_rate": 6.551509533898306e-06, "loss": 1.2165, "mean_token_accuracy": 0.7246929407119751, "num_tokens": 20955825.0, "step": 26044 }, { "epoch": 6.897775423728813, "grad_norm": 1.8622888326644897, "learning_rate": 6.551244703389831e-06, "loss": 1.0762, "mean_token_accuracy": 0.7312910482287407, "num_tokens": 20957489.0, "step": 26046 }, { "epoch": 6.898305084745763, "grad_norm": 2.583285331726074, "learning_rate": 6.550979872881357e-06, "loss": 1.2527, "mean_token_accuracy": 0.7020114734768867, "num_tokens": 20959265.0, "step": 26048 }, { "epoch": 6.898834745762712, "grad_norm": 2.0583677291870117, "learning_rate": 6.550715042372882e-06, "loss": 1.0617, "mean_token_accuracy": 0.7314578369259834, "num_tokens": 20961065.0, "step": 26050 }, { "epoch": 6.899364406779661, "grad_norm": 2.3721799850463867, "learning_rate": 6.550450211864407e-06, "loss": 1.017, "mean_token_accuracy": 0.7339591830968857, "num_tokens": 20962702.0, "step": 26052 }, { "epoch": 6.89989406779661, "grad_norm": 2.4579579830169678, "learning_rate": 6.550185381355932e-06, "loss": 1.6909, "mean_token_accuracy": 0.6071287244558334, "num_tokens": 20964319.0, "step": 26054 }, { "epoch": 6.90042372881356, "grad_norm": 2.88637113571167, "learning_rate": 6.549920550847458e-06, "loss": 1.4528, "mean_token_accuracy": 0.6770748421549797, "num_tokens": 20965897.0, "step": 26056 }, { "epoch": 6.900953389830509, "grad_norm": 2.4734644889831543, "learning_rate": 6.549655720338983e-06, "loss": 1.027, "mean_token_accuracy": 0.7463776990771294, "num_tokens": 20967673.0, "step": 26058 }, { "epoch": 6.901483050847458, "grad_norm": 3.0262434482574463, "learning_rate": 6.549390889830509e-06, "loss": 1.2739, "mean_token_accuracy": 0.728513590991497, "num_tokens": 20969003.0, "step": 26060 }, { "epoch": 6.902012711864407, "grad_norm": 2.3598873615264893, "learning_rate": 6.549126059322034e-06, "loss": 1.3482, "mean_token_accuracy": 0.6756618097424507, "num_tokens": 20971000.0, "step": 26062 }, { "epoch": 6.902542372881356, "grad_norm": 2.485255479812622, "learning_rate": 6.54886122881356e-06, "loss": 1.1083, "mean_token_accuracy": 0.7330245822668076, "num_tokens": 20972444.0, "step": 26064 }, { "epoch": 6.903072033898305, "grad_norm": 2.3323891162872314, "learning_rate": 6.5485963983050845e-06, "loss": 1.2942, "mean_token_accuracy": 0.6910364180803299, "num_tokens": 20973961.0, "step": 26066 }, { "epoch": 6.903601694915254, "grad_norm": 2.83256196975708, "learning_rate": 6.548331567796611e-06, "loss": 0.9755, "mean_token_accuracy": 0.7756356298923492, "num_tokens": 20975133.0, "step": 26068 }, { "epoch": 6.904131355932203, "grad_norm": 2.1106679439544678, "learning_rate": 6.548066737288136e-06, "loss": 1.296, "mean_token_accuracy": 0.710547186434269, "num_tokens": 20976790.0, "step": 26070 }, { "epoch": 6.904661016949152, "grad_norm": 2.6672604084014893, "learning_rate": 6.547801906779662e-06, "loss": 1.7999, "mean_token_accuracy": 0.630761943757534, "num_tokens": 20978280.0, "step": 26072 }, { "epoch": 6.905190677966102, "grad_norm": 2.0505211353302, "learning_rate": 6.547537076271187e-06, "loss": 1.0401, "mean_token_accuracy": 0.7379424571990967, "num_tokens": 20980067.0, "step": 26074 }, { "epoch": 6.905720338983051, "grad_norm": 2.5760178565979004, "learning_rate": 6.5472722457627125e-06, "loss": 1.5362, "mean_token_accuracy": 0.6550982445478439, "num_tokens": 20981660.0, "step": 26076 }, { "epoch": 6.90625, "grad_norm": 2.167105197906494, "learning_rate": 6.5470074152542374e-06, "loss": 1.2248, "mean_token_accuracy": 0.7078311890363693, "num_tokens": 20983403.0, "step": 26078 }, { "epoch": 6.906779661016949, "grad_norm": 3.1587815284729004, "learning_rate": 6.546742584745763e-06, "loss": 0.8876, "mean_token_accuracy": 0.7808157652616501, "num_tokens": 20985091.0, "step": 26080 }, { "epoch": 6.907309322033898, "grad_norm": 3.056333065032959, "learning_rate": 6.546477754237288e-06, "loss": 1.4633, "mean_token_accuracy": 0.6555478349328041, "num_tokens": 20986640.0, "step": 26082 }, { "epoch": 6.907838983050848, "grad_norm": 2.1763198375701904, "learning_rate": 6.546212923728814e-06, "loss": 1.3596, "mean_token_accuracy": 0.6979190185666084, "num_tokens": 20988507.0, "step": 26084 }, { "epoch": 6.908368644067797, "grad_norm": 2.6168594360351562, "learning_rate": 6.545948093220339e-06, "loss": 1.2334, "mean_token_accuracy": 0.7097868397831917, "num_tokens": 20990040.0, "step": 26086 }, { "epoch": 6.908898305084746, "grad_norm": 2.21415376663208, "learning_rate": 6.545683262711865e-06, "loss": 1.3113, "mean_token_accuracy": 0.70928555727005, "num_tokens": 20991522.0, "step": 26088 }, { "epoch": 6.909427966101695, "grad_norm": 2.8125734329223633, "learning_rate": 6.545418432203391e-06, "loss": 1.2463, "mean_token_accuracy": 0.7045238614082336, "num_tokens": 20993284.0, "step": 26090 }, { "epoch": 6.909957627118644, "grad_norm": 2.4779131412506104, "learning_rate": 6.545153601694916e-06, "loss": 1.0233, "mean_token_accuracy": 0.74887665361166, "num_tokens": 20994798.0, "step": 26092 }, { "epoch": 6.910487288135593, "grad_norm": 2.773388385772705, "learning_rate": 6.544888771186442e-06, "loss": 1.488, "mean_token_accuracy": 0.6525736302137375, "num_tokens": 20996472.0, "step": 26094 }, { "epoch": 6.911016949152542, "grad_norm": 2.5163934230804443, "learning_rate": 6.544623940677967e-06, "loss": 1.3475, "mean_token_accuracy": 0.6871473416686058, "num_tokens": 20998005.0, "step": 26096 }, { "epoch": 6.911546610169491, "grad_norm": 2.8175578117370605, "learning_rate": 6.544359110169493e-06, "loss": 1.6586, "mean_token_accuracy": 0.6536876931786537, "num_tokens": 20999694.0, "step": 26098 }, { "epoch": 6.91207627118644, "grad_norm": 2.144414186477661, "learning_rate": 6.544094279661018e-06, "loss": 1.5271, "mean_token_accuracy": 0.6840710416436195, "num_tokens": 21001418.0, "step": 26100 }, { "epoch": 6.91260593220339, "grad_norm": 2.853478193283081, "learning_rate": 6.543829449152543e-06, "loss": 1.1607, "mean_token_accuracy": 0.7159787863492966, "num_tokens": 21002800.0, "step": 26102 }, { "epoch": 6.913135593220339, "grad_norm": 2.689732789993286, "learning_rate": 6.543564618644068e-06, "loss": 1.2459, "mean_token_accuracy": 0.7172345817089081, "num_tokens": 21004033.0, "step": 26104 }, { "epoch": 6.913665254237288, "grad_norm": 2.2297632694244385, "learning_rate": 6.543299788135594e-06, "loss": 0.6402, "mean_token_accuracy": 0.8134900778532028, "num_tokens": 21005479.0, "step": 26106 }, { "epoch": 6.914194915254237, "grad_norm": 2.8827016353607178, "learning_rate": 6.543034957627119e-06, "loss": 1.1844, "mean_token_accuracy": 0.7108956649899483, "num_tokens": 21007005.0, "step": 26108 }, { "epoch": 6.914724576271187, "grad_norm": 2.448812246322632, "learning_rate": 6.542770127118645e-06, "loss": 1.4744, "mean_token_accuracy": 0.6547887772321701, "num_tokens": 21008700.0, "step": 26110 }, { "epoch": 6.915254237288136, "grad_norm": 3.1846981048583984, "learning_rate": 6.54250529661017e-06, "loss": 1.0274, "mean_token_accuracy": 0.7704812586307526, "num_tokens": 21010358.0, "step": 26112 }, { "epoch": 6.915783898305085, "grad_norm": 2.343759298324585, "learning_rate": 6.5422404661016955e-06, "loss": 1.283, "mean_token_accuracy": 0.7044288739562035, "num_tokens": 21011985.0, "step": 26114 }, { "epoch": 6.916313559322034, "grad_norm": 2.309757709503174, "learning_rate": 6.54197563559322e-06, "loss": 1.0067, "mean_token_accuracy": 0.7643680274486542, "num_tokens": 21013751.0, "step": 26116 }, { "epoch": 6.916843220338983, "grad_norm": 2.503178834915161, "learning_rate": 6.541710805084747e-06, "loss": 1.2544, "mean_token_accuracy": 0.730524867773056, "num_tokens": 21015319.0, "step": 26118 }, { "epoch": 6.9173728813559325, "grad_norm": 2.4759738445281982, "learning_rate": 6.541445974576271e-06, "loss": 1.1922, "mean_token_accuracy": 0.733598954975605, "num_tokens": 21016853.0, "step": 26120 }, { "epoch": 6.9179025423728815, "grad_norm": 2.5207390785217285, "learning_rate": 6.541181144067798e-06, "loss": 1.2497, "mean_token_accuracy": 0.7204035222530365, "num_tokens": 21018586.0, "step": 26122 }, { "epoch": 6.9184322033898304, "grad_norm": 2.2072343826293945, "learning_rate": 6.540916313559323e-06, "loss": 0.8716, "mean_token_accuracy": 0.7902804762125015, "num_tokens": 21020378.0, "step": 26124 }, { "epoch": 6.918961864406779, "grad_norm": 2.3463680744171143, "learning_rate": 6.5406514830508484e-06, "loss": 1.3458, "mean_token_accuracy": 0.7041922584176064, "num_tokens": 21021797.0, "step": 26126 }, { "epoch": 6.919491525423728, "grad_norm": 2.0833613872528076, "learning_rate": 6.540386652542373e-06, "loss": 1.1936, "mean_token_accuracy": 0.7007604166865349, "num_tokens": 21023366.0, "step": 26128 }, { "epoch": 6.920021186440678, "grad_norm": 2.5491256713867188, "learning_rate": 6.540121822033899e-06, "loss": 1.0611, "mean_token_accuracy": 0.7432405054569244, "num_tokens": 21024930.0, "step": 26130 }, { "epoch": 6.920550847457627, "grad_norm": 2.54412579536438, "learning_rate": 6.539856991525424e-06, "loss": 1.1509, "mean_token_accuracy": 0.7026830613613129, "num_tokens": 21026681.0, "step": 26132 }, { "epoch": 6.921080508474576, "grad_norm": 2.4861648082733154, "learning_rate": 6.53959216101695e-06, "loss": 1.5463, "mean_token_accuracy": 0.6666369065642357, "num_tokens": 21028267.0, "step": 26134 }, { "epoch": 6.921610169491525, "grad_norm": 2.2658348083496094, "learning_rate": 6.539327330508475e-06, "loss": 0.8099, "mean_token_accuracy": 0.7984558418393135, "num_tokens": 21029805.0, "step": 26136 }, { "epoch": 6.922139830508475, "grad_norm": 2.743795394897461, "learning_rate": 6.5390625000000005e-06, "loss": 1.3811, "mean_token_accuracy": 0.6903738453984261, "num_tokens": 21031333.0, "step": 26138 }, { "epoch": 6.922669491525424, "grad_norm": 2.5917348861694336, "learning_rate": 6.5387976694915255e-06, "loss": 0.9085, "mean_token_accuracy": 0.769841194152832, "num_tokens": 21032826.0, "step": 26140 }, { "epoch": 6.923199152542373, "grad_norm": 2.7822110652923584, "learning_rate": 6.538532838983051e-06, "loss": 1.3949, "mean_token_accuracy": 0.7118350267410278, "num_tokens": 21034234.0, "step": 26142 }, { "epoch": 6.923728813559322, "grad_norm": 2.7360126972198486, "learning_rate": 6.538268008474576e-06, "loss": 0.9966, "mean_token_accuracy": 0.775248184800148, "num_tokens": 21035780.0, "step": 26144 }, { "epoch": 6.924258474576272, "grad_norm": 2.4344687461853027, "learning_rate": 6.538003177966103e-06, "loss": 0.9673, "mean_token_accuracy": 0.7779485732316971, "num_tokens": 21037216.0, "step": 26146 }, { "epoch": 6.924788135593221, "grad_norm": 2.6055352687835693, "learning_rate": 6.537738347457627e-06, "loss": 1.188, "mean_token_accuracy": 0.725991278886795, "num_tokens": 21038999.0, "step": 26148 }, { "epoch": 6.9253177966101696, "grad_norm": 2.6204988956451416, "learning_rate": 6.5374735169491535e-06, "loss": 1.3064, "mean_token_accuracy": 0.7200310006737709, "num_tokens": 21040568.0, "step": 26150 }, { "epoch": 6.9258474576271185, "grad_norm": 2.1088151931762695, "learning_rate": 6.5372086864406784e-06, "loss": 1.0709, "mean_token_accuracy": 0.7333869263529778, "num_tokens": 21041964.0, "step": 26152 }, { "epoch": 6.9263771186440675, "grad_norm": 2.3671982288360596, "learning_rate": 6.536943855932204e-06, "loss": 1.2775, "mean_token_accuracy": 0.7138994485139847, "num_tokens": 21043574.0, "step": 26154 }, { "epoch": 6.926906779661017, "grad_norm": 2.42883038520813, "learning_rate": 6.536679025423729e-06, "loss": 1.2019, "mean_token_accuracy": 0.7059893533587456, "num_tokens": 21045296.0, "step": 26156 }, { "epoch": 6.927436440677966, "grad_norm": 1.72428297996521, "learning_rate": 6.536414194915255e-06, "loss": 0.8157, "mean_token_accuracy": 0.7863363996148109, "num_tokens": 21046910.0, "step": 26158 }, { "epoch": 6.927966101694915, "grad_norm": 2.9145233631134033, "learning_rate": 6.53614936440678e-06, "loss": 0.9821, "mean_token_accuracy": 0.7558157444000244, "num_tokens": 21048430.0, "step": 26160 }, { "epoch": 6.928495762711864, "grad_norm": 3.035552501678467, "learning_rate": 6.535884533898306e-06, "loss": 1.2299, "mean_token_accuracy": 0.7029792219400406, "num_tokens": 21049724.0, "step": 26162 }, { "epoch": 6.929025423728813, "grad_norm": 2.7539212703704834, "learning_rate": 6.5356197033898306e-06, "loss": 1.5505, "mean_token_accuracy": 0.6799816563725471, "num_tokens": 21051440.0, "step": 26164 }, { "epoch": 6.929555084745763, "grad_norm": 2.8309528827667236, "learning_rate": 6.535354872881356e-06, "loss": 1.3248, "mean_token_accuracy": 0.6665880158543587, "num_tokens": 21052992.0, "step": 26166 }, { "epoch": 6.930084745762712, "grad_norm": 3.045917272567749, "learning_rate": 6.535090042372881e-06, "loss": 1.4887, "mean_token_accuracy": 0.6649959534406662, "num_tokens": 21054480.0, "step": 26168 }, { "epoch": 6.930614406779661, "grad_norm": 2.542102813720703, "learning_rate": 6.534825211864407e-06, "loss": 1.1231, "mean_token_accuracy": 0.7481069266796112, "num_tokens": 21056064.0, "step": 26170 }, { "epoch": 6.93114406779661, "grad_norm": 2.3269166946411133, "learning_rate": 6.534560381355934e-06, "loss": 1.3595, "mean_token_accuracy": 0.6979669108986855, "num_tokens": 21057566.0, "step": 26172 }, { "epoch": 6.93167372881356, "grad_norm": 2.7306551933288574, "learning_rate": 6.534295550847458e-06, "loss": 1.1089, "mean_token_accuracy": 0.7447856143116951, "num_tokens": 21059033.0, "step": 26174 }, { "epoch": 6.932203389830509, "grad_norm": 1.714085340499878, "learning_rate": 6.534030720338984e-06, "loss": 0.8035, "mean_token_accuracy": 0.7822729423642159, "num_tokens": 21060565.0, "step": 26176 }, { "epoch": 6.932733050847458, "grad_norm": 2.688661575317383, "learning_rate": 6.533765889830509e-06, "loss": 1.4382, "mean_token_accuracy": 0.6808261349797249, "num_tokens": 21062114.0, "step": 26178 }, { "epoch": 6.933262711864407, "grad_norm": 2.1331024169921875, "learning_rate": 6.533501059322035e-06, "loss": 0.9262, "mean_token_accuracy": 0.7672401964664459, "num_tokens": 21063889.0, "step": 26180 }, { "epoch": 6.933792372881356, "grad_norm": 2.298818588256836, "learning_rate": 6.53323622881356e-06, "loss": 0.8825, "mean_token_accuracy": 0.7653302401304245, "num_tokens": 21065742.0, "step": 26182 }, { "epoch": 6.934322033898305, "grad_norm": 2.898632764816284, "learning_rate": 6.532971398305086e-06, "loss": 1.4228, "mean_token_accuracy": 0.7019002884626389, "num_tokens": 21067253.0, "step": 26184 }, { "epoch": 6.934851694915254, "grad_norm": 2.1876065731048584, "learning_rate": 6.532706567796611e-06, "loss": 1.0304, "mean_token_accuracy": 0.7424520179629326, "num_tokens": 21068904.0, "step": 26186 }, { "epoch": 6.935381355932203, "grad_norm": 2.70090389251709, "learning_rate": 6.5324417372881365e-06, "loss": 1.173, "mean_token_accuracy": 0.7289229184389114, "num_tokens": 21070301.0, "step": 26188 }, { "epoch": 6.935911016949152, "grad_norm": 2.226590394973755, "learning_rate": 6.532176906779661e-06, "loss": 0.8604, "mean_token_accuracy": 0.7808691635727882, "num_tokens": 21071743.0, "step": 26190 }, { "epoch": 6.936440677966102, "grad_norm": 2.014888048171997, "learning_rate": 6.531912076271187e-06, "loss": 1.1908, "mean_token_accuracy": 0.7217095047235489, "num_tokens": 21073405.0, "step": 26192 }, { "epoch": 6.936970338983051, "grad_norm": 2.6618196964263916, "learning_rate": 6.531647245762712e-06, "loss": 1.1572, "mean_token_accuracy": 0.7205090075731277, "num_tokens": 21074875.0, "step": 26194 }, { "epoch": 6.9375, "grad_norm": 3.0347414016723633, "learning_rate": 6.531382415254238e-06, "loss": 1.1044, "mean_token_accuracy": 0.7367073744535446, "num_tokens": 21076252.0, "step": 26196 }, { "epoch": 6.938029661016949, "grad_norm": 2.44468092918396, "learning_rate": 6.531117584745763e-06, "loss": 1.1417, "mean_token_accuracy": 0.7528446689248085, "num_tokens": 21077815.0, "step": 26198 }, { "epoch": 6.938559322033898, "grad_norm": 2.690178871154785, "learning_rate": 6.5308527542372894e-06, "loss": 1.5626, "mean_token_accuracy": 0.6353673413395882, "num_tokens": 21079361.0, "step": 26200 }, { "epoch": 6.939088983050848, "grad_norm": 1.8577278852462769, "learning_rate": 6.5305879237288135e-06, "loss": 1.3226, "mean_token_accuracy": 0.7091432027518749, "num_tokens": 21081137.0, "step": 26202 }, { "epoch": 6.939618644067797, "grad_norm": 3.050983190536499, "learning_rate": 6.53032309322034e-06, "loss": 1.3614, "mean_token_accuracy": 0.6913518384099007, "num_tokens": 21082788.0, "step": 26204 }, { "epoch": 6.940148305084746, "grad_norm": 2.078511953353882, "learning_rate": 6.530058262711865e-06, "loss": 1.5225, "mean_token_accuracy": 0.6703826859593391, "num_tokens": 21084811.0, "step": 26206 }, { "epoch": 6.940677966101695, "grad_norm": 2.315962076187134, "learning_rate": 6.529793432203391e-06, "loss": 0.8851, "mean_token_accuracy": 0.7707155048847198, "num_tokens": 21086510.0, "step": 26208 }, { "epoch": 6.941207627118644, "grad_norm": 2.2460787296295166, "learning_rate": 6.529528601694916e-06, "loss": 1.0672, "mean_token_accuracy": 0.7374164313077927, "num_tokens": 21088138.0, "step": 26210 }, { "epoch": 6.941737288135593, "grad_norm": 2.3211169242858887, "learning_rate": 6.5292637711864416e-06, "loss": 1.5832, "mean_token_accuracy": 0.6366925276815891, "num_tokens": 21089809.0, "step": 26212 }, { "epoch": 6.942266949152542, "grad_norm": 2.013909101486206, "learning_rate": 6.5289989406779665e-06, "loss": 1.0414, "mean_token_accuracy": 0.7417976930737495, "num_tokens": 21091341.0, "step": 26214 }, { "epoch": 6.942796610169491, "grad_norm": 2.1571037769317627, "learning_rate": 6.528734110169492e-06, "loss": 1.2489, "mean_token_accuracy": 0.7389930114150047, "num_tokens": 21093133.0, "step": 26216 }, { "epoch": 6.94332627118644, "grad_norm": 2.610584020614624, "learning_rate": 6.528469279661017e-06, "loss": 1.2251, "mean_token_accuracy": 0.6986034512519836, "num_tokens": 21094741.0, "step": 26218 }, { "epoch": 6.94385593220339, "grad_norm": 2.520934820175171, "learning_rate": 6.528204449152543e-06, "loss": 1.5673, "mean_token_accuracy": 0.655072771012783, "num_tokens": 21096494.0, "step": 26220 }, { "epoch": 6.944385593220339, "grad_norm": 2.4068503379821777, "learning_rate": 6.527939618644068e-06, "loss": 0.9912, "mean_token_accuracy": 0.7407485321164131, "num_tokens": 21098015.0, "step": 26222 }, { "epoch": 6.944915254237288, "grad_norm": 2.47251033782959, "learning_rate": 6.527674788135594e-06, "loss": 1.365, "mean_token_accuracy": 0.6890425309538841, "num_tokens": 21099480.0, "step": 26224 }, { "epoch": 6.945444915254237, "grad_norm": 2.6442935466766357, "learning_rate": 6.527409957627119e-06, "loss": 1.1329, "mean_token_accuracy": 0.744677796959877, "num_tokens": 21101059.0, "step": 26226 }, { "epoch": 6.945974576271187, "grad_norm": 2.267420530319214, "learning_rate": 6.527145127118644e-06, "loss": 1.3998, "mean_token_accuracy": 0.68815478682518, "num_tokens": 21102587.0, "step": 26228 }, { "epoch": 6.946504237288136, "grad_norm": 2.643972635269165, "learning_rate": 6.526880296610169e-06, "loss": 0.9048, "mean_token_accuracy": 0.778680220246315, "num_tokens": 21104069.0, "step": 26230 }, { "epoch": 6.947033898305085, "grad_norm": 2.4273524284362793, "learning_rate": 6.526615466101696e-06, "loss": 0.937, "mean_token_accuracy": 0.7523399665951729, "num_tokens": 21105449.0, "step": 26232 }, { "epoch": 6.947563559322034, "grad_norm": 2.359685182571411, "learning_rate": 6.526350635593221e-06, "loss": 0.9786, "mean_token_accuracy": 0.7495320960879326, "num_tokens": 21106911.0, "step": 26234 }, { "epoch": 6.948093220338983, "grad_norm": 2.331056594848633, "learning_rate": 6.526085805084747e-06, "loss": 1.0782, "mean_token_accuracy": 0.734384298324585, "num_tokens": 21108364.0, "step": 26236 }, { "epoch": 6.9486228813559325, "grad_norm": 2.6302268505096436, "learning_rate": 6.5258209745762716e-06, "loss": 1.3279, "mean_token_accuracy": 0.7168427109718323, "num_tokens": 21110156.0, "step": 26238 }, { "epoch": 6.9491525423728815, "grad_norm": 2.8417153358459473, "learning_rate": 6.525556144067797e-06, "loss": 1.1234, "mean_token_accuracy": 0.7335163280367851, "num_tokens": 21111510.0, "step": 26240 }, { "epoch": 6.9496822033898304, "grad_norm": 2.5308332443237305, "learning_rate": 6.525291313559322e-06, "loss": 1.1058, "mean_token_accuracy": 0.7595075145363808, "num_tokens": 21112780.0, "step": 26242 }, { "epoch": 6.950211864406779, "grad_norm": 2.2114288806915283, "learning_rate": 6.525026483050848e-06, "loss": 0.9643, "mean_token_accuracy": 0.7644317448139191, "num_tokens": 21114418.0, "step": 26244 }, { "epoch": 6.950741525423728, "grad_norm": 2.1110706329345703, "learning_rate": 6.524761652542373e-06, "loss": 1.1871, "mean_token_accuracy": 0.7257881015539169, "num_tokens": 21116029.0, "step": 26246 }, { "epoch": 6.951271186440678, "grad_norm": 2.421849250793457, "learning_rate": 6.524496822033899e-06, "loss": 1.4691, "mean_token_accuracy": 0.6743912696838379, "num_tokens": 21117684.0, "step": 26248 }, { "epoch": 6.951800847457627, "grad_norm": 2.5539588928222656, "learning_rate": 6.524231991525424e-06, "loss": 1.0621, "step": 26250 }, { "epoch": 6.951800847457627, "eval_loss": 1.3253302574157715, "eval_mean_token_accuracy": 0.7001062403251599, "eval_num_tokens": 21118999.0, "eval_runtime": 48.6192, "eval_samples_per_second": 6.335, "eval_steps_per_second": 6.335, "step": 26250 }, { "epoch": 6.952330508474576, "grad_norm": 2.1597542762756348, "learning_rate": 6.5239671610169495e-06, "loss": 1.206, "mean_token_accuracy": 0.729117214679718, "num_tokens": 21120715.0, "step": 26252 }, { "epoch": 6.952860169491525, "grad_norm": 2.2798807621002197, "learning_rate": 6.523702330508474e-06, "loss": 0.8711, "mean_token_accuracy": 0.7980964854359627, "num_tokens": 21122439.0, "step": 26254 }, { "epoch": 6.953389830508475, "grad_norm": 2.7204113006591797, "learning_rate": 6.5234375e-06, "loss": 1.3321, "mean_token_accuracy": 0.6950038745999336, "num_tokens": 21124031.0, "step": 26256 }, { "epoch": 6.953919491525424, "grad_norm": 1.8877602815628052, "learning_rate": 6.523172669491527e-06, "loss": 0.6902, "mean_token_accuracy": 0.8177425414323807, "num_tokens": 21125631.0, "step": 26258 }, { "epoch": 6.954449152542373, "grad_norm": 2.4170310497283936, "learning_rate": 6.522907838983052e-06, "loss": 1.3511, "mean_token_accuracy": 0.7066370993852615, "num_tokens": 21127273.0, "step": 26260 }, { "epoch": 6.954978813559322, "grad_norm": 2.625495433807373, "learning_rate": 6.5226430084745775e-06, "loss": 1.567, "mean_token_accuracy": 0.6764947324991226, "num_tokens": 21128667.0, "step": 26262 }, { "epoch": 6.955508474576272, "grad_norm": 2.468604564666748, "learning_rate": 6.5223781779661024e-06, "loss": 1.521, "mean_token_accuracy": 0.6617805883288383, "num_tokens": 21130179.0, "step": 26264 }, { "epoch": 6.956038135593221, "grad_norm": 2.61354923248291, "learning_rate": 6.522113347457628e-06, "loss": 0.9367, "mean_token_accuracy": 0.7768892869353294, "num_tokens": 21132343.0, "step": 26266 }, { "epoch": 6.9565677966101696, "grad_norm": 2.062779188156128, "learning_rate": 6.521848516949153e-06, "loss": 0.9826, "mean_token_accuracy": 0.7712856158614159, "num_tokens": 21133929.0, "step": 26268 }, { "epoch": 6.9570974576271185, "grad_norm": 2.5269954204559326, "learning_rate": 6.521583686440679e-06, "loss": 1.3598, "mean_token_accuracy": 0.6860308647155762, "num_tokens": 21135536.0, "step": 26270 }, { "epoch": 6.9576271186440675, "grad_norm": 2.895906925201416, "learning_rate": 6.521318855932204e-06, "loss": 1.3791, "mean_token_accuracy": 0.6636065170168877, "num_tokens": 21137838.0, "step": 26272 }, { "epoch": 6.958156779661017, "grad_norm": 2.5466248989105225, "learning_rate": 6.52105402542373e-06, "loss": 1.1014, "mean_token_accuracy": 0.7457879036664963, "num_tokens": 21139518.0, "step": 26274 }, { "epoch": 6.958686440677966, "grad_norm": 2.476820468902588, "learning_rate": 6.5207891949152545e-06, "loss": 1.4632, "mean_token_accuracy": 0.6692899465560913, "num_tokens": 21141184.0, "step": 26276 }, { "epoch": 6.959216101694915, "grad_norm": 2.8442604541778564, "learning_rate": 6.52052436440678e-06, "loss": 1.4238, "mean_token_accuracy": 0.6723845675587654, "num_tokens": 21142433.0, "step": 26278 }, { "epoch": 6.959745762711864, "grad_norm": 3.0988786220550537, "learning_rate": 6.520259533898305e-06, "loss": 1.0922, "mean_token_accuracy": 0.7467311546206474, "num_tokens": 21143900.0, "step": 26280 }, { "epoch": 6.960275423728813, "grad_norm": 2.6112024784088135, "learning_rate": 6.519994703389831e-06, "loss": 1.2674, "mean_token_accuracy": 0.7016546949744225, "num_tokens": 21145720.0, "step": 26282 }, { "epoch": 6.960805084745763, "grad_norm": 2.310574531555176, "learning_rate": 6.519729872881356e-06, "loss": 1.1204, "mean_token_accuracy": 0.7373341247439384, "num_tokens": 21147320.0, "step": 26284 }, { "epoch": 6.961334745762712, "grad_norm": 2.339099645614624, "learning_rate": 6.519465042372883e-06, "loss": 1.4576, "mean_token_accuracy": 0.6797645092010498, "num_tokens": 21149057.0, "step": 26286 }, { "epoch": 6.961864406779661, "grad_norm": 2.580716133117676, "learning_rate": 6.5192002118644075e-06, "loss": 1.3932, "mean_token_accuracy": 0.6727633848786354, "num_tokens": 21150589.0, "step": 26288 }, { "epoch": 6.96239406779661, "grad_norm": 2.793931484222412, "learning_rate": 6.518935381355933e-06, "loss": 1.1288, "mean_token_accuracy": 0.7247078791260719, "num_tokens": 21152240.0, "step": 26290 }, { "epoch": 6.96292372881356, "grad_norm": 5.571834087371826, "learning_rate": 6.518670550847458e-06, "loss": 0.9389, "mean_token_accuracy": 0.780258297920227, "num_tokens": 21153676.0, "step": 26292 }, { "epoch": 6.963453389830509, "grad_norm": 2.698592185974121, "learning_rate": 6.518405720338984e-06, "loss": 1.0985, "mean_token_accuracy": 0.7454999834299088, "num_tokens": 21155340.0, "step": 26294 }, { "epoch": 6.963983050847458, "grad_norm": 2.5981760025024414, "learning_rate": 6.518140889830509e-06, "loss": 1.1282, "mean_token_accuracy": 0.723748467862606, "num_tokens": 21156802.0, "step": 26296 }, { "epoch": 6.964512711864407, "grad_norm": 2.524900197982788, "learning_rate": 6.517876059322035e-06, "loss": 1.4852, "mean_token_accuracy": 0.6629534363746643, "num_tokens": 21158403.0, "step": 26298 }, { "epoch": 6.965042372881356, "grad_norm": 2.0881540775299072, "learning_rate": 6.51761122881356e-06, "loss": 1.0272, "mean_token_accuracy": 0.7479724213480949, "num_tokens": 21160164.0, "step": 26300 }, { "epoch": 6.965572033898305, "grad_norm": 2.449159622192383, "learning_rate": 6.517346398305085e-06, "loss": 1.0815, "mean_token_accuracy": 0.7465455457568169, "num_tokens": 21161802.0, "step": 26302 }, { "epoch": 6.966101694915254, "grad_norm": 2.838576078414917, "learning_rate": 6.51708156779661e-06, "loss": 1.2201, "mean_token_accuracy": 0.709899291396141, "num_tokens": 21163438.0, "step": 26304 }, { "epoch": 6.966631355932203, "grad_norm": 1.5849705934524536, "learning_rate": 6.516816737288136e-06, "loss": 0.972, "mean_token_accuracy": 0.7444401606917381, "num_tokens": 21165920.0, "step": 26306 }, { "epoch": 6.967161016949152, "grad_norm": 2.598315477371216, "learning_rate": 6.516551906779661e-06, "loss": 1.3784, "mean_token_accuracy": 0.6696274951100349, "num_tokens": 21167571.0, "step": 26308 }, { "epoch": 6.967690677966102, "grad_norm": 2.3394625186920166, "learning_rate": 6.516287076271187e-06, "loss": 1.0579, "mean_token_accuracy": 0.7485655173659325, "num_tokens": 21169206.0, "step": 26310 }, { "epoch": 6.968220338983051, "grad_norm": 2.66328763961792, "learning_rate": 6.516022245762712e-06, "loss": 1.573, "mean_token_accuracy": 0.6501235812902451, "num_tokens": 21170937.0, "step": 26312 }, { "epoch": 6.96875, "grad_norm": 2.356067657470703, "learning_rate": 6.515757415254238e-06, "loss": 1.0269, "mean_token_accuracy": 0.7422627583146095, "num_tokens": 21172701.0, "step": 26314 }, { "epoch": 6.969279661016949, "grad_norm": 1.6334141492843628, "learning_rate": 6.5154925847457624e-06, "loss": 0.9973, "mean_token_accuracy": 0.771797426044941, "num_tokens": 21174028.0, "step": 26316 }, { "epoch": 6.969809322033898, "grad_norm": 2.3317508697509766, "learning_rate": 6.515227754237289e-06, "loss": 1.2635, "mean_token_accuracy": 0.7176399230957031, "num_tokens": 21176433.0, "step": 26318 }, { "epoch": 6.970338983050848, "grad_norm": 2.0910568237304688, "learning_rate": 6.514962923728814e-06, "loss": 0.9904, "mean_token_accuracy": 0.7700219675898552, "num_tokens": 21177841.0, "step": 26320 }, { "epoch": 6.970868644067797, "grad_norm": 2.5321240425109863, "learning_rate": 6.51469809322034e-06, "loss": 1.2887, "mean_token_accuracy": 0.7018020451068878, "num_tokens": 21179308.0, "step": 26322 }, { "epoch": 6.971398305084746, "grad_norm": 2.6588246822357178, "learning_rate": 6.514433262711865e-06, "loss": 1.7289, "mean_token_accuracy": 0.6713168919086456, "num_tokens": 21180971.0, "step": 26324 }, { "epoch": 6.971927966101695, "grad_norm": 2.2301182746887207, "learning_rate": 6.5141684322033905e-06, "loss": 1.0655, "mean_token_accuracy": 0.7230082526803017, "num_tokens": 21182884.0, "step": 26326 }, { "epoch": 6.972457627118644, "grad_norm": 1.8925851583480835, "learning_rate": 6.513903601694915e-06, "loss": 1.0964, "mean_token_accuracy": 0.7169815674424171, "num_tokens": 21185010.0, "step": 26328 }, { "epoch": 6.972987288135593, "grad_norm": 2.5301613807678223, "learning_rate": 6.513638771186441e-06, "loss": 1.1084, "mean_token_accuracy": 0.7246732041239738, "num_tokens": 21186595.0, "step": 26330 }, { "epoch": 6.973516949152542, "grad_norm": 2.2115330696105957, "learning_rate": 6.513373940677966e-06, "loss": 1.2402, "mean_token_accuracy": 0.7245765700936317, "num_tokens": 21188116.0, "step": 26332 }, { "epoch": 6.974046610169491, "grad_norm": 3.026735305786133, "learning_rate": 6.513109110169492e-06, "loss": 1.043, "mean_token_accuracy": 0.7555159777402878, "num_tokens": 21189381.0, "step": 26334 }, { "epoch": 6.97457627118644, "grad_norm": 2.5972630977630615, "learning_rate": 6.512844279661017e-06, "loss": 0.9536, "mean_token_accuracy": 0.7444865629076958, "num_tokens": 21191016.0, "step": 26336 }, { "epoch": 6.97510593220339, "grad_norm": 2.524786949157715, "learning_rate": 6.512579449152543e-06, "loss": 1.6151, "mean_token_accuracy": 0.6034141592681408, "num_tokens": 21193043.0, "step": 26338 }, { "epoch": 6.975635593220339, "grad_norm": 2.6730446815490723, "learning_rate": 6.5123146186440675e-06, "loss": 0.8743, "mean_token_accuracy": 0.7805454954504967, "num_tokens": 21194551.0, "step": 26340 }, { "epoch": 6.976165254237288, "grad_norm": 1.9750088453292847, "learning_rate": 6.512049788135594e-06, "loss": 0.8983, "mean_token_accuracy": 0.778447113931179, "num_tokens": 21196164.0, "step": 26342 }, { "epoch": 6.976694915254237, "grad_norm": 2.542182683944702, "learning_rate": 6.51178495762712e-06, "loss": 1.165, "mean_token_accuracy": 0.7225238457322121, "num_tokens": 21197677.0, "step": 26344 }, { "epoch": 6.977224576271187, "grad_norm": 2.678466796875, "learning_rate": 6.511520127118645e-06, "loss": 1.5741, "mean_token_accuracy": 0.6772408932447433, "num_tokens": 21199001.0, "step": 26346 }, { "epoch": 6.977754237288136, "grad_norm": 2.3045008182525635, "learning_rate": 6.511255296610171e-06, "loss": 1.303, "mean_token_accuracy": 0.7068662792444229, "num_tokens": 21200490.0, "step": 26348 }, { "epoch": 6.978283898305085, "grad_norm": 2.5614848136901855, "learning_rate": 6.5109904661016956e-06, "loss": 1.1212, "mean_token_accuracy": 0.7053306326270103, "num_tokens": 21202211.0, "step": 26350 }, { "epoch": 6.978813559322034, "grad_norm": 2.0872042179107666, "learning_rate": 6.510725635593221e-06, "loss": 1.0211, "mean_token_accuracy": 0.7599349915981293, "num_tokens": 21203923.0, "step": 26352 }, { "epoch": 6.979343220338983, "grad_norm": 2.6465859413146973, "learning_rate": 6.510460805084746e-06, "loss": 1.316, "mean_token_accuracy": 0.7293780073523521, "num_tokens": 21205513.0, "step": 26354 }, { "epoch": 6.9798728813559325, "grad_norm": 2.841632604598999, "learning_rate": 6.510195974576272e-06, "loss": 1.3991, "mean_token_accuracy": 0.6763444170355797, "num_tokens": 21207090.0, "step": 26356 }, { "epoch": 6.9804025423728815, "grad_norm": 2.546785354614258, "learning_rate": 6.509931144067797e-06, "loss": 1.4282, "mean_token_accuracy": 0.6752287074923515, "num_tokens": 21208617.0, "step": 26358 }, { "epoch": 6.9809322033898304, "grad_norm": 2.188781261444092, "learning_rate": 6.509666313559323e-06, "loss": 1.4365, "mean_token_accuracy": 0.6754996106028557, "num_tokens": 21210415.0, "step": 26360 }, { "epoch": 6.981461864406779, "grad_norm": 2.259097099304199, "learning_rate": 6.509401483050848e-06, "loss": 1.0008, "mean_token_accuracy": 0.757338210940361, "num_tokens": 21212038.0, "step": 26362 }, { "epoch": 6.981991525423728, "grad_norm": 2.1550567150115967, "learning_rate": 6.5091366525423734e-06, "loss": 1.0135, "mean_token_accuracy": 0.7568221688270569, "num_tokens": 21213531.0, "step": 26364 }, { "epoch": 6.982521186440678, "grad_norm": 3.117363929748535, "learning_rate": 6.508871822033898e-06, "loss": 1.2693, "mean_token_accuracy": 0.6934528276324272, "num_tokens": 21214846.0, "step": 26366 }, { "epoch": 6.983050847457627, "grad_norm": 2.647806167602539, "learning_rate": 6.508606991525425e-06, "loss": 1.2946, "mean_token_accuracy": 0.6924465149641037, "num_tokens": 21216197.0, "step": 26368 }, { "epoch": 6.983580508474576, "grad_norm": 3.5627686977386475, "learning_rate": 6.508342161016949e-06, "loss": 1.4589, "mean_token_accuracy": 0.6914965957403183, "num_tokens": 21217664.0, "step": 26370 }, { "epoch": 6.984110169491525, "grad_norm": 2.185370445251465, "learning_rate": 6.508077330508476e-06, "loss": 0.9553, "mean_token_accuracy": 0.75467549264431, "num_tokens": 21219241.0, "step": 26372 }, { "epoch": 6.984639830508475, "grad_norm": 2.121155023574829, "learning_rate": 6.507812500000001e-06, "loss": 1.2814, "mean_token_accuracy": 0.7143289968371391, "num_tokens": 21220752.0, "step": 26374 }, { "epoch": 6.985169491525424, "grad_norm": 2.417725086212158, "learning_rate": 6.507547669491526e-06, "loss": 1.4809, "mean_token_accuracy": 0.647428534924984, "num_tokens": 21222446.0, "step": 26376 }, { "epoch": 6.985699152542373, "grad_norm": 2.1310012340545654, "learning_rate": 6.507282838983051e-06, "loss": 1.2929, "mean_token_accuracy": 0.7032888829708099, "num_tokens": 21224113.0, "step": 26378 }, { "epoch": 6.986228813559322, "grad_norm": 2.424333333969116, "learning_rate": 6.507018008474577e-06, "loss": 1.2327, "mean_token_accuracy": 0.7062389105558395, "num_tokens": 21225705.0, "step": 26380 }, { "epoch": 6.986758474576272, "grad_norm": 1.8855717182159424, "learning_rate": 6.506753177966102e-06, "loss": 1.0828, "mean_token_accuracy": 0.7234814465045929, "num_tokens": 21227412.0, "step": 26382 }, { "epoch": 6.987288135593221, "grad_norm": 2.3355300426483154, "learning_rate": 6.506488347457628e-06, "loss": 1.2122, "mean_token_accuracy": 0.7218191847205162, "num_tokens": 21229183.0, "step": 26384 }, { "epoch": 6.9878177966101696, "grad_norm": 2.9660863876342773, "learning_rate": 6.506223516949153e-06, "loss": 1.5244, "mean_token_accuracy": 0.6873585507273674, "num_tokens": 21230529.0, "step": 26386 }, { "epoch": 6.9883474576271185, "grad_norm": 2.3203535079956055, "learning_rate": 6.5059586864406785e-06, "loss": 1.5228, "mean_token_accuracy": 0.6809408888220787, "num_tokens": 21232329.0, "step": 26388 }, { "epoch": 6.9888771186440675, "grad_norm": 2.582350254058838, "learning_rate": 6.5056938559322035e-06, "loss": 1.3624, "mean_token_accuracy": 0.686459943652153, "num_tokens": 21234056.0, "step": 26390 }, { "epoch": 6.989406779661017, "grad_norm": 2.2792842388153076, "learning_rate": 6.505429025423729e-06, "loss": 1.0058, "mean_token_accuracy": 0.7558507993817329, "num_tokens": 21235445.0, "step": 26392 }, { "epoch": 6.989936440677966, "grad_norm": 2.145822048187256, "learning_rate": 6.505164194915254e-06, "loss": 0.9806, "mean_token_accuracy": 0.7626690417528152, "num_tokens": 21236836.0, "step": 26394 }, { "epoch": 6.990466101694915, "grad_norm": 2.2565741539001465, "learning_rate": 6.504899364406781e-06, "loss": 1.4958, "mean_token_accuracy": 0.65988514944911, "num_tokens": 21238409.0, "step": 26396 }, { "epoch": 6.990995762711864, "grad_norm": 2.1252193450927734, "learning_rate": 6.504634533898305e-06, "loss": 1.2092, "mean_token_accuracy": 0.6992443725466728, "num_tokens": 21240873.0, "step": 26398 }, { "epoch": 6.991525423728813, "grad_norm": 2.484783887863159, "learning_rate": 6.5043697033898315e-06, "loss": 1.0104, "mean_token_accuracy": 0.7506958022713661, "num_tokens": 21242442.0, "step": 26400 }, { "epoch": 6.992055084745763, "grad_norm": 3.564552068710327, "learning_rate": 6.504104872881356e-06, "loss": 1.1043, "mean_token_accuracy": 0.7631508633494377, "num_tokens": 21243902.0, "step": 26402 }, { "epoch": 6.992584745762712, "grad_norm": 2.827718734741211, "learning_rate": 6.503840042372882e-06, "loss": 1.3748, "mean_token_accuracy": 0.6790727749466896, "num_tokens": 21245396.0, "step": 26404 }, { "epoch": 6.993114406779661, "grad_norm": 2.289917230606079, "learning_rate": 6.503575211864407e-06, "loss": 0.9645, "mean_token_accuracy": 0.7572465166449547, "num_tokens": 21247134.0, "step": 26406 }, { "epoch": 6.99364406779661, "grad_norm": 2.374155282974243, "learning_rate": 6.503310381355933e-06, "loss": 0.8924, "mean_token_accuracy": 0.7763385772705078, "num_tokens": 21248669.0, "step": 26408 }, { "epoch": 6.99417372881356, "grad_norm": 2.302264928817749, "learning_rate": 6.503045550847458e-06, "loss": 1.5414, "mean_token_accuracy": 0.680850051343441, "num_tokens": 21250304.0, "step": 26410 }, { "epoch": 6.994703389830509, "grad_norm": 2.177476644515991, "learning_rate": 6.502780720338984e-06, "loss": 1.1285, "mean_token_accuracy": 0.7199574410915375, "num_tokens": 21252119.0, "step": 26412 }, { "epoch": 6.995233050847458, "grad_norm": 2.4158506393432617, "learning_rate": 6.5025158898305085e-06, "loss": 0.9205, "mean_token_accuracy": 0.7721742242574692, "num_tokens": 21253480.0, "step": 26414 }, { "epoch": 6.995762711864407, "grad_norm": 2.1728861331939697, "learning_rate": 6.502251059322034e-06, "loss": 1.458, "mean_token_accuracy": 0.7001524157822132, "num_tokens": 21255060.0, "step": 26416 }, { "epoch": 6.996292372881356, "grad_norm": 2.8924789428710938, "learning_rate": 6.501986228813559e-06, "loss": 0.9908, "mean_token_accuracy": 0.7808263450860977, "num_tokens": 21256439.0, "step": 26418 }, { "epoch": 6.996822033898305, "grad_norm": 2.480211019515991, "learning_rate": 6.501721398305085e-06, "loss": 1.3663, "mean_token_accuracy": 0.6864387691020966, "num_tokens": 21258439.0, "step": 26420 }, { "epoch": 6.997351694915254, "grad_norm": 1.9769575595855713, "learning_rate": 6.50145656779661e-06, "loss": 1.4729, "mean_token_accuracy": 0.6837536916136742, "num_tokens": 21260055.0, "step": 26422 }, { "epoch": 6.997881355932203, "grad_norm": 1.9855512380599976, "learning_rate": 6.501191737288136e-06, "loss": 0.7264, "mean_token_accuracy": 0.8259565979242325, "num_tokens": 21261428.0, "step": 26424 }, { "epoch": 6.998411016949152, "grad_norm": 2.631139039993286, "learning_rate": 6.500926906779662e-06, "loss": 1.4326, "mean_token_accuracy": 0.6544315665960312, "num_tokens": 21263015.0, "step": 26426 }, { "epoch": 6.998940677966102, "grad_norm": 2.486102342605591, "learning_rate": 6.500662076271187e-06, "loss": 1.0159, "mean_token_accuracy": 0.7592109814286232, "num_tokens": 21264505.0, "step": 26428 }, { "epoch": 6.999470338983051, "grad_norm": 2.5363993644714355, "learning_rate": 6.500397245762713e-06, "loss": 1.3432, "mean_token_accuracy": 0.6823387965559959, "num_tokens": 21266006.0, "step": 26430 }, { "epoch": 7.0, "grad_norm": 2.314967632293701, "learning_rate": 6.500132415254238e-06, "loss": 1.3768, "mean_token_accuracy": 0.7059273943305016, "num_tokens": 21267876.0, "step": 26432 }, { "epoch": 7.000529661016949, "grad_norm": 2.2512567043304443, "learning_rate": 6.499867584745764e-06, "loss": 0.9922, "mean_token_accuracy": 0.7441974133253098, "num_tokens": 21269608.0, "step": 26434 }, { "epoch": 7.001059322033898, "grad_norm": 2.3302597999572754, "learning_rate": 6.499602754237289e-06, "loss": 0.9735, "mean_token_accuracy": 0.7479139715433121, "num_tokens": 21271095.0, "step": 26436 }, { "epoch": 7.001588983050848, "grad_norm": 2.310429096221924, "learning_rate": 6.4993379237288145e-06, "loss": 1.1239, "mean_token_accuracy": 0.7428635358810425, "num_tokens": 21272731.0, "step": 26438 }, { "epoch": 7.002118644067797, "grad_norm": 2.439568042755127, "learning_rate": 6.499073093220339e-06, "loss": 1.5375, "mean_token_accuracy": 0.6668355464935303, "num_tokens": 21274411.0, "step": 26440 }, { "epoch": 7.002648305084746, "grad_norm": 2.283634662628174, "learning_rate": 6.498808262711865e-06, "loss": 0.9402, "mean_token_accuracy": 0.7713204175233841, "num_tokens": 21276109.0, "step": 26442 }, { "epoch": 7.003177966101695, "grad_norm": 2.2653467655181885, "learning_rate": 6.49854343220339e-06, "loss": 1.0802, "mean_token_accuracy": 0.7378737851977348, "num_tokens": 21277709.0, "step": 26444 }, { "epoch": 7.0037076271186445, "grad_norm": 2.8796308040618896, "learning_rate": 6.498278601694916e-06, "loss": 1.4538, "mean_token_accuracy": 0.683975338935852, "num_tokens": 21279219.0, "step": 26446 }, { "epoch": 7.004237288135593, "grad_norm": 2.498478412628174, "learning_rate": 6.498013771186441e-06, "loss": 1.1532, "mean_token_accuracy": 0.7140674144029617, "num_tokens": 21280757.0, "step": 26448 }, { "epoch": 7.004766949152542, "grad_norm": 2.2613141536712646, "learning_rate": 6.4977489406779674e-06, "loss": 0.9728, "mean_token_accuracy": 0.7881807833909988, "num_tokens": 21282038.0, "step": 26450 }, { "epoch": 7.005296610169491, "grad_norm": 2.7611446380615234, "learning_rate": 6.4974841101694915e-06, "loss": 1.3807, "mean_token_accuracy": 0.6698266044259071, "num_tokens": 21283612.0, "step": 26452 }, { "epoch": 7.00582627118644, "grad_norm": 3.1081044673919678, "learning_rate": 6.497219279661018e-06, "loss": 1.4071, "mean_token_accuracy": 0.7175647988915443, "num_tokens": 21285225.0, "step": 26454 }, { "epoch": 7.00635593220339, "grad_norm": 3.0335440635681152, "learning_rate": 6.496954449152543e-06, "loss": 1.4241, "mean_token_accuracy": 0.661038413643837, "num_tokens": 21286669.0, "step": 26456 }, { "epoch": 7.006885593220339, "grad_norm": 2.2650012969970703, "learning_rate": 6.496689618644069e-06, "loss": 1.1533, "mean_token_accuracy": 0.750803105533123, "num_tokens": 21288098.0, "step": 26458 }, { "epoch": 7.007415254237288, "grad_norm": 2.1318273544311523, "learning_rate": 6.496424788135594e-06, "loss": 0.919, "mean_token_accuracy": 0.7769458442926407, "num_tokens": 21290066.0, "step": 26460 }, { "epoch": 7.007944915254237, "grad_norm": 1.9078689813613892, "learning_rate": 6.4961599576271195e-06, "loss": 0.8395, "mean_token_accuracy": 0.7970423623919487, "num_tokens": 21291650.0, "step": 26462 }, { "epoch": 7.008474576271187, "grad_norm": 2.7537972927093506, "learning_rate": 6.4958951271186445e-06, "loss": 1.1883, "mean_token_accuracy": 0.7228818386793137, "num_tokens": 21293324.0, "step": 26464 }, { "epoch": 7.009004237288136, "grad_norm": 2.1339809894561768, "learning_rate": 6.49563029661017e-06, "loss": 1.0557, "mean_token_accuracy": 0.7452512383460999, "num_tokens": 21295051.0, "step": 26466 }, { "epoch": 7.009533898305085, "grad_norm": 1.9622471332550049, "learning_rate": 6.495365466101695e-06, "loss": 0.9185, "mean_token_accuracy": 0.7579684481024742, "num_tokens": 21296883.0, "step": 26468 }, { "epoch": 7.010063559322034, "grad_norm": 2.93597149848938, "learning_rate": 6.495100635593221e-06, "loss": 1.3802, "mean_token_accuracy": 0.6788886860013008, "num_tokens": 21298404.0, "step": 26470 }, { "epoch": 7.010593220338983, "grad_norm": 2.4710049629211426, "learning_rate": 6.494835805084746e-06, "loss": 1.0807, "mean_token_accuracy": 0.7550135776400566, "num_tokens": 21299563.0, "step": 26472 }, { "epoch": 7.0111228813559325, "grad_norm": 2.9441418647766113, "learning_rate": 6.494570974576272e-06, "loss": 1.462, "mean_token_accuracy": 0.691793367266655, "num_tokens": 21301048.0, "step": 26474 }, { "epoch": 7.0116525423728815, "grad_norm": 2.9588849544525146, "learning_rate": 6.494306144067797e-06, "loss": 1.0269, "mean_token_accuracy": 0.7473046332597733, "num_tokens": 21302480.0, "step": 26476 }, { "epoch": 7.0121822033898304, "grad_norm": 2.4903724193573, "learning_rate": 6.494041313559322e-06, "loss": 1.069, "mean_token_accuracy": 0.7572866082191467, "num_tokens": 21304074.0, "step": 26478 }, { "epoch": 7.012711864406779, "grad_norm": 2.6321539878845215, "learning_rate": 6.493776483050847e-06, "loss": 1.732, "mean_token_accuracy": 0.6028298437595367, "num_tokens": 21305871.0, "step": 26480 }, { "epoch": 7.013241525423729, "grad_norm": 2.3140625953674316, "learning_rate": 6.493511652542374e-06, "loss": 0.9123, "mean_token_accuracy": 0.7798646911978722, "num_tokens": 21307370.0, "step": 26482 }, { "epoch": 7.013771186440678, "grad_norm": 3.0314290523529053, "learning_rate": 6.493246822033899e-06, "loss": 1.3172, "mean_token_accuracy": 0.7074770778417587, "num_tokens": 21308859.0, "step": 26484 }, { "epoch": 7.014300847457627, "grad_norm": 2.8743302822113037, "learning_rate": 6.492981991525425e-06, "loss": 1.1304, "mean_token_accuracy": 0.7497439831495285, "num_tokens": 21310126.0, "step": 26486 }, { "epoch": 7.014830508474576, "grad_norm": 2.2950873374938965, "learning_rate": 6.4927171610169496e-06, "loss": 1.1541, "mean_token_accuracy": 0.7079150378704071, "num_tokens": 21311690.0, "step": 26488 }, { "epoch": 7.015360169491525, "grad_norm": 2.0559585094451904, "learning_rate": 6.492452330508475e-06, "loss": 0.9843, "mean_token_accuracy": 0.7879000082612038, "num_tokens": 21313041.0, "step": 26490 }, { "epoch": 7.015889830508475, "grad_norm": 2.7673566341400146, "learning_rate": 6.4921875e-06, "loss": 1.2868, "mean_token_accuracy": 0.7040404826402664, "num_tokens": 21314730.0, "step": 26492 }, { "epoch": 7.016419491525424, "grad_norm": 2.8358020782470703, "learning_rate": 6.491922669491526e-06, "loss": 1.3094, "mean_token_accuracy": 0.6859803572297096, "num_tokens": 21316425.0, "step": 26494 }, { "epoch": 7.016949152542373, "grad_norm": 2.7089366912841797, "learning_rate": 6.491657838983051e-06, "loss": 1.1472, "mean_token_accuracy": 0.7225935235619545, "num_tokens": 21318148.0, "step": 26496 }, { "epoch": 7.017478813559322, "grad_norm": 2.2836713790893555, "learning_rate": 6.491393008474577e-06, "loss": 1.093, "mean_token_accuracy": 0.7267733663320541, "num_tokens": 21319961.0, "step": 26498 }, { "epoch": 7.018008474576271, "grad_norm": 2.3307888507843018, "learning_rate": 6.491128177966102e-06, "loss": 0.9692, "step": 26500 }, { "epoch": 7.018008474576271, "eval_loss": 1.3370181322097778, "eval_mean_token_accuracy": 0.6995368982096771, "eval_num_tokens": 21321911.0, "eval_runtime": 48.7242, "eval_samples_per_second": 6.321, "eval_steps_per_second": 6.321, "step": 26500 }, { "epoch": 7.018538135593221, "grad_norm": 2.713747501373291, "learning_rate": 6.4908633474576274e-06, "loss": 1.4494, "mean_token_accuracy": 0.7037721425294876, "num_tokens": 21323672.0, "step": 26502 }, { "epoch": 7.0190677966101696, "grad_norm": 2.3846936225891113, "learning_rate": 6.490598516949152e-06, "loss": 1.2765, "mean_token_accuracy": 0.7353990152478218, "num_tokens": 21325278.0, "step": 26504 }, { "epoch": 7.0195974576271185, "grad_norm": 2.2929604053497314, "learning_rate": 6.490333686440678e-06, "loss": 1.2609, "mean_token_accuracy": 0.6953203976154327, "num_tokens": 21326845.0, "step": 26506 }, { "epoch": 7.0201271186440675, "grad_norm": 3.2280352115631104, "learning_rate": 6.490068855932203e-06, "loss": 1.1103, "mean_token_accuracy": 0.7368922829627991, "num_tokens": 21328221.0, "step": 26508 }, { "epoch": 7.020656779661017, "grad_norm": 2.6253790855407715, "learning_rate": 6.48980402542373e-06, "loss": 1.0752, "mean_token_accuracy": 0.7538797035813332, "num_tokens": 21329636.0, "step": 26510 }, { "epoch": 7.021186440677966, "grad_norm": 2.7117819786071777, "learning_rate": 6.4895391949152555e-06, "loss": 1.5259, "mean_token_accuracy": 0.6483487784862518, "num_tokens": 21331423.0, "step": 26512 }, { "epoch": 7.021716101694915, "grad_norm": 2.8504106998443604, "learning_rate": 6.48927436440678e-06, "loss": 1.1699, "mean_token_accuracy": 0.7371080592274666, "num_tokens": 21332987.0, "step": 26514 }, { "epoch": 7.022245762711864, "grad_norm": 2.9526031017303467, "learning_rate": 6.489009533898306e-06, "loss": 1.3673, "mean_token_accuracy": 0.7062789499759674, "num_tokens": 21334415.0, "step": 26516 }, { "epoch": 7.022775423728813, "grad_norm": 2.6186933517456055, "learning_rate": 6.488744703389831e-06, "loss": 1.0002, "mean_token_accuracy": 0.7626661881804466, "num_tokens": 21335869.0, "step": 26518 }, { "epoch": 7.023305084745763, "grad_norm": 2.578484058380127, "learning_rate": 6.488479872881357e-06, "loss": 1.2546, "mean_token_accuracy": 0.7245556190609932, "num_tokens": 21337532.0, "step": 26520 }, { "epoch": 7.023834745762712, "grad_norm": 3.1059372425079346, "learning_rate": 6.488215042372882e-06, "loss": 1.7265, "mean_token_accuracy": 0.6281845793128014, "num_tokens": 21338967.0, "step": 26522 }, { "epoch": 7.024364406779661, "grad_norm": 3.2029781341552734, "learning_rate": 6.487950211864408e-06, "loss": 1.2369, "mean_token_accuracy": 0.7518512606620789, "num_tokens": 21340544.0, "step": 26524 }, { "epoch": 7.02489406779661, "grad_norm": 2.0394396781921387, "learning_rate": 6.4876853813559325e-06, "loss": 0.6811, "mean_token_accuracy": 0.8100931122899055, "num_tokens": 21342049.0, "step": 26526 }, { "epoch": 7.02542372881356, "grad_norm": 2.2442126274108887, "learning_rate": 6.487420550847458e-06, "loss": 1.0248, "mean_token_accuracy": 0.741440162062645, "num_tokens": 21343657.0, "step": 26528 }, { "epoch": 7.025953389830509, "grad_norm": 2.402268886566162, "learning_rate": 6.487155720338983e-06, "loss": 0.976, "mean_token_accuracy": 0.7443747445940971, "num_tokens": 21345308.0, "step": 26530 }, { "epoch": 7.026483050847458, "grad_norm": 2.2016761302948, "learning_rate": 6.486890889830509e-06, "loss": 0.9875, "mean_token_accuracy": 0.7455314621329308, "num_tokens": 21347187.0, "step": 26532 }, { "epoch": 7.027012711864407, "grad_norm": 2.1941871643066406, "learning_rate": 6.486626059322034e-06, "loss": 1.0292, "mean_token_accuracy": 0.7458584234118462, "num_tokens": 21348838.0, "step": 26534 }, { "epoch": 7.0275423728813555, "grad_norm": 2.6487531661987305, "learning_rate": 6.4863612288135606e-06, "loss": 1.1018, "mean_token_accuracy": 0.7404664531350136, "num_tokens": 21350613.0, "step": 26536 }, { "epoch": 7.028072033898305, "grad_norm": 2.450174331665039, "learning_rate": 6.4860963983050855e-06, "loss": 1.0517, "mean_token_accuracy": 0.7361838296055794, "num_tokens": 21352227.0, "step": 26538 }, { "epoch": 7.028601694915254, "grad_norm": 1.983981728553772, "learning_rate": 6.485831567796611e-06, "loss": 1.0684, "mean_token_accuracy": 0.7337170019745827, "num_tokens": 21354170.0, "step": 26540 }, { "epoch": 7.029131355932203, "grad_norm": 2.906658411026001, "learning_rate": 6.485566737288136e-06, "loss": 1.5058, "mean_token_accuracy": 0.6673719957470894, "num_tokens": 21355683.0, "step": 26542 }, { "epoch": 7.029661016949152, "grad_norm": 2.689588785171509, "learning_rate": 6.485301906779662e-06, "loss": 1.2397, "mean_token_accuracy": 0.7034421190619469, "num_tokens": 21357309.0, "step": 26544 }, { "epoch": 7.030190677966102, "grad_norm": 2.4808764457702637, "learning_rate": 6.485037076271187e-06, "loss": 1.2799, "mean_token_accuracy": 0.704557865858078, "num_tokens": 21359115.0, "step": 26546 }, { "epoch": 7.030720338983051, "grad_norm": 2.7370553016662598, "learning_rate": 6.484772245762713e-06, "loss": 1.0727, "mean_token_accuracy": 0.7426124289631844, "num_tokens": 21360476.0, "step": 26548 }, { "epoch": 7.03125, "grad_norm": 2.173957586288452, "learning_rate": 6.484507415254238e-06, "loss": 0.7232, "mean_token_accuracy": 0.8094378560781479, "num_tokens": 21362256.0, "step": 26550 }, { "epoch": 7.031779661016949, "grad_norm": 2.52384090423584, "learning_rate": 6.484242584745763e-06, "loss": 1.2887, "mean_token_accuracy": 0.7115787640213966, "num_tokens": 21363774.0, "step": 26552 }, { "epoch": 7.032309322033898, "grad_norm": 2.740963935852051, "learning_rate": 6.483977754237288e-06, "loss": 1.4658, "mean_token_accuracy": 0.6801014915108681, "num_tokens": 21365497.0, "step": 26554 }, { "epoch": 7.032838983050848, "grad_norm": 2.379408836364746, "learning_rate": 6.483712923728814e-06, "loss": 1.1959, "mean_token_accuracy": 0.7301631644368172, "num_tokens": 21367329.0, "step": 26556 }, { "epoch": 7.033368644067797, "grad_norm": 2.3752031326293945, "learning_rate": 6.483448093220339e-06, "loss": 1.4249, "mean_token_accuracy": 0.6866497918963432, "num_tokens": 21369180.0, "step": 26558 }, { "epoch": 7.033898305084746, "grad_norm": 3.0505123138427734, "learning_rate": 6.483183262711865e-06, "loss": 1.1937, "mean_token_accuracy": 0.7360123619437218, "num_tokens": 21370446.0, "step": 26560 }, { "epoch": 7.034427966101695, "grad_norm": 2.3171472549438477, "learning_rate": 6.48291843220339e-06, "loss": 0.8712, "mean_token_accuracy": 0.7662561684846878, "num_tokens": 21372737.0, "step": 26562 }, { "epoch": 7.0349576271186445, "grad_norm": 2.756500005722046, "learning_rate": 6.482653601694916e-06, "loss": 1.3595, "mean_token_accuracy": 0.7065071687102318, "num_tokens": 21374352.0, "step": 26564 }, { "epoch": 7.035487288135593, "grad_norm": 2.7397258281707764, "learning_rate": 6.4823887711864404e-06, "loss": 0.8333, "mean_token_accuracy": 0.7947534844279289, "num_tokens": 21375952.0, "step": 26566 }, { "epoch": 7.036016949152542, "grad_norm": 2.7260966300964355, "learning_rate": 6.482123940677967e-06, "loss": 1.6496, "mean_token_accuracy": 0.6354754343628883, "num_tokens": 21377497.0, "step": 26568 }, { "epoch": 7.036546610169491, "grad_norm": 2.1121902465820312, "learning_rate": 6.481859110169492e-06, "loss": 0.855, "mean_token_accuracy": 0.7673645988106728, "num_tokens": 21379165.0, "step": 26570 }, { "epoch": 7.03707627118644, "grad_norm": 2.136397123336792, "learning_rate": 6.481594279661018e-06, "loss": 1.1628, "mean_token_accuracy": 0.7188721150159836, "num_tokens": 21380854.0, "step": 26572 }, { "epoch": 7.03760593220339, "grad_norm": 2.644987106323242, "learning_rate": 6.481329449152543e-06, "loss": 1.0931, "mean_token_accuracy": 0.730604775249958, "num_tokens": 21382483.0, "step": 26574 }, { "epoch": 7.038135593220339, "grad_norm": 2.5256197452545166, "learning_rate": 6.4810646186440685e-06, "loss": 1.0727, "mean_token_accuracy": 0.7422703504562378, "num_tokens": 21383928.0, "step": 26576 }, { "epoch": 7.038665254237288, "grad_norm": 2.466836452484131, "learning_rate": 6.480799788135593e-06, "loss": 1.4788, "mean_token_accuracy": 0.6744893677532673, "num_tokens": 21385768.0, "step": 26578 }, { "epoch": 7.039194915254237, "grad_norm": 2.7066047191619873, "learning_rate": 6.480534957627119e-06, "loss": 1.1593, "mean_token_accuracy": 0.7272468507289886, "num_tokens": 21387059.0, "step": 26580 }, { "epoch": 7.039724576271187, "grad_norm": 2.7110862731933594, "learning_rate": 6.480270127118644e-06, "loss": 1.1844, "mean_token_accuracy": 0.7121548354625702, "num_tokens": 21388632.0, "step": 26582 }, { "epoch": 7.040254237288136, "grad_norm": 2.767796754837036, "learning_rate": 6.48000529661017e-06, "loss": 1.5127, "mean_token_accuracy": 0.6767934784293175, "num_tokens": 21390513.0, "step": 26584 }, { "epoch": 7.040783898305085, "grad_norm": 2.3543825149536133, "learning_rate": 6.479740466101695e-06, "loss": 1.2728, "mean_token_accuracy": 0.6924243345856667, "num_tokens": 21392258.0, "step": 26586 }, { "epoch": 7.041313559322034, "grad_norm": 2.1510207653045654, "learning_rate": 6.4794756355932206e-06, "loss": 1.3106, "mean_token_accuracy": 0.6909905672073364, "num_tokens": 21393897.0, "step": 26588 }, { "epoch": 7.041843220338983, "grad_norm": 2.855832576751709, "learning_rate": 6.4792108050847455e-06, "loss": 1.1921, "mean_token_accuracy": 0.7275940850377083, "num_tokens": 21395340.0, "step": 26590 }, { "epoch": 7.0423728813559325, "grad_norm": 2.3827669620513916, "learning_rate": 6.478945974576272e-06, "loss": 1.086, "mean_token_accuracy": 0.7382670193910599, "num_tokens": 21396982.0, "step": 26592 }, { "epoch": 7.0429025423728815, "grad_norm": 2.2527248859405518, "learning_rate": 6.478681144067798e-06, "loss": 1.0695, "mean_token_accuracy": 0.7549338713288307, "num_tokens": 21398572.0, "step": 26594 }, { "epoch": 7.0434322033898304, "grad_norm": 2.352288246154785, "learning_rate": 6.478416313559323e-06, "loss": 1.1622, "mean_token_accuracy": 0.7336920127272606, "num_tokens": 21400338.0, "step": 26596 }, { "epoch": 7.043961864406779, "grad_norm": 1.9691650867462158, "learning_rate": 6.478151483050849e-06, "loss": 0.7505, "mean_token_accuracy": 0.805206336081028, "num_tokens": 21401856.0, "step": 26598 }, { "epoch": 7.044491525423729, "grad_norm": 2.1117947101593018, "learning_rate": 6.4778866525423735e-06, "loss": 0.9919, "mean_token_accuracy": 0.7558713257312775, "num_tokens": 21403585.0, "step": 26600 }, { "epoch": 7.045021186440678, "grad_norm": 2.3874857425689697, "learning_rate": 6.477621822033899e-06, "loss": 1.2667, "mean_token_accuracy": 0.691416047513485, "num_tokens": 21405316.0, "step": 26602 }, { "epoch": 7.045550847457627, "grad_norm": 3.0821549892425537, "learning_rate": 6.477356991525424e-06, "loss": 1.0726, "mean_token_accuracy": 0.7608326822519302, "num_tokens": 21406692.0, "step": 26604 }, { "epoch": 7.046080508474576, "grad_norm": 2.159911870956421, "learning_rate": 6.47709216101695e-06, "loss": 1.0444, "mean_token_accuracy": 0.7457871660590172, "num_tokens": 21408372.0, "step": 26606 }, { "epoch": 7.046610169491525, "grad_norm": 2.5645720958709717, "learning_rate": 6.476827330508475e-06, "loss": 1.2452, "mean_token_accuracy": 0.7154807150363922, "num_tokens": 21409952.0, "step": 26608 }, { "epoch": 7.047139830508475, "grad_norm": 2.783249855041504, "learning_rate": 6.476562500000001e-06, "loss": 1.2195, "mean_token_accuracy": 0.7312350049614906, "num_tokens": 21411523.0, "step": 26610 }, { "epoch": 7.047669491525424, "grad_norm": 2.238391160964966, "learning_rate": 6.476297669491526e-06, "loss": 0.8582, "mean_token_accuracy": 0.7925676852464676, "num_tokens": 21413337.0, "step": 26612 }, { "epoch": 7.048199152542373, "grad_norm": 2.5176236629486084, "learning_rate": 6.4760328389830514e-06, "loss": 1.5913, "mean_token_accuracy": 0.6549273058772087, "num_tokens": 21414846.0, "step": 26614 }, { "epoch": 7.048728813559322, "grad_norm": 2.7711551189422607, "learning_rate": 6.475768008474576e-06, "loss": 1.5952, "mean_token_accuracy": 0.6414227113127708, "num_tokens": 21416536.0, "step": 26616 }, { "epoch": 7.049258474576271, "grad_norm": 2.605259895324707, "learning_rate": 6.475503177966103e-06, "loss": 1.2438, "mean_token_accuracy": 0.7358441688120365, "num_tokens": 21418382.0, "step": 26618 }, { "epoch": 7.049788135593221, "grad_norm": 2.5923593044281006, "learning_rate": 6.475238347457627e-06, "loss": 1.1196, "mean_token_accuracy": 0.7373368069529533, "num_tokens": 21419867.0, "step": 26620 }, { "epoch": 7.0503177966101696, "grad_norm": 2.54870867729187, "learning_rate": 6.474973516949154e-06, "loss": 1.1578, "mean_token_accuracy": 0.7183217704296112, "num_tokens": 21421889.0, "step": 26622 }, { "epoch": 7.0508474576271185, "grad_norm": 2.548433780670166, "learning_rate": 6.474708686440679e-06, "loss": 1.3775, "mean_token_accuracy": 0.6763741597533226, "num_tokens": 21423412.0, "step": 26624 }, { "epoch": 7.0513771186440675, "grad_norm": 2.40516996383667, "learning_rate": 6.474443855932204e-06, "loss": 1.0135, "mean_token_accuracy": 0.751154437661171, "num_tokens": 21424667.0, "step": 26626 }, { "epoch": 7.051906779661017, "grad_norm": 2.3974967002868652, "learning_rate": 6.474179025423729e-06, "loss": 1.0344, "mean_token_accuracy": 0.757568970322609, "num_tokens": 21426330.0, "step": 26628 }, { "epoch": 7.052436440677966, "grad_norm": 2.9053995609283447, "learning_rate": 6.473914194915255e-06, "loss": 1.0933, "mean_token_accuracy": 0.7432253211736679, "num_tokens": 21427784.0, "step": 26630 }, { "epoch": 7.052966101694915, "grad_norm": 3.065004348754883, "learning_rate": 6.47364936440678e-06, "loss": 1.1794, "mean_token_accuracy": 0.7118006274104118, "num_tokens": 21429252.0, "step": 26632 }, { "epoch": 7.053495762711864, "grad_norm": 2.980954170227051, "learning_rate": 6.473384533898306e-06, "loss": 1.458, "mean_token_accuracy": 0.6784924939274788, "num_tokens": 21430672.0, "step": 26634 }, { "epoch": 7.054025423728813, "grad_norm": 3.1543359756469727, "learning_rate": 6.473119703389831e-06, "loss": 1.6261, "mean_token_accuracy": 0.6635489463806152, "num_tokens": 21432249.0, "step": 26636 }, { "epoch": 7.054555084745763, "grad_norm": 1.939758539199829, "learning_rate": 6.4728548728813565e-06, "loss": 1.2619, "mean_token_accuracy": 0.7236454039812088, "num_tokens": 21433822.0, "step": 26638 }, { "epoch": 7.055084745762712, "grad_norm": 3.051823854446411, "learning_rate": 6.4725900423728814e-06, "loss": 1.1789, "mean_token_accuracy": 0.7382388114929199, "num_tokens": 21435397.0, "step": 26640 }, { "epoch": 7.055614406779661, "grad_norm": 2.5154201984405518, "learning_rate": 6.472325211864407e-06, "loss": 1.0548, "mean_token_accuracy": 0.7270970940589905, "num_tokens": 21436859.0, "step": 26642 }, { "epoch": 7.05614406779661, "grad_norm": 2.5219266414642334, "learning_rate": 6.472060381355932e-06, "loss": 0.8114, "mean_token_accuracy": 0.7798869907855988, "num_tokens": 21438529.0, "step": 26644 }, { "epoch": 7.05667372881356, "grad_norm": 2.422269821166992, "learning_rate": 6.471795550847459e-06, "loss": 1.3159, "mean_token_accuracy": 0.7037319466471672, "num_tokens": 21439983.0, "step": 26646 }, { "epoch": 7.057203389830509, "grad_norm": 2.3600966930389404, "learning_rate": 6.471530720338983e-06, "loss": 0.7772, "mean_token_accuracy": 0.7924789413809776, "num_tokens": 21441531.0, "step": 26648 }, { "epoch": 7.057733050847458, "grad_norm": 2.9197099208831787, "learning_rate": 6.4712658898305095e-06, "loss": 1.2123, "mean_token_accuracy": 0.721599206328392, "num_tokens": 21443186.0, "step": 26650 }, { "epoch": 7.058262711864407, "grad_norm": 2.6554789543151855, "learning_rate": 6.471001059322034e-06, "loss": 1.2274, "mean_token_accuracy": 0.6981068700551987, "num_tokens": 21444650.0, "step": 26652 }, { "epoch": 7.0587923728813555, "grad_norm": 3.2978334426879883, "learning_rate": 6.47073622881356e-06, "loss": 1.4599, "mean_token_accuracy": 0.6666183695197105, "num_tokens": 21445983.0, "step": 26654 }, { "epoch": 7.059322033898305, "grad_norm": 2.7357285022735596, "learning_rate": 6.470471398305085e-06, "loss": 0.9069, "mean_token_accuracy": 0.7855026870965958, "num_tokens": 21447237.0, "step": 26656 }, { "epoch": 7.059851694915254, "grad_norm": 2.3606061935424805, "learning_rate": 6.470206567796611e-06, "loss": 1.3535, "mean_token_accuracy": 0.7013100273907185, "num_tokens": 21448663.0, "step": 26658 }, { "epoch": 7.060381355932203, "grad_norm": 3.058289051055908, "learning_rate": 6.469941737288136e-06, "loss": 1.2916, "mean_token_accuracy": 0.6905698329210281, "num_tokens": 21450072.0, "step": 26660 }, { "epoch": 7.060911016949152, "grad_norm": 2.6484475135803223, "learning_rate": 6.469676906779662e-06, "loss": 1.104, "mean_token_accuracy": 0.730995774269104, "num_tokens": 21451629.0, "step": 26662 }, { "epoch": 7.061440677966102, "grad_norm": 2.055525064468384, "learning_rate": 6.4694120762711865e-06, "loss": 1.0835, "mean_token_accuracy": 0.7441528141498566, "num_tokens": 21453425.0, "step": 26664 }, { "epoch": 7.061970338983051, "grad_norm": 2.5352039337158203, "learning_rate": 6.469147245762712e-06, "loss": 1.0061, "mean_token_accuracy": 0.7588429376482964, "num_tokens": 21455070.0, "step": 26666 }, { "epoch": 7.0625, "grad_norm": 1.923948884010315, "learning_rate": 6.468882415254237e-06, "loss": 1.1857, "mean_token_accuracy": 0.7564429715275764, "num_tokens": 21457301.0, "step": 26668 }, { "epoch": 7.063029661016949, "grad_norm": 2.44252347946167, "learning_rate": 6.468617584745763e-06, "loss": 1.0538, "mean_token_accuracy": 0.73406583070755, "num_tokens": 21458886.0, "step": 26670 }, { "epoch": 7.063559322033898, "grad_norm": 3.0488216876983643, "learning_rate": 6.468352754237288e-06, "loss": 1.4752, "mean_token_accuracy": 0.6612076088786125, "num_tokens": 21460374.0, "step": 26672 }, { "epoch": 7.064088983050848, "grad_norm": 1.997885823249817, "learning_rate": 6.468087923728814e-06, "loss": 1.0436, "mean_token_accuracy": 0.7474479600787163, "num_tokens": 21461937.0, "step": 26674 }, { "epoch": 7.064618644067797, "grad_norm": 2.9677860736846924, "learning_rate": 6.467823093220339e-06, "loss": 1.0247, "mean_token_accuracy": 0.7528651803731918, "num_tokens": 21463526.0, "step": 26676 }, { "epoch": 7.065148305084746, "grad_norm": 2.825885057449341, "learning_rate": 6.467558262711865e-06, "loss": 1.0237, "mean_token_accuracy": 0.7534766122698784, "num_tokens": 21464852.0, "step": 26678 }, { "epoch": 7.065677966101695, "grad_norm": 2.519731283187866, "learning_rate": 6.467293432203391e-06, "loss": 0.9589, "mean_token_accuracy": 0.7515283301472664, "num_tokens": 21466241.0, "step": 26680 }, { "epoch": 7.0662076271186445, "grad_norm": 2.8052144050598145, "learning_rate": 6.467028601694916e-06, "loss": 1.3116, "mean_token_accuracy": 0.7288357689976692, "num_tokens": 21467782.0, "step": 26682 }, { "epoch": 7.066737288135593, "grad_norm": 2.769533157348633, "learning_rate": 6.466763771186442e-06, "loss": 1.2983, "mean_token_accuracy": 0.7252851277589798, "num_tokens": 21469201.0, "step": 26684 }, { "epoch": 7.067266949152542, "grad_norm": 3.0696914196014404, "learning_rate": 6.466498940677967e-06, "loss": 0.9112, "mean_token_accuracy": 0.7786171659827232, "num_tokens": 21470600.0, "step": 26686 }, { "epoch": 7.067796610169491, "grad_norm": 3.116419553756714, "learning_rate": 6.4662341101694924e-06, "loss": 1.0539, "mean_token_accuracy": 0.7506025284528732, "num_tokens": 21471971.0, "step": 26688 }, { "epoch": 7.06832627118644, "grad_norm": 2.1975629329681396, "learning_rate": 6.465969279661017e-06, "loss": 1.0696, "mean_token_accuracy": 0.7606344372034073, "num_tokens": 21473461.0, "step": 26690 }, { "epoch": 7.06885593220339, "grad_norm": 3.028424024581909, "learning_rate": 6.465704449152543e-06, "loss": 1.2006, "mean_token_accuracy": 0.7226060032844543, "num_tokens": 21475090.0, "step": 26692 }, { "epoch": 7.069385593220339, "grad_norm": 2.1905272006988525, "learning_rate": 6.465439618644068e-06, "loss": 1.3233, "mean_token_accuracy": 0.6927661970257759, "num_tokens": 21476683.0, "step": 26694 }, { "epoch": 7.069915254237288, "grad_norm": 2.2118589878082275, "learning_rate": 6.465174788135594e-06, "loss": 1.2001, "mean_token_accuracy": 0.7046578824520111, "num_tokens": 21479192.0, "step": 26696 }, { "epoch": 7.070444915254237, "grad_norm": 2.0348052978515625, "learning_rate": 6.464909957627119e-06, "loss": 1.1264, "mean_token_accuracy": 0.7554331757128239, "num_tokens": 21480993.0, "step": 26698 }, { "epoch": 7.070974576271187, "grad_norm": 2.402867555618286, "learning_rate": 6.464645127118645e-06, "loss": 1.1798, "mean_token_accuracy": 0.7170394062995911, "num_tokens": 21482618.0, "step": 26700 }, { "epoch": 7.071504237288136, "grad_norm": 3.020310878753662, "learning_rate": 6.4643802966101695e-06, "loss": 1.0993, "mean_token_accuracy": 0.7280499264597893, "num_tokens": 21484542.0, "step": 26702 }, { "epoch": 7.072033898305085, "grad_norm": 3.346006393432617, "learning_rate": 6.464115466101696e-06, "loss": 1.5191, "mean_token_accuracy": 0.6744872108101845, "num_tokens": 21485985.0, "step": 26704 }, { "epoch": 7.072563559322034, "grad_norm": 2.992771863937378, "learning_rate": 6.463850635593221e-06, "loss": 1.7001, "mean_token_accuracy": 0.628825880587101, "num_tokens": 21487657.0, "step": 26706 }, { "epoch": 7.073093220338983, "grad_norm": 2.9440150260925293, "learning_rate": 6.463585805084747e-06, "loss": 0.8936, "mean_token_accuracy": 0.7773099094629288, "num_tokens": 21489130.0, "step": 26708 }, { "epoch": 7.0736228813559325, "grad_norm": 2.1397228240966797, "learning_rate": 6.463320974576272e-06, "loss": 1.1094, "mean_token_accuracy": 0.7287123799324036, "num_tokens": 21490840.0, "step": 26710 }, { "epoch": 7.0741525423728815, "grad_norm": 3.059497594833374, "learning_rate": 6.4630561440677975e-06, "loss": 1.116, "mean_token_accuracy": 0.743734709918499, "num_tokens": 21492350.0, "step": 26712 }, { "epoch": 7.0746822033898304, "grad_norm": 2.585263967514038, "learning_rate": 6.4627913135593225e-06, "loss": 1.1768, "mean_token_accuracy": 0.7278474494814873, "num_tokens": 21493965.0, "step": 26714 }, { "epoch": 7.075211864406779, "grad_norm": 1.8861703872680664, "learning_rate": 6.462526483050848e-06, "loss": 0.7938, "mean_token_accuracy": 0.7863008752465248, "num_tokens": 21495743.0, "step": 26716 }, { "epoch": 7.075741525423729, "grad_norm": 2.399618625640869, "learning_rate": 6.462261652542373e-06, "loss": 1.3636, "mean_token_accuracy": 0.7154310345649719, "num_tokens": 21497461.0, "step": 26718 }, { "epoch": 7.076271186440678, "grad_norm": 2.3295512199401855, "learning_rate": 6.461996822033899e-06, "loss": 1.1763, "mean_token_accuracy": 0.735441155731678, "num_tokens": 21498985.0, "step": 26720 }, { "epoch": 7.076800847457627, "grad_norm": 2.446282148361206, "learning_rate": 6.461731991525424e-06, "loss": 1.4682, "mean_token_accuracy": 0.6551605015993118, "num_tokens": 21501258.0, "step": 26722 }, { "epoch": 7.077330508474576, "grad_norm": 2.371246576309204, "learning_rate": 6.46146716101695e-06, "loss": 1.0568, "mean_token_accuracy": 0.7519754841923714, "num_tokens": 21502835.0, "step": 26724 }, { "epoch": 7.077860169491525, "grad_norm": 2.289330005645752, "learning_rate": 6.4612023305084746e-06, "loss": 1.149, "mean_token_accuracy": 0.731323704123497, "num_tokens": 21504585.0, "step": 26726 }, { "epoch": 7.078389830508475, "grad_norm": 2.5126521587371826, "learning_rate": 6.4609375e-06, "loss": 1.1526, "mean_token_accuracy": 0.715895876288414, "num_tokens": 21505986.0, "step": 26728 }, { "epoch": 7.078919491525424, "grad_norm": 2.803828477859497, "learning_rate": 6.460672669491525e-06, "loss": 1.2531, "mean_token_accuracy": 0.7238163501024246, "num_tokens": 21507681.0, "step": 26730 }, { "epoch": 7.079449152542373, "grad_norm": 2.5087969303131104, "learning_rate": 6.460407838983052e-06, "loss": 0.8265, "mean_token_accuracy": 0.775347113609314, "num_tokens": 21509237.0, "step": 26732 }, { "epoch": 7.079978813559322, "grad_norm": 2.7156288623809814, "learning_rate": 6.460143008474577e-06, "loss": 1.3334, "mean_token_accuracy": 0.6916378065943718, "num_tokens": 21510982.0, "step": 26734 }, { "epoch": 7.080508474576271, "grad_norm": 2.0141971111297607, "learning_rate": 6.459878177966103e-06, "loss": 1.0384, "mean_token_accuracy": 0.7416237220168114, "num_tokens": 21512790.0, "step": 26736 }, { "epoch": 7.081038135593221, "grad_norm": 2.4421775341033936, "learning_rate": 6.4596133474576275e-06, "loss": 0.8602, "mean_token_accuracy": 0.8054165840148926, "num_tokens": 21514308.0, "step": 26738 }, { "epoch": 7.0815677966101696, "grad_norm": 2.4072422981262207, "learning_rate": 6.459348516949153e-06, "loss": 1.2978, "mean_token_accuracy": 0.701170489192009, "num_tokens": 21516223.0, "step": 26740 }, { "epoch": 7.0820974576271185, "grad_norm": 2.655742883682251, "learning_rate": 6.459083686440678e-06, "loss": 1.0427, "mean_token_accuracy": 0.7539222463965416, "num_tokens": 21517835.0, "step": 26742 }, { "epoch": 7.0826271186440675, "grad_norm": 2.732290267944336, "learning_rate": 6.458818855932204e-06, "loss": 1.4832, "mean_token_accuracy": 0.6730398610234261, "num_tokens": 21519543.0, "step": 26744 }, { "epoch": 7.083156779661017, "grad_norm": 2.7660486698150635, "learning_rate": 6.458554025423729e-06, "loss": 0.8022, "mean_token_accuracy": 0.7833606377243996, "num_tokens": 21520835.0, "step": 26746 }, { "epoch": 7.083686440677966, "grad_norm": 1.9329473972320557, "learning_rate": 6.458289194915255e-06, "loss": 0.8146, "mean_token_accuracy": 0.8006101623177528, "num_tokens": 21522472.0, "step": 26748 }, { "epoch": 7.084216101694915, "grad_norm": 2.940303325653076, "learning_rate": 6.45802436440678e-06, "loss": 1.1361, "step": 26750 }, { "epoch": 7.084216101694915, "eval_loss": 1.3420358896255493, "eval_mean_token_accuracy": 0.6983506552778281, "eval_num_tokens": 21523933.0, "eval_runtime": 48.6714, "eval_samples_per_second": 6.328, "eval_steps_per_second": 6.328, "step": 26750 }, { "epoch": 7.084745762711864, "grad_norm": 3.0417470932006836, "learning_rate": 6.4577595338983054e-06, "loss": 0.7731, "mean_token_accuracy": 0.7697185538709164, "num_tokens": 21525349.0, "step": 26752 }, { "epoch": 7.085275423728813, "grad_norm": 2.8196754455566406, "learning_rate": 6.45749470338983e-06, "loss": 1.0597, "mean_token_accuracy": 0.7359176352620125, "num_tokens": 21526937.0, "step": 26754 }, { "epoch": 7.085805084745763, "grad_norm": 2.678997278213501, "learning_rate": 6.457229872881356e-06, "loss": 1.2168, "mean_token_accuracy": 0.7305383905768394, "num_tokens": 21528446.0, "step": 26756 }, { "epoch": 7.086334745762712, "grad_norm": 2.3675949573516846, "learning_rate": 6.456965042372881e-06, "loss": 1.0281, "mean_token_accuracy": 0.7593961134552956, "num_tokens": 21530133.0, "step": 26758 }, { "epoch": 7.086864406779661, "grad_norm": 2.425644636154175, "learning_rate": 6.456700211864408e-06, "loss": 1.2389, "mean_token_accuracy": 0.7131887078285217, "num_tokens": 21531659.0, "step": 26760 }, { "epoch": 7.08739406779661, "grad_norm": 2.6665122509002686, "learning_rate": 6.4564353813559335e-06, "loss": 1.3223, "mean_token_accuracy": 0.7038193382322788, "num_tokens": 21533393.0, "step": 26762 }, { "epoch": 7.08792372881356, "grad_norm": 2.600581645965576, "learning_rate": 6.456170550847458e-06, "loss": 0.8956, "mean_token_accuracy": 0.7740781009197235, "num_tokens": 21535139.0, "step": 26764 }, { "epoch": 7.088453389830509, "grad_norm": 2.925448417663574, "learning_rate": 6.455905720338984e-06, "loss": 1.0102, "mean_token_accuracy": 0.7474380061030388, "num_tokens": 21536731.0, "step": 26766 }, { "epoch": 7.088983050847458, "grad_norm": 2.2946324348449707, "learning_rate": 6.455640889830509e-06, "loss": 1.1, "mean_token_accuracy": 0.7437760978937149, "num_tokens": 21538668.0, "step": 26768 }, { "epoch": 7.089512711864407, "grad_norm": 2.515760898590088, "learning_rate": 6.455376059322035e-06, "loss": 0.7208, "mean_token_accuracy": 0.8080434277653694, "num_tokens": 21540078.0, "step": 26770 }, { "epoch": 7.0900423728813555, "grad_norm": 2.417555570602417, "learning_rate": 6.45511122881356e-06, "loss": 1.1425, "mean_token_accuracy": 0.731530636548996, "num_tokens": 21541912.0, "step": 26772 }, { "epoch": 7.090572033898305, "grad_norm": 2.372753858566284, "learning_rate": 6.4548463983050856e-06, "loss": 1.3709, "mean_token_accuracy": 0.6893001645803452, "num_tokens": 21543477.0, "step": 26774 }, { "epoch": 7.091101694915254, "grad_norm": 2.513692855834961, "learning_rate": 6.4545815677966105e-06, "loss": 1.2389, "mean_token_accuracy": 0.6883092597126961, "num_tokens": 21545170.0, "step": 26776 }, { "epoch": 7.091631355932203, "grad_norm": 2.1799843311309814, "learning_rate": 6.454316737288136e-06, "loss": 0.9407, "mean_token_accuracy": 0.7513938397169113, "num_tokens": 21546989.0, "step": 26778 }, { "epoch": 7.092161016949152, "grad_norm": 2.99222993850708, "learning_rate": 6.454051906779661e-06, "loss": 1.4144, "mean_token_accuracy": 0.6843565702438354, "num_tokens": 21548549.0, "step": 26780 }, { "epoch": 7.092690677966102, "grad_norm": 3.4864706993103027, "learning_rate": 6.453787076271187e-06, "loss": 1.5161, "mean_token_accuracy": 0.6558195725083351, "num_tokens": 21550188.0, "step": 26782 }, { "epoch": 7.093220338983051, "grad_norm": 2.7644808292388916, "learning_rate": 6.453522245762712e-06, "loss": 1.3336, "mean_token_accuracy": 0.6897201910614967, "num_tokens": 21551885.0, "step": 26784 }, { "epoch": 7.09375, "grad_norm": 2.6292097568511963, "learning_rate": 6.4532574152542385e-06, "loss": 1.2796, "mean_token_accuracy": 0.6973868533968925, "num_tokens": 21553471.0, "step": 26786 }, { "epoch": 7.094279661016949, "grad_norm": 2.715446710586548, "learning_rate": 6.4529925847457635e-06, "loss": 1.2777, "mean_token_accuracy": 0.7158639505505562, "num_tokens": 21555358.0, "step": 26788 }, { "epoch": 7.094809322033898, "grad_norm": 2.5559091567993164, "learning_rate": 6.452727754237289e-06, "loss": 0.963, "mean_token_accuracy": 0.753350704908371, "num_tokens": 21557120.0, "step": 26790 }, { "epoch": 7.095338983050848, "grad_norm": 2.843101739883423, "learning_rate": 6.452462923728814e-06, "loss": 1.2353, "mean_token_accuracy": 0.7124655693769455, "num_tokens": 21558731.0, "step": 26792 }, { "epoch": 7.095868644067797, "grad_norm": 3.1530373096466064, "learning_rate": 6.45219809322034e-06, "loss": 0.9286, "mean_token_accuracy": 0.7586077004671097, "num_tokens": 21560460.0, "step": 26794 }, { "epoch": 7.096398305084746, "grad_norm": 2.7259678840637207, "learning_rate": 6.451933262711865e-06, "loss": 1.0177, "mean_token_accuracy": 0.7487714886665344, "num_tokens": 21561863.0, "step": 26796 }, { "epoch": 7.096927966101695, "grad_norm": 2.4459972381591797, "learning_rate": 6.451668432203391e-06, "loss": 1.2265, "mean_token_accuracy": 0.7108296006917953, "num_tokens": 21563711.0, "step": 26798 }, { "epoch": 7.0974576271186445, "grad_norm": 2.5429067611694336, "learning_rate": 6.451403601694916e-06, "loss": 1.1742, "mean_token_accuracy": 0.7392092049121857, "num_tokens": 21565169.0, "step": 26800 }, { "epoch": 7.097987288135593, "grad_norm": 2.0702905654907227, "learning_rate": 6.451138771186441e-06, "loss": 1.1384, "mean_token_accuracy": 0.7423236593604088, "num_tokens": 21567088.0, "step": 26802 }, { "epoch": 7.098516949152542, "grad_norm": 2.7290079593658447, "learning_rate": 6.450873940677966e-06, "loss": 1.8317, "mean_token_accuracy": 0.5907640010118484, "num_tokens": 21568805.0, "step": 26804 }, { "epoch": 7.099046610169491, "grad_norm": 2.719630718231201, "learning_rate": 6.450609110169492e-06, "loss": 1.2653, "mean_token_accuracy": 0.716310128569603, "num_tokens": 21570260.0, "step": 26806 }, { "epoch": 7.09957627118644, "grad_norm": 2.7198877334594727, "learning_rate": 6.450344279661017e-06, "loss": 1.2223, "mean_token_accuracy": 0.7096365243196487, "num_tokens": 21571932.0, "step": 26808 }, { "epoch": 7.10010593220339, "grad_norm": 3.0924506187438965, "learning_rate": 6.450079449152543e-06, "loss": 1.0548, "mean_token_accuracy": 0.7430363222956657, "num_tokens": 21573360.0, "step": 26810 }, { "epoch": 7.100635593220339, "grad_norm": 2.5506742000579834, "learning_rate": 6.449814618644068e-06, "loss": 0.8951, "mean_token_accuracy": 0.7644711583852768, "num_tokens": 21574830.0, "step": 26812 }, { "epoch": 7.101165254237288, "grad_norm": 2.693948745727539, "learning_rate": 6.449549788135594e-06, "loss": 1.3894, "mean_token_accuracy": 0.6975224316120148, "num_tokens": 21576745.0, "step": 26814 }, { "epoch": 7.101694915254237, "grad_norm": 2.7026326656341553, "learning_rate": 6.449284957627118e-06, "loss": 1.1679, "mean_token_accuracy": 0.707119345664978, "num_tokens": 21578589.0, "step": 26816 }, { "epoch": 7.102224576271187, "grad_norm": 2.5227057933807373, "learning_rate": 6.449020127118645e-06, "loss": 1.2611, "mean_token_accuracy": 0.7169991359114647, "num_tokens": 21580337.0, "step": 26818 }, { "epoch": 7.102754237288136, "grad_norm": 2.869248390197754, "learning_rate": 6.44875529661017e-06, "loss": 1.0982, "mean_token_accuracy": 0.7401283234357834, "num_tokens": 21582168.0, "step": 26820 }, { "epoch": 7.103283898305085, "grad_norm": 2.30981707572937, "learning_rate": 6.448490466101696e-06, "loss": 1.278, "mean_token_accuracy": 0.7192185670137405, "num_tokens": 21583833.0, "step": 26822 }, { "epoch": 7.103813559322034, "grad_norm": 2.2401044368743896, "learning_rate": 6.448225635593221e-06, "loss": 1.2777, "mean_token_accuracy": 0.7024229094386101, "num_tokens": 21585616.0, "step": 26824 }, { "epoch": 7.104343220338983, "grad_norm": 2.185063600540161, "learning_rate": 6.4479608050847464e-06, "loss": 1.0894, "mean_token_accuracy": 0.7391640543937683, "num_tokens": 21587125.0, "step": 26826 }, { "epoch": 7.1048728813559325, "grad_norm": 2.7199277877807617, "learning_rate": 6.447695974576271e-06, "loss": 1.2813, "mean_token_accuracy": 0.7020227313041687, "num_tokens": 21588586.0, "step": 26828 }, { "epoch": 7.1054025423728815, "grad_norm": 2.881599187850952, "learning_rate": 6.447431144067797e-06, "loss": 1.2895, "mean_token_accuracy": 0.7084599435329437, "num_tokens": 21590008.0, "step": 26830 }, { "epoch": 7.1059322033898304, "grad_norm": 1.815999984741211, "learning_rate": 6.447166313559322e-06, "loss": 1.088, "mean_token_accuracy": 0.7404539361596107, "num_tokens": 21591656.0, "step": 26832 }, { "epoch": 7.106461864406779, "grad_norm": 2.5038928985595703, "learning_rate": 6.446901483050848e-06, "loss": 1.0517, "mean_token_accuracy": 0.7474530339241028, "num_tokens": 21593105.0, "step": 26834 }, { "epoch": 7.106991525423729, "grad_norm": 2.3259127140045166, "learning_rate": 6.446636652542373e-06, "loss": 0.8699, "mean_token_accuracy": 0.7832369208335876, "num_tokens": 21594746.0, "step": 26836 }, { "epoch": 7.107521186440678, "grad_norm": 2.4404261112213135, "learning_rate": 6.4463718220338986e-06, "loss": 1.2486, "mean_token_accuracy": 0.7374923974275589, "num_tokens": 21596442.0, "step": 26838 }, { "epoch": 7.108050847457627, "grad_norm": 2.2927043437957764, "learning_rate": 6.4461069915254235e-06, "loss": 1.1924, "mean_token_accuracy": 0.7347875311970711, "num_tokens": 21598737.0, "step": 26840 }, { "epoch": 7.108580508474576, "grad_norm": 2.9971086978912354, "learning_rate": 6.44584216101695e-06, "loss": 1.2468, "mean_token_accuracy": 0.6959849298000336, "num_tokens": 21600344.0, "step": 26842 }, { "epoch": 7.109110169491525, "grad_norm": 2.5044281482696533, "learning_rate": 6.445577330508474e-06, "loss": 1.4093, "mean_token_accuracy": 0.6755383983254433, "num_tokens": 21602018.0, "step": 26844 }, { "epoch": 7.109639830508475, "grad_norm": 2.3847718238830566, "learning_rate": 6.445312500000001e-06, "loss": 1.0521, "mean_token_accuracy": 0.7302493155002594, "num_tokens": 21603637.0, "step": 26846 }, { "epoch": 7.110169491525424, "grad_norm": 2.6260502338409424, "learning_rate": 6.445047669491527e-06, "loss": 1.1347, "mean_token_accuracy": 0.7380197048187256, "num_tokens": 21605063.0, "step": 26848 }, { "epoch": 7.110699152542373, "grad_norm": 2.28983211517334, "learning_rate": 6.4447828389830515e-06, "loss": 1.119, "mean_token_accuracy": 0.7538326010107994, "num_tokens": 21606604.0, "step": 26850 }, { "epoch": 7.111228813559322, "grad_norm": 1.9811947345733643, "learning_rate": 6.444518008474577e-06, "loss": 0.9048, "mean_token_accuracy": 0.7798539698123932, "num_tokens": 21608310.0, "step": 26852 }, { "epoch": 7.111758474576271, "grad_norm": 2.5757246017456055, "learning_rate": 6.444253177966102e-06, "loss": 1.4728, "mean_token_accuracy": 0.6616311259567738, "num_tokens": 21610457.0, "step": 26854 }, { "epoch": 7.112288135593221, "grad_norm": 2.6637701988220215, "learning_rate": 6.443988347457628e-06, "loss": 0.9392, "mean_token_accuracy": 0.7732030972838402, "num_tokens": 21611926.0, "step": 26856 }, { "epoch": 7.1128177966101696, "grad_norm": 2.179114818572998, "learning_rate": 6.443723516949153e-06, "loss": 1.0184, "mean_token_accuracy": 0.764917366206646, "num_tokens": 21613433.0, "step": 26858 }, { "epoch": 7.1133474576271185, "grad_norm": 2.149400234222412, "learning_rate": 6.443458686440679e-06, "loss": 0.983, "mean_token_accuracy": 0.7412888184189796, "num_tokens": 21615281.0, "step": 26860 }, { "epoch": 7.1138771186440675, "grad_norm": 2.886213541030884, "learning_rate": 6.443193855932204e-06, "loss": 0.9161, "mean_token_accuracy": 0.7760968059301376, "num_tokens": 21616707.0, "step": 26862 }, { "epoch": 7.114406779661017, "grad_norm": 2.3785176277160645, "learning_rate": 6.442929025423729e-06, "loss": 0.8446, "mean_token_accuracy": 0.786267913877964, "num_tokens": 21618156.0, "step": 26864 }, { "epoch": 7.114936440677966, "grad_norm": 2.822526454925537, "learning_rate": 6.442664194915254e-06, "loss": 1.276, "mean_token_accuracy": 0.7143800929188728, "num_tokens": 21619545.0, "step": 26866 }, { "epoch": 7.115466101694915, "grad_norm": 2.7790122032165527, "learning_rate": 6.442399364406781e-06, "loss": 1.0295, "mean_token_accuracy": 0.75240508466959, "num_tokens": 21621284.0, "step": 26868 }, { "epoch": 7.115995762711864, "grad_norm": 2.534371852874756, "learning_rate": 6.442134533898305e-06, "loss": 0.8483, "mean_token_accuracy": 0.7925033122301102, "num_tokens": 21622617.0, "step": 26870 }, { "epoch": 7.116525423728813, "grad_norm": 2.1837713718414307, "learning_rate": 6.441869703389832e-06, "loss": 0.9333, "mean_token_accuracy": 0.7857279777526855, "num_tokens": 21624133.0, "step": 26872 }, { "epoch": 7.117055084745763, "grad_norm": 2.330923080444336, "learning_rate": 6.441604872881357e-06, "loss": 0.9198, "mean_token_accuracy": 0.7761629298329353, "num_tokens": 21625759.0, "step": 26874 }, { "epoch": 7.117584745762712, "grad_norm": 2.670037269592285, "learning_rate": 6.441340042372882e-06, "loss": 1.301, "mean_token_accuracy": 0.7045377492904663, "num_tokens": 21627214.0, "step": 26876 }, { "epoch": 7.118114406779661, "grad_norm": 2.4123587608337402, "learning_rate": 6.441075211864407e-06, "loss": 1.2636, "mean_token_accuracy": 0.7091714516282082, "num_tokens": 21628933.0, "step": 26878 }, { "epoch": 7.11864406779661, "grad_norm": 2.9270713329315186, "learning_rate": 6.440810381355933e-06, "loss": 1.2588, "mean_token_accuracy": 0.7046929001808167, "num_tokens": 21630601.0, "step": 26880 }, { "epoch": 7.11917372881356, "grad_norm": 2.5503668785095215, "learning_rate": 6.440545550847458e-06, "loss": 1.0211, "mean_token_accuracy": 0.7482756748795509, "num_tokens": 21632122.0, "step": 26882 }, { "epoch": 7.119703389830509, "grad_norm": 3.226991653442383, "learning_rate": 6.440280720338984e-06, "loss": 0.9895, "mean_token_accuracy": 0.7497764676809311, "num_tokens": 21633638.0, "step": 26884 }, { "epoch": 7.120233050847458, "grad_norm": 3.279815912246704, "learning_rate": 6.440015889830509e-06, "loss": 1.4157, "mean_token_accuracy": 0.6886674016714096, "num_tokens": 21635228.0, "step": 26886 }, { "epoch": 7.120762711864407, "grad_norm": 2.606389284133911, "learning_rate": 6.4397510593220345e-06, "loss": 1.1236, "mean_token_accuracy": 0.7482115402817726, "num_tokens": 21636846.0, "step": 26888 }, { "epoch": 7.1212923728813555, "grad_norm": 2.5361177921295166, "learning_rate": 6.439486228813559e-06, "loss": 0.9263, "mean_token_accuracy": 0.7673278823494911, "num_tokens": 21638746.0, "step": 26890 }, { "epoch": 7.121822033898305, "grad_norm": 1.9630627632141113, "learning_rate": 6.439221398305085e-06, "loss": 0.6283, "mean_token_accuracy": 0.8290792852640152, "num_tokens": 21640173.0, "step": 26892 }, { "epoch": 7.122351694915254, "grad_norm": 2.211052656173706, "learning_rate": 6.43895656779661e-06, "loss": 1.2892, "mean_token_accuracy": 0.707987830042839, "num_tokens": 21641860.0, "step": 26894 }, { "epoch": 7.122881355932203, "grad_norm": 2.883521795272827, "learning_rate": 6.438691737288137e-06, "loss": 1.0988, "mean_token_accuracy": 0.7454224601387978, "num_tokens": 21643378.0, "step": 26896 }, { "epoch": 7.123411016949152, "grad_norm": 3.014331579208374, "learning_rate": 6.438426906779661e-06, "loss": 1.2492, "mean_token_accuracy": 0.7024533301591873, "num_tokens": 21644824.0, "step": 26898 }, { "epoch": 7.123940677966102, "grad_norm": 2.892160415649414, "learning_rate": 6.4381620762711875e-06, "loss": 1.0178, "mean_token_accuracy": 0.743654303252697, "num_tokens": 21646314.0, "step": 26900 }, { "epoch": 7.124470338983051, "grad_norm": 2.139554500579834, "learning_rate": 6.437897245762712e-06, "loss": 1.1585, "mean_token_accuracy": 0.7436264529824257, "num_tokens": 21648217.0, "step": 26902 }, { "epoch": 7.125, "grad_norm": 2.0807857513427734, "learning_rate": 6.437632415254238e-06, "loss": 0.9859, "mean_token_accuracy": 0.7299174144864082, "num_tokens": 21649817.0, "step": 26904 }, { "epoch": 7.125529661016949, "grad_norm": 2.9551138877868652, "learning_rate": 6.437367584745763e-06, "loss": 1.3918, "mean_token_accuracy": 0.701573770493269, "num_tokens": 21651757.0, "step": 26906 }, { "epoch": 7.126059322033898, "grad_norm": 3.0265040397644043, "learning_rate": 6.437102754237289e-06, "loss": 1.3481, "mean_token_accuracy": 0.6963168084621429, "num_tokens": 21653406.0, "step": 26908 }, { "epoch": 7.126588983050848, "grad_norm": 2.75980544090271, "learning_rate": 6.436837923728814e-06, "loss": 1.0415, "mean_token_accuracy": 0.7517974004149437, "num_tokens": 21654969.0, "step": 26910 }, { "epoch": 7.127118644067797, "grad_norm": 2.5636208057403564, "learning_rate": 6.4365730932203396e-06, "loss": 1.1966, "mean_token_accuracy": 0.7315869852900505, "num_tokens": 21656579.0, "step": 26912 }, { "epoch": 7.127648305084746, "grad_norm": 3.6390459537506104, "learning_rate": 6.4363082627118645e-06, "loss": 1.3798, "mean_token_accuracy": 0.6964506059885025, "num_tokens": 21657932.0, "step": 26914 }, { "epoch": 7.128177966101695, "grad_norm": 2.6712794303894043, "learning_rate": 6.43604343220339e-06, "loss": 0.9592, "mean_token_accuracy": 0.7753179892897606, "num_tokens": 21659395.0, "step": 26916 }, { "epoch": 7.1287076271186445, "grad_norm": 2.757772207260132, "learning_rate": 6.435778601694915e-06, "loss": 1.1116, "mean_token_accuracy": 0.7274519130587578, "num_tokens": 21661763.0, "step": 26918 }, { "epoch": 7.129237288135593, "grad_norm": 2.60607647895813, "learning_rate": 6.435513771186441e-06, "loss": 1.2519, "mean_token_accuracy": 0.759469248354435, "num_tokens": 21663415.0, "step": 26920 }, { "epoch": 7.129766949152542, "grad_norm": 2.703213691711426, "learning_rate": 6.435248940677966e-06, "loss": 1.0784, "mean_token_accuracy": 0.7538226991891861, "num_tokens": 21665055.0, "step": 26922 }, { "epoch": 7.130296610169491, "grad_norm": 2.7928695678710938, "learning_rate": 6.434984110169492e-06, "loss": 1.3807, "mean_token_accuracy": 0.6775733903050423, "num_tokens": 21666610.0, "step": 26924 }, { "epoch": 7.13082627118644, "grad_norm": 2.5596790313720703, "learning_rate": 6.434719279661017e-06, "loss": 0.8841, "mean_token_accuracy": 0.7765242531895638, "num_tokens": 21668226.0, "step": 26926 }, { "epoch": 7.13135593220339, "grad_norm": 2.966953992843628, "learning_rate": 6.434454449152543e-06, "loss": 1.15, "mean_token_accuracy": 0.7419301569461823, "num_tokens": 21669670.0, "step": 26928 }, { "epoch": 7.131885593220339, "grad_norm": 2.9929897785186768, "learning_rate": 6.434189618644068e-06, "loss": 1.3111, "mean_token_accuracy": 0.6884675472974777, "num_tokens": 21671140.0, "step": 26930 }, { "epoch": 7.132415254237288, "grad_norm": 2.7269492149353027, "learning_rate": 6.433924788135594e-06, "loss": 1.0886, "mean_token_accuracy": 0.7289093807339668, "num_tokens": 21672621.0, "step": 26932 }, { "epoch": 7.132944915254237, "grad_norm": 2.8570797443389893, "learning_rate": 6.43365995762712e-06, "loss": 0.8275, "mean_token_accuracy": 0.7807460799813271, "num_tokens": 21673995.0, "step": 26934 }, { "epoch": 7.133474576271187, "grad_norm": 2.5062098503112793, "learning_rate": 6.433395127118645e-06, "loss": 1.0159, "mean_token_accuracy": 0.7710435763001442, "num_tokens": 21675312.0, "step": 26936 }, { "epoch": 7.134004237288136, "grad_norm": 2.6102049350738525, "learning_rate": 6.4331302966101704e-06, "loss": 1.1525, "mean_token_accuracy": 0.7245068401098251, "num_tokens": 21676854.0, "step": 26938 }, { "epoch": 7.134533898305085, "grad_norm": 2.4891703128814697, "learning_rate": 6.432865466101695e-06, "loss": 1.1626, "mean_token_accuracy": 0.7099948897957802, "num_tokens": 21678733.0, "step": 26940 }, { "epoch": 7.135063559322034, "grad_norm": 1.8306163549423218, "learning_rate": 6.432600635593221e-06, "loss": 1.2507, "mean_token_accuracy": 0.7189113423228264, "num_tokens": 21681043.0, "step": 26942 }, { "epoch": 7.135593220338983, "grad_norm": 2.803457021713257, "learning_rate": 6.432335805084746e-06, "loss": 0.8506, "mean_token_accuracy": 0.7761444300413132, "num_tokens": 21682683.0, "step": 26944 }, { "epoch": 7.1361228813559325, "grad_norm": 3.2006542682647705, "learning_rate": 6.432070974576272e-06, "loss": 1.3807, "mean_token_accuracy": 0.6929488033056259, "num_tokens": 21684359.0, "step": 26946 }, { "epoch": 7.1366525423728815, "grad_norm": 2.2431697845458984, "learning_rate": 6.431806144067797e-06, "loss": 1.2578, "mean_token_accuracy": 0.699488528072834, "num_tokens": 21686262.0, "step": 26948 }, { "epoch": 7.1371822033898304, "grad_norm": 2.3647611141204834, "learning_rate": 6.431541313559323e-06, "loss": 1.0103, "mean_token_accuracy": 0.7729940041899681, "num_tokens": 21687727.0, "step": 26950 }, { "epoch": 7.137711864406779, "grad_norm": 1.9809021949768066, "learning_rate": 6.4312764830508475e-06, "loss": 0.7799, "mean_token_accuracy": 0.8112128525972366, "num_tokens": 21689242.0, "step": 26952 }, { "epoch": 7.138241525423728, "grad_norm": 2.6966567039489746, "learning_rate": 6.431011652542374e-06, "loss": 1.059, "mean_token_accuracy": 0.7316638827323914, "num_tokens": 21691295.0, "step": 26954 }, { "epoch": 7.138771186440678, "grad_norm": 2.255915403366089, "learning_rate": 6.430746822033899e-06, "loss": 1.026, "mean_token_accuracy": 0.7321706414222717, "num_tokens": 21692981.0, "step": 26956 }, { "epoch": 7.139300847457627, "grad_norm": 2.3245105743408203, "learning_rate": 6.430481991525425e-06, "loss": 1.2258, "mean_token_accuracy": 0.6824742928147316, "num_tokens": 21694745.0, "step": 26958 }, { "epoch": 7.139830508474576, "grad_norm": 2.4162282943725586, "learning_rate": 6.43021716101695e-06, "loss": 1.0544, "mean_token_accuracy": 0.7511016428470612, "num_tokens": 21696205.0, "step": 26960 }, { "epoch": 7.140360169491525, "grad_norm": 2.986672878265381, "learning_rate": 6.4299523305084755e-06, "loss": 1.3289, "mean_token_accuracy": 0.7014249041676521, "num_tokens": 21697644.0, "step": 26962 }, { "epoch": 7.140889830508475, "grad_norm": 2.4208250045776367, "learning_rate": 6.4296875000000004e-06, "loss": 1.2498, "mean_token_accuracy": 0.7148179486393929, "num_tokens": 21699201.0, "step": 26964 }, { "epoch": 7.141419491525424, "grad_norm": 2.6195321083068848, "learning_rate": 6.429422669491526e-06, "loss": 1.5975, "mean_token_accuracy": 0.6398527175188065, "num_tokens": 21700714.0, "step": 26966 }, { "epoch": 7.141949152542373, "grad_norm": 2.391716241836548, "learning_rate": 6.429157838983051e-06, "loss": 0.7038, "mean_token_accuracy": 0.8079317957162857, "num_tokens": 21702010.0, "step": 26968 }, { "epoch": 7.142478813559322, "grad_norm": 3.0794010162353516, "learning_rate": 6.428893008474577e-06, "loss": 1.1097, "mean_token_accuracy": 0.7466113567352295, "num_tokens": 21703214.0, "step": 26970 }, { "epoch": 7.143008474576272, "grad_norm": 2.7762579917907715, "learning_rate": 6.428628177966102e-06, "loss": 1.276, "mean_token_accuracy": 0.6992128863930702, "num_tokens": 21704911.0, "step": 26972 }, { "epoch": 7.143538135593221, "grad_norm": 1.9828916788101196, "learning_rate": 6.428363347457628e-06, "loss": 1.4346, "mean_token_accuracy": 0.6705845445394516, "num_tokens": 21706787.0, "step": 26974 }, { "epoch": 7.1440677966101696, "grad_norm": 2.359098434448242, "learning_rate": 6.4280985169491525e-06, "loss": 1.2878, "mean_token_accuracy": 0.7127295508980751, "num_tokens": 21708513.0, "step": 26976 }, { "epoch": 7.1445974576271185, "grad_norm": 1.7001376152038574, "learning_rate": 6.427833686440678e-06, "loss": 0.958, "mean_token_accuracy": 0.7751760259270668, "num_tokens": 21710549.0, "step": 26978 }, { "epoch": 7.1451271186440675, "grad_norm": 3.1154539585113525, "learning_rate": 6.427568855932203e-06, "loss": 1.136, "mean_token_accuracy": 0.7380329892039299, "num_tokens": 21712026.0, "step": 26980 }, { "epoch": 7.145656779661017, "grad_norm": 2.7886714935302734, "learning_rate": 6.42730402542373e-06, "loss": 1.5832, "mean_token_accuracy": 0.6527065560221672, "num_tokens": 21713678.0, "step": 26982 }, { "epoch": 7.146186440677966, "grad_norm": 2.53292179107666, "learning_rate": 6.427039194915255e-06, "loss": 1.0043, "mean_token_accuracy": 0.7431929334998131, "num_tokens": 21715189.0, "step": 26984 }, { "epoch": 7.146716101694915, "grad_norm": 2.8095924854278564, "learning_rate": 6.426774364406781e-06, "loss": 1.4161, "mean_token_accuracy": 0.6960565075278282, "num_tokens": 21716789.0, "step": 26986 }, { "epoch": 7.147245762711864, "grad_norm": 2.0826776027679443, "learning_rate": 6.4265095338983055e-06, "loss": 0.7784, "mean_token_accuracy": 0.7899801433086395, "num_tokens": 21718218.0, "step": 26988 }, { "epoch": 7.147775423728813, "grad_norm": 3.034621477127075, "learning_rate": 6.426244703389831e-06, "loss": 1.3627, "mean_token_accuracy": 0.6969192177057266, "num_tokens": 21719749.0, "step": 26990 }, { "epoch": 7.148305084745763, "grad_norm": 2.7389628887176514, "learning_rate": 6.425979872881356e-06, "loss": 1.2234, "mean_token_accuracy": 0.7104431390762329, "num_tokens": 21721387.0, "step": 26992 }, { "epoch": 7.148834745762712, "grad_norm": 3.024909734725952, "learning_rate": 6.425715042372882e-06, "loss": 0.888, "mean_token_accuracy": 0.7903749570250511, "num_tokens": 21722739.0, "step": 26994 }, { "epoch": 7.149364406779661, "grad_norm": 3.0980300903320312, "learning_rate": 6.425450211864407e-06, "loss": 1.1419, "mean_token_accuracy": 0.7100983709096909, "num_tokens": 21724468.0, "step": 26996 }, { "epoch": 7.14989406779661, "grad_norm": 2.64410138130188, "learning_rate": 6.425185381355933e-06, "loss": 0.866, "mean_token_accuracy": 0.7486663460731506, "num_tokens": 21726264.0, "step": 26998 }, { "epoch": 7.15042372881356, "grad_norm": 2.4235963821411133, "learning_rate": 6.424920550847458e-06, "loss": 1.5311, "step": 27000 }, { "epoch": 7.15042372881356, "eval_loss": 1.3367531299591064, "eval_mean_token_accuracy": 0.698723305258658, "eval_num_tokens": 21728063.0, "eval_runtime": 48.7795, "eval_samples_per_second": 6.314, "eval_steps_per_second": 6.314, "step": 27000 }, { "epoch": 7.150953389830509, "grad_norm": 2.183781385421753, "learning_rate": 6.424655720338983e-06, "loss": 0.9669, "mean_token_accuracy": 0.7203770019114017, "num_tokens": 21729403.0, "step": 27002 }, { "epoch": 7.151483050847458, "grad_norm": 2.7679600715637207, "learning_rate": 6.424390889830508e-06, "loss": 0.9856, "mean_token_accuracy": 0.7673221752047539, "num_tokens": 21730970.0, "step": 27004 }, { "epoch": 7.152012711864407, "grad_norm": 3.0859363079071045, "learning_rate": 6.424126059322034e-06, "loss": 1.5317, "mean_token_accuracy": 0.6454431861639023, "num_tokens": 21732378.0, "step": 27006 }, { "epoch": 7.1525423728813555, "grad_norm": 2.7085959911346436, "learning_rate": 6.423861228813559e-06, "loss": 1.0937, "mean_token_accuracy": 0.7420622259378433, "num_tokens": 21733858.0, "step": 27008 }, { "epoch": 7.153072033898305, "grad_norm": 2.7325615882873535, "learning_rate": 6.423596398305086e-06, "loss": 1.0469, "mean_token_accuracy": 0.7317560613155365, "num_tokens": 21735437.0, "step": 27010 }, { "epoch": 7.153601694915254, "grad_norm": 2.5425617694854736, "learning_rate": 6.42333156779661e-06, "loss": 0.7559, "mean_token_accuracy": 0.7929704338312149, "num_tokens": 21736884.0, "step": 27012 }, { "epoch": 7.154131355932203, "grad_norm": 2.741541624069214, "learning_rate": 6.423066737288136e-06, "loss": 1.3419, "mean_token_accuracy": 0.6613789200782776, "num_tokens": 21738635.0, "step": 27014 }, { "epoch": 7.154661016949152, "grad_norm": 2.450378656387329, "learning_rate": 6.422801906779662e-06, "loss": 1.0459, "mean_token_accuracy": 0.7658979296684265, "num_tokens": 21740540.0, "step": 27016 }, { "epoch": 7.155190677966102, "grad_norm": 2.4980785846710205, "learning_rate": 6.422537076271187e-06, "loss": 1.2325, "mean_token_accuracy": 0.7113139294087887, "num_tokens": 21742205.0, "step": 27018 }, { "epoch": 7.155720338983051, "grad_norm": 2.689133882522583, "learning_rate": 6.422272245762713e-06, "loss": 0.9724, "mean_token_accuracy": 0.7718763053417206, "num_tokens": 21743726.0, "step": 27020 }, { "epoch": 7.15625, "grad_norm": 2.6121439933776855, "learning_rate": 6.422007415254238e-06, "loss": 1.5447, "mean_token_accuracy": 0.6781083308160305, "num_tokens": 21745388.0, "step": 27022 }, { "epoch": 7.156779661016949, "grad_norm": 3.1022698879241943, "learning_rate": 6.4217425847457636e-06, "loss": 1.1438, "mean_token_accuracy": 0.7237382531166077, "num_tokens": 21746932.0, "step": 27024 }, { "epoch": 7.157309322033898, "grad_norm": 2.4370274543762207, "learning_rate": 6.4214777542372885e-06, "loss": 0.9478, "mean_token_accuracy": 0.76535864174366, "num_tokens": 21748559.0, "step": 27026 }, { "epoch": 7.157838983050848, "grad_norm": 3.4402973651885986, "learning_rate": 6.421212923728814e-06, "loss": 1.2092, "mean_token_accuracy": 0.7102608978748322, "num_tokens": 21749942.0, "step": 27028 }, { "epoch": 7.158368644067797, "grad_norm": 2.3955674171447754, "learning_rate": 6.420948093220339e-06, "loss": 1.4008, "mean_token_accuracy": 0.7032748684287071, "num_tokens": 21751706.0, "step": 27030 }, { "epoch": 7.158898305084746, "grad_norm": 2.6933035850524902, "learning_rate": 6.420683262711865e-06, "loss": 1.2442, "mean_token_accuracy": 0.7149692997336388, "num_tokens": 21753146.0, "step": 27032 }, { "epoch": 7.159427966101695, "grad_norm": 2.6182045936584473, "learning_rate": 6.42041843220339e-06, "loss": 0.9397, "mean_token_accuracy": 0.764859639108181, "num_tokens": 21754887.0, "step": 27034 }, { "epoch": 7.1599576271186445, "grad_norm": 2.643260955810547, "learning_rate": 6.4201536016949165e-06, "loss": 1.022, "mean_token_accuracy": 0.7480692341923714, "num_tokens": 21756528.0, "step": 27036 }, { "epoch": 7.160487288135593, "grad_norm": 2.6308274269104004, "learning_rate": 6.4198887711864414e-06, "loss": 0.9739, "mean_token_accuracy": 0.76594378054142, "num_tokens": 21757995.0, "step": 27038 }, { "epoch": 7.161016949152542, "grad_norm": 2.0729241371154785, "learning_rate": 6.419623940677967e-06, "loss": 0.5945, "mean_token_accuracy": 0.8450099006295204, "num_tokens": 21759558.0, "step": 27040 }, { "epoch": 7.161546610169491, "grad_norm": 1.9714785814285278, "learning_rate": 6.419359110169492e-06, "loss": 0.6561, "mean_token_accuracy": 0.8355193585157394, "num_tokens": 21761298.0, "step": 27042 }, { "epoch": 7.16207627118644, "grad_norm": 2.3038761615753174, "learning_rate": 6.419094279661018e-06, "loss": 1.1614, "mean_token_accuracy": 0.719777874648571, "num_tokens": 21762864.0, "step": 27044 }, { "epoch": 7.16260593220339, "grad_norm": 2.601289749145508, "learning_rate": 6.418829449152543e-06, "loss": 0.9, "mean_token_accuracy": 0.8070802465081215, "num_tokens": 21764336.0, "step": 27046 }, { "epoch": 7.163135593220339, "grad_norm": 2.0262932777404785, "learning_rate": 6.418564618644069e-06, "loss": 1.1372, "mean_token_accuracy": 0.7380828931927681, "num_tokens": 21765951.0, "step": 27048 }, { "epoch": 7.163665254237288, "grad_norm": 2.1125197410583496, "learning_rate": 6.4182997881355936e-06, "loss": 0.6776, "mean_token_accuracy": 0.8205604329705238, "num_tokens": 21767525.0, "step": 27050 }, { "epoch": 7.164194915254237, "grad_norm": 2.484703779220581, "learning_rate": 6.418034957627119e-06, "loss": 1.3732, "mean_token_accuracy": 0.6982721835374832, "num_tokens": 21769083.0, "step": 27052 }, { "epoch": 7.164724576271187, "grad_norm": 3.030988931655884, "learning_rate": 6.417770127118644e-06, "loss": 1.3318, "mean_token_accuracy": 0.6879483610391617, "num_tokens": 21770905.0, "step": 27054 }, { "epoch": 7.165254237288136, "grad_norm": 2.57635235786438, "learning_rate": 6.41750529661017e-06, "loss": 1.3242, "mean_token_accuracy": 0.6894590854644775, "num_tokens": 21772287.0, "step": 27056 }, { "epoch": 7.165783898305085, "grad_norm": 2.5694243907928467, "learning_rate": 6.417240466101695e-06, "loss": 0.9294, "mean_token_accuracy": 0.7718349471688271, "num_tokens": 21773781.0, "step": 27058 }, { "epoch": 7.166313559322034, "grad_norm": 2.3255844116210938, "learning_rate": 6.416975635593221e-06, "loss": 1.1403, "mean_token_accuracy": 0.739667221903801, "num_tokens": 21775321.0, "step": 27060 }, { "epoch": 7.166843220338983, "grad_norm": 2.814676523208618, "learning_rate": 6.416710805084746e-06, "loss": 1.4468, "mean_token_accuracy": 0.667605847120285, "num_tokens": 21776956.0, "step": 27062 }, { "epoch": 7.1673728813559325, "grad_norm": 2.7774338722229004, "learning_rate": 6.416445974576272e-06, "loss": 1.2182, "mean_token_accuracy": 0.7257654592394829, "num_tokens": 21778597.0, "step": 27064 }, { "epoch": 7.1679025423728815, "grad_norm": 2.4867641925811768, "learning_rate": 6.416181144067796e-06, "loss": 0.8358, "mean_token_accuracy": 0.7839904427528381, "num_tokens": 21780383.0, "step": 27066 }, { "epoch": 7.1684322033898304, "grad_norm": 2.7767152786254883, "learning_rate": 6.415916313559323e-06, "loss": 1.069, "mean_token_accuracy": 0.7382025644183159, "num_tokens": 21781838.0, "step": 27068 }, { "epoch": 7.168961864406779, "grad_norm": 2.62013840675354, "learning_rate": 6.415651483050848e-06, "loss": 1.0733, "mean_token_accuracy": 0.7564990147948265, "num_tokens": 21783542.0, "step": 27070 }, { "epoch": 7.169491525423728, "grad_norm": 2.219937324523926, "learning_rate": 6.415386652542374e-06, "loss": 0.5202, "mean_token_accuracy": 0.8575832843780518, "num_tokens": 21784884.0, "step": 27072 }, { "epoch": 7.170021186440678, "grad_norm": 2.3907577991485596, "learning_rate": 6.415121822033899e-06, "loss": 1.1577, "mean_token_accuracy": 0.7371522933244705, "num_tokens": 21786522.0, "step": 27074 }, { "epoch": 7.170550847457627, "grad_norm": 2.317460775375366, "learning_rate": 6.414856991525424e-06, "loss": 0.9374, "mean_token_accuracy": 0.7757615223526955, "num_tokens": 21788134.0, "step": 27076 }, { "epoch": 7.171080508474576, "grad_norm": 3.1635496616363525, "learning_rate": 6.414592161016949e-06, "loss": 1.5621, "mean_token_accuracy": 0.6786289811134338, "num_tokens": 21789751.0, "step": 27078 }, { "epoch": 7.171610169491525, "grad_norm": 2.535982131958008, "learning_rate": 6.414327330508475e-06, "loss": 1.2382, "mean_token_accuracy": 0.729063130915165, "num_tokens": 21791198.0, "step": 27080 }, { "epoch": 7.172139830508475, "grad_norm": 2.4317781925201416, "learning_rate": 6.4140625e-06, "loss": 1.2346, "mean_token_accuracy": 0.738120011985302, "num_tokens": 21792724.0, "step": 27082 }, { "epoch": 7.172669491525424, "grad_norm": 2.716890573501587, "learning_rate": 6.413797669491526e-06, "loss": 1.154, "mean_token_accuracy": 0.725717194378376, "num_tokens": 21794154.0, "step": 27084 }, { "epoch": 7.173199152542373, "grad_norm": 2.7867465019226074, "learning_rate": 6.413532838983051e-06, "loss": 1.1908, "mean_token_accuracy": 0.7142956331372261, "num_tokens": 21795705.0, "step": 27086 }, { "epoch": 7.173728813559322, "grad_norm": 4.010892391204834, "learning_rate": 6.4132680084745765e-06, "loss": 1.4343, "mean_token_accuracy": 0.6680134683847427, "num_tokens": 21797369.0, "step": 27088 }, { "epoch": 7.174258474576272, "grad_norm": 2.322671413421631, "learning_rate": 6.4130031779661015e-06, "loss": 0.8935, "mean_token_accuracy": 0.7836401462554932, "num_tokens": 21798989.0, "step": 27090 }, { "epoch": 7.174788135593221, "grad_norm": 2.389401912689209, "learning_rate": 6.412738347457628e-06, "loss": 1.1819, "mean_token_accuracy": 0.713996522128582, "num_tokens": 21800771.0, "step": 27092 }, { "epoch": 7.1753177966101696, "grad_norm": 2.1094961166381836, "learning_rate": 6.412473516949152e-06, "loss": 0.7806, "mean_token_accuracy": 0.7896308451890945, "num_tokens": 21802137.0, "step": 27094 }, { "epoch": 7.1758474576271185, "grad_norm": 3.1058361530303955, "learning_rate": 6.412208686440679e-06, "loss": 1.3754, "mean_token_accuracy": 0.6820389777421951, "num_tokens": 21803685.0, "step": 27096 }, { "epoch": 7.1763771186440675, "grad_norm": 2.613172769546509, "learning_rate": 6.411943855932204e-06, "loss": 1.0779, "mean_token_accuracy": 0.7604336887598038, "num_tokens": 21805440.0, "step": 27098 }, { "epoch": 7.176906779661017, "grad_norm": 3.7727954387664795, "learning_rate": 6.4116790254237295e-06, "loss": 1.3208, "mean_token_accuracy": 0.7489602342247963, "num_tokens": 21807641.0, "step": 27100 }, { "epoch": 7.177436440677966, "grad_norm": 2.2682721614837646, "learning_rate": 6.411414194915255e-06, "loss": 1.3209, "mean_token_accuracy": 0.6782781928777695, "num_tokens": 21809575.0, "step": 27102 }, { "epoch": 7.177966101694915, "grad_norm": 2.3078737258911133, "learning_rate": 6.41114936440678e-06, "loss": 1.1372, "mean_token_accuracy": 0.7573037222027779, "num_tokens": 21811183.0, "step": 27104 }, { "epoch": 7.178495762711864, "grad_norm": 2.3358240127563477, "learning_rate": 6.410884533898306e-06, "loss": 1.1699, "mean_token_accuracy": 0.7410279884934425, "num_tokens": 21812777.0, "step": 27106 }, { "epoch": 7.179025423728813, "grad_norm": 2.447533130645752, "learning_rate": 6.410619703389831e-06, "loss": 0.9527, "mean_token_accuracy": 0.7709366604685783, "num_tokens": 21814285.0, "step": 27108 }, { "epoch": 7.179555084745763, "grad_norm": 2.2295217514038086, "learning_rate": 6.410354872881357e-06, "loss": 1.0664, "mean_token_accuracy": 0.7244456931948662, "num_tokens": 21815900.0, "step": 27110 }, { "epoch": 7.180084745762712, "grad_norm": 2.3058931827545166, "learning_rate": 6.410090042372882e-06, "loss": 0.6667, "mean_token_accuracy": 0.8363568186759949, "num_tokens": 21817418.0, "step": 27112 }, { "epoch": 7.180614406779661, "grad_norm": 2.6528851985931396, "learning_rate": 6.409825211864407e-06, "loss": 1.3863, "mean_token_accuracy": 0.7143978700041771, "num_tokens": 21818873.0, "step": 27114 }, { "epoch": 7.18114406779661, "grad_norm": 2.553372383117676, "learning_rate": 6.409560381355932e-06, "loss": 1.5729, "mean_token_accuracy": 0.6696544289588928, "num_tokens": 21820480.0, "step": 27116 }, { "epoch": 7.18167372881356, "grad_norm": 2.3773481845855713, "learning_rate": 6.409295550847459e-06, "loss": 1.016, "mean_token_accuracy": 0.7526136189699173, "num_tokens": 21822529.0, "step": 27118 }, { "epoch": 7.182203389830509, "grad_norm": 2.9027018547058105, "learning_rate": 6.409030720338983e-06, "loss": 1.3028, "mean_token_accuracy": 0.6922737210988998, "num_tokens": 21824178.0, "step": 27120 }, { "epoch": 7.182733050847458, "grad_norm": 2.7739758491516113, "learning_rate": 6.40876588983051e-06, "loss": 0.9193, "mean_token_accuracy": 0.7857348099350929, "num_tokens": 21825738.0, "step": 27122 }, { "epoch": 7.183262711864407, "grad_norm": 3.1036183834075928, "learning_rate": 6.4085010593220346e-06, "loss": 1.5981, "mean_token_accuracy": 0.6512212492525578, "num_tokens": 21827305.0, "step": 27124 }, { "epoch": 7.1837923728813555, "grad_norm": 2.580254554748535, "learning_rate": 6.40823622881356e-06, "loss": 1.1987, "mean_token_accuracy": 0.6987587437033653, "num_tokens": 21829073.0, "step": 27126 }, { "epoch": 7.184322033898305, "grad_norm": 2.9052927494049072, "learning_rate": 6.407971398305085e-06, "loss": 1.0777, "mean_token_accuracy": 0.7332861870527267, "num_tokens": 21830798.0, "step": 27128 }, { "epoch": 7.184851694915254, "grad_norm": 2.2031872272491455, "learning_rate": 6.407706567796611e-06, "loss": 1.1983, "mean_token_accuracy": 0.7364377304911613, "num_tokens": 21832604.0, "step": 27130 }, { "epoch": 7.185381355932203, "grad_norm": 1.8501193523406982, "learning_rate": 6.407441737288136e-06, "loss": 0.871, "mean_token_accuracy": 0.7526988163590431, "num_tokens": 21834398.0, "step": 27132 }, { "epoch": 7.185911016949152, "grad_norm": 2.287912368774414, "learning_rate": 6.407176906779662e-06, "loss": 0.7877, "mean_token_accuracy": 0.7892578318715096, "num_tokens": 21836059.0, "step": 27134 }, { "epoch": 7.186440677966102, "grad_norm": 2.965075969696045, "learning_rate": 6.406912076271187e-06, "loss": 1.1314, "mean_token_accuracy": 0.745550312101841, "num_tokens": 21837376.0, "step": 27136 }, { "epoch": 7.186970338983051, "grad_norm": 2.743853807449341, "learning_rate": 6.4066472457627125e-06, "loss": 1.046, "mean_token_accuracy": 0.7421795651316643, "num_tokens": 21838970.0, "step": 27138 }, { "epoch": 7.1875, "grad_norm": 3.4923179149627686, "learning_rate": 6.406382415254237e-06, "loss": 1.1094, "mean_token_accuracy": 0.7155390456318855, "num_tokens": 21840440.0, "step": 27140 }, { "epoch": 7.188029661016949, "grad_norm": 3.590294599533081, "learning_rate": 6.406117584745763e-06, "loss": 0.9497, "mean_token_accuracy": 0.7653177380561829, "num_tokens": 21842049.0, "step": 27142 }, { "epoch": 7.188559322033898, "grad_norm": 2.2900314331054688, "learning_rate": 6.405852754237288e-06, "loss": 1.3813, "mean_token_accuracy": 0.7014643028378487, "num_tokens": 21843728.0, "step": 27144 }, { "epoch": 7.189088983050848, "grad_norm": 2.5440938472747803, "learning_rate": 6.405587923728815e-06, "loss": 0.8825, "mean_token_accuracy": 0.7773147374391556, "num_tokens": 21845356.0, "step": 27146 }, { "epoch": 7.189618644067797, "grad_norm": 2.1855735778808594, "learning_rate": 6.405323093220339e-06, "loss": 1.2635, "mean_token_accuracy": 0.6774393022060394, "num_tokens": 21847406.0, "step": 27148 }, { "epoch": 7.190148305084746, "grad_norm": 3.0617940425872803, "learning_rate": 6.4050582627118654e-06, "loss": 1.3473, "mean_token_accuracy": 0.684286318719387, "num_tokens": 21848928.0, "step": 27150 }, { "epoch": 7.190677966101695, "grad_norm": 2.6591200828552246, "learning_rate": 6.40479343220339e-06, "loss": 1.3465, "mean_token_accuracy": 0.6909897401928902, "num_tokens": 21850470.0, "step": 27152 }, { "epoch": 7.1912076271186445, "grad_norm": 3.0096988677978516, "learning_rate": 6.404528601694916e-06, "loss": 1.4004, "mean_token_accuracy": 0.6919109001755714, "num_tokens": 21851856.0, "step": 27154 }, { "epoch": 7.191737288135593, "grad_norm": 2.6526103019714355, "learning_rate": 6.404263771186441e-06, "loss": 0.9969, "mean_token_accuracy": 0.7629199624061584, "num_tokens": 21853252.0, "step": 27156 }, { "epoch": 7.192266949152542, "grad_norm": 2.3199350833892822, "learning_rate": 6.403998940677967e-06, "loss": 1.1912, "mean_token_accuracy": 0.71560388058424, "num_tokens": 21855035.0, "step": 27158 }, { "epoch": 7.192796610169491, "grad_norm": 2.765427350997925, "learning_rate": 6.403734110169492e-06, "loss": 1.2816, "mean_token_accuracy": 0.7220919132232666, "num_tokens": 21856548.0, "step": 27160 }, { "epoch": 7.19332627118644, "grad_norm": 2.917285680770874, "learning_rate": 6.4034692796610175e-06, "loss": 1.235, "mean_token_accuracy": 0.7305081337690353, "num_tokens": 21858072.0, "step": 27162 }, { "epoch": 7.19385593220339, "grad_norm": 2.421613931655884, "learning_rate": 6.4032044491525425e-06, "loss": 1.3356, "mean_token_accuracy": 0.7128896117210388, "num_tokens": 21859648.0, "step": 27164 }, { "epoch": 7.194385593220339, "grad_norm": 2.650566816329956, "learning_rate": 6.402939618644068e-06, "loss": 1.1026, "mean_token_accuracy": 0.7409807816147804, "num_tokens": 21861156.0, "step": 27166 }, { "epoch": 7.194915254237288, "grad_norm": 3.0877697467803955, "learning_rate": 6.402674788135593e-06, "loss": 1.2903, "mean_token_accuracy": 0.7181911021471024, "num_tokens": 21862429.0, "step": 27168 }, { "epoch": 7.195444915254237, "grad_norm": 2.195875644683838, "learning_rate": 6.402409957627119e-06, "loss": 1.1453, "mean_token_accuracy": 0.7225984707474709, "num_tokens": 21864085.0, "step": 27170 }, { "epoch": 7.195974576271187, "grad_norm": 2.7959718704223633, "learning_rate": 6.402145127118644e-06, "loss": 0.9114, "mean_token_accuracy": 0.7961595430970192, "num_tokens": 21865878.0, "step": 27172 }, { "epoch": 7.196504237288136, "grad_norm": 2.1578800678253174, "learning_rate": 6.40188029661017e-06, "loss": 0.8365, "mean_token_accuracy": 0.8140619844198227, "num_tokens": 21867321.0, "step": 27174 }, { "epoch": 7.197033898305085, "grad_norm": 3.592985153198242, "learning_rate": 6.401615466101695e-06, "loss": 1.376, "mean_token_accuracy": 0.6982356235384941, "num_tokens": 21868812.0, "step": 27176 }, { "epoch": 7.197563559322034, "grad_norm": 2.65091872215271, "learning_rate": 6.401350635593221e-06, "loss": 1.0973, "mean_token_accuracy": 0.7376868426799774, "num_tokens": 21870281.0, "step": 27178 }, { "epoch": 7.198093220338983, "grad_norm": 2.3893096446990967, "learning_rate": 6.401085805084746e-06, "loss": 1.0673, "mean_token_accuracy": 0.7606558129191399, "num_tokens": 21871838.0, "step": 27180 }, { "epoch": 7.1986228813559325, "grad_norm": 2.9326541423797607, "learning_rate": 6.400820974576272e-06, "loss": 1.3953, "mean_token_accuracy": 0.6991091296076775, "num_tokens": 21873262.0, "step": 27182 }, { "epoch": 7.1991525423728815, "grad_norm": 3.115945816040039, "learning_rate": 6.400556144067798e-06, "loss": 1.4402, "mean_token_accuracy": 0.6823773086071014, "num_tokens": 21874876.0, "step": 27184 }, { "epoch": 7.1996822033898304, "grad_norm": 2.527514696121216, "learning_rate": 6.400291313559323e-06, "loss": 1.0715, "mean_token_accuracy": 0.7329717352986336, "num_tokens": 21876420.0, "step": 27186 }, { "epoch": 7.200211864406779, "grad_norm": 2.689671754837036, "learning_rate": 6.400026483050848e-06, "loss": 1.1209, "mean_token_accuracy": 0.7342936620116234, "num_tokens": 21878149.0, "step": 27188 }, { "epoch": 7.200741525423728, "grad_norm": 3.123302698135376, "learning_rate": 6.399761652542373e-06, "loss": 0.7157, "mean_token_accuracy": 0.8035194128751755, "num_tokens": 21879680.0, "step": 27190 }, { "epoch": 7.201271186440678, "grad_norm": 2.4031713008880615, "learning_rate": 6.399496822033899e-06, "loss": 1.016, "mean_token_accuracy": 0.7781406342983246, "num_tokens": 21881327.0, "step": 27192 }, { "epoch": 7.201800847457627, "grad_norm": 2.536215305328369, "learning_rate": 6.399231991525424e-06, "loss": 1.2724, "mean_token_accuracy": 0.6901653483510017, "num_tokens": 21883021.0, "step": 27194 }, { "epoch": 7.202330508474576, "grad_norm": 2.9277706146240234, "learning_rate": 6.39896716101695e-06, "loss": 1.1679, "mean_token_accuracy": 0.7379577308893204, "num_tokens": 21884457.0, "step": 27196 }, { "epoch": 7.202860169491525, "grad_norm": 2.0520131587982178, "learning_rate": 6.398702330508475e-06, "loss": 0.675, "mean_token_accuracy": 0.8169446736574173, "num_tokens": 21886301.0, "step": 27198 }, { "epoch": 7.203389830508475, "grad_norm": 2.573885202407837, "learning_rate": 6.398437500000001e-06, "loss": 0.652, "mean_token_accuracy": 0.8231245055794716, "num_tokens": 21887469.0, "step": 27200 }, { "epoch": 7.203919491525424, "grad_norm": 1.9630508422851562, "learning_rate": 6.3981726694915254e-06, "loss": 0.9716, "mean_token_accuracy": 0.765317477285862, "num_tokens": 21889155.0, "step": 27202 }, { "epoch": 7.204449152542373, "grad_norm": 2.382606267929077, "learning_rate": 6.397907838983052e-06, "loss": 1.1545, "mean_token_accuracy": 0.7618793472647667, "num_tokens": 21890638.0, "step": 27204 }, { "epoch": 7.204978813559322, "grad_norm": 2.935720682144165, "learning_rate": 6.397643008474577e-06, "loss": 1.2083, "mean_token_accuracy": 0.7191147655248642, "num_tokens": 21892210.0, "step": 27206 }, { "epoch": 7.205508474576272, "grad_norm": 3.188572645187378, "learning_rate": 6.397378177966103e-06, "loss": 1.0387, "mean_token_accuracy": 0.7290885001420975, "num_tokens": 21893781.0, "step": 27208 }, { "epoch": 7.206038135593221, "grad_norm": 3.2248873710632324, "learning_rate": 6.397113347457628e-06, "loss": 1.4221, "mean_token_accuracy": 0.7065721526741982, "num_tokens": 21895194.0, "step": 27210 }, { "epoch": 7.2065677966101696, "grad_norm": 3.1589488983154297, "learning_rate": 6.3968485169491535e-06, "loss": 1.3761, "mean_token_accuracy": 0.6740253120660782, "num_tokens": 21896914.0, "step": 27212 }, { "epoch": 7.2070974576271185, "grad_norm": 2.7563090324401855, "learning_rate": 6.396583686440678e-06, "loss": 1.2706, "mean_token_accuracy": 0.7258571535348892, "num_tokens": 21898349.0, "step": 27214 }, { "epoch": 7.2076271186440675, "grad_norm": 3.575674057006836, "learning_rate": 6.396318855932204e-06, "loss": 1.6043, "mean_token_accuracy": 0.6371822729706764, "num_tokens": 21899721.0, "step": 27216 }, { "epoch": 7.208156779661017, "grad_norm": 2.44916033744812, "learning_rate": 6.396054025423729e-06, "loss": 1.3254, "mean_token_accuracy": 0.6965440809726715, "num_tokens": 21901137.0, "step": 27218 }, { "epoch": 7.208686440677966, "grad_norm": 2.870940923690796, "learning_rate": 6.395789194915255e-06, "loss": 1.2138, "mean_token_accuracy": 0.722022071480751, "num_tokens": 21902581.0, "step": 27220 }, { "epoch": 7.209216101694915, "grad_norm": 2.2132925987243652, "learning_rate": 6.39552436440678e-06, "loss": 1.2539, "mean_token_accuracy": 0.7037374079227448, "num_tokens": 21904218.0, "step": 27222 }, { "epoch": 7.209745762711864, "grad_norm": 2.447340250015259, "learning_rate": 6.395259533898306e-06, "loss": 0.7669, "mean_token_accuracy": 0.8129378855228424, "num_tokens": 21905792.0, "step": 27224 }, { "epoch": 7.210275423728813, "grad_norm": 3.1647398471832275, "learning_rate": 6.3949947033898305e-06, "loss": 0.8794, "mean_token_accuracy": 0.7635785192251205, "num_tokens": 21908003.0, "step": 27226 }, { "epoch": 7.210805084745763, "grad_norm": 2.7520108222961426, "learning_rate": 6.394729872881356e-06, "loss": 0.9949, "mean_token_accuracy": 0.7705682143568993, "num_tokens": 21909838.0, "step": 27228 }, { "epoch": 7.211334745762712, "grad_norm": 2.3796157836914062, "learning_rate": 6.394465042372881e-06, "loss": 1.0056, "mean_token_accuracy": 0.7716137617826462, "num_tokens": 21911643.0, "step": 27230 }, { "epoch": 7.211864406779661, "grad_norm": 2.3154568672180176, "learning_rate": 6.394200211864408e-06, "loss": 1.307, "mean_token_accuracy": 0.7001505419611931, "num_tokens": 21913058.0, "step": 27232 }, { "epoch": 7.21239406779661, "grad_norm": 3.061455726623535, "learning_rate": 6.393935381355933e-06, "loss": 1.1941, "mean_token_accuracy": 0.7192520573735237, "num_tokens": 21914366.0, "step": 27234 }, { "epoch": 7.21292372881356, "grad_norm": 2.7673535346984863, "learning_rate": 6.3936705508474586e-06, "loss": 1.6079, "mean_token_accuracy": 0.6439777463674545, "num_tokens": 21916042.0, "step": 27236 }, { "epoch": 7.213453389830509, "grad_norm": 2.736969470977783, "learning_rate": 6.3934057203389835e-06, "loss": 1.2235, "mean_token_accuracy": 0.7078733444213867, "num_tokens": 21917881.0, "step": 27238 }, { "epoch": 7.213983050847458, "grad_norm": 2.2912707328796387, "learning_rate": 6.393140889830509e-06, "loss": 1.1749, "mean_token_accuracy": 0.7155566066503525, "num_tokens": 21919572.0, "step": 27240 }, { "epoch": 7.214512711864407, "grad_norm": 2.9967286586761475, "learning_rate": 6.392876059322034e-06, "loss": 0.8639, "mean_token_accuracy": 0.7848344892263412, "num_tokens": 21920958.0, "step": 27242 }, { "epoch": 7.2150423728813555, "grad_norm": 2.610651969909668, "learning_rate": 6.39261122881356e-06, "loss": 1.2259, "mean_token_accuracy": 0.7223586291074753, "num_tokens": 21922708.0, "step": 27244 }, { "epoch": 7.215572033898305, "grad_norm": 2.637481212615967, "learning_rate": 6.392346398305085e-06, "loss": 1.2879, "mean_token_accuracy": 0.6810561195015907, "num_tokens": 21924267.0, "step": 27246 }, { "epoch": 7.216101694915254, "grad_norm": 2.985752820968628, "learning_rate": 6.392081567796611e-06, "loss": 1.3401, "mean_token_accuracy": 0.6858273446559906, "num_tokens": 21926036.0, "step": 27248 }, { "epoch": 7.216631355932203, "grad_norm": 2.8577775955200195, "learning_rate": 6.391816737288136e-06, "loss": 1.148, "step": 27250 }, { "epoch": 7.216631355932203, "eval_loss": 1.3346238136291504, "eval_mean_token_accuracy": 0.6984671631029674, "eval_num_tokens": 21927609.0, "eval_runtime": 48.1498, "eval_samples_per_second": 6.397, "eval_steps_per_second": 6.397, "step": 27250 }, { "epoch": 7.217161016949152, "grad_norm": 2.365236282348633, "learning_rate": 6.391551906779661e-06, "loss": 1.3718, "mean_token_accuracy": 0.7251349687576294, "num_tokens": 21929316.0, "step": 27252 }, { "epoch": 7.217690677966102, "grad_norm": 2.785860776901245, "learning_rate": 6.391287076271186e-06, "loss": 1.2376, "mean_token_accuracy": 0.7216351553797722, "num_tokens": 21930912.0, "step": 27254 }, { "epoch": 7.218220338983051, "grad_norm": 3.294689655303955, "learning_rate": 6.391022245762712e-06, "loss": 1.4674, "mean_token_accuracy": 0.6701751537621021, "num_tokens": 21932444.0, "step": 27256 }, { "epoch": 7.21875, "grad_norm": 2.568087339401245, "learning_rate": 6.390757415254237e-06, "loss": 0.9843, "mean_token_accuracy": 0.7425954714417458, "num_tokens": 21934222.0, "step": 27258 }, { "epoch": 7.219279661016949, "grad_norm": 2.9463276863098145, "learning_rate": 6.390492584745764e-06, "loss": 1.1021, "mean_token_accuracy": 0.7461869791150093, "num_tokens": 21935754.0, "step": 27260 }, { "epoch": 7.219809322033898, "grad_norm": 2.456331253051758, "learning_rate": 6.390227754237288e-06, "loss": 0.8507, "mean_token_accuracy": 0.7799910828471184, "num_tokens": 21937254.0, "step": 27262 }, { "epoch": 7.220338983050848, "grad_norm": 2.448978900909424, "learning_rate": 6.389962923728814e-06, "loss": 0.9308, "mean_token_accuracy": 0.7739629149436951, "num_tokens": 21938883.0, "step": 27264 }, { "epoch": 7.220868644067797, "grad_norm": 3.491290807723999, "learning_rate": 6.389698093220339e-06, "loss": 1.4944, "mean_token_accuracy": 0.6534613445401192, "num_tokens": 21940361.0, "step": 27266 }, { "epoch": 7.221398305084746, "grad_norm": 2.3113980293273926, "learning_rate": 6.389433262711865e-06, "loss": 1.2922, "mean_token_accuracy": 0.6920685172080994, "num_tokens": 21941820.0, "step": 27268 }, { "epoch": 7.221927966101695, "grad_norm": 2.494785785675049, "learning_rate": 6.389168432203391e-06, "loss": 1.0196, "mean_token_accuracy": 0.7518786862492561, "num_tokens": 21943322.0, "step": 27270 }, { "epoch": 7.2224576271186445, "grad_norm": 2.9096620082855225, "learning_rate": 6.388903601694916e-06, "loss": 1.3216, "mean_token_accuracy": 0.6978775560855865, "num_tokens": 21944941.0, "step": 27272 }, { "epoch": 7.222987288135593, "grad_norm": 3.642237901687622, "learning_rate": 6.3886387711864415e-06, "loss": 1.3481, "mean_token_accuracy": 0.6941733881831169, "num_tokens": 21946559.0, "step": 27274 }, { "epoch": 7.223516949152542, "grad_norm": 2.8667867183685303, "learning_rate": 6.3883739406779665e-06, "loss": 1.3073, "mean_token_accuracy": 0.7056198343634605, "num_tokens": 21948060.0, "step": 27276 }, { "epoch": 7.224046610169491, "grad_norm": 3.16805362701416, "learning_rate": 6.388109110169492e-06, "loss": 1.4688, "mean_token_accuracy": 0.6676215156912804, "num_tokens": 21949747.0, "step": 27278 }, { "epoch": 7.22457627118644, "grad_norm": 2.2590699195861816, "learning_rate": 6.387844279661017e-06, "loss": 1.04, "mean_token_accuracy": 0.7752974033355713, "num_tokens": 21951194.0, "step": 27280 }, { "epoch": 7.22510593220339, "grad_norm": 2.8664512634277344, "learning_rate": 6.387579449152543e-06, "loss": 1.1644, "mean_token_accuracy": 0.7428180426359177, "num_tokens": 21952663.0, "step": 27282 }, { "epoch": 7.225635593220339, "grad_norm": 2.45969295501709, "learning_rate": 6.387314618644068e-06, "loss": 1.1366, "mean_token_accuracy": 0.7286548987030983, "num_tokens": 21954101.0, "step": 27284 }, { "epoch": 7.226165254237288, "grad_norm": 2.6738719940185547, "learning_rate": 6.3870497881355945e-06, "loss": 1.4784, "mean_token_accuracy": 0.6538967117667198, "num_tokens": 21956054.0, "step": 27286 }, { "epoch": 7.226694915254237, "grad_norm": 3.3937315940856934, "learning_rate": 6.3867849576271194e-06, "loss": 1.3923, "mean_token_accuracy": 0.6931044682860374, "num_tokens": 21957612.0, "step": 27288 }, { "epoch": 7.227224576271187, "grad_norm": 3.4367353916168213, "learning_rate": 6.386520127118645e-06, "loss": 1.2307, "mean_token_accuracy": 0.7166216522455215, "num_tokens": 21958925.0, "step": 27290 }, { "epoch": 7.227754237288136, "grad_norm": 2.948500633239746, "learning_rate": 6.38625529661017e-06, "loss": 1.4709, "mean_token_accuracy": 0.653072752058506, "num_tokens": 21960407.0, "step": 27292 }, { "epoch": 7.228283898305085, "grad_norm": 2.597884178161621, "learning_rate": 6.385990466101696e-06, "loss": 1.1773, "mean_token_accuracy": 0.7315554767847061, "num_tokens": 21962076.0, "step": 27294 }, { "epoch": 7.228813559322034, "grad_norm": 2.2282955646514893, "learning_rate": 6.385725635593221e-06, "loss": 1.1022, "mean_token_accuracy": 0.7411927133798599, "num_tokens": 21963804.0, "step": 27296 }, { "epoch": 7.229343220338983, "grad_norm": 2.1864326000213623, "learning_rate": 6.385460805084747e-06, "loss": 1.3449, "mean_token_accuracy": 0.7315388321876526, "num_tokens": 21965435.0, "step": 27298 }, { "epoch": 7.2298728813559325, "grad_norm": 2.1447389125823975, "learning_rate": 6.3851959745762715e-06, "loss": 1.0035, "mean_token_accuracy": 0.7613661773502827, "num_tokens": 21966998.0, "step": 27300 }, { "epoch": 7.2304025423728815, "grad_norm": 2.8650593757629395, "learning_rate": 6.384931144067797e-06, "loss": 1.3611, "mean_token_accuracy": 0.7112817168235779, "num_tokens": 21968419.0, "step": 27302 }, { "epoch": 7.2309322033898304, "grad_norm": 2.585134506225586, "learning_rate": 6.384666313559322e-06, "loss": 1.225, "mean_token_accuracy": 0.7090175896883011, "num_tokens": 21970291.0, "step": 27304 }, { "epoch": 7.231461864406779, "grad_norm": 2.5866992473602295, "learning_rate": 6.384401483050848e-06, "loss": 1.48, "mean_token_accuracy": 0.686453066766262, "num_tokens": 21971981.0, "step": 27306 }, { "epoch": 7.231991525423728, "grad_norm": 2.822330951690674, "learning_rate": 6.384136652542373e-06, "loss": 1.3009, "mean_token_accuracy": 0.7019532918930054, "num_tokens": 21973593.0, "step": 27308 }, { "epoch": 7.232521186440678, "grad_norm": 2.266226053237915, "learning_rate": 6.383871822033899e-06, "loss": 1.3069, "mean_token_accuracy": 0.6840160638093948, "num_tokens": 21975453.0, "step": 27310 }, { "epoch": 7.233050847457627, "grad_norm": 2.0300586223602295, "learning_rate": 6.383606991525424e-06, "loss": 1.0929, "mean_token_accuracy": 0.7193244993686676, "num_tokens": 21977315.0, "step": 27312 }, { "epoch": 7.233580508474576, "grad_norm": 2.5220086574554443, "learning_rate": 6.38334216101695e-06, "loss": 1.5007, "mean_token_accuracy": 0.6622075736522675, "num_tokens": 21979125.0, "step": 27314 }, { "epoch": 7.234110169491525, "grad_norm": 2.5883851051330566, "learning_rate": 6.383077330508474e-06, "loss": 1.1656, "mean_token_accuracy": 0.7205364108085632, "num_tokens": 21980705.0, "step": 27316 }, { "epoch": 7.234639830508475, "grad_norm": 2.6640591621398926, "learning_rate": 6.382812500000001e-06, "loss": 1.0732, "mean_token_accuracy": 0.7643107026815414, "num_tokens": 21982161.0, "step": 27318 }, { "epoch": 7.235169491525424, "grad_norm": 2.3282413482666016, "learning_rate": 6.382547669491526e-06, "loss": 1.278, "mean_token_accuracy": 0.7134194448590279, "num_tokens": 21983763.0, "step": 27320 }, { "epoch": 7.235699152542373, "grad_norm": 2.257974147796631, "learning_rate": 6.382282838983052e-06, "loss": 0.9688, "mean_token_accuracy": 0.7610267996788025, "num_tokens": 21985594.0, "step": 27322 }, { "epoch": 7.236228813559322, "grad_norm": 3.1132326126098633, "learning_rate": 6.382018008474577e-06, "loss": 1.1447, "mean_token_accuracy": 0.7482578456401825, "num_tokens": 21987022.0, "step": 27324 }, { "epoch": 7.236758474576272, "grad_norm": 2.788252592086792, "learning_rate": 6.381753177966102e-06, "loss": 1.3519, "mean_token_accuracy": 0.7101476490497589, "num_tokens": 21988507.0, "step": 27326 }, { "epoch": 7.237288135593221, "grad_norm": 1.9808168411254883, "learning_rate": 6.381488347457627e-06, "loss": 0.9385, "mean_token_accuracy": 0.7737000212073326, "num_tokens": 21990143.0, "step": 27328 }, { "epoch": 7.2378177966101696, "grad_norm": 2.6795883178710938, "learning_rate": 6.381223516949153e-06, "loss": 0.99, "mean_token_accuracy": 0.7587607130408287, "num_tokens": 21991574.0, "step": 27330 }, { "epoch": 7.2383474576271185, "grad_norm": 3.1101419925689697, "learning_rate": 6.380958686440678e-06, "loss": 1.1895, "mean_token_accuracy": 0.7141270786523819, "num_tokens": 21993170.0, "step": 27332 }, { "epoch": 7.2388771186440675, "grad_norm": 2.034482002258301, "learning_rate": 6.380693855932204e-06, "loss": 1.0153, "mean_token_accuracy": 0.7411909699440002, "num_tokens": 21994798.0, "step": 27334 }, { "epoch": 7.239406779661017, "grad_norm": 2.9112420082092285, "learning_rate": 6.380429025423729e-06, "loss": 1.2066, "mean_token_accuracy": 0.724509708583355, "num_tokens": 21996397.0, "step": 27336 }, { "epoch": 7.239936440677966, "grad_norm": 2.248901844024658, "learning_rate": 6.3801641949152545e-06, "loss": 0.9823, "mean_token_accuracy": 0.7654655650258064, "num_tokens": 21997878.0, "step": 27338 }, { "epoch": 7.240466101694915, "grad_norm": 2.827833890914917, "learning_rate": 6.3798993644067794e-06, "loss": 1.322, "mean_token_accuracy": 0.7132636457681656, "num_tokens": 21999390.0, "step": 27340 }, { "epoch": 7.240995762711864, "grad_norm": 2.9994354248046875, "learning_rate": 6.379634533898306e-06, "loss": 1.1833, "mean_token_accuracy": 0.7426732331514359, "num_tokens": 22000905.0, "step": 27342 }, { "epoch": 7.241525423728813, "grad_norm": 2.0274741649627686, "learning_rate": 6.37936970338983e-06, "loss": 0.7296, "mean_token_accuracy": 0.8271134719252586, "num_tokens": 22002518.0, "step": 27344 }, { "epoch": 7.242055084745763, "grad_norm": 3.5083398818969727, "learning_rate": 6.379104872881357e-06, "loss": 1.2068, "mean_token_accuracy": 0.7139831185340881, "num_tokens": 22003815.0, "step": 27346 }, { "epoch": 7.242584745762712, "grad_norm": 2.850680112838745, "learning_rate": 6.378840042372882e-06, "loss": 1.386, "mean_token_accuracy": 0.6944009438157082, "num_tokens": 22005418.0, "step": 27348 }, { "epoch": 7.243114406779661, "grad_norm": 2.876835823059082, "learning_rate": 6.3785752118644075e-06, "loss": 0.8466, "mean_token_accuracy": 0.7897524163126945, "num_tokens": 22006786.0, "step": 27350 }, { "epoch": 7.24364406779661, "grad_norm": 3.4228227138519287, "learning_rate": 6.378310381355933e-06, "loss": 1.4459, "mean_token_accuracy": 0.6672986969351768, "num_tokens": 22008250.0, "step": 27352 }, { "epoch": 7.24417372881356, "grad_norm": 2.4834842681884766, "learning_rate": 6.378045550847458e-06, "loss": 1.0561, "mean_token_accuracy": 0.7410213649272919, "num_tokens": 22009795.0, "step": 27354 }, { "epoch": 7.244703389830509, "grad_norm": 2.2988948822021484, "learning_rate": 6.377780720338984e-06, "loss": 1.1744, "mean_token_accuracy": 0.7404158338904381, "num_tokens": 22011588.0, "step": 27356 }, { "epoch": 7.245233050847458, "grad_norm": 2.0505857467651367, "learning_rate": 6.377515889830509e-06, "loss": 0.8872, "mean_token_accuracy": 0.7783447355031967, "num_tokens": 22013067.0, "step": 27358 }, { "epoch": 7.245762711864407, "grad_norm": 2.2131507396698, "learning_rate": 6.377251059322035e-06, "loss": 0.8998, "mean_token_accuracy": 0.7912416383624077, "num_tokens": 22014503.0, "step": 27360 }, { "epoch": 7.2462923728813555, "grad_norm": 2.4641799926757812, "learning_rate": 6.37698622881356e-06, "loss": 1.0783, "mean_token_accuracy": 0.73799829185009, "num_tokens": 22016437.0, "step": 27362 }, { "epoch": 7.246822033898305, "grad_norm": 2.4191999435424805, "learning_rate": 6.376721398305085e-06, "loss": 1.025, "mean_token_accuracy": 0.7432508915662766, "num_tokens": 22017969.0, "step": 27364 }, { "epoch": 7.247351694915254, "grad_norm": 2.770320177078247, "learning_rate": 6.37645656779661e-06, "loss": 1.1394, "mean_token_accuracy": 0.7411326915025711, "num_tokens": 22019456.0, "step": 27366 }, { "epoch": 7.247881355932203, "grad_norm": 2.1305108070373535, "learning_rate": 6.376191737288137e-06, "loss": 0.6426, "mean_token_accuracy": 0.8269670829176903, "num_tokens": 22021018.0, "step": 27368 }, { "epoch": 7.248411016949152, "grad_norm": 2.170947313308716, "learning_rate": 6.375926906779661e-06, "loss": 0.6058, "mean_token_accuracy": 0.8447716757655144, "num_tokens": 22022806.0, "step": 27370 }, { "epoch": 7.248940677966102, "grad_norm": 2.8176305294036865, "learning_rate": 6.375662076271188e-06, "loss": 1.4071, "mean_token_accuracy": 0.7015915885567665, "num_tokens": 22024306.0, "step": 27372 }, { "epoch": 7.249470338983051, "grad_norm": 2.4675471782684326, "learning_rate": 6.3753972457627126e-06, "loss": 1.1305, "mean_token_accuracy": 0.7311557158827782, "num_tokens": 22026004.0, "step": 27374 }, { "epoch": 7.25, "grad_norm": 2.974543333053589, "learning_rate": 6.375132415254238e-06, "loss": 1.1177, "mean_token_accuracy": 0.7221501022577286, "num_tokens": 22027605.0, "step": 27376 }, { "epoch": 7.250529661016949, "grad_norm": 3.1041061878204346, "learning_rate": 6.374867584745763e-06, "loss": 1.0429, "mean_token_accuracy": 0.743228904902935, "num_tokens": 22028915.0, "step": 27378 }, { "epoch": 7.251059322033898, "grad_norm": 2.753150463104248, "learning_rate": 6.374602754237289e-06, "loss": 0.997, "mean_token_accuracy": 0.7391364425420761, "num_tokens": 22030442.0, "step": 27380 }, { "epoch": 7.251588983050848, "grad_norm": 2.858120918273926, "learning_rate": 6.374337923728814e-06, "loss": 1.4632, "mean_token_accuracy": 0.68059441447258, "num_tokens": 22032098.0, "step": 27382 }, { "epoch": 7.252118644067797, "grad_norm": 3.479651927947998, "learning_rate": 6.37407309322034e-06, "loss": 1.0875, "mean_token_accuracy": 0.7488712295889854, "num_tokens": 22034009.0, "step": 27384 }, { "epoch": 7.252648305084746, "grad_norm": 3.1597063541412354, "learning_rate": 6.373808262711865e-06, "loss": 0.989, "mean_token_accuracy": 0.7533774599432945, "num_tokens": 22035593.0, "step": 27386 }, { "epoch": 7.253177966101695, "grad_norm": 2.9710562229156494, "learning_rate": 6.3735434322033904e-06, "loss": 1.1583, "mean_token_accuracy": 0.7537121772766113, "num_tokens": 22036884.0, "step": 27388 }, { "epoch": 7.2537076271186445, "grad_norm": 2.55615496635437, "learning_rate": 6.373278601694915e-06, "loss": 0.9382, "mean_token_accuracy": 0.7477534711360931, "num_tokens": 22038726.0, "step": 27390 }, { "epoch": 7.254237288135593, "grad_norm": 2.6432249546051025, "learning_rate": 6.373013771186441e-06, "loss": 1.2213, "mean_token_accuracy": 0.700142540037632, "num_tokens": 22040308.0, "step": 27392 }, { "epoch": 7.254766949152542, "grad_norm": 3.284391403198242, "learning_rate": 6.372748940677966e-06, "loss": 1.2488, "mean_token_accuracy": 0.7252795845270157, "num_tokens": 22041815.0, "step": 27394 }, { "epoch": 7.255296610169491, "grad_norm": 2.9310076236724854, "learning_rate": 6.372484110169493e-06, "loss": 1.0665, "mean_token_accuracy": 0.7460607439279556, "num_tokens": 22043441.0, "step": 27396 }, { "epoch": 7.25582627118644, "grad_norm": 2.699463129043579, "learning_rate": 6.372219279661017e-06, "loss": 1.2245, "mean_token_accuracy": 0.7390593998134136, "num_tokens": 22045180.0, "step": 27398 }, { "epoch": 7.25635593220339, "grad_norm": 2.4971346855163574, "learning_rate": 6.371954449152543e-06, "loss": 0.9106, "mean_token_accuracy": 0.7754324525594711, "num_tokens": 22046590.0, "step": 27400 }, { "epoch": 7.256885593220339, "grad_norm": 4.082270622253418, "learning_rate": 6.371689618644068e-06, "loss": 1.2971, "mean_token_accuracy": 0.7095475718379021, "num_tokens": 22048059.0, "step": 27402 }, { "epoch": 7.257415254237288, "grad_norm": 2.174879312515259, "learning_rate": 6.371424788135594e-06, "loss": 0.9489, "mean_token_accuracy": 0.7639200836420059, "num_tokens": 22049781.0, "step": 27404 }, { "epoch": 7.257944915254237, "grad_norm": 2.95748233795166, "learning_rate": 6.371159957627119e-06, "loss": 0.9913, "mean_token_accuracy": 0.7654586881399155, "num_tokens": 22051567.0, "step": 27406 }, { "epoch": 7.258474576271187, "grad_norm": 2.7620322704315186, "learning_rate": 6.370895127118645e-06, "loss": 1.3507, "mean_token_accuracy": 0.6685413643717766, "num_tokens": 22054119.0, "step": 27408 }, { "epoch": 7.259004237288136, "grad_norm": 2.557175636291504, "learning_rate": 6.37063029661017e-06, "loss": 1.2327, "mean_token_accuracy": 0.7286583259701729, "num_tokens": 22055697.0, "step": 27410 }, { "epoch": 7.259533898305085, "grad_norm": 3.1287970542907715, "learning_rate": 6.3703654661016955e-06, "loss": 1.2566, "mean_token_accuracy": 0.7199085578322411, "num_tokens": 22057440.0, "step": 27412 }, { "epoch": 7.260063559322034, "grad_norm": 2.26567006111145, "learning_rate": 6.3701006355932205e-06, "loss": 1.3838, "mean_token_accuracy": 0.6689170151948929, "num_tokens": 22059036.0, "step": 27414 }, { "epoch": 7.260593220338983, "grad_norm": 1.9594337940216064, "learning_rate": 6.369835805084746e-06, "loss": 1.2036, "mean_token_accuracy": 0.7307908609509468, "num_tokens": 22061000.0, "step": 27416 }, { "epoch": 7.2611228813559325, "grad_norm": 2.741222858428955, "learning_rate": 6.369570974576271e-06, "loss": 0.7749, "mean_token_accuracy": 0.7959752157330513, "num_tokens": 22062620.0, "step": 27418 }, { "epoch": 7.2616525423728815, "grad_norm": 2.3784902095794678, "learning_rate": 6.369306144067797e-06, "loss": 1.0479, "mean_token_accuracy": 0.7608773708343506, "num_tokens": 22064077.0, "step": 27420 }, { "epoch": 7.2621822033898304, "grad_norm": 2.73604154586792, "learning_rate": 6.369041313559322e-06, "loss": 1.2082, "mean_token_accuracy": 0.7085421681404114, "num_tokens": 22065657.0, "step": 27422 }, { "epoch": 7.262711864406779, "grad_norm": 2.3774430751800537, "learning_rate": 6.368776483050848e-06, "loss": 1.1592, "mean_token_accuracy": 0.7298837155103683, "num_tokens": 22067450.0, "step": 27424 }, { "epoch": 7.263241525423728, "grad_norm": 2.662989377975464, "learning_rate": 6.3685116525423726e-06, "loss": 1.0495, "mean_token_accuracy": 0.7461475357413292, "num_tokens": 22068919.0, "step": 27426 }, { "epoch": 7.263771186440678, "grad_norm": 2.435894727706909, "learning_rate": 6.368246822033899e-06, "loss": 1.4327, "mean_token_accuracy": 0.657079204916954, "num_tokens": 22070735.0, "step": 27428 }, { "epoch": 7.264300847457627, "grad_norm": 3.0758392810821533, "learning_rate": 6.367981991525424e-06, "loss": 1.1001, "mean_token_accuracy": 0.7343556135892868, "num_tokens": 22072209.0, "step": 27430 }, { "epoch": 7.264830508474576, "grad_norm": 2.4706871509552, "learning_rate": 6.36771716101695e-06, "loss": 1.2908, "mean_token_accuracy": 0.718082882463932, "num_tokens": 22073723.0, "step": 27432 }, { "epoch": 7.265360169491525, "grad_norm": 2.9663522243499756, "learning_rate": 6.367452330508475e-06, "loss": 1.1718, "mean_token_accuracy": 0.7323949262499809, "num_tokens": 22075275.0, "step": 27434 }, { "epoch": 7.265889830508475, "grad_norm": 2.9806196689605713, "learning_rate": 6.367187500000001e-06, "loss": 1.3499, "mean_token_accuracy": 0.687720000743866, "num_tokens": 22076985.0, "step": 27436 }, { "epoch": 7.266419491525424, "grad_norm": 2.6625771522521973, "learning_rate": 6.366922669491526e-06, "loss": 1.0067, "mean_token_accuracy": 0.7505495697259903, "num_tokens": 22078972.0, "step": 27438 }, { "epoch": 7.266949152542373, "grad_norm": 2.510521411895752, "learning_rate": 6.366657838983051e-06, "loss": 1.2771, "mean_token_accuracy": 0.7176803350448608, "num_tokens": 22080526.0, "step": 27440 }, { "epoch": 7.267478813559322, "grad_norm": 2.7093753814697266, "learning_rate": 6.366393008474577e-06, "loss": 1.5528, "mean_token_accuracy": 0.6625945717096329, "num_tokens": 22082261.0, "step": 27442 }, { "epoch": 7.268008474576272, "grad_norm": 2.730010747909546, "learning_rate": 6.366128177966102e-06, "loss": 1.3341, "mean_token_accuracy": 0.6895381584763527, "num_tokens": 22083638.0, "step": 27444 }, { "epoch": 7.268538135593221, "grad_norm": 2.39506459236145, "learning_rate": 6.365863347457628e-06, "loss": 1.2376, "mean_token_accuracy": 0.7208186388015747, "num_tokens": 22085614.0, "step": 27446 }, { "epoch": 7.2690677966101696, "grad_norm": 2.7414872646331787, "learning_rate": 6.365598516949153e-06, "loss": 1.2823, "mean_token_accuracy": 0.6989562064409256, "num_tokens": 22087203.0, "step": 27448 }, { "epoch": 7.2695974576271185, "grad_norm": 2.7097549438476562, "learning_rate": 6.365333686440679e-06, "loss": 1.0178, "mean_token_accuracy": 0.764176219701767, "num_tokens": 22088498.0, "step": 27450 }, { "epoch": 7.2701271186440675, "grad_norm": 2.846008777618408, "learning_rate": 6.3650688559322034e-06, "loss": 1.5251, "mean_token_accuracy": 0.6742094308137894, "num_tokens": 22090151.0, "step": 27452 }, { "epoch": 7.270656779661017, "grad_norm": 2.5831637382507324, "learning_rate": 6.36480402542373e-06, "loss": 1.4496, "mean_token_accuracy": 0.6923297867178917, "num_tokens": 22091617.0, "step": 27454 }, { "epoch": 7.271186440677966, "grad_norm": 3.0536694526672363, "learning_rate": 6.364539194915255e-06, "loss": 0.9003, "mean_token_accuracy": 0.7918722555041313, "num_tokens": 22093197.0, "step": 27456 }, { "epoch": 7.271716101694915, "grad_norm": 2.9040443897247314, "learning_rate": 6.364274364406781e-06, "loss": 1.0817, "mean_token_accuracy": 0.7459635138511658, "num_tokens": 22094545.0, "step": 27458 }, { "epoch": 7.272245762711864, "grad_norm": 2.7183685302734375, "learning_rate": 6.364009533898306e-06, "loss": 1.3568, "mean_token_accuracy": 0.7094782255589962, "num_tokens": 22095893.0, "step": 27460 }, { "epoch": 7.272775423728813, "grad_norm": 2.3814029693603516, "learning_rate": 6.3637447033898315e-06, "loss": 0.9622, "mean_token_accuracy": 0.7620750814676285, "num_tokens": 22097547.0, "step": 27462 }, { "epoch": 7.273305084745763, "grad_norm": 2.5940933227539062, "learning_rate": 6.363479872881356e-06, "loss": 1.0585, "mean_token_accuracy": 0.7333376556634903, "num_tokens": 22099015.0, "step": 27464 }, { "epoch": 7.273834745762712, "grad_norm": 3.4107930660247803, "learning_rate": 6.363215042372882e-06, "loss": 1.2571, "mean_token_accuracy": 0.7226743102073669, "num_tokens": 22100575.0, "step": 27466 }, { "epoch": 7.274364406779661, "grad_norm": 2.689603328704834, "learning_rate": 6.362950211864407e-06, "loss": 0.9755, "mean_token_accuracy": 0.7469530180096626, "num_tokens": 22102223.0, "step": 27468 }, { "epoch": 7.27489406779661, "grad_norm": 2.614542007446289, "learning_rate": 6.362685381355933e-06, "loss": 1.3552, "mean_token_accuracy": 0.6992892771959305, "num_tokens": 22104050.0, "step": 27470 }, { "epoch": 7.27542372881356, "grad_norm": 2.8981854915618896, "learning_rate": 6.362420550847458e-06, "loss": 1.0962, "mean_token_accuracy": 0.770111933350563, "num_tokens": 22105612.0, "step": 27472 }, { "epoch": 7.275953389830509, "grad_norm": 2.911653995513916, "learning_rate": 6.362155720338984e-06, "loss": 1.433, "mean_token_accuracy": 0.667639434337616, "num_tokens": 22107192.0, "step": 27474 }, { "epoch": 7.276483050847458, "grad_norm": 2.797964572906494, "learning_rate": 6.3618908898305085e-06, "loss": 1.4361, "mean_token_accuracy": 0.6855931133031845, "num_tokens": 22108723.0, "step": 27476 }, { "epoch": 7.277012711864407, "grad_norm": 3.0148680210113525, "learning_rate": 6.361626059322034e-06, "loss": 1.1194, "mean_token_accuracy": 0.7212969064712524, "num_tokens": 22110411.0, "step": 27478 }, { "epoch": 7.2775423728813555, "grad_norm": 3.085761547088623, "learning_rate": 6.361361228813559e-06, "loss": 1.376, "mean_token_accuracy": 0.7100708112120628, "num_tokens": 22111878.0, "step": 27480 }, { "epoch": 7.278072033898305, "grad_norm": 2.6782894134521484, "learning_rate": 6.361096398305086e-06, "loss": 1.2626, "mean_token_accuracy": 0.7046827003359795, "num_tokens": 22113565.0, "step": 27482 }, { "epoch": 7.278601694915254, "grad_norm": 2.4982399940490723, "learning_rate": 6.360831567796611e-06, "loss": 1.2307, "mean_token_accuracy": 0.7235394790768623, "num_tokens": 22115278.0, "step": 27484 }, { "epoch": 7.279131355932203, "grad_norm": 2.4312708377838135, "learning_rate": 6.3605667372881365e-06, "loss": 0.8884, "mean_token_accuracy": 0.7820464745163918, "num_tokens": 22116718.0, "step": 27486 }, { "epoch": 7.279661016949152, "grad_norm": 2.7114248275756836, "learning_rate": 6.3603019067796615e-06, "loss": 1.1926, "mean_token_accuracy": 0.7318062782287598, "num_tokens": 22118102.0, "step": 27488 }, { "epoch": 7.280190677966102, "grad_norm": 2.538602352142334, "learning_rate": 6.360037076271187e-06, "loss": 1.1985, "mean_token_accuracy": 0.7074414193630219, "num_tokens": 22119641.0, "step": 27490 }, { "epoch": 7.280720338983051, "grad_norm": 2.4881889820098877, "learning_rate": 6.359772245762712e-06, "loss": 1.2277, "mean_token_accuracy": 0.6864653527736664, "num_tokens": 22121215.0, "step": 27492 }, { "epoch": 7.28125, "grad_norm": 2.530336856842041, "learning_rate": 6.359507415254238e-06, "loss": 1.0648, "mean_token_accuracy": 0.7424117103219032, "num_tokens": 22123003.0, "step": 27494 }, { "epoch": 7.281779661016949, "grad_norm": 2.2865402698516846, "learning_rate": 6.359242584745763e-06, "loss": 1.0311, "mean_token_accuracy": 0.7452148795127869, "num_tokens": 22124670.0, "step": 27496 }, { "epoch": 7.282309322033898, "grad_norm": 2.297795295715332, "learning_rate": 6.358977754237289e-06, "loss": 0.6461, "mean_token_accuracy": 0.8361491933465004, "num_tokens": 22126280.0, "step": 27498 }, { "epoch": 7.282838983050848, "grad_norm": 2.854822874069214, "learning_rate": 6.358712923728814e-06, "loss": 1.4355, "step": 27500 }, { "epoch": 7.282838983050848, "eval_loss": 1.33561110496521, "eval_mean_token_accuracy": 0.6990470152783703, "eval_num_tokens": 22127750.0, "eval_runtime": 48.2829, "eval_samples_per_second": 6.379, "eval_steps_per_second": 6.379, "step": 27500 }, { "epoch": 7.283368644067797, "grad_norm": 2.643028736114502, "learning_rate": 6.358448093220339e-06, "loss": 1.4504, "mean_token_accuracy": 0.6928317174315453, "num_tokens": 22129296.0, "step": 27502 }, { "epoch": 7.283898305084746, "grad_norm": 2.305063009262085, "learning_rate": 6.358183262711864e-06, "loss": 1.099, "mean_token_accuracy": 0.7640021368861198, "num_tokens": 22130649.0, "step": 27504 }, { "epoch": 7.284427966101695, "grad_norm": 2.2059290409088135, "learning_rate": 6.35791843220339e-06, "loss": 0.8464, "mean_token_accuracy": 0.7800518646836281, "num_tokens": 22132207.0, "step": 27506 }, { "epoch": 7.2849576271186445, "grad_norm": 2.5695600509643555, "learning_rate": 6.357653601694915e-06, "loss": 1.3487, "mean_token_accuracy": 0.7004715874791145, "num_tokens": 22133762.0, "step": 27508 }, { "epoch": 7.285487288135593, "grad_norm": 3.0833961963653564, "learning_rate": 6.357388771186442e-06, "loss": 1.0759, "mean_token_accuracy": 0.7367045506834984, "num_tokens": 22135144.0, "step": 27510 }, { "epoch": 7.286016949152542, "grad_norm": 3.017731189727783, "learning_rate": 6.357123940677966e-06, "loss": 1.3269, "mean_token_accuracy": 0.6862766444683075, "num_tokens": 22136685.0, "step": 27512 }, { "epoch": 7.286546610169491, "grad_norm": 2.1974451541900635, "learning_rate": 6.356859110169492e-06, "loss": 0.8542, "mean_token_accuracy": 0.7807115316390991, "num_tokens": 22138261.0, "step": 27514 }, { "epoch": 7.28707627118644, "grad_norm": 2.7375242710113525, "learning_rate": 6.356594279661017e-06, "loss": 1.1187, "mean_token_accuracy": 0.755912147462368, "num_tokens": 22139837.0, "step": 27516 }, { "epoch": 7.28760593220339, "grad_norm": 2.4257631301879883, "learning_rate": 6.356329449152543e-06, "loss": 1.1766, "mean_token_accuracy": 0.7262263037264347, "num_tokens": 22141208.0, "step": 27518 }, { "epoch": 7.288135593220339, "grad_norm": 3.2726809978485107, "learning_rate": 6.356064618644068e-06, "loss": 1.6545, "mean_token_accuracy": 0.634592343121767, "num_tokens": 22142962.0, "step": 27520 }, { "epoch": 7.288665254237288, "grad_norm": 2.576404333114624, "learning_rate": 6.355799788135594e-06, "loss": 1.2468, "mean_token_accuracy": 0.7445703223347664, "num_tokens": 22144379.0, "step": 27522 }, { "epoch": 7.289194915254237, "grad_norm": 2.543168067932129, "learning_rate": 6.3555349576271195e-06, "loss": 1.1047, "mean_token_accuracy": 0.727476567029953, "num_tokens": 22145926.0, "step": 27524 }, { "epoch": 7.289724576271187, "grad_norm": 2.188113212585449, "learning_rate": 6.3552701271186444e-06, "loss": 1.1185, "mean_token_accuracy": 0.7594826892018318, "num_tokens": 22147692.0, "step": 27526 }, { "epoch": 7.290254237288136, "grad_norm": 2.7450363636016846, "learning_rate": 6.35500529661017e-06, "loss": 0.8672, "mean_token_accuracy": 0.7913522645831108, "num_tokens": 22149351.0, "step": 27528 }, { "epoch": 7.290783898305085, "grad_norm": 2.2548351287841797, "learning_rate": 6.354740466101695e-06, "loss": 1.2392, "mean_token_accuracy": 0.7227983698248863, "num_tokens": 22151131.0, "step": 27530 }, { "epoch": 7.291313559322034, "grad_norm": 2.46250581741333, "learning_rate": 6.354475635593221e-06, "loss": 1.1311, "mean_token_accuracy": 0.7268149554729462, "num_tokens": 22152569.0, "step": 27532 }, { "epoch": 7.291843220338983, "grad_norm": 2.909740924835205, "learning_rate": 6.354210805084746e-06, "loss": 1.1652, "mean_token_accuracy": 0.7217259407043457, "num_tokens": 22154241.0, "step": 27534 }, { "epoch": 7.2923728813559325, "grad_norm": 2.50331974029541, "learning_rate": 6.3539459745762725e-06, "loss": 0.9541, "mean_token_accuracy": 0.7646428123116493, "num_tokens": 22155755.0, "step": 27536 }, { "epoch": 7.2929025423728815, "grad_norm": 2.7481179237365723, "learning_rate": 6.353681144067797e-06, "loss": 0.8983, "mean_token_accuracy": 0.7834630832076073, "num_tokens": 22157326.0, "step": 27538 }, { "epoch": 7.2934322033898304, "grad_norm": 2.3167948722839355, "learning_rate": 6.353416313559323e-06, "loss": 0.9539, "mean_token_accuracy": 0.7684978023171425, "num_tokens": 22159001.0, "step": 27540 }, { "epoch": 7.293961864406779, "grad_norm": 2.3431427478790283, "learning_rate": 6.353151483050848e-06, "loss": 0.824, "mean_token_accuracy": 0.7964636757969856, "num_tokens": 22160359.0, "step": 27542 }, { "epoch": 7.294491525423728, "grad_norm": 2.4652347564697266, "learning_rate": 6.352886652542374e-06, "loss": 0.8943, "mean_token_accuracy": 0.7724586129188538, "num_tokens": 22162063.0, "step": 27544 }, { "epoch": 7.295021186440678, "grad_norm": 2.9992947578430176, "learning_rate": 6.352621822033899e-06, "loss": 1.3195, "mean_token_accuracy": 0.708934485912323, "num_tokens": 22163605.0, "step": 27546 }, { "epoch": 7.295550847457627, "grad_norm": 2.404813766479492, "learning_rate": 6.352356991525425e-06, "loss": 1.3443, "mean_token_accuracy": 0.6868055164813995, "num_tokens": 22165286.0, "step": 27548 }, { "epoch": 7.296080508474576, "grad_norm": 2.997406005859375, "learning_rate": 6.3520921610169495e-06, "loss": 1.0471, "mean_token_accuracy": 0.7400263622403145, "num_tokens": 22166971.0, "step": 27550 }, { "epoch": 7.296610169491525, "grad_norm": 2.6258914470672607, "learning_rate": 6.351827330508475e-06, "loss": 1.639, "mean_token_accuracy": 0.6411176696419716, "num_tokens": 22168607.0, "step": 27552 }, { "epoch": 7.297139830508475, "grad_norm": 2.407273530960083, "learning_rate": 6.3515625e-06, "loss": 0.7725, "mean_token_accuracy": 0.7835506275296211, "num_tokens": 22170467.0, "step": 27554 }, { "epoch": 7.297669491525424, "grad_norm": 3.018993377685547, "learning_rate": 6.351297669491526e-06, "loss": 1.3127, "mean_token_accuracy": 0.6979938521981239, "num_tokens": 22171867.0, "step": 27556 }, { "epoch": 7.298199152542373, "grad_norm": 2.4767701625823975, "learning_rate": 6.351032838983051e-06, "loss": 1.1026, "mean_token_accuracy": 0.7414448112249374, "num_tokens": 22173372.0, "step": 27558 }, { "epoch": 7.298728813559322, "grad_norm": 2.201476573944092, "learning_rate": 6.350768008474577e-06, "loss": 1.0513, "mean_token_accuracy": 0.7488341704010963, "num_tokens": 22175038.0, "step": 27560 }, { "epoch": 7.299258474576272, "grad_norm": 2.6824533939361572, "learning_rate": 6.350503177966102e-06, "loss": 1.1805, "mean_token_accuracy": 0.7377370223402977, "num_tokens": 22176622.0, "step": 27562 }, { "epoch": 7.299788135593221, "grad_norm": 2.4341981410980225, "learning_rate": 6.350238347457628e-06, "loss": 1.0367, "mean_token_accuracy": 0.7376254498958588, "num_tokens": 22178168.0, "step": 27564 }, { "epoch": 7.3003177966101696, "grad_norm": 2.5530483722686768, "learning_rate": 6.349973516949152e-06, "loss": 1.3438, "mean_token_accuracy": 0.6911372989416122, "num_tokens": 22179694.0, "step": 27566 }, { "epoch": 7.3008474576271185, "grad_norm": 2.8748879432678223, "learning_rate": 6.349708686440679e-06, "loss": 1.0888, "mean_token_accuracy": 0.735251359641552, "num_tokens": 22181395.0, "step": 27568 }, { "epoch": 7.3013771186440675, "grad_norm": 2.420083522796631, "learning_rate": 6.349443855932204e-06, "loss": 1.1662, "mean_token_accuracy": 0.713231198489666, "num_tokens": 22182862.0, "step": 27570 }, { "epoch": 7.301906779661017, "grad_norm": 3.0748462677001953, "learning_rate": 6.34917902542373e-06, "loss": 1.589, "mean_token_accuracy": 0.6501793190836906, "num_tokens": 22184607.0, "step": 27572 }, { "epoch": 7.302436440677966, "grad_norm": 2.152963638305664, "learning_rate": 6.348914194915255e-06, "loss": 0.8434, "mean_token_accuracy": 0.7820108160376549, "num_tokens": 22186290.0, "step": 27574 }, { "epoch": 7.302966101694915, "grad_norm": 3.1212480068206787, "learning_rate": 6.34864936440678e-06, "loss": 1.3698, "mean_token_accuracy": 0.685204766690731, "num_tokens": 22187794.0, "step": 27576 }, { "epoch": 7.303495762711864, "grad_norm": 2.9225924015045166, "learning_rate": 6.348384533898305e-06, "loss": 1.9402, "mean_token_accuracy": 0.5713943913578987, "num_tokens": 22189553.0, "step": 27578 }, { "epoch": 7.304025423728813, "grad_norm": 2.3553407192230225, "learning_rate": 6.348119703389831e-06, "loss": 0.917, "mean_token_accuracy": 0.7594696208834648, "num_tokens": 22191113.0, "step": 27580 }, { "epoch": 7.304555084745763, "grad_norm": 2.918537139892578, "learning_rate": 6.347854872881356e-06, "loss": 1.6614, "mean_token_accuracy": 0.6582042202353477, "num_tokens": 22192685.0, "step": 27582 }, { "epoch": 7.305084745762712, "grad_norm": 2.8689396381378174, "learning_rate": 6.347590042372882e-06, "loss": 1.5073, "mean_token_accuracy": 0.672215461730957, "num_tokens": 22194307.0, "step": 27584 }, { "epoch": 7.305614406779661, "grad_norm": 2.455465793609619, "learning_rate": 6.347325211864407e-06, "loss": 0.9413, "mean_token_accuracy": 0.7627626061439514, "num_tokens": 22196171.0, "step": 27586 }, { "epoch": 7.30614406779661, "grad_norm": 2.754054546356201, "learning_rate": 6.3470603813559325e-06, "loss": 1.7327, "mean_token_accuracy": 0.63930644094944, "num_tokens": 22197784.0, "step": 27588 }, { "epoch": 7.30667372881356, "grad_norm": 2.52943754196167, "learning_rate": 6.346795550847457e-06, "loss": 1.3627, "mean_token_accuracy": 0.7141416221857071, "num_tokens": 22199362.0, "step": 27590 }, { "epoch": 7.307203389830509, "grad_norm": 2.561797618865967, "learning_rate": 6.346530720338984e-06, "loss": 1.3401, "mean_token_accuracy": 0.6975513473153114, "num_tokens": 22200857.0, "step": 27592 }, { "epoch": 7.307733050847458, "grad_norm": 2.9845385551452637, "learning_rate": 6.346265889830508e-06, "loss": 1.0376, "mean_token_accuracy": 0.7615564465522766, "num_tokens": 22202510.0, "step": 27594 }, { "epoch": 7.308262711864407, "grad_norm": 1.7263081073760986, "learning_rate": 6.346001059322035e-06, "loss": 0.8679, "mean_token_accuracy": 0.8066568672657013, "num_tokens": 22204193.0, "step": 27596 }, { "epoch": 7.3087923728813555, "grad_norm": 2.686760663986206, "learning_rate": 6.34573622881356e-06, "loss": 0.9973, "mean_token_accuracy": 0.7577509209513664, "num_tokens": 22205949.0, "step": 27598 }, { "epoch": 7.309322033898305, "grad_norm": 2.8409881591796875, "learning_rate": 6.3454713983050855e-06, "loss": 1.0571, "mean_token_accuracy": 0.7314026057720184, "num_tokens": 22207724.0, "step": 27600 }, { "epoch": 7.309851694915254, "grad_norm": 2.2819840908050537, "learning_rate": 6.34520656779661e-06, "loss": 1.3521, "mean_token_accuracy": 0.6813852861523628, "num_tokens": 22209550.0, "step": 27602 }, { "epoch": 7.310381355932203, "grad_norm": 2.9124491214752197, "learning_rate": 6.344941737288136e-06, "loss": 1.3487, "mean_token_accuracy": 0.683343093842268, "num_tokens": 22211188.0, "step": 27604 }, { "epoch": 7.310911016949152, "grad_norm": 2.5253255367279053, "learning_rate": 6.344676906779662e-06, "loss": 1.4578, "mean_token_accuracy": 0.6569262966513634, "num_tokens": 22212911.0, "step": 27606 }, { "epoch": 7.311440677966102, "grad_norm": 2.4082164764404297, "learning_rate": 6.344412076271187e-06, "loss": 1.3121, "mean_token_accuracy": 0.7230099067091942, "num_tokens": 22214444.0, "step": 27608 }, { "epoch": 7.311970338983051, "grad_norm": 2.8982717990875244, "learning_rate": 6.344147245762713e-06, "loss": 1.8266, "mean_token_accuracy": 0.6025206744670868, "num_tokens": 22216218.0, "step": 27610 }, { "epoch": 7.3125, "grad_norm": 2.595628023147583, "learning_rate": 6.3438824152542376e-06, "loss": 1.3804, "mean_token_accuracy": 0.6854591891169548, "num_tokens": 22217807.0, "step": 27612 }, { "epoch": 7.313029661016949, "grad_norm": 2.649353265762329, "learning_rate": 6.343617584745763e-06, "loss": 0.9025, "mean_token_accuracy": 0.7655102238059044, "num_tokens": 22219639.0, "step": 27614 }, { "epoch": 7.313559322033898, "grad_norm": 2.8803799152374268, "learning_rate": 6.343352754237288e-06, "loss": 1.3542, "mean_token_accuracy": 0.6820192039012909, "num_tokens": 22221568.0, "step": 27616 }, { "epoch": 7.314088983050848, "grad_norm": 2.1842846870422363, "learning_rate": 6.343087923728815e-06, "loss": 1.3083, "mean_token_accuracy": 0.6971524506807327, "num_tokens": 22223610.0, "step": 27618 }, { "epoch": 7.314618644067797, "grad_norm": 2.877338171005249, "learning_rate": 6.342823093220339e-06, "loss": 1.2281, "mean_token_accuracy": 0.7161726206541061, "num_tokens": 22225311.0, "step": 27620 }, { "epoch": 7.315148305084746, "grad_norm": 2.5987255573272705, "learning_rate": 6.342558262711866e-06, "loss": 0.9767, "mean_token_accuracy": 0.7672359645366669, "num_tokens": 22226810.0, "step": 27622 }, { "epoch": 7.315677966101695, "grad_norm": 3.378164052963257, "learning_rate": 6.3422934322033905e-06, "loss": 1.145, "mean_token_accuracy": 0.7112538069486618, "num_tokens": 22228476.0, "step": 27624 }, { "epoch": 7.3162076271186445, "grad_norm": 2.8650989532470703, "learning_rate": 6.342028601694916e-06, "loss": 1.0453, "mean_token_accuracy": 0.7402955368161201, "num_tokens": 22230036.0, "step": 27626 }, { "epoch": 7.316737288135593, "grad_norm": 3.041917085647583, "learning_rate": 6.341763771186441e-06, "loss": 1.2438, "mean_token_accuracy": 0.7119405344128609, "num_tokens": 22231736.0, "step": 27628 }, { "epoch": 7.317266949152542, "grad_norm": 2.34055757522583, "learning_rate": 6.341498940677967e-06, "loss": 1.2659, "mean_token_accuracy": 0.6871560290455818, "num_tokens": 22233528.0, "step": 27630 }, { "epoch": 7.317796610169491, "grad_norm": 2.982250213623047, "learning_rate": 6.341234110169492e-06, "loss": 1.2808, "mean_token_accuracy": 0.7020213156938553, "num_tokens": 22236263.0, "step": 27632 }, { "epoch": 7.31832627118644, "grad_norm": 2.8577609062194824, "learning_rate": 6.340969279661018e-06, "loss": 1.3481, "mean_token_accuracy": 0.6801131367683411, "num_tokens": 22237926.0, "step": 27634 }, { "epoch": 7.31885593220339, "grad_norm": 2.32747220993042, "learning_rate": 6.340704449152543e-06, "loss": 1.1826, "mean_token_accuracy": 0.7278234884142876, "num_tokens": 22239955.0, "step": 27636 }, { "epoch": 7.319385593220339, "grad_norm": 2.906841516494751, "learning_rate": 6.3404396186440684e-06, "loss": 1.1928, "mean_token_accuracy": 0.7258834391832352, "num_tokens": 22241839.0, "step": 27638 }, { "epoch": 7.319915254237288, "grad_norm": 2.3059918880462646, "learning_rate": 6.340174788135593e-06, "loss": 1.2553, "mean_token_accuracy": 0.7285369336605072, "num_tokens": 22243768.0, "step": 27640 }, { "epoch": 7.320444915254237, "grad_norm": 2.327181339263916, "learning_rate": 6.339909957627119e-06, "loss": 1.0979, "mean_token_accuracy": 0.7328582182526588, "num_tokens": 22245626.0, "step": 27642 }, { "epoch": 7.320974576271187, "grad_norm": 3.4470877647399902, "learning_rate": 6.339645127118644e-06, "loss": 1.6186, "mean_token_accuracy": 0.6566818952560425, "num_tokens": 22247054.0, "step": 27644 }, { "epoch": 7.321504237288136, "grad_norm": 2.656735897064209, "learning_rate": 6.339380296610171e-06, "loss": 1.5319, "mean_token_accuracy": 0.6692043021321297, "num_tokens": 22248643.0, "step": 27646 }, { "epoch": 7.322033898305085, "grad_norm": 2.6167447566986084, "learning_rate": 6.339115466101695e-06, "loss": 1.5276, "mean_token_accuracy": 0.654021754860878, "num_tokens": 22250238.0, "step": 27648 }, { "epoch": 7.322563559322034, "grad_norm": 3.0963897705078125, "learning_rate": 6.338850635593221e-06, "loss": 1.2455, "mean_token_accuracy": 0.7327922135591507, "num_tokens": 22251658.0, "step": 27650 }, { "epoch": 7.323093220338983, "grad_norm": 2.9706263542175293, "learning_rate": 6.338585805084746e-06, "loss": 1.7683, "mean_token_accuracy": 0.6095814928412437, "num_tokens": 22253247.0, "step": 27652 }, { "epoch": 7.3236228813559325, "grad_norm": 2.6163830757141113, "learning_rate": 6.338320974576272e-06, "loss": 0.9701, "mean_token_accuracy": 0.7618570998311043, "num_tokens": 22254892.0, "step": 27654 }, { "epoch": 7.3241525423728815, "grad_norm": 1.861011028289795, "learning_rate": 6.338056144067797e-06, "loss": 1.0375, "mean_token_accuracy": 0.737295851111412, "num_tokens": 22256883.0, "step": 27656 }, { "epoch": 7.3246822033898304, "grad_norm": 2.2281930446624756, "learning_rate": 6.337791313559323e-06, "loss": 1.3147, "mean_token_accuracy": 0.6803358718752861, "num_tokens": 22258745.0, "step": 27658 }, { "epoch": 7.325211864406779, "grad_norm": 3.173892021179199, "learning_rate": 6.337526483050848e-06, "loss": 1.4434, "mean_token_accuracy": 0.6794343069195747, "num_tokens": 22260452.0, "step": 27660 }, { "epoch": 7.325741525423728, "grad_norm": 2.613776445388794, "learning_rate": 6.3372616525423735e-06, "loss": 1.1204, "mean_token_accuracy": 0.757389772683382, "num_tokens": 22261806.0, "step": 27662 }, { "epoch": 7.326271186440678, "grad_norm": 3.4400064945220947, "learning_rate": 6.3369968220338984e-06, "loss": 1.2455, "mean_token_accuracy": 0.717123955488205, "num_tokens": 22263232.0, "step": 27664 }, { "epoch": 7.326800847457627, "grad_norm": 2.0282697677612305, "learning_rate": 6.336731991525424e-06, "loss": 0.8542, "mean_token_accuracy": 0.7979394719004631, "num_tokens": 22264687.0, "step": 27666 }, { "epoch": 7.327330508474576, "grad_norm": 2.832280158996582, "learning_rate": 6.336467161016949e-06, "loss": 1.1256, "mean_token_accuracy": 0.7319172024726868, "num_tokens": 22266109.0, "step": 27668 }, { "epoch": 7.327860169491525, "grad_norm": 2.634077548980713, "learning_rate": 6.336202330508475e-06, "loss": 1.2387, "mean_token_accuracy": 0.7075159326195717, "num_tokens": 22267944.0, "step": 27670 }, { "epoch": 7.328389830508475, "grad_norm": 2.160444974899292, "learning_rate": 6.3359375e-06, "loss": 1.0276, "mean_token_accuracy": 0.7561782523989677, "num_tokens": 22269788.0, "step": 27672 }, { "epoch": 7.328919491525424, "grad_norm": 2.567147970199585, "learning_rate": 6.335672669491526e-06, "loss": 1.2717, "mean_token_accuracy": 0.7032406702637672, "num_tokens": 22271406.0, "step": 27674 }, { "epoch": 7.329449152542373, "grad_norm": 2.8917250633239746, "learning_rate": 6.3354078389830506e-06, "loss": 1.1859, "mean_token_accuracy": 0.7277095541357994, "num_tokens": 22273077.0, "step": 27676 }, { "epoch": 7.329978813559322, "grad_norm": 2.4582767486572266, "learning_rate": 6.335143008474577e-06, "loss": 1.1919, "mean_token_accuracy": 0.7331730201840401, "num_tokens": 22274533.0, "step": 27678 }, { "epoch": 7.330508474576272, "grad_norm": 2.676622152328491, "learning_rate": 6.334878177966102e-06, "loss": 1.4657, "mean_token_accuracy": 0.6690487563610077, "num_tokens": 22276123.0, "step": 27680 }, { "epoch": 7.331038135593221, "grad_norm": 3.6521081924438477, "learning_rate": 6.334613347457628e-06, "loss": 1.2204, "mean_token_accuracy": 0.7148040682077408, "num_tokens": 22277593.0, "step": 27682 }, { "epoch": 7.3315677966101696, "grad_norm": 2.225160837173462, "learning_rate": 6.334348516949153e-06, "loss": 0.986, "mean_token_accuracy": 0.7503638565540314, "num_tokens": 22279200.0, "step": 27684 }, { "epoch": 7.3320974576271185, "grad_norm": 2.4469263553619385, "learning_rate": 6.334083686440679e-06, "loss": 1.3291, "mean_token_accuracy": 0.7071046456694603, "num_tokens": 22280835.0, "step": 27686 }, { "epoch": 7.3326271186440675, "grad_norm": 2.6526358127593994, "learning_rate": 6.3338188559322035e-06, "loss": 1.1285, "mean_token_accuracy": 0.7285580039024353, "num_tokens": 22282238.0, "step": 27688 }, { "epoch": 7.333156779661017, "grad_norm": 2.0839929580688477, "learning_rate": 6.333554025423729e-06, "loss": 0.7821, "mean_token_accuracy": 0.8016134947538376, "num_tokens": 22283714.0, "step": 27690 }, { "epoch": 7.333686440677966, "grad_norm": 2.625213146209717, "learning_rate": 6.333289194915255e-06, "loss": 1.4716, "mean_token_accuracy": 0.6581221148371696, "num_tokens": 22285298.0, "step": 27692 }, { "epoch": 7.334216101694915, "grad_norm": 3.025878429412842, "learning_rate": 6.33302436440678e-06, "loss": 1.3051, "mean_token_accuracy": 0.7070126384496689, "num_tokens": 22287012.0, "step": 27694 }, { "epoch": 7.334745762711864, "grad_norm": 2.4833168983459473, "learning_rate": 6.332759533898306e-06, "loss": 1.1414, "mean_token_accuracy": 0.7232707515358925, "num_tokens": 22288542.0, "step": 27696 }, { "epoch": 7.335275423728813, "grad_norm": 2.6680164337158203, "learning_rate": 6.332494703389831e-06, "loss": 1.1235, "mean_token_accuracy": 0.7263383492827415, "num_tokens": 22290205.0, "step": 27698 }, { "epoch": 7.335805084745763, "grad_norm": 2.489187240600586, "learning_rate": 6.332229872881357e-06, "loss": 0.8536, "mean_token_accuracy": 0.7930762618780136, "num_tokens": 22291620.0, "step": 27700 }, { "epoch": 7.336334745762712, "grad_norm": 3.915931463241577, "learning_rate": 6.331965042372881e-06, "loss": 1.7101, "mean_token_accuracy": 0.6299413852393627, "num_tokens": 22293455.0, "step": 27702 }, { "epoch": 7.336864406779661, "grad_norm": 2.438397169113159, "learning_rate": 6.331700211864408e-06, "loss": 1.0503, "mean_token_accuracy": 0.740377277135849, "num_tokens": 22294851.0, "step": 27704 }, { "epoch": 7.33739406779661, "grad_norm": 2.6080899238586426, "learning_rate": 6.331435381355933e-06, "loss": 1.0835, "mean_token_accuracy": 0.7585303261876106, "num_tokens": 22296134.0, "step": 27706 }, { "epoch": 7.33792372881356, "grad_norm": 2.7095348834991455, "learning_rate": 6.331170550847459e-06, "loss": 1.4934, "mean_token_accuracy": 0.6745353937149048, "num_tokens": 22297567.0, "step": 27708 }, { "epoch": 7.338453389830509, "grad_norm": 2.4601869583129883, "learning_rate": 6.330905720338984e-06, "loss": 1.0176, "mean_token_accuracy": 0.7761547639966011, "num_tokens": 22298925.0, "step": 27710 }, { "epoch": 7.338983050847458, "grad_norm": 2.33866286277771, "learning_rate": 6.3306408898305094e-06, "loss": 1.1344, "mean_token_accuracy": 0.7433035001158714, "num_tokens": 22300795.0, "step": 27712 }, { "epoch": 7.339512711864407, "grad_norm": 1.956795334815979, "learning_rate": 6.330376059322034e-06, "loss": 0.7102, "mean_token_accuracy": 0.8135772421956062, "num_tokens": 22302639.0, "step": 27714 }, { "epoch": 7.3400423728813555, "grad_norm": 2.425746440887451, "learning_rate": 6.33011122881356e-06, "loss": 1.5434, "mean_token_accuracy": 0.6472152769565582, "num_tokens": 22304332.0, "step": 27716 }, { "epoch": 7.340572033898305, "grad_norm": 2.6695621013641357, "learning_rate": 6.329846398305085e-06, "loss": 1.2118, "mean_token_accuracy": 0.7158867493271828, "num_tokens": 22305888.0, "step": 27718 }, { "epoch": 7.341101694915254, "grad_norm": 2.3464467525482178, "learning_rate": 6.329581567796611e-06, "loss": 1.235, "mean_token_accuracy": 0.7098114863038063, "num_tokens": 22307548.0, "step": 27720 }, { "epoch": 7.341631355932203, "grad_norm": 3.566197156906128, "learning_rate": 6.329316737288136e-06, "loss": 1.2821, "mean_token_accuracy": 0.7098830044269562, "num_tokens": 22308851.0, "step": 27722 }, { "epoch": 7.342161016949152, "grad_norm": 2.3502330780029297, "learning_rate": 6.3290519067796616e-06, "loss": 0.9908, "mean_token_accuracy": 0.7540590688586235, "num_tokens": 22310526.0, "step": 27724 }, { "epoch": 7.342690677966102, "grad_norm": 2.7093088626861572, "learning_rate": 6.3287870762711865e-06, "loss": 0.9756, "mean_token_accuracy": 0.746274009346962, "num_tokens": 22312304.0, "step": 27726 }, { "epoch": 7.343220338983051, "grad_norm": 2.924138307571411, "learning_rate": 6.328522245762712e-06, "loss": 1.2638, "mean_token_accuracy": 0.7203178629279137, "num_tokens": 22313895.0, "step": 27728 }, { "epoch": 7.34375, "grad_norm": 2.865079164505005, "learning_rate": 6.328257415254237e-06, "loss": 1.5066, "mean_token_accuracy": 0.6559976115822792, "num_tokens": 22315518.0, "step": 27730 }, { "epoch": 7.344279661016949, "grad_norm": 2.1366472244262695, "learning_rate": 6.327992584745764e-06, "loss": 1.245, "mean_token_accuracy": 0.6931669898331165, "num_tokens": 22317419.0, "step": 27732 }, { "epoch": 7.344809322033898, "grad_norm": 1.923306941986084, "learning_rate": 6.327727754237289e-06, "loss": 1.3213, "mean_token_accuracy": 0.719658762216568, "num_tokens": 22319679.0, "step": 27734 }, { "epoch": 7.345338983050848, "grad_norm": 2.6955299377441406, "learning_rate": 6.3274629237288145e-06, "loss": 1.4272, "mean_token_accuracy": 0.6941109150648117, "num_tokens": 22321294.0, "step": 27736 }, { "epoch": 7.345868644067797, "grad_norm": 2.240490674972534, "learning_rate": 6.3271980932203395e-06, "loss": 1.1173, "mean_token_accuracy": 0.7276050597429276, "num_tokens": 22322850.0, "step": 27738 }, { "epoch": 7.346398305084746, "grad_norm": 2.7346980571746826, "learning_rate": 6.326933262711865e-06, "loss": 1.1942, "mean_token_accuracy": 0.7307435721158981, "num_tokens": 22324366.0, "step": 27740 }, { "epoch": 7.346927966101695, "grad_norm": 2.3662610054016113, "learning_rate": 6.32666843220339e-06, "loss": 1.3764, "mean_token_accuracy": 0.7148122377693653, "num_tokens": 22326852.0, "step": 27742 }, { "epoch": 7.3474576271186445, "grad_norm": 2.4019532203674316, "learning_rate": 6.326403601694916e-06, "loss": 1.182, "mean_token_accuracy": 0.6947592943906784, "num_tokens": 22328329.0, "step": 27744 }, { "epoch": 7.347987288135593, "grad_norm": 2.5758213996887207, "learning_rate": 6.326138771186441e-06, "loss": 1.1396, "mean_token_accuracy": 0.7343032136559486, "num_tokens": 22330007.0, "step": 27746 }, { "epoch": 7.348516949152542, "grad_norm": 2.662701368331909, "learning_rate": 6.325873940677967e-06, "loss": 1.3842, "mean_token_accuracy": 0.6861434802412987, "num_tokens": 22331722.0, "step": 27748 }, { "epoch": 7.349046610169491, "grad_norm": 2.943389415740967, "learning_rate": 6.3256091101694916e-06, "loss": 1.4617, "step": 27750 }, { "epoch": 7.349046610169491, "eval_loss": 1.3390628099441528, "eval_mean_token_accuracy": 0.6984659980063315, "eval_num_tokens": 22333219.0, "eval_runtime": 48.7468, "eval_samples_per_second": 6.318, "eval_steps_per_second": 6.318, "step": 27750 }, { "epoch": 7.34957627118644, "grad_norm": 3.113736152648926, "learning_rate": 6.325344279661017e-06, "loss": 1.283, "mean_token_accuracy": 0.6834988072514534, "num_tokens": 22334785.0, "step": 27752 }, { "epoch": 7.35010593220339, "grad_norm": 2.655116319656372, "learning_rate": 6.325079449152542e-06, "loss": 1.1963, "mean_token_accuracy": 0.720709428191185, "num_tokens": 22336347.0, "step": 27754 }, { "epoch": 7.350635593220339, "grad_norm": 2.549783706665039, "learning_rate": 6.324814618644068e-06, "loss": 1.4598, "mean_token_accuracy": 0.672037560492754, "num_tokens": 22338011.0, "step": 27756 }, { "epoch": 7.351165254237288, "grad_norm": 2.4671809673309326, "learning_rate": 6.324549788135593e-06, "loss": 1.1449, "mean_token_accuracy": 0.7234860509634018, "num_tokens": 22339581.0, "step": 27758 }, { "epoch": 7.351694915254237, "grad_norm": 2.6986184120178223, "learning_rate": 6.32428495762712e-06, "loss": 1.4937, "mean_token_accuracy": 0.6961285099387169, "num_tokens": 22341024.0, "step": 27760 }, { "epoch": 7.352224576271187, "grad_norm": 2.407125473022461, "learning_rate": 6.324020127118644e-06, "loss": 1.3532, "mean_token_accuracy": 0.6749768108129501, "num_tokens": 22342986.0, "step": 27762 }, { "epoch": 7.352754237288136, "grad_norm": 2.6673548221588135, "learning_rate": 6.32375529661017e-06, "loss": 1.4045, "mean_token_accuracy": 0.7074059322476387, "num_tokens": 22344528.0, "step": 27764 }, { "epoch": 7.353283898305085, "grad_norm": 2.4840445518493652, "learning_rate": 6.323490466101695e-06, "loss": 1.2, "mean_token_accuracy": 0.7155241742730141, "num_tokens": 22345944.0, "step": 27766 }, { "epoch": 7.353813559322034, "grad_norm": 2.6026573181152344, "learning_rate": 6.323225635593221e-06, "loss": 1.3886, "mean_token_accuracy": 0.7018070816993713, "num_tokens": 22347639.0, "step": 27768 }, { "epoch": 7.354343220338983, "grad_norm": 2.745767831802368, "learning_rate": 6.322960805084746e-06, "loss": 1.1332, "mean_token_accuracy": 0.7253957390785217, "num_tokens": 22349046.0, "step": 27770 }, { "epoch": 7.3548728813559325, "grad_norm": 2.4910330772399902, "learning_rate": 6.322695974576272e-06, "loss": 0.9329, "mean_token_accuracy": 0.753692701458931, "num_tokens": 22350684.0, "step": 27772 }, { "epoch": 7.3554025423728815, "grad_norm": 3.076676368713379, "learning_rate": 6.3224311440677975e-06, "loss": 1.6486, "mean_token_accuracy": 0.654259130358696, "num_tokens": 22352034.0, "step": 27774 }, { "epoch": 7.3559322033898304, "grad_norm": 2.715444564819336, "learning_rate": 6.322166313559322e-06, "loss": 1.1715, "mean_token_accuracy": 0.7404468506574631, "num_tokens": 22353672.0, "step": 27776 }, { "epoch": 7.356461864406779, "grad_norm": 3.1617562770843506, "learning_rate": 6.321901483050848e-06, "loss": 0.8792, "mean_token_accuracy": 0.7809945791959763, "num_tokens": 22355109.0, "step": 27778 }, { "epoch": 7.356991525423728, "grad_norm": 2.654155969619751, "learning_rate": 6.321636652542373e-06, "loss": 1.1382, "mean_token_accuracy": 0.7478366792201996, "num_tokens": 22356501.0, "step": 27780 }, { "epoch": 7.357521186440678, "grad_norm": 2.4675211906433105, "learning_rate": 6.321371822033899e-06, "loss": 1.3311, "mean_token_accuracy": 0.7093342766165733, "num_tokens": 22358166.0, "step": 27782 }, { "epoch": 7.358050847457627, "grad_norm": 3.3284761905670166, "learning_rate": 6.321106991525424e-06, "loss": 1.2637, "mean_token_accuracy": 0.724668487906456, "num_tokens": 22359543.0, "step": 27784 }, { "epoch": 7.358580508474576, "grad_norm": 2.355168104171753, "learning_rate": 6.3208421610169505e-06, "loss": 0.8801, "mean_token_accuracy": 0.777142658829689, "num_tokens": 22361251.0, "step": 27786 }, { "epoch": 7.359110169491525, "grad_norm": 2.4408936500549316, "learning_rate": 6.320577330508475e-06, "loss": 1.0835, "mean_token_accuracy": 0.73551344871521, "num_tokens": 22363007.0, "step": 27788 }, { "epoch": 7.359639830508475, "grad_norm": 2.3468210697174072, "learning_rate": 6.320312500000001e-06, "loss": 1.0447, "mean_token_accuracy": 0.7500650882720947, "num_tokens": 22364668.0, "step": 27790 }, { "epoch": 7.360169491525424, "grad_norm": 3.281229019165039, "learning_rate": 6.320047669491526e-06, "loss": 0.8679, "mean_token_accuracy": 0.7966472283005714, "num_tokens": 22366017.0, "step": 27792 }, { "epoch": 7.360699152542373, "grad_norm": 2.261659622192383, "learning_rate": 6.319782838983052e-06, "loss": 1.3064, "mean_token_accuracy": 0.7251667827367783, "num_tokens": 22367420.0, "step": 27794 }, { "epoch": 7.361228813559322, "grad_norm": 2.3242056369781494, "learning_rate": 6.319518008474577e-06, "loss": 1.1284, "mean_token_accuracy": 0.7124462425708771, "num_tokens": 22369470.0, "step": 27796 }, { "epoch": 7.361758474576272, "grad_norm": 2.4589483737945557, "learning_rate": 6.3192531779661026e-06, "loss": 0.9839, "mean_token_accuracy": 0.7385262250900269, "num_tokens": 22371265.0, "step": 27798 }, { "epoch": 7.362288135593221, "grad_norm": 2.434826374053955, "learning_rate": 6.3189883474576275e-06, "loss": 1.3604, "mean_token_accuracy": 0.6696173399686813, "num_tokens": 22372988.0, "step": 27800 }, { "epoch": 7.3628177966101696, "grad_norm": 2.882612466812134, "learning_rate": 6.318723516949153e-06, "loss": 1.5361, "mean_token_accuracy": 0.6677693352103233, "num_tokens": 22374691.0, "step": 27802 }, { "epoch": 7.3633474576271185, "grad_norm": 3.309734344482422, "learning_rate": 6.318458686440678e-06, "loss": 1.057, "mean_token_accuracy": 0.7372601106762886, "num_tokens": 22376056.0, "step": 27804 }, { "epoch": 7.3638771186440675, "grad_norm": 2.9204351902008057, "learning_rate": 6.318193855932204e-06, "loss": 1.181, "mean_token_accuracy": 0.7183077335357666, "num_tokens": 22377603.0, "step": 27806 }, { "epoch": 7.364406779661017, "grad_norm": 3.246514081954956, "learning_rate": 6.317929025423729e-06, "loss": 1.0756, "mean_token_accuracy": 0.7397569715976715, "num_tokens": 22379181.0, "step": 27808 }, { "epoch": 7.364936440677966, "grad_norm": 3.2717702388763428, "learning_rate": 6.317664194915255e-06, "loss": 1.3079, "mean_token_accuracy": 0.7069755271077156, "num_tokens": 22380592.0, "step": 27810 }, { "epoch": 7.365466101694915, "grad_norm": 2.612309455871582, "learning_rate": 6.31739936440678e-06, "loss": 1.0618, "mean_token_accuracy": 0.7182447016239166, "num_tokens": 22382216.0, "step": 27812 }, { "epoch": 7.365995762711864, "grad_norm": 2.336597204208374, "learning_rate": 6.317134533898306e-06, "loss": 1.2904, "mean_token_accuracy": 0.6722175776958466, "num_tokens": 22384412.0, "step": 27814 }, { "epoch": 7.366525423728813, "grad_norm": 3.0543770790100098, "learning_rate": 6.31686970338983e-06, "loss": 1.3777, "mean_token_accuracy": 0.6983271986246109, "num_tokens": 22385978.0, "step": 27816 }, { "epoch": 7.367055084745763, "grad_norm": 2.9149155616760254, "learning_rate": 6.316604872881357e-06, "loss": 1.0307, "mean_token_accuracy": 0.7463594377040863, "num_tokens": 22387415.0, "step": 27818 }, { "epoch": 7.367584745762712, "grad_norm": 2.6946377754211426, "learning_rate": 6.316340042372882e-06, "loss": 1.3944, "mean_token_accuracy": 0.690433070063591, "num_tokens": 22389166.0, "step": 27820 }, { "epoch": 7.368114406779661, "grad_norm": 3.0502805709838867, "learning_rate": 6.316075211864408e-06, "loss": 1.3285, "mean_token_accuracy": 0.6968480870127678, "num_tokens": 22390599.0, "step": 27822 }, { "epoch": 7.36864406779661, "grad_norm": 2.283914089202881, "learning_rate": 6.315810381355933e-06, "loss": 0.7715, "mean_token_accuracy": 0.7916594743728638, "num_tokens": 22392307.0, "step": 27824 }, { "epoch": 7.36917372881356, "grad_norm": 2.6678342819213867, "learning_rate": 6.315545550847458e-06, "loss": 1.094, "mean_token_accuracy": 0.7419043108820915, "num_tokens": 22394074.0, "step": 27826 }, { "epoch": 7.369703389830509, "grad_norm": 2.3794260025024414, "learning_rate": 6.315280720338983e-06, "loss": 1.0276, "mean_token_accuracy": 0.7275078743696213, "num_tokens": 22395879.0, "step": 27828 }, { "epoch": 7.370233050847458, "grad_norm": 2.3943305015563965, "learning_rate": 6.315015889830509e-06, "loss": 1.2524, "mean_token_accuracy": 0.7080614045262337, "num_tokens": 22397537.0, "step": 27830 }, { "epoch": 7.370762711864407, "grad_norm": 3.0402352809906006, "learning_rate": 6.314751059322034e-06, "loss": 0.7931, "mean_token_accuracy": 0.8072178065776825, "num_tokens": 22399202.0, "step": 27832 }, { "epoch": 7.3712923728813555, "grad_norm": 3.283968210220337, "learning_rate": 6.31448622881356e-06, "loss": 1.4304, "mean_token_accuracy": 0.6693228334188461, "num_tokens": 22400920.0, "step": 27834 }, { "epoch": 7.371822033898305, "grad_norm": 2.5960001945495605, "learning_rate": 6.314221398305085e-06, "loss": 1.3318, "mean_token_accuracy": 0.6828830614686012, "num_tokens": 22402583.0, "step": 27836 }, { "epoch": 7.372351694915254, "grad_norm": 2.374983549118042, "learning_rate": 6.3139565677966105e-06, "loss": 1.0414, "mean_token_accuracy": 0.7822369635105133, "num_tokens": 22404116.0, "step": 27838 }, { "epoch": 7.372881355932203, "grad_norm": 2.7566471099853516, "learning_rate": 6.313691737288135e-06, "loss": 1.5449, "mean_token_accuracy": 0.6433251202106476, "num_tokens": 22405857.0, "step": 27840 }, { "epoch": 7.373411016949152, "grad_norm": 2.820439338684082, "learning_rate": 6.313426906779662e-06, "loss": 0.8697, "mean_token_accuracy": 0.774383045732975, "num_tokens": 22407331.0, "step": 27842 }, { "epoch": 7.373940677966102, "grad_norm": 2.548133611679077, "learning_rate": 6.313162076271186e-06, "loss": 1.1572, "mean_token_accuracy": 0.7347201928496361, "num_tokens": 22408699.0, "step": 27844 }, { "epoch": 7.374470338983051, "grad_norm": 2.486037492752075, "learning_rate": 6.312897245762713e-06, "loss": 1.1181, "mean_token_accuracy": 0.7509767636656761, "num_tokens": 22410248.0, "step": 27846 }, { "epoch": 7.375, "grad_norm": 2.365631103515625, "learning_rate": 6.312632415254238e-06, "loss": 1.0611, "mean_token_accuracy": 0.7543940767645836, "num_tokens": 22411932.0, "step": 27848 }, { "epoch": 7.375529661016949, "grad_norm": 2.598562717437744, "learning_rate": 6.3123675847457634e-06, "loss": 1.0307, "mean_token_accuracy": 0.751666434109211, "num_tokens": 22413645.0, "step": 27850 }, { "epoch": 7.376059322033898, "grad_norm": 3.3704423904418945, "learning_rate": 6.312102754237288e-06, "loss": 1.8359, "mean_token_accuracy": 0.6222923472523689, "num_tokens": 22415149.0, "step": 27852 }, { "epoch": 7.376588983050848, "grad_norm": 2.7104873657226562, "learning_rate": 6.311837923728814e-06, "loss": 0.8596, "mean_token_accuracy": 0.7816247418522835, "num_tokens": 22416492.0, "step": 27854 }, { "epoch": 7.377118644067797, "grad_norm": 2.8548851013183594, "learning_rate": 6.311573093220339e-06, "loss": 1.5407, "mean_token_accuracy": 0.6647722199559212, "num_tokens": 22417980.0, "step": 27856 }, { "epoch": 7.377648305084746, "grad_norm": 3.184384346008301, "learning_rate": 6.311308262711865e-06, "loss": 1.6001, "mean_token_accuracy": 0.6800318285822868, "num_tokens": 22419620.0, "step": 27858 }, { "epoch": 7.378177966101695, "grad_norm": 2.9082868099212646, "learning_rate": 6.311043432203391e-06, "loss": 1.4641, "mean_token_accuracy": 0.6838476210832596, "num_tokens": 22421502.0, "step": 27860 }, { "epoch": 7.3787076271186445, "grad_norm": 3.119364023208618, "learning_rate": 6.3107786016949156e-06, "loss": 0.985, "mean_token_accuracy": 0.7639620155096054, "num_tokens": 22423103.0, "step": 27862 }, { "epoch": 7.379237288135593, "grad_norm": 2.2744009494781494, "learning_rate": 6.310513771186441e-06, "loss": 1.1189, "mean_token_accuracy": 0.7580434456467628, "num_tokens": 22424623.0, "step": 27864 }, { "epoch": 7.379766949152542, "grad_norm": 2.6837100982666016, "learning_rate": 6.310248940677966e-06, "loss": 1.0962, "mean_token_accuracy": 0.7507282048463821, "num_tokens": 22426275.0, "step": 27866 }, { "epoch": 7.380296610169491, "grad_norm": 2.604717969894409, "learning_rate": 6.309984110169493e-06, "loss": 0.8803, "mean_token_accuracy": 0.7880947440862656, "num_tokens": 22427936.0, "step": 27868 }, { "epoch": 7.38082627118644, "grad_norm": 2.796738386154175, "learning_rate": 6.309719279661017e-06, "loss": 1.1357, "mean_token_accuracy": 0.734879195690155, "num_tokens": 22429457.0, "step": 27870 }, { "epoch": 7.38135593220339, "grad_norm": 2.8117787837982178, "learning_rate": 6.309454449152544e-06, "loss": 1.3729, "mean_token_accuracy": 0.6997414529323578, "num_tokens": 22430844.0, "step": 27872 }, { "epoch": 7.381885593220339, "grad_norm": 3.326085090637207, "learning_rate": 6.3091896186440685e-06, "loss": 1.7577, "mean_token_accuracy": 0.648857019841671, "num_tokens": 22432413.0, "step": 27874 }, { "epoch": 7.382415254237288, "grad_norm": 3.2157444953918457, "learning_rate": 6.308924788135594e-06, "loss": 1.7077, "mean_token_accuracy": 0.642901636660099, "num_tokens": 22433878.0, "step": 27876 }, { "epoch": 7.382944915254237, "grad_norm": 2.8477840423583984, "learning_rate": 6.308659957627119e-06, "loss": 1.5977, "mean_token_accuracy": 0.6802913695573807, "num_tokens": 22435552.0, "step": 27878 }, { "epoch": 7.383474576271187, "grad_norm": 2.5744123458862305, "learning_rate": 6.308395127118645e-06, "loss": 0.8071, "mean_token_accuracy": 0.8007592037320137, "num_tokens": 22437147.0, "step": 27880 }, { "epoch": 7.384004237288136, "grad_norm": 2.990858793258667, "learning_rate": 6.30813029661017e-06, "loss": 1.3499, "mean_token_accuracy": 0.6918786391615868, "num_tokens": 22438744.0, "step": 27882 }, { "epoch": 7.384533898305085, "grad_norm": 2.958329677581787, "learning_rate": 6.307865466101696e-06, "loss": 1.5075, "mean_token_accuracy": 0.6973594725131989, "num_tokens": 22440313.0, "step": 27884 }, { "epoch": 7.385063559322034, "grad_norm": 4.816290378570557, "learning_rate": 6.307600635593221e-06, "loss": 1.1704, "mean_token_accuracy": 0.7639663517475128, "num_tokens": 22442536.0, "step": 27886 }, { "epoch": 7.385593220338983, "grad_norm": 2.0680112838745117, "learning_rate": 6.307335805084746e-06, "loss": 1.2221, "mean_token_accuracy": 0.7300314977765083, "num_tokens": 22444385.0, "step": 27888 }, { "epoch": 7.3861228813559325, "grad_norm": 2.640942335128784, "learning_rate": 6.307070974576271e-06, "loss": 1.4838, "mean_token_accuracy": 0.667701244354248, "num_tokens": 22446323.0, "step": 27890 }, { "epoch": 7.3866525423728815, "grad_norm": 2.254216194152832, "learning_rate": 6.306806144067797e-06, "loss": 1.0276, "mean_token_accuracy": 0.759814977645874, "num_tokens": 22448064.0, "step": 27892 }, { "epoch": 7.3871822033898304, "grad_norm": 2.4453060626983643, "learning_rate": 6.306541313559322e-06, "loss": 1.4513, "mean_token_accuracy": 0.6835054010152817, "num_tokens": 22449823.0, "step": 27894 }, { "epoch": 7.387711864406779, "grad_norm": 2.5032286643981934, "learning_rate": 6.306276483050849e-06, "loss": 1.0735, "mean_token_accuracy": 0.751397468149662, "num_tokens": 22451383.0, "step": 27896 }, { "epoch": 7.388241525423728, "grad_norm": 2.673795223236084, "learning_rate": 6.306011652542373e-06, "loss": 1.0854, "mean_token_accuracy": 0.7382313683629036, "num_tokens": 22453058.0, "step": 27898 }, { "epoch": 7.388771186440678, "grad_norm": 3.0175561904907227, "learning_rate": 6.305746822033899e-06, "loss": 1.6438, "mean_token_accuracy": 0.6627509295940399, "num_tokens": 22454957.0, "step": 27900 }, { "epoch": 7.389300847457627, "grad_norm": 2.77128005027771, "learning_rate": 6.305481991525424e-06, "loss": 0.8609, "mean_token_accuracy": 0.7861144691705704, "num_tokens": 22456366.0, "step": 27902 }, { "epoch": 7.389830508474576, "grad_norm": 2.1194982528686523, "learning_rate": 6.30521716101695e-06, "loss": 0.9346, "mean_token_accuracy": 0.7817027717828751, "num_tokens": 22458305.0, "step": 27904 }, { "epoch": 7.390360169491525, "grad_norm": 2.7394919395446777, "learning_rate": 6.304952330508475e-06, "loss": 1.2016, "mean_token_accuracy": 0.7290158495306969, "num_tokens": 22459884.0, "step": 27906 }, { "epoch": 7.390889830508475, "grad_norm": 2.4001874923706055, "learning_rate": 6.304687500000001e-06, "loss": 1.6087, "mean_token_accuracy": 0.6295105963945389, "num_tokens": 22461536.0, "step": 27908 }, { "epoch": 7.391419491525424, "grad_norm": 2.1408960819244385, "learning_rate": 6.304422669491526e-06, "loss": 0.7376, "mean_token_accuracy": 0.8170875087380409, "num_tokens": 22463278.0, "step": 27910 }, { "epoch": 7.391949152542373, "grad_norm": 2.646928548812866, "learning_rate": 6.3041578389830515e-06, "loss": 1.548, "mean_token_accuracy": 0.6674401015043259, "num_tokens": 22464906.0, "step": 27912 }, { "epoch": 7.392478813559322, "grad_norm": 2.479379177093506, "learning_rate": 6.303893008474576e-06, "loss": 1.1578, "mean_token_accuracy": 0.7464823201298714, "num_tokens": 22466644.0, "step": 27914 }, { "epoch": 7.393008474576272, "grad_norm": 2.633223295211792, "learning_rate": 6.303628177966102e-06, "loss": 1.0874, "mean_token_accuracy": 0.7470950037240982, "num_tokens": 22468064.0, "step": 27916 }, { "epoch": 7.393538135593221, "grad_norm": 2.598916530609131, "learning_rate": 6.303363347457627e-06, "loss": 1.5093, "mean_token_accuracy": 0.6795461475849152, "num_tokens": 22469704.0, "step": 27918 }, { "epoch": 7.3940677966101696, "grad_norm": 3.363640546798706, "learning_rate": 6.303098516949153e-06, "loss": 1.7231, "mean_token_accuracy": 0.6562559828162193, "num_tokens": 22471128.0, "step": 27920 }, { "epoch": 7.3945974576271185, "grad_norm": 2.5661675930023193, "learning_rate": 6.302833686440678e-06, "loss": 0.8997, "mean_token_accuracy": 0.7703178077936172, "num_tokens": 22472586.0, "step": 27922 }, { "epoch": 7.3951271186440675, "grad_norm": 2.616926670074463, "learning_rate": 6.302568855932204e-06, "loss": 1.2892, "mean_token_accuracy": 0.6958207413554192, "num_tokens": 22474186.0, "step": 27924 }, { "epoch": 7.395656779661017, "grad_norm": 3.3062636852264404, "learning_rate": 6.3023040254237285e-06, "loss": 0.7944, "mean_token_accuracy": 0.7864220440387726, "num_tokens": 22475819.0, "step": 27926 }, { "epoch": 7.396186440677966, "grad_norm": 3.7178919315338135, "learning_rate": 6.302039194915255e-06, "loss": 1.4392, "mean_token_accuracy": 0.6777041181921959, "num_tokens": 22477734.0, "step": 27928 }, { "epoch": 7.396716101694915, "grad_norm": 2.5059704780578613, "learning_rate": 6.30177436440678e-06, "loss": 1.1048, "mean_token_accuracy": 0.7367812544107437, "num_tokens": 22479262.0, "step": 27930 }, { "epoch": 7.397245762711864, "grad_norm": 6.5454020500183105, "learning_rate": 6.301509533898306e-06, "loss": 1.419, "mean_token_accuracy": 0.6596273705363274, "num_tokens": 22480929.0, "step": 27932 }, { "epoch": 7.397775423728813, "grad_norm": 2.79351806640625, "learning_rate": 6.301244703389831e-06, "loss": 1.5441, "mean_token_accuracy": 0.6436311230063438, "num_tokens": 22482707.0, "step": 27934 }, { "epoch": 7.398305084745763, "grad_norm": 2.611457347869873, "learning_rate": 6.3009798728813566e-06, "loss": 1.1001, "mean_token_accuracy": 0.7236946150660515, "num_tokens": 22484300.0, "step": 27936 }, { "epoch": 7.398834745762712, "grad_norm": 2.2184152603149414, "learning_rate": 6.3007150423728815e-06, "loss": 1.2643, "mean_token_accuracy": 0.6980486139655113, "num_tokens": 22485957.0, "step": 27938 }, { "epoch": 7.399364406779661, "grad_norm": 2.7429258823394775, "learning_rate": 6.300450211864407e-06, "loss": 1.4862, "mean_token_accuracy": 0.6690027713775635, "num_tokens": 22487667.0, "step": 27940 }, { "epoch": 7.39989406779661, "grad_norm": 2.1287057399749756, "learning_rate": 6.300185381355933e-06, "loss": 1.1063, "mean_token_accuracy": 0.7371661961078644, "num_tokens": 22488994.0, "step": 27942 }, { "epoch": 7.40042372881356, "grad_norm": 3.40160870552063, "learning_rate": 6.299920550847458e-06, "loss": 1.2832, "mean_token_accuracy": 0.6948209628462791, "num_tokens": 22490565.0, "step": 27944 }, { "epoch": 7.400953389830509, "grad_norm": 2.3856589794158936, "learning_rate": 6.299655720338984e-06, "loss": 1.1515, "mean_token_accuracy": 0.7367017865180969, "num_tokens": 22491939.0, "step": 27946 }, { "epoch": 7.401483050847458, "grad_norm": 2.3559324741363525, "learning_rate": 6.299390889830509e-06, "loss": 0.5204, "mean_token_accuracy": 0.861418329179287, "num_tokens": 22493580.0, "step": 27948 }, { "epoch": 7.402012711864407, "grad_norm": 2.3736817836761475, "learning_rate": 6.299126059322035e-06, "loss": 1.1692, "mean_token_accuracy": 0.7129894345998764, "num_tokens": 22495378.0, "step": 27950 }, { "epoch": 7.4025423728813555, "grad_norm": 2.786188840866089, "learning_rate": 6.298861228813559e-06, "loss": 1.551, "mean_token_accuracy": 0.6766326203942299, "num_tokens": 22497106.0, "step": 27952 }, { "epoch": 7.403072033898305, "grad_norm": 3.595048666000366, "learning_rate": 6.298596398305086e-06, "loss": 1.1857, "mean_token_accuracy": 0.7571794390678406, "num_tokens": 22498456.0, "step": 27954 }, { "epoch": 7.403601694915254, "grad_norm": 2.5283281803131104, "learning_rate": 6.298331567796611e-06, "loss": 1.3176, "mean_token_accuracy": 0.6757283136248589, "num_tokens": 22500076.0, "step": 27956 }, { "epoch": 7.404131355932203, "grad_norm": 2.3197503089904785, "learning_rate": 6.298066737288137e-06, "loss": 1.3824, "mean_token_accuracy": 0.698245957493782, "num_tokens": 22501615.0, "step": 27958 }, { "epoch": 7.404661016949152, "grad_norm": 2.6058387756347656, "learning_rate": 6.297801906779662e-06, "loss": 0.9977, "mean_token_accuracy": 0.7543464377522469, "num_tokens": 22502941.0, "step": 27960 }, { "epoch": 7.405190677966102, "grad_norm": 2.746279239654541, "learning_rate": 6.297537076271187e-06, "loss": 1.1927, "mean_token_accuracy": 0.7155818194150925, "num_tokens": 22504555.0, "step": 27962 }, { "epoch": 7.405720338983051, "grad_norm": 2.1472158432006836, "learning_rate": 6.297272245762712e-06, "loss": 0.7579, "mean_token_accuracy": 0.7949414178729057, "num_tokens": 22506907.0, "step": 27964 }, { "epoch": 7.40625, "grad_norm": 2.6004767417907715, "learning_rate": 6.297007415254238e-06, "loss": 1.2061, "mean_token_accuracy": 0.7445061877369881, "num_tokens": 22508281.0, "step": 27966 }, { "epoch": 7.406779661016949, "grad_norm": 2.6212711334228516, "learning_rate": 6.296742584745763e-06, "loss": 1.1195, "mean_token_accuracy": 0.7697264775633812, "num_tokens": 22510180.0, "step": 27968 }, { "epoch": 7.407309322033898, "grad_norm": 2.709413766860962, "learning_rate": 6.296477754237289e-06, "loss": 1.0257, "mean_token_accuracy": 0.76775112003088, "num_tokens": 22511691.0, "step": 27970 }, { "epoch": 7.407838983050848, "grad_norm": 2.3560686111450195, "learning_rate": 6.296212923728814e-06, "loss": 1.7817, "mean_token_accuracy": 0.6118699684739113, "num_tokens": 22513975.0, "step": 27972 }, { "epoch": 7.408368644067797, "grad_norm": 2.781404972076416, "learning_rate": 6.2959480932203395e-06, "loss": 1.1835, "mean_token_accuracy": 0.7314208000898361, "num_tokens": 22515626.0, "step": 27974 }, { "epoch": 7.408898305084746, "grad_norm": 2.0767080783843994, "learning_rate": 6.2956832627118645e-06, "loss": 1.0753, "mean_token_accuracy": 0.7379799336194992, "num_tokens": 22517317.0, "step": 27976 }, { "epoch": 7.409427966101695, "grad_norm": 2.5925400257110596, "learning_rate": 6.29541843220339e-06, "loss": 1.1928, "mean_token_accuracy": 0.7433243095874786, "num_tokens": 22518785.0, "step": 27978 }, { "epoch": 7.4099576271186445, "grad_norm": 2.1721556186676025, "learning_rate": 6.295153601694915e-06, "loss": 1.0781, "mean_token_accuracy": 0.729293018579483, "num_tokens": 22520528.0, "step": 27980 }, { "epoch": 7.410487288135593, "grad_norm": 2.880338668823242, "learning_rate": 6.294888771186442e-06, "loss": 1.1643, "mean_token_accuracy": 0.7131859362125397, "num_tokens": 22522236.0, "step": 27982 }, { "epoch": 7.411016949152542, "grad_norm": 2.9672279357910156, "learning_rate": 6.294623940677967e-06, "loss": 1.0825, "mean_token_accuracy": 0.7274327501654625, "num_tokens": 22523882.0, "step": 27984 }, { "epoch": 7.411546610169491, "grad_norm": 3.597933530807495, "learning_rate": 6.2943591101694925e-06, "loss": 1.2914, "mean_token_accuracy": 0.6886232495307922, "num_tokens": 22525384.0, "step": 27986 }, { "epoch": 7.41207627118644, "grad_norm": 2.332110643386841, "learning_rate": 6.2940942796610174e-06, "loss": 0.9886, "mean_token_accuracy": 0.7716721072793007, "num_tokens": 22526920.0, "step": 27988 }, { "epoch": 7.41260593220339, "grad_norm": 2.521341562271118, "learning_rate": 6.293829449152543e-06, "loss": 1.0677, "mean_token_accuracy": 0.7597881704568863, "num_tokens": 22528560.0, "step": 27990 }, { "epoch": 7.413135593220339, "grad_norm": 2.7637181282043457, "learning_rate": 6.293564618644068e-06, "loss": 1.0999, "mean_token_accuracy": 0.7410281673073769, "num_tokens": 22531293.0, "step": 27992 }, { "epoch": 7.413665254237288, "grad_norm": 2.5389187335968018, "learning_rate": 6.293299788135594e-06, "loss": 0.9839, "mean_token_accuracy": 0.7570179179310799, "num_tokens": 22532992.0, "step": 27994 }, { "epoch": 7.414194915254237, "grad_norm": 3.3080432415008545, "learning_rate": 6.293034957627119e-06, "loss": 1.1956, "mean_token_accuracy": 0.7536359056830406, "num_tokens": 22534389.0, "step": 27996 }, { "epoch": 7.414724576271187, "grad_norm": 2.546072244644165, "learning_rate": 6.292770127118645e-06, "loss": 1.1867, "mean_token_accuracy": 0.7197400294244289, "num_tokens": 22535996.0, "step": 27998 }, { "epoch": 7.415254237288136, "grad_norm": 2.872525453567505, "learning_rate": 6.2925052966101695e-06, "loss": 1.5876, "step": 28000 }, { "epoch": 7.415254237288136, "eval_loss": 1.3365979194641113, "eval_mean_token_accuracy": 0.6993645719893566, "eval_num_tokens": 22537661.0, "eval_runtime": 48.446, "eval_samples_per_second": 6.358, "eval_steps_per_second": 6.358, "step": 28000 }, { "epoch": 7.415783898305085, "grad_norm": 2.9375524520874023, "learning_rate": 6.292240466101695e-06, "loss": 0.9822, "mean_token_accuracy": 0.7004925012588501, "num_tokens": 22539033.0, "step": 28002 }, { "epoch": 7.416313559322034, "grad_norm": 2.6491177082061768, "learning_rate": 6.29197563559322e-06, "loss": 1.3308, "mean_token_accuracy": 0.7177750319242477, "num_tokens": 22540480.0, "step": 28004 }, { "epoch": 7.416843220338983, "grad_norm": 1.9599623680114746, "learning_rate": 6.291710805084746e-06, "loss": 0.7117, "mean_token_accuracy": 0.7943894937634468, "num_tokens": 22542317.0, "step": 28006 }, { "epoch": 7.4173728813559325, "grad_norm": 2.1863350868225098, "learning_rate": 6.291445974576271e-06, "loss": 0.9536, "mean_token_accuracy": 0.7745930477976799, "num_tokens": 22543820.0, "step": 28008 }, { "epoch": 7.4179025423728815, "grad_norm": 2.7830848693847656, "learning_rate": 6.291181144067798e-06, "loss": 1.1556, "mean_token_accuracy": 0.7110878452658653, "num_tokens": 22545388.0, "step": 28010 }, { "epoch": 7.4184322033898304, "grad_norm": 2.5708298683166504, "learning_rate": 6.290916313559322e-06, "loss": 1.3466, "mean_token_accuracy": 0.6762874349951744, "num_tokens": 22547008.0, "step": 28012 }, { "epoch": 7.418961864406779, "grad_norm": 2.4924051761627197, "learning_rate": 6.290651483050848e-06, "loss": 1.1807, "mean_token_accuracy": 0.7150436788797379, "num_tokens": 22548791.0, "step": 28014 }, { "epoch": 7.419491525423728, "grad_norm": 2.824875593185425, "learning_rate": 6.290386652542373e-06, "loss": 1.14, "mean_token_accuracy": 0.7343410700559616, "num_tokens": 22550076.0, "step": 28016 }, { "epoch": 7.420021186440678, "grad_norm": 2.7764806747436523, "learning_rate": 6.290121822033899e-06, "loss": 1.4232, "mean_token_accuracy": 0.6787010803818703, "num_tokens": 22551747.0, "step": 28018 }, { "epoch": 7.420550847457627, "grad_norm": 2.4446818828582764, "learning_rate": 6.289856991525424e-06, "loss": 0.834, "mean_token_accuracy": 0.7838174253702164, "num_tokens": 22553240.0, "step": 28020 }, { "epoch": 7.421080508474576, "grad_norm": 2.5033388137817383, "learning_rate": 6.28959216101695e-06, "loss": 0.9727, "mean_token_accuracy": 0.7543392330408096, "num_tokens": 22555581.0, "step": 28022 }, { "epoch": 7.421610169491525, "grad_norm": 3.19936203956604, "learning_rate": 6.289327330508475e-06, "loss": 1.4502, "mean_token_accuracy": 0.7105177789926529, "num_tokens": 22557487.0, "step": 28024 }, { "epoch": 7.422139830508475, "grad_norm": 2.467040538787842, "learning_rate": 6.2890625e-06, "loss": 0.9725, "mean_token_accuracy": 0.7611937820911407, "num_tokens": 22559180.0, "step": 28026 }, { "epoch": 7.422669491525424, "grad_norm": 2.756103754043579, "learning_rate": 6.288797669491526e-06, "loss": 1.6128, "mean_token_accuracy": 0.6346626207232475, "num_tokens": 22561042.0, "step": 28028 }, { "epoch": 7.423199152542373, "grad_norm": 2.117771625518799, "learning_rate": 6.288532838983051e-06, "loss": 0.8852, "mean_token_accuracy": 0.7733186557888985, "num_tokens": 22562564.0, "step": 28030 }, { "epoch": 7.423728813559322, "grad_norm": 3.1410796642303467, "learning_rate": 6.288268008474577e-06, "loss": 1.1405, "mean_token_accuracy": 0.7460992708802223, "num_tokens": 22563900.0, "step": 28032 }, { "epoch": 7.424258474576272, "grad_norm": 2.6091115474700928, "learning_rate": 6.288003177966102e-06, "loss": 0.8095, "mean_token_accuracy": 0.784801572561264, "num_tokens": 22565428.0, "step": 28034 }, { "epoch": 7.424788135593221, "grad_norm": 3.10400390625, "learning_rate": 6.2877383474576284e-06, "loss": 1.2139, "mean_token_accuracy": 0.7016113772988319, "num_tokens": 22566961.0, "step": 28036 }, { "epoch": 7.4253177966101696, "grad_norm": 2.0020546913146973, "learning_rate": 6.287473516949153e-06, "loss": 1.2257, "mean_token_accuracy": 0.6921442300081253, "num_tokens": 22568833.0, "step": 28038 }, { "epoch": 7.4258474576271185, "grad_norm": 2.2490172386169434, "learning_rate": 6.287208686440679e-06, "loss": 1.0807, "mean_token_accuracy": 0.7344628348946571, "num_tokens": 22570472.0, "step": 28040 }, { "epoch": 7.4263771186440675, "grad_norm": 2.859872341156006, "learning_rate": 6.286943855932204e-06, "loss": 1.2052, "mean_token_accuracy": 0.7184906750917435, "num_tokens": 22572339.0, "step": 28042 }, { "epoch": 7.426906779661017, "grad_norm": 2.1321165561676025, "learning_rate": 6.28667902542373e-06, "loss": 1.0815, "mean_token_accuracy": 0.7401029169559479, "num_tokens": 22574044.0, "step": 28044 }, { "epoch": 7.427436440677966, "grad_norm": 2.359079360961914, "learning_rate": 6.286414194915255e-06, "loss": 1.1733, "mean_token_accuracy": 0.7349618822336197, "num_tokens": 22575674.0, "step": 28046 }, { "epoch": 7.427966101694915, "grad_norm": 3.050804376602173, "learning_rate": 6.2861493644067806e-06, "loss": 1.665, "mean_token_accuracy": 0.6227579340338707, "num_tokens": 22577371.0, "step": 28048 }, { "epoch": 7.428495762711864, "grad_norm": 2.653468370437622, "learning_rate": 6.2858845338983055e-06, "loss": 1.0651, "mean_token_accuracy": 0.7708334028720856, "num_tokens": 22578792.0, "step": 28050 }, { "epoch": 7.429025423728813, "grad_norm": 2.2437868118286133, "learning_rate": 6.285619703389831e-06, "loss": 1.1619, "mean_token_accuracy": 0.7208708822727203, "num_tokens": 22580357.0, "step": 28052 }, { "epoch": 7.429555084745763, "grad_norm": 2.383730411529541, "learning_rate": 6.285354872881356e-06, "loss": 1.0251, "mean_token_accuracy": 0.7398514002561569, "num_tokens": 22581819.0, "step": 28054 }, { "epoch": 7.430084745762712, "grad_norm": 2.5216407775878906, "learning_rate": 6.285090042372882e-06, "loss": 1.5343, "mean_token_accuracy": 0.6468188464641571, "num_tokens": 22583620.0, "step": 28056 }, { "epoch": 7.430614406779661, "grad_norm": 2.458594799041748, "learning_rate": 6.284825211864407e-06, "loss": 0.9197, "mean_token_accuracy": 0.7766465842723846, "num_tokens": 22585090.0, "step": 28058 }, { "epoch": 7.43114406779661, "grad_norm": 2.61966609954834, "learning_rate": 6.284560381355933e-06, "loss": 1.2425, "mean_token_accuracy": 0.7229128256440163, "num_tokens": 22586744.0, "step": 28060 }, { "epoch": 7.43167372881356, "grad_norm": 2.146604299545288, "learning_rate": 6.284295550847458e-06, "loss": 0.7654, "mean_token_accuracy": 0.7892730608582497, "num_tokens": 22588157.0, "step": 28062 }, { "epoch": 7.432203389830509, "grad_norm": 2.579071521759033, "learning_rate": 6.284030720338984e-06, "loss": 0.9327, "mean_token_accuracy": 0.7716857939958572, "num_tokens": 22589655.0, "step": 28064 }, { "epoch": 7.432733050847458, "grad_norm": 2.4746665954589844, "learning_rate": 6.283765889830508e-06, "loss": 1.314, "mean_token_accuracy": 0.7014094963669777, "num_tokens": 22591257.0, "step": 28066 }, { "epoch": 7.433262711864407, "grad_norm": 2.4369561672210693, "learning_rate": 6.283501059322035e-06, "loss": 0.8601, "mean_token_accuracy": 0.7935044318437576, "num_tokens": 22593125.0, "step": 28068 }, { "epoch": 7.4337923728813555, "grad_norm": 2.7084920406341553, "learning_rate": 6.28323622881356e-06, "loss": 1.0262, "mean_token_accuracy": 0.7451158091425896, "num_tokens": 22594817.0, "step": 28070 }, { "epoch": 7.434322033898305, "grad_norm": 2.421372652053833, "learning_rate": 6.282971398305086e-06, "loss": 0.8255, "mean_token_accuracy": 0.8081689551472664, "num_tokens": 22596418.0, "step": 28072 }, { "epoch": 7.434851694915254, "grad_norm": 2.6325387954711914, "learning_rate": 6.2827065677966106e-06, "loss": 1.2887, "mean_token_accuracy": 0.719113752245903, "num_tokens": 22597894.0, "step": 28074 }, { "epoch": 7.435381355932203, "grad_norm": 2.7970101833343506, "learning_rate": 6.282441737288136e-06, "loss": 0.7766, "mean_token_accuracy": 0.792475625872612, "num_tokens": 22599108.0, "step": 28076 }, { "epoch": 7.435911016949152, "grad_norm": 2.8636510372161865, "learning_rate": 6.282176906779661e-06, "loss": 1.5137, "mean_token_accuracy": 0.6665945500135422, "num_tokens": 22600789.0, "step": 28078 }, { "epoch": 7.436440677966102, "grad_norm": 2.4864659309387207, "learning_rate": 6.281912076271187e-06, "loss": 0.9287, "mean_token_accuracy": 0.7555140405893326, "num_tokens": 22602391.0, "step": 28080 }, { "epoch": 7.436970338983051, "grad_norm": 2.121917963027954, "learning_rate": 6.281647245762712e-06, "loss": 1.0228, "mean_token_accuracy": 0.7595337703824043, "num_tokens": 22604180.0, "step": 28082 }, { "epoch": 7.4375, "grad_norm": 3.1787490844726562, "learning_rate": 6.281382415254238e-06, "loss": 1.4836, "mean_token_accuracy": 0.6493448838591576, "num_tokens": 22605724.0, "step": 28084 }, { "epoch": 7.438029661016949, "grad_norm": 2.5494675636291504, "learning_rate": 6.281117584745763e-06, "loss": 0.8914, "mean_token_accuracy": 0.7967565134167671, "num_tokens": 22607187.0, "step": 28086 }, { "epoch": 7.438559322033898, "grad_norm": 2.891420841217041, "learning_rate": 6.2808527542372885e-06, "loss": 0.9381, "mean_token_accuracy": 0.7657515108585358, "num_tokens": 22608686.0, "step": 28088 }, { "epoch": 7.439088983050848, "grad_norm": 2.5600745677948, "learning_rate": 6.280587923728813e-06, "loss": 1.1577, "mean_token_accuracy": 0.7255166321992874, "num_tokens": 22610298.0, "step": 28090 }, { "epoch": 7.439618644067797, "grad_norm": 2.4433436393737793, "learning_rate": 6.28032309322034e-06, "loss": 0.9111, "mean_token_accuracy": 0.7500429265201092, "num_tokens": 22611949.0, "step": 28092 }, { "epoch": 7.440148305084746, "grad_norm": 2.3932416439056396, "learning_rate": 6.280058262711864e-06, "loss": 1.3211, "mean_token_accuracy": 0.6915291622281075, "num_tokens": 22613699.0, "step": 28094 }, { "epoch": 7.440677966101695, "grad_norm": 2.9408891201019287, "learning_rate": 6.279793432203391e-06, "loss": 1.4348, "mean_token_accuracy": 0.6796543896198273, "num_tokens": 22615311.0, "step": 28096 }, { "epoch": 7.4412076271186445, "grad_norm": 2.3746399879455566, "learning_rate": 6.279528601694916e-06, "loss": 1.3425, "mean_token_accuracy": 0.6951533257961273, "num_tokens": 22617121.0, "step": 28098 }, { "epoch": 7.441737288135593, "grad_norm": 2.777179002761841, "learning_rate": 6.279263771186441e-06, "loss": 1.3344, "mean_token_accuracy": 0.6812326312065125, "num_tokens": 22618850.0, "step": 28100 }, { "epoch": 7.442266949152542, "grad_norm": 2.912889242172241, "learning_rate": 6.278998940677966e-06, "loss": 1.5452, "mean_token_accuracy": 0.6504956409335136, "num_tokens": 22620553.0, "step": 28102 }, { "epoch": 7.442796610169491, "grad_norm": 2.646230459213257, "learning_rate": 6.278734110169492e-06, "loss": 1.0995, "mean_token_accuracy": 0.7193618938326836, "num_tokens": 22622423.0, "step": 28104 }, { "epoch": 7.44332627118644, "grad_norm": 2.4472551345825195, "learning_rate": 6.278469279661017e-06, "loss": 1.0184, "mean_token_accuracy": 0.7494736313819885, "num_tokens": 22624048.0, "step": 28106 }, { "epoch": 7.44385593220339, "grad_norm": 2.714277982711792, "learning_rate": 6.278204449152543e-06, "loss": 1.0706, "mean_token_accuracy": 0.7338246032595634, "num_tokens": 22625582.0, "step": 28108 }, { "epoch": 7.444385593220339, "grad_norm": 3.091137647628784, "learning_rate": 6.277939618644068e-06, "loss": 1.3372, "mean_token_accuracy": 0.6817753985524178, "num_tokens": 22627109.0, "step": 28110 }, { "epoch": 7.444915254237288, "grad_norm": 2.56290340423584, "learning_rate": 6.2776747881355935e-06, "loss": 1.3955, "mean_token_accuracy": 0.6906018927693367, "num_tokens": 22628924.0, "step": 28112 }, { "epoch": 7.445444915254237, "grad_norm": 3.330996513366699, "learning_rate": 6.277409957627119e-06, "loss": 1.0814, "mean_token_accuracy": 0.7708374112844467, "num_tokens": 22630165.0, "step": 28114 }, { "epoch": 7.445974576271187, "grad_norm": 4.82291316986084, "learning_rate": 6.277145127118644e-06, "loss": 1.2401, "mean_token_accuracy": 0.7224748432636261, "num_tokens": 22631678.0, "step": 28116 }, { "epoch": 7.446504237288136, "grad_norm": 2.9838225841522217, "learning_rate": 6.276880296610171e-06, "loss": 1.1373, "mean_token_accuracy": 0.7352084070444107, "num_tokens": 22633319.0, "step": 28118 }, { "epoch": 7.447033898305085, "grad_norm": 2.4325239658355713, "learning_rate": 6.276615466101695e-06, "loss": 0.9164, "mean_token_accuracy": 0.7566298916935921, "num_tokens": 22634900.0, "step": 28120 }, { "epoch": 7.447563559322034, "grad_norm": 2.8901774883270264, "learning_rate": 6.2763506355932216e-06, "loss": 1.343, "mean_token_accuracy": 0.6895202398300171, "num_tokens": 22636415.0, "step": 28122 }, { "epoch": 7.448093220338983, "grad_norm": 2.1225335597991943, "learning_rate": 6.2760858050847465e-06, "loss": 0.7985, "mean_token_accuracy": 0.7791986986994743, "num_tokens": 22638424.0, "step": 28124 }, { "epoch": 7.4486228813559325, "grad_norm": 3.3994812965393066, "learning_rate": 6.275820974576272e-06, "loss": 1.502, "mean_token_accuracy": 0.6652545519173145, "num_tokens": 22639937.0, "step": 28126 }, { "epoch": 7.4491525423728815, "grad_norm": 3.046489715576172, "learning_rate": 6.275556144067797e-06, "loss": 1.0616, "mean_token_accuracy": 0.7323333844542503, "num_tokens": 22641423.0, "step": 28128 }, { "epoch": 7.4496822033898304, "grad_norm": 3.3291749954223633, "learning_rate": 6.275291313559323e-06, "loss": 0.9717, "mean_token_accuracy": 0.7445976212620735, "num_tokens": 22642871.0, "step": 28130 }, { "epoch": 7.450211864406779, "grad_norm": 2.7014122009277344, "learning_rate": 6.275026483050848e-06, "loss": 1.4258, "mean_token_accuracy": 0.7036510109901428, "num_tokens": 22644350.0, "step": 28132 }, { "epoch": 7.450741525423728, "grad_norm": 2.5288712978363037, "learning_rate": 6.274761652542374e-06, "loss": 0.942, "mean_token_accuracy": 0.7403031587600708, "num_tokens": 22645997.0, "step": 28134 }, { "epoch": 7.451271186440678, "grad_norm": 2.9512288570404053, "learning_rate": 6.274496822033899e-06, "loss": 1.0959, "mean_token_accuracy": 0.7430037781596184, "num_tokens": 22648071.0, "step": 28136 }, { "epoch": 7.451800847457627, "grad_norm": 2.3151261806488037, "learning_rate": 6.274231991525424e-06, "loss": 1.1269, "mean_token_accuracy": 0.7399450838565826, "num_tokens": 22649557.0, "step": 28138 }, { "epoch": 7.452330508474576, "grad_norm": 2.437962532043457, "learning_rate": 6.273967161016949e-06, "loss": 1.1797, "mean_token_accuracy": 0.7146999388933182, "num_tokens": 22651315.0, "step": 28140 }, { "epoch": 7.452860169491525, "grad_norm": 2.653649091720581, "learning_rate": 6.273702330508475e-06, "loss": 1.4035, "mean_token_accuracy": 0.6985767707228661, "num_tokens": 22652944.0, "step": 28142 }, { "epoch": 7.453389830508475, "grad_norm": 2.3197007179260254, "learning_rate": 6.2734375e-06, "loss": 1.2736, "mean_token_accuracy": 0.7058835998177528, "num_tokens": 22654501.0, "step": 28144 }, { "epoch": 7.453919491525424, "grad_norm": 2.815913438796997, "learning_rate": 6.273172669491527e-06, "loss": 1.371, "mean_token_accuracy": 0.6867531314492226, "num_tokens": 22656180.0, "step": 28146 }, { "epoch": 7.454449152542373, "grad_norm": 2.2838432788848877, "learning_rate": 6.272907838983051e-06, "loss": 0.811, "mean_token_accuracy": 0.7895044460892677, "num_tokens": 22658064.0, "step": 28148 }, { "epoch": 7.454978813559322, "grad_norm": 2.5548300743103027, "learning_rate": 6.272643008474577e-06, "loss": 1.1342, "mean_token_accuracy": 0.7405809760093689, "num_tokens": 22659939.0, "step": 28150 }, { "epoch": 7.455508474576272, "grad_norm": 1.797255277633667, "learning_rate": 6.272378177966102e-06, "loss": 1.0952, "mean_token_accuracy": 0.7418995499610901, "num_tokens": 22662453.0, "step": 28152 }, { "epoch": 7.456038135593221, "grad_norm": 2.4558610916137695, "learning_rate": 6.272113347457628e-06, "loss": 0.97, "mean_token_accuracy": 0.7593518644571304, "num_tokens": 22663900.0, "step": 28154 }, { "epoch": 7.4565677966101696, "grad_norm": 2.642517566680908, "learning_rate": 6.271848516949153e-06, "loss": 1.2131, "mean_token_accuracy": 0.6995778828859329, "num_tokens": 22665461.0, "step": 28156 }, { "epoch": 7.4570974576271185, "grad_norm": 2.5280027389526367, "learning_rate": 6.271583686440679e-06, "loss": 1.4148, "mean_token_accuracy": 0.6863928437232971, "num_tokens": 22667049.0, "step": 28158 }, { "epoch": 7.4576271186440675, "grad_norm": 2.19503116607666, "learning_rate": 6.271318855932204e-06, "loss": 0.8958, "mean_token_accuracy": 0.7629182934761047, "num_tokens": 22668719.0, "step": 28160 }, { "epoch": 7.458156779661017, "grad_norm": 2.166294813156128, "learning_rate": 6.2710540254237295e-06, "loss": 0.9972, "mean_token_accuracy": 0.7448386177420616, "num_tokens": 22670539.0, "step": 28162 }, { "epoch": 7.458686440677966, "grad_norm": 2.1454665660858154, "learning_rate": 6.270789194915254e-06, "loss": 0.9098, "mean_token_accuracy": 0.772582933306694, "num_tokens": 22672364.0, "step": 28164 }, { "epoch": 7.459216101694915, "grad_norm": 3.001417398452759, "learning_rate": 6.27052436440678e-06, "loss": 1.7103, "mean_token_accuracy": 0.6419242024421692, "num_tokens": 22674143.0, "step": 28166 }, { "epoch": 7.459745762711864, "grad_norm": 2.7637579441070557, "learning_rate": 6.270259533898305e-06, "loss": 1.2415, "mean_token_accuracy": 0.6945426985621452, "num_tokens": 22675703.0, "step": 28168 }, { "epoch": 7.460275423728813, "grad_norm": 3.37247371673584, "learning_rate": 6.269994703389831e-06, "loss": 0.925, "mean_token_accuracy": 0.7878284752368927, "num_tokens": 22677205.0, "step": 28170 }, { "epoch": 7.460805084745763, "grad_norm": 2.486873149871826, "learning_rate": 6.269729872881356e-06, "loss": 1.1962, "mean_token_accuracy": 0.7170274183154106, "num_tokens": 22678875.0, "step": 28172 }, { "epoch": 7.461334745762712, "grad_norm": 3.1400580406188965, "learning_rate": 6.269465042372882e-06, "loss": 1.3623, "mean_token_accuracy": 0.7060023695230484, "num_tokens": 22680328.0, "step": 28174 }, { "epoch": 7.461864406779661, "grad_norm": 2.46618914604187, "learning_rate": 6.2692002118644065e-06, "loss": 1.3452, "mean_token_accuracy": 0.6762179061770439, "num_tokens": 22682054.0, "step": 28176 }, { "epoch": 7.46239406779661, "grad_norm": 2.7068216800689697, "learning_rate": 6.268935381355933e-06, "loss": 1.351, "mean_token_accuracy": 0.7102606445550919, "num_tokens": 22684019.0, "step": 28178 }, { "epoch": 7.46292372881356, "grad_norm": 3.224924087524414, "learning_rate": 6.268670550847458e-06, "loss": 1.1777, "mean_token_accuracy": 0.7133968323469162, "num_tokens": 22685342.0, "step": 28180 }, { "epoch": 7.463453389830509, "grad_norm": 2.575098752975464, "learning_rate": 6.268405720338984e-06, "loss": 1.2483, "mean_token_accuracy": 0.7212185561656952, "num_tokens": 22686906.0, "step": 28182 }, { "epoch": 7.463983050847458, "grad_norm": 2.242967128753662, "learning_rate": 6.268140889830509e-06, "loss": 0.9352, "mean_token_accuracy": 0.7918405905365944, "num_tokens": 22688544.0, "step": 28184 }, { "epoch": 7.464512711864407, "grad_norm": 2.5748188495635986, "learning_rate": 6.2678760593220345e-06, "loss": 1.4199, "mean_token_accuracy": 0.6784768402576447, "num_tokens": 22690349.0, "step": 28186 }, { "epoch": 7.4650423728813555, "grad_norm": 2.62528133392334, "learning_rate": 6.2676112288135595e-06, "loss": 1.0709, "mean_token_accuracy": 0.7643670365214348, "num_tokens": 22692069.0, "step": 28188 }, { "epoch": 7.465572033898305, "grad_norm": 2.401296615600586, "learning_rate": 6.267346398305085e-06, "loss": 0.9559, "mean_token_accuracy": 0.7819783017039299, "num_tokens": 22693602.0, "step": 28190 }, { "epoch": 7.466101694915254, "grad_norm": 2.7472543716430664, "learning_rate": 6.26708156779661e-06, "loss": 1.1787, "mean_token_accuracy": 0.7404987141489983, "num_tokens": 22695345.0, "step": 28192 }, { "epoch": 7.466631355932203, "grad_norm": 2.397831916809082, "learning_rate": 6.266816737288136e-06, "loss": 1.3056, "mean_token_accuracy": 0.7081091403961182, "num_tokens": 22696996.0, "step": 28194 }, { "epoch": 7.467161016949152, "grad_norm": 2.432288885116577, "learning_rate": 6.266551906779662e-06, "loss": 1.1437, "mean_token_accuracy": 0.7191993519663811, "num_tokens": 22698669.0, "step": 28196 }, { "epoch": 7.467690677966102, "grad_norm": 2.5690343379974365, "learning_rate": 6.266287076271187e-06, "loss": 1.0933, "mean_token_accuracy": 0.7396071031689644, "num_tokens": 22700264.0, "step": 28198 }, { "epoch": 7.468220338983051, "grad_norm": 2.8762714862823486, "learning_rate": 6.266022245762713e-06, "loss": 0.9925, "mean_token_accuracy": 0.7709626480937004, "num_tokens": 22701643.0, "step": 28200 }, { "epoch": 7.46875, "grad_norm": 2.8942337036132812, "learning_rate": 6.265757415254237e-06, "loss": 1.2617, "mean_token_accuracy": 0.712882362306118, "num_tokens": 22703154.0, "step": 28202 }, { "epoch": 7.469279661016949, "grad_norm": 3.064296007156372, "learning_rate": 6.265492584745764e-06, "loss": 1.2981, "mean_token_accuracy": 0.7223534807562828, "num_tokens": 22704831.0, "step": 28204 }, { "epoch": 7.469809322033898, "grad_norm": 2.7574856281280518, "learning_rate": 6.265227754237289e-06, "loss": 1.0827, "mean_token_accuracy": 0.7503119856119156, "num_tokens": 22706556.0, "step": 28206 }, { "epoch": 7.470338983050848, "grad_norm": 2.2585489749908447, "learning_rate": 6.264962923728815e-06, "loss": 0.7745, "mean_token_accuracy": 0.8006626293063164, "num_tokens": 22708128.0, "step": 28208 }, { "epoch": 7.470868644067797, "grad_norm": 2.3243117332458496, "learning_rate": 6.26469809322034e-06, "loss": 0.9392, "mean_token_accuracy": 0.7788860350847244, "num_tokens": 22709800.0, "step": 28210 }, { "epoch": 7.471398305084746, "grad_norm": 2.949073553085327, "learning_rate": 6.264433262711865e-06, "loss": 1.2658, "mean_token_accuracy": 0.7112098410725594, "num_tokens": 22711432.0, "step": 28212 }, { "epoch": 7.471927966101695, "grad_norm": 2.867607593536377, "learning_rate": 6.26416843220339e-06, "loss": 1.4115, "mean_token_accuracy": 0.6772773638367653, "num_tokens": 22713236.0, "step": 28214 }, { "epoch": 7.4724576271186445, "grad_norm": 2.5705623626708984, "learning_rate": 6.263903601694916e-06, "loss": 1.0944, "mean_token_accuracy": 0.730459026992321, "num_tokens": 22714791.0, "step": 28216 }, { "epoch": 7.472987288135593, "grad_norm": 2.9266135692596436, "learning_rate": 6.263638771186441e-06, "loss": 1.3446, "mean_token_accuracy": 0.6812973544001579, "num_tokens": 22716423.0, "step": 28218 }, { "epoch": 7.473516949152542, "grad_norm": 2.842552423477173, "learning_rate": 6.263373940677967e-06, "loss": 1.2401, "mean_token_accuracy": 0.699172168970108, "num_tokens": 22718143.0, "step": 28220 }, { "epoch": 7.474046610169491, "grad_norm": 2.755614757537842, "learning_rate": 6.263109110169492e-06, "loss": 1.0722, "mean_token_accuracy": 0.7412138804793358, "num_tokens": 22719831.0, "step": 28222 }, { "epoch": 7.47457627118644, "grad_norm": 2.862870216369629, "learning_rate": 6.2628442796610175e-06, "loss": 1.2142, "mean_token_accuracy": 0.7347593083977699, "num_tokens": 22721270.0, "step": 28224 }, { "epoch": 7.47510593220339, "grad_norm": 3.1450297832489014, "learning_rate": 6.2625794491525424e-06, "loss": 1.718, "mean_token_accuracy": 0.6460006982088089, "num_tokens": 22722915.0, "step": 28226 }, { "epoch": 7.475635593220339, "grad_norm": 2.7028911113739014, "learning_rate": 6.262314618644068e-06, "loss": 1.0903, "mean_token_accuracy": 0.7398832440376282, "num_tokens": 22724531.0, "step": 28228 }, { "epoch": 7.476165254237288, "grad_norm": 2.693190574645996, "learning_rate": 6.262049788135593e-06, "loss": 1.1915, "mean_token_accuracy": 0.7213738188147545, "num_tokens": 22726202.0, "step": 28230 }, { "epoch": 7.476694915254237, "grad_norm": 2.840139627456665, "learning_rate": 6.26178495762712e-06, "loss": 1.5574, "mean_token_accuracy": 0.6521936506032944, "num_tokens": 22727898.0, "step": 28232 }, { "epoch": 7.477224576271187, "grad_norm": 2.771152973175049, "learning_rate": 6.261520127118645e-06, "loss": 1.0895, "mean_token_accuracy": 0.7705805450677872, "num_tokens": 22729353.0, "step": 28234 }, { "epoch": 7.477754237288136, "grad_norm": 2.0548202991485596, "learning_rate": 6.2612552966101705e-06, "loss": 0.9332, "mean_token_accuracy": 0.7748825326561928, "num_tokens": 22731067.0, "step": 28236 }, { "epoch": 7.478283898305085, "grad_norm": 2.64278244972229, "learning_rate": 6.260990466101695e-06, "loss": 1.218, "mean_token_accuracy": 0.7045706510543823, "num_tokens": 22732667.0, "step": 28238 }, { "epoch": 7.478813559322034, "grad_norm": 2.780881643295288, "learning_rate": 6.260725635593221e-06, "loss": 1.1846, "mean_token_accuracy": 0.7151187658309937, "num_tokens": 22734369.0, "step": 28240 }, { "epoch": 7.479343220338983, "grad_norm": 2.8477942943573, "learning_rate": 6.260460805084746e-06, "loss": 1.3242, "mean_token_accuracy": 0.6944971606135368, "num_tokens": 22736251.0, "step": 28242 }, { "epoch": 7.4798728813559325, "grad_norm": 2.83537220954895, "learning_rate": 6.260195974576272e-06, "loss": 1.2423, "mean_token_accuracy": 0.6956971511244774, "num_tokens": 22737709.0, "step": 28244 }, { "epoch": 7.4804025423728815, "grad_norm": 2.881640911102295, "learning_rate": 6.259931144067797e-06, "loss": 1.2941, "mean_token_accuracy": 0.6956572309136391, "num_tokens": 22739255.0, "step": 28246 }, { "epoch": 7.4809322033898304, "grad_norm": 3.26973819732666, "learning_rate": 6.259666313559323e-06, "loss": 0.967, "mean_token_accuracy": 0.7818073332309723, "num_tokens": 22740758.0, "step": 28248 }, { "epoch": 7.481461864406779, "grad_norm": 2.5318260192871094, "learning_rate": 6.2594014830508475e-06, "loss": 1.0645, "step": 28250 }, { "epoch": 7.481461864406779, "eval_loss": 1.3394627571105957, "eval_mean_token_accuracy": 0.6991979941338687, "eval_num_tokens": 22742222.0, "eval_runtime": 48.2946, "eval_samples_per_second": 6.378, "eval_steps_per_second": 6.378, "step": 28250 }, { "epoch": 7.481991525423728, "grad_norm": 2.3026621341705322, "learning_rate": 6.259136652542373e-06, "loss": 0.9866, "mean_token_accuracy": 0.7561705932021141, "num_tokens": 22743711.0, "step": 28252 }, { "epoch": 7.482521186440678, "grad_norm": 2.4758005142211914, "learning_rate": 6.258871822033898e-06, "loss": 1.2054, "mean_token_accuracy": 0.7245851010084152, "num_tokens": 22745378.0, "step": 28254 }, { "epoch": 7.483050847457627, "grad_norm": 2.477463483810425, "learning_rate": 6.258606991525424e-06, "loss": 1.1147, "mean_token_accuracy": 0.7512884140014648, "num_tokens": 22747143.0, "step": 28256 }, { "epoch": 7.483580508474576, "grad_norm": 3.084066867828369, "learning_rate": 6.258342161016949e-06, "loss": 1.1864, "mean_token_accuracy": 0.7388006448745728, "num_tokens": 22748661.0, "step": 28258 }, { "epoch": 7.484110169491525, "grad_norm": 2.2209482192993164, "learning_rate": 6.2580773305084756e-06, "loss": 1.0033, "mean_token_accuracy": 0.7773260325193405, "num_tokens": 22750467.0, "step": 28260 }, { "epoch": 7.484639830508475, "grad_norm": 3.080933094024658, "learning_rate": 6.2578125000000005e-06, "loss": 1.5685, "mean_token_accuracy": 0.6750494092702866, "num_tokens": 22752145.0, "step": 28262 }, { "epoch": 7.485169491525424, "grad_norm": 3.1209213733673096, "learning_rate": 6.257547669491526e-06, "loss": 1.2252, "mean_token_accuracy": 0.7133627235889435, "num_tokens": 22753686.0, "step": 28264 }, { "epoch": 7.485699152542373, "grad_norm": 2.6883862018585205, "learning_rate": 6.257282838983051e-06, "loss": 1.1518, "mean_token_accuracy": 0.7497794181108475, "num_tokens": 22755324.0, "step": 28266 }, { "epoch": 7.486228813559322, "grad_norm": 3.1697311401367188, "learning_rate": 6.257018008474577e-06, "loss": 0.9981, "mean_token_accuracy": 0.7571454867720604, "num_tokens": 22756822.0, "step": 28268 }, { "epoch": 7.486758474576272, "grad_norm": 2.5477240085601807, "learning_rate": 6.256753177966102e-06, "loss": 1.5545, "mean_token_accuracy": 0.6399405524134636, "num_tokens": 22758547.0, "step": 28270 }, { "epoch": 7.487288135593221, "grad_norm": 3.13393235206604, "learning_rate": 6.256488347457628e-06, "loss": 1.2207, "mean_token_accuracy": 0.7280430644750595, "num_tokens": 22760060.0, "step": 28272 }, { "epoch": 7.4878177966101696, "grad_norm": 2.9180357456207275, "learning_rate": 6.256223516949153e-06, "loss": 1.0287, "mean_token_accuracy": 0.7247338742017746, "num_tokens": 22762181.0, "step": 28274 }, { "epoch": 7.4883474576271185, "grad_norm": 2.3407273292541504, "learning_rate": 6.255958686440678e-06, "loss": 1.3283, "mean_token_accuracy": 0.6851882562041283, "num_tokens": 22763824.0, "step": 28276 }, { "epoch": 7.4888771186440675, "grad_norm": 2.594021797180176, "learning_rate": 6.255693855932203e-06, "loss": 1.5761, "mean_token_accuracy": 0.6656948029994965, "num_tokens": 22765276.0, "step": 28278 }, { "epoch": 7.489406779661017, "grad_norm": 2.655740261077881, "learning_rate": 6.255429025423729e-06, "loss": 1.3536, "mean_token_accuracy": 0.6932266727089882, "num_tokens": 22766992.0, "step": 28280 }, { "epoch": 7.489936440677966, "grad_norm": 2.3288369178771973, "learning_rate": 6.255164194915255e-06, "loss": 1.0311, "mean_token_accuracy": 0.7560652270913124, "num_tokens": 22768542.0, "step": 28282 }, { "epoch": 7.490466101694915, "grad_norm": 2.7227699756622314, "learning_rate": 6.25489936440678e-06, "loss": 1.1914, "mean_token_accuracy": 0.7351401075720787, "num_tokens": 22770051.0, "step": 28284 }, { "epoch": 7.490995762711864, "grad_norm": 2.560899019241333, "learning_rate": 6.254634533898306e-06, "loss": 1.2497, "mean_token_accuracy": 0.7112729549407959, "num_tokens": 22771700.0, "step": 28286 }, { "epoch": 7.491525423728813, "grad_norm": 2.380079746246338, "learning_rate": 6.254369703389831e-06, "loss": 0.9056, "mean_token_accuracy": 0.7689857482910156, "num_tokens": 22773530.0, "step": 28288 }, { "epoch": 7.492055084745763, "grad_norm": 2.185216188430786, "learning_rate": 6.254104872881357e-06, "loss": 1.1433, "mean_token_accuracy": 0.6923967972397804, "num_tokens": 22775757.0, "step": 28290 }, { "epoch": 7.492584745762712, "grad_norm": 2.394721508026123, "learning_rate": 6.253840042372882e-06, "loss": 0.9791, "mean_token_accuracy": 0.7609254494309425, "num_tokens": 22777292.0, "step": 28292 }, { "epoch": 7.493114406779661, "grad_norm": 3.3801794052124023, "learning_rate": 6.253575211864408e-06, "loss": 1.266, "mean_token_accuracy": 0.6948565505445004, "num_tokens": 22778883.0, "step": 28294 }, { "epoch": 7.49364406779661, "grad_norm": 2.808980941772461, "learning_rate": 6.253310381355933e-06, "loss": 1.2961, "mean_token_accuracy": 0.7108441293239594, "num_tokens": 22780531.0, "step": 28296 }, { "epoch": 7.49417372881356, "grad_norm": 2.9028496742248535, "learning_rate": 6.2530455508474585e-06, "loss": 1.1808, "mean_token_accuracy": 0.7410922273993492, "num_tokens": 22781945.0, "step": 28298 }, { "epoch": 7.494703389830509, "grad_norm": 2.538013219833374, "learning_rate": 6.2527807203389835e-06, "loss": 1.13, "mean_token_accuracy": 0.7124759778380394, "num_tokens": 22783555.0, "step": 28300 }, { "epoch": 7.495233050847458, "grad_norm": 2.740870475769043, "learning_rate": 6.252515889830509e-06, "loss": 1.0935, "mean_token_accuracy": 0.7554588243365288, "num_tokens": 22785216.0, "step": 28302 }, { "epoch": 7.495762711864407, "grad_norm": 2.245788812637329, "learning_rate": 6.252251059322034e-06, "loss": 1.1371, "mean_token_accuracy": 0.7487985342741013, "num_tokens": 22786904.0, "step": 28304 }, { "epoch": 7.4962923728813555, "grad_norm": 2.4826745986938477, "learning_rate": 6.25198622881356e-06, "loss": 1.3809, "mean_token_accuracy": 0.6916472688317299, "num_tokens": 22788620.0, "step": 28306 }, { "epoch": 7.496822033898305, "grad_norm": 2.849738597869873, "learning_rate": 6.251721398305085e-06, "loss": 0.9807, "mean_token_accuracy": 0.7452125549316406, "num_tokens": 22789973.0, "step": 28308 }, { "epoch": 7.497351694915254, "grad_norm": 2.633610725402832, "learning_rate": 6.251456567796611e-06, "loss": 1.3497, "mean_token_accuracy": 0.6959575414657593, "num_tokens": 22791541.0, "step": 28310 }, { "epoch": 7.497881355932203, "grad_norm": 3.1571011543273926, "learning_rate": 6.251191737288136e-06, "loss": 1.3524, "mean_token_accuracy": 0.6861020773649216, "num_tokens": 22793063.0, "step": 28312 }, { "epoch": 7.498411016949152, "grad_norm": 3.400012731552124, "learning_rate": 6.250926906779662e-06, "loss": 0.8566, "mean_token_accuracy": 0.8026233315467834, "num_tokens": 22794439.0, "step": 28314 }, { "epoch": 7.498940677966102, "grad_norm": 3.253268241882324, "learning_rate": 6.250662076271187e-06, "loss": 1.5089, "mean_token_accuracy": 0.658043697476387, "num_tokens": 22795870.0, "step": 28316 }, { "epoch": 7.499470338983051, "grad_norm": 2.9784791469573975, "learning_rate": 6.250397245762713e-06, "loss": 1.2416, "mean_token_accuracy": 0.704057402908802, "num_tokens": 22797476.0, "step": 28318 }, { "epoch": 7.5, "grad_norm": 2.4977457523345947, "learning_rate": 6.250132415254238e-06, "loss": 0.9882, "mean_token_accuracy": 0.7522767335176468, "num_tokens": 22799113.0, "step": 28320 }, { "epoch": 7.500529661016949, "grad_norm": 2.3553476333618164, "learning_rate": 6.249867584745764e-06, "loss": 0.9024, "mean_token_accuracy": 0.7923505306243896, "num_tokens": 22800573.0, "step": 28322 }, { "epoch": 7.501059322033898, "grad_norm": 2.4130165576934814, "learning_rate": 6.2496027542372885e-06, "loss": 1.0229, "mean_token_accuracy": 0.7371485605835915, "num_tokens": 22802147.0, "step": 28324 }, { "epoch": 7.501588983050848, "grad_norm": 2.514511823654175, "learning_rate": 6.249337923728814e-06, "loss": 0.6605, "mean_token_accuracy": 0.8079967722296715, "num_tokens": 22803638.0, "step": 28326 }, { "epoch": 7.502118644067797, "grad_norm": 2.928851366043091, "learning_rate": 6.249073093220339e-06, "loss": 1.5451, "mean_token_accuracy": 0.6428031548857689, "num_tokens": 22805339.0, "step": 28328 }, { "epoch": 7.502648305084746, "grad_norm": 3.8936281204223633, "learning_rate": 6.248808262711865e-06, "loss": 0.7882, "mean_token_accuracy": 0.7898042947053909, "num_tokens": 22806749.0, "step": 28330 }, { "epoch": 7.503177966101695, "grad_norm": 2.562429904937744, "learning_rate": 6.24854343220339e-06, "loss": 1.1355, "mean_token_accuracy": 0.7374394610524178, "num_tokens": 22808265.0, "step": 28332 }, { "epoch": 7.503707627118644, "grad_norm": 3.1093945503234863, "learning_rate": 6.248278601694916e-06, "loss": 1.4417, "mean_token_accuracy": 0.6640488877892494, "num_tokens": 22809764.0, "step": 28334 }, { "epoch": 7.504237288135593, "grad_norm": 2.688664674758911, "learning_rate": 6.248013771186441e-06, "loss": 1.1527, "mean_token_accuracy": 0.7236545085906982, "num_tokens": 22811393.0, "step": 28336 }, { "epoch": 7.504766949152542, "grad_norm": 2.735931396484375, "learning_rate": 6.2477489406779664e-06, "loss": 0.9517, "mean_token_accuracy": 0.7518416568636894, "num_tokens": 22812823.0, "step": 28338 }, { "epoch": 7.505296610169491, "grad_norm": 2.5793001651763916, "learning_rate": 6.247484110169491e-06, "loss": 1.0802, "mean_token_accuracy": 0.7442661970853806, "num_tokens": 22814256.0, "step": 28340 }, { "epoch": 7.50582627118644, "grad_norm": 1.989856481552124, "learning_rate": 6.247219279661018e-06, "loss": 0.9285, "mean_token_accuracy": 0.7728413641452789, "num_tokens": 22816362.0, "step": 28342 }, { "epoch": 7.50635593220339, "grad_norm": 3.4057517051696777, "learning_rate": 6.246954449152542e-06, "loss": 1.0114, "mean_token_accuracy": 0.7459565624594688, "num_tokens": 22817751.0, "step": 28344 }, { "epoch": 7.506885593220339, "grad_norm": 2.099717855453491, "learning_rate": 6.246689618644069e-06, "loss": 0.8267, "mean_token_accuracy": 0.7756300419569016, "num_tokens": 22819393.0, "step": 28346 }, { "epoch": 7.507415254237288, "grad_norm": 2.729701519012451, "learning_rate": 6.246424788135594e-06, "loss": 1.2177, "mean_token_accuracy": 0.7104421034455299, "num_tokens": 22821007.0, "step": 28348 }, { "epoch": 7.507944915254237, "grad_norm": 2.3386616706848145, "learning_rate": 6.246159957627119e-06, "loss": 1.227, "mean_token_accuracy": 0.7061777114868164, "num_tokens": 22822668.0, "step": 28350 }, { "epoch": 7.508474576271187, "grad_norm": 2.8026604652404785, "learning_rate": 6.245895127118644e-06, "loss": 1.4996, "mean_token_accuracy": 0.6638266034424305, "num_tokens": 22824004.0, "step": 28352 }, { "epoch": 7.509004237288136, "grad_norm": 3.0838353633880615, "learning_rate": 6.24563029661017e-06, "loss": 1.1147, "mean_token_accuracy": 0.7348205074667931, "num_tokens": 22825595.0, "step": 28354 }, { "epoch": 7.509533898305085, "grad_norm": 3.1252400875091553, "learning_rate": 6.245365466101695e-06, "loss": 1.0541, "mean_token_accuracy": 0.746577575802803, "num_tokens": 22826822.0, "step": 28356 }, { "epoch": 7.510063559322034, "grad_norm": 2.9112637042999268, "learning_rate": 6.245100635593221e-06, "loss": 1.25, "mean_token_accuracy": 0.6815609112381935, "num_tokens": 22828466.0, "step": 28358 }, { "epoch": 7.510593220338983, "grad_norm": 2.6058619022369385, "learning_rate": 6.244835805084746e-06, "loss": 1.1413, "mean_token_accuracy": 0.7253810167312622, "num_tokens": 22830384.0, "step": 28360 }, { "epoch": 7.5111228813559325, "grad_norm": 3.1237452030181885, "learning_rate": 6.2445709745762715e-06, "loss": 1.2767, "mean_token_accuracy": 0.7100831940770149, "num_tokens": 22832134.0, "step": 28362 }, { "epoch": 7.5116525423728815, "grad_norm": 2.653235673904419, "learning_rate": 6.244306144067797e-06, "loss": 1.1093, "mean_token_accuracy": 0.7401356175541878, "num_tokens": 22833425.0, "step": 28364 }, { "epoch": 7.5121822033898304, "grad_norm": 2.7933080196380615, "learning_rate": 6.244041313559322e-06, "loss": 1.2577, "mean_token_accuracy": 0.6993377506732941, "num_tokens": 22834802.0, "step": 28366 }, { "epoch": 7.512711864406779, "grad_norm": 2.7790424823760986, "learning_rate": 6.243776483050849e-06, "loss": 1.3575, "mean_token_accuracy": 0.7066038846969604, "num_tokens": 22836255.0, "step": 28368 }, { "epoch": 7.513241525423728, "grad_norm": 2.5697014331817627, "learning_rate": 6.243511652542374e-06, "loss": 1.3162, "mean_token_accuracy": 0.7026975154876709, "num_tokens": 22837790.0, "step": 28370 }, { "epoch": 7.513771186440678, "grad_norm": 2.7333076000213623, "learning_rate": 6.2432468220338995e-06, "loss": 1.0362, "mean_token_accuracy": 0.7514017224311829, "num_tokens": 22839125.0, "step": 28372 }, { "epoch": 7.514300847457627, "grad_norm": 2.5130887031555176, "learning_rate": 6.2429819915254245e-06, "loss": 1.0601, "mean_token_accuracy": 0.7140849679708481, "num_tokens": 22840741.0, "step": 28374 }, { "epoch": 7.514830508474576, "grad_norm": 2.5073795318603516, "learning_rate": 6.24271716101695e-06, "loss": 1.0704, "mean_token_accuracy": 0.755203053355217, "num_tokens": 22842310.0, "step": 28376 }, { "epoch": 7.515360169491525, "grad_norm": 3.491905450820923, "learning_rate": 6.242452330508475e-06, "loss": 1.3506, "mean_token_accuracy": 0.6890264227986336, "num_tokens": 22843741.0, "step": 28378 }, { "epoch": 7.515889830508475, "grad_norm": 2.503967523574829, "learning_rate": 6.242187500000001e-06, "loss": 1.3159, "mean_token_accuracy": 0.6783608049154282, "num_tokens": 22845542.0, "step": 28380 }, { "epoch": 7.516419491525424, "grad_norm": 3.2333948612213135, "learning_rate": 6.241922669491526e-06, "loss": 1.4924, "mean_token_accuracy": 0.6762522235512733, "num_tokens": 22847259.0, "step": 28382 }, { "epoch": 7.516949152542373, "grad_norm": 2.719158887863159, "learning_rate": 6.241657838983052e-06, "loss": 1.6701, "mean_token_accuracy": 0.6329618468880653, "num_tokens": 22848921.0, "step": 28384 }, { "epoch": 7.517478813559322, "grad_norm": 2.990888833999634, "learning_rate": 6.241393008474577e-06, "loss": 1.1419, "mean_token_accuracy": 0.7514528259634972, "num_tokens": 22850260.0, "step": 28386 }, { "epoch": 7.518008474576272, "grad_norm": 2.2982826232910156, "learning_rate": 6.241128177966102e-06, "loss": 1.2018, "mean_token_accuracy": 0.7056038081645966, "num_tokens": 22852080.0, "step": 28388 }, { "epoch": 7.518538135593221, "grad_norm": 2.578324556350708, "learning_rate": 6.240863347457627e-06, "loss": 1.082, "mean_token_accuracy": 0.7373176217079163, "num_tokens": 22853619.0, "step": 28390 }, { "epoch": 7.5190677966101696, "grad_norm": 2.700301170349121, "learning_rate": 6.240598516949153e-06, "loss": 1.199, "mean_token_accuracy": 0.7310111299157143, "num_tokens": 22855228.0, "step": 28392 }, { "epoch": 7.5195974576271185, "grad_norm": 2.9468886852264404, "learning_rate": 6.240333686440678e-06, "loss": 1.1854, "mean_token_accuracy": 0.7399677559733391, "num_tokens": 22856714.0, "step": 28394 }, { "epoch": 7.5201271186440675, "grad_norm": 2.5937423706054688, "learning_rate": 6.240068855932205e-06, "loss": 1.2312, "mean_token_accuracy": 0.7351573333144188, "num_tokens": 22858370.0, "step": 28396 }, { "epoch": 7.520656779661017, "grad_norm": 2.1397483348846436, "learning_rate": 6.239804025423729e-06, "loss": 1.3049, "mean_token_accuracy": 0.6966352835297585, "num_tokens": 22860195.0, "step": 28398 }, { "epoch": 7.521186440677966, "grad_norm": 2.8875465393066406, "learning_rate": 6.239539194915255e-06, "loss": 1.2347, "mean_token_accuracy": 0.7248450517654419, "num_tokens": 22861733.0, "step": 28400 }, { "epoch": 7.521716101694915, "grad_norm": 2.8237736225128174, "learning_rate": 6.23927436440678e-06, "loss": 1.3886, "mean_token_accuracy": 0.6918802335858345, "num_tokens": 22863452.0, "step": 28402 }, { "epoch": 7.522245762711864, "grad_norm": 2.03775691986084, "learning_rate": 6.239009533898306e-06, "loss": 0.9236, "mean_token_accuracy": 0.7712095677852631, "num_tokens": 22865078.0, "step": 28404 }, { "epoch": 7.522775423728813, "grad_norm": 2.623105525970459, "learning_rate": 6.238744703389831e-06, "loss": 1.2164, "mean_token_accuracy": 0.7452848628163338, "num_tokens": 22866949.0, "step": 28406 }, { "epoch": 7.523305084745763, "grad_norm": 2.7018935680389404, "learning_rate": 6.238479872881357e-06, "loss": 1.3118, "mean_token_accuracy": 0.6941705048084259, "num_tokens": 22868636.0, "step": 28408 }, { "epoch": 7.523834745762712, "grad_norm": 3.210298538208008, "learning_rate": 6.238215042372882e-06, "loss": 1.1683, "mean_token_accuracy": 0.7165716961026192, "num_tokens": 22870223.0, "step": 28410 }, { "epoch": 7.524364406779661, "grad_norm": 2.301880359649658, "learning_rate": 6.2379502118644074e-06, "loss": 0.8081, "mean_token_accuracy": 0.784988284111023, "num_tokens": 22871996.0, "step": 28412 }, { "epoch": 7.52489406779661, "grad_norm": 2.611072540283203, "learning_rate": 6.237685381355932e-06, "loss": 1.3828, "mean_token_accuracy": 0.6890134438872337, "num_tokens": 22873591.0, "step": 28414 }, { "epoch": 7.52542372881356, "grad_norm": 2.3751134872436523, "learning_rate": 6.237420550847458e-06, "loss": 1.1646, "mean_token_accuracy": 0.7127250880002975, "num_tokens": 22875095.0, "step": 28416 }, { "epoch": 7.525953389830509, "grad_norm": 3.0893819332122803, "learning_rate": 6.237155720338983e-06, "loss": 1.4688, "mean_token_accuracy": 0.6659365072846413, "num_tokens": 22876753.0, "step": 28418 }, { "epoch": 7.526483050847458, "grad_norm": 3.3876378536224365, "learning_rate": 6.236890889830509e-06, "loss": 0.9854, "mean_token_accuracy": 0.7739760354161263, "num_tokens": 22878180.0, "step": 28420 }, { "epoch": 7.527012711864407, "grad_norm": 3.4928176403045654, "learning_rate": 6.236626059322034e-06, "loss": 1.2065, "mean_token_accuracy": 0.712058775126934, "num_tokens": 22879783.0, "step": 28422 }, { "epoch": 7.527542372881356, "grad_norm": 2.3728227615356445, "learning_rate": 6.23636122881356e-06, "loss": 1.3746, "mean_token_accuracy": 0.6802220866084099, "num_tokens": 22881388.0, "step": 28424 }, { "epoch": 7.528072033898305, "grad_norm": 2.8043582439422607, "learning_rate": 6.2360963983050845e-06, "loss": 1.3707, "mean_token_accuracy": 0.66630669683218, "num_tokens": 22882840.0, "step": 28426 }, { "epoch": 7.528601694915254, "grad_norm": 2.9385364055633545, "learning_rate": 6.235831567796611e-06, "loss": 1.317, "mean_token_accuracy": 0.6885616257786751, "num_tokens": 22884550.0, "step": 28428 }, { "epoch": 7.529131355932203, "grad_norm": 3.1673033237457275, "learning_rate": 6.235566737288136e-06, "loss": 1.2103, "mean_token_accuracy": 0.7030776664614677, "num_tokens": 22886176.0, "step": 28430 }, { "epoch": 7.529661016949152, "grad_norm": 2.6247477531433105, "learning_rate": 6.235301906779662e-06, "loss": 0.7917, "mean_token_accuracy": 0.8037781417369843, "num_tokens": 22887863.0, "step": 28432 }, { "epoch": 7.530190677966102, "grad_norm": 2.7332651615142822, "learning_rate": 6.235037076271187e-06, "loss": 1.458, "mean_token_accuracy": 0.6765586584806442, "num_tokens": 22889917.0, "step": 28434 }, { "epoch": 7.530720338983051, "grad_norm": 3.490403175354004, "learning_rate": 6.2347722457627125e-06, "loss": 1.2626, "mean_token_accuracy": 0.7159283235669136, "num_tokens": 22891408.0, "step": 28436 }, { "epoch": 7.53125, "grad_norm": 2.3564579486846924, "learning_rate": 6.2345074152542375e-06, "loss": 0.9657, "mean_token_accuracy": 0.7483692839741707, "num_tokens": 22892979.0, "step": 28438 }, { "epoch": 7.531779661016949, "grad_norm": 2.478283405303955, "learning_rate": 6.234242584745763e-06, "loss": 0.9264, "mean_token_accuracy": 0.7829174771904945, "num_tokens": 22894380.0, "step": 28440 }, { "epoch": 7.532309322033898, "grad_norm": 2.9000778198242188, "learning_rate": 6.233977754237288e-06, "loss": 1.2447, "mean_token_accuracy": 0.6987102851271629, "num_tokens": 22896122.0, "step": 28442 }, { "epoch": 7.532838983050848, "grad_norm": 2.910127639770508, "learning_rate": 6.233712923728814e-06, "loss": 1.0184, "mean_token_accuracy": 0.7498920410871506, "num_tokens": 22897602.0, "step": 28444 }, { "epoch": 7.533368644067797, "grad_norm": 2.6039693355560303, "learning_rate": 6.233448093220339e-06, "loss": 1.3139, "mean_token_accuracy": 0.6858590170741081, "num_tokens": 22899369.0, "step": 28446 }, { "epoch": 7.533898305084746, "grad_norm": 2.548560857772827, "learning_rate": 6.233183262711865e-06, "loss": 0.9606, "mean_token_accuracy": 0.7664188519120216, "num_tokens": 22900736.0, "step": 28448 }, { "epoch": 7.534427966101695, "grad_norm": 2.7174220085144043, "learning_rate": 6.232918432203391e-06, "loss": 1.1413, "mean_token_accuracy": 0.7359709143638611, "num_tokens": 22902144.0, "step": 28450 }, { "epoch": 7.534957627118644, "grad_norm": 2.7557971477508545, "learning_rate": 6.232653601694915e-06, "loss": 1.2547, "mean_token_accuracy": 0.722784973680973, "num_tokens": 22903573.0, "step": 28452 }, { "epoch": 7.535487288135593, "grad_norm": 2.4463796615600586, "learning_rate": 6.232388771186442e-06, "loss": 1.0238, "mean_token_accuracy": 0.7543482333421707, "num_tokens": 22905116.0, "step": 28454 }, { "epoch": 7.536016949152542, "grad_norm": 2.6254220008850098, "learning_rate": 6.232123940677967e-06, "loss": 1.3784, "mean_token_accuracy": 0.6897186115384102, "num_tokens": 22906993.0, "step": 28456 }, { "epoch": 7.536546610169491, "grad_norm": 2.8357062339782715, "learning_rate": 6.231859110169493e-06, "loss": 1.227, "mean_token_accuracy": 0.7063195705413818, "num_tokens": 22908511.0, "step": 28458 }, { "epoch": 7.53707627118644, "grad_norm": 2.7252378463745117, "learning_rate": 6.231594279661018e-06, "loss": 0.8491, "mean_token_accuracy": 0.7960145324468613, "num_tokens": 22909706.0, "step": 28460 }, { "epoch": 7.53760593220339, "grad_norm": 3.2812464237213135, "learning_rate": 6.231329449152543e-06, "loss": 1.3371, "mean_token_accuracy": 0.7002869620919228, "num_tokens": 22911282.0, "step": 28462 }, { "epoch": 7.538135593220339, "grad_norm": 2.563479423522949, "learning_rate": 6.231064618644068e-06, "loss": 1.125, "mean_token_accuracy": 0.7597640976309776, "num_tokens": 22912735.0, "step": 28464 }, { "epoch": 7.538665254237288, "grad_norm": 2.299438953399658, "learning_rate": 6.230799788135594e-06, "loss": 1.4042, "mean_token_accuracy": 0.6818584054708481, "num_tokens": 22914561.0, "step": 28466 }, { "epoch": 7.539194915254237, "grad_norm": 2.471116304397583, "learning_rate": 6.230534957627119e-06, "loss": 1.3544, "mean_token_accuracy": 0.7333692982792854, "num_tokens": 22916336.0, "step": 28468 }, { "epoch": 7.539724576271187, "grad_norm": 2.6026952266693115, "learning_rate": 6.230270127118645e-06, "loss": 0.8253, "mean_token_accuracy": 0.7911527827382088, "num_tokens": 22917854.0, "step": 28470 }, { "epoch": 7.540254237288136, "grad_norm": 2.98850154876709, "learning_rate": 6.23000529661017e-06, "loss": 1.4633, "mean_token_accuracy": 0.6694325432181358, "num_tokens": 22919495.0, "step": 28472 }, { "epoch": 7.540783898305085, "grad_norm": 2.897071361541748, "learning_rate": 6.2297404661016955e-06, "loss": 1.5075, "mean_token_accuracy": 0.6512602008879185, "num_tokens": 22921483.0, "step": 28474 }, { "epoch": 7.541313559322034, "grad_norm": 2.9177732467651367, "learning_rate": 6.2294756355932204e-06, "loss": 0.9939, "mean_token_accuracy": 0.7630199268460274, "num_tokens": 22922871.0, "step": 28476 }, { "epoch": 7.541843220338983, "grad_norm": 2.375004768371582, "learning_rate": 6.229210805084746e-06, "loss": 1.1674, "mean_token_accuracy": 0.7164661586284637, "num_tokens": 22924740.0, "step": 28478 }, { "epoch": 7.5423728813559325, "grad_norm": 2.6330771446228027, "learning_rate": 6.228945974576271e-06, "loss": 1.2922, "mean_token_accuracy": 0.7292592227458954, "num_tokens": 22926488.0, "step": 28480 }, { "epoch": 7.5429025423728815, "grad_norm": 1.6860408782958984, "learning_rate": 6.228681144067798e-06, "loss": 0.8879, "mean_token_accuracy": 0.7569508701562881, "num_tokens": 22929063.0, "step": 28482 }, { "epoch": 7.5434322033898304, "grad_norm": 2.62532901763916, "learning_rate": 6.228416313559323e-06, "loss": 1.2576, "mean_token_accuracy": 0.7086337730288506, "num_tokens": 22930624.0, "step": 28484 }, { "epoch": 7.543961864406779, "grad_norm": 2.5707383155822754, "learning_rate": 6.2281514830508485e-06, "loss": 0.9767, "mean_token_accuracy": 0.7517991364002228, "num_tokens": 22932196.0, "step": 28486 }, { "epoch": 7.544491525423728, "grad_norm": 2.5804014205932617, "learning_rate": 6.227886652542373e-06, "loss": 1.2489, "mean_token_accuracy": 0.7090959474444389, "num_tokens": 22933644.0, "step": 28488 }, { "epoch": 7.545021186440678, "grad_norm": 3.3257925510406494, "learning_rate": 6.227621822033899e-06, "loss": 1.2525, "mean_token_accuracy": 0.6958632320165634, "num_tokens": 22935425.0, "step": 28490 }, { "epoch": 7.545550847457627, "grad_norm": 2.5153369903564453, "learning_rate": 6.227356991525424e-06, "loss": 1.2128, "mean_token_accuracy": 0.7262676805257797, "num_tokens": 22936941.0, "step": 28492 }, { "epoch": 7.546080508474576, "grad_norm": 3.320732593536377, "learning_rate": 6.22709216101695e-06, "loss": 1.5866, "mean_token_accuracy": 0.6569247096776962, "num_tokens": 22938468.0, "step": 28494 }, { "epoch": 7.546610169491525, "grad_norm": 2.745399236679077, "learning_rate": 6.226827330508475e-06, "loss": 1.0975, "mean_token_accuracy": 0.7482407614588737, "num_tokens": 22939867.0, "step": 28496 }, { "epoch": 7.547139830508475, "grad_norm": 2.924124240875244, "learning_rate": 6.226562500000001e-06, "loss": 1.3471, "mean_token_accuracy": 0.6851365827023983, "num_tokens": 22941862.0, "step": 28498 }, { "epoch": 7.547669491525424, "grad_norm": 2.6412529945373535, "learning_rate": 6.2262976694915255e-06, "loss": 1.1243, "step": 28500 }, { "epoch": 7.547669491525424, "eval_loss": 1.337050199508667, "eval_mean_token_accuracy": 0.6989652915047361, "eval_num_tokens": 22943410.0, "eval_runtime": 48.2956, "eval_samples_per_second": 6.377, "eval_steps_per_second": 6.377, "step": 28500 }, { "epoch": 7.548199152542373, "grad_norm": 2.337733745574951, "learning_rate": 6.226032838983051e-06, "loss": 1.3097, "mean_token_accuracy": 0.7177058681845665, "num_tokens": 22945191.0, "step": 28502 }, { "epoch": 7.548728813559322, "grad_norm": 2.9723379611968994, "learning_rate": 6.225768008474576e-06, "loss": 0.9548, "mean_token_accuracy": 0.7910578697919846, "num_tokens": 22946670.0, "step": 28504 }, { "epoch": 7.549258474576272, "grad_norm": 2.8654773235321045, "learning_rate": 6.225503177966102e-06, "loss": 1.207, "mean_token_accuracy": 0.7040880396962166, "num_tokens": 22949331.0, "step": 28506 }, { "epoch": 7.549788135593221, "grad_norm": 2.6779589653015137, "learning_rate": 6.225238347457627e-06, "loss": 1.534, "mean_token_accuracy": 0.6535671129822731, "num_tokens": 22950894.0, "step": 28508 }, { "epoch": 7.5503177966101696, "grad_norm": 2.599296808242798, "learning_rate": 6.2249735169491535e-06, "loss": 1.2067, "mean_token_accuracy": 0.7215663641691208, "num_tokens": 22952491.0, "step": 28510 }, { "epoch": 7.5508474576271185, "grad_norm": 2.274984359741211, "learning_rate": 6.2247086864406785e-06, "loss": 0.7472, "mean_token_accuracy": 0.8183404579758644, "num_tokens": 22954383.0, "step": 28512 }, { "epoch": 7.5513771186440675, "grad_norm": 2.842404842376709, "learning_rate": 6.224443855932204e-06, "loss": 1.2286, "mean_token_accuracy": 0.720105804502964, "num_tokens": 22955783.0, "step": 28514 }, { "epoch": 7.551906779661017, "grad_norm": 3.2190067768096924, "learning_rate": 6.224179025423729e-06, "loss": 1.4117, "mean_token_accuracy": 0.7025902047753334, "num_tokens": 22957392.0, "step": 28516 }, { "epoch": 7.552436440677966, "grad_norm": 2.7203454971313477, "learning_rate": 6.223914194915255e-06, "loss": 1.0099, "mean_token_accuracy": 0.7345245778560638, "num_tokens": 22958823.0, "step": 28518 }, { "epoch": 7.552966101694915, "grad_norm": 2.2889487743377686, "learning_rate": 6.22364936440678e-06, "loss": 1.207, "mean_token_accuracy": 0.7365457490086555, "num_tokens": 22960379.0, "step": 28520 }, { "epoch": 7.553495762711864, "grad_norm": 2.1553666591644287, "learning_rate": 6.223384533898306e-06, "loss": 1.0034, "mean_token_accuracy": 0.7370342388749123, "num_tokens": 22962096.0, "step": 28522 }, { "epoch": 7.554025423728813, "grad_norm": 2.6373167037963867, "learning_rate": 6.223119703389831e-06, "loss": 0.8021, "mean_token_accuracy": 0.8071837425231934, "num_tokens": 22963646.0, "step": 28524 }, { "epoch": 7.554555084745763, "grad_norm": 2.7174665927886963, "learning_rate": 6.222854872881356e-06, "loss": 1.4401, "mean_token_accuracy": 0.6640900447964668, "num_tokens": 22965520.0, "step": 28526 }, { "epoch": 7.555084745762712, "grad_norm": 2.2393388748168945, "learning_rate": 6.222590042372881e-06, "loss": 0.8133, "mean_token_accuracy": 0.7932971715927124, "num_tokens": 22967292.0, "step": 28528 }, { "epoch": 7.555614406779661, "grad_norm": 2.548518180847168, "learning_rate": 6.222325211864407e-06, "loss": 1.0134, "mean_token_accuracy": 0.7238415852189064, "num_tokens": 22968865.0, "step": 28530 }, { "epoch": 7.55614406779661, "grad_norm": 2.509795904159546, "learning_rate": 6.222060381355933e-06, "loss": 1.2986, "mean_token_accuracy": 0.7334286794066429, "num_tokens": 22970442.0, "step": 28532 }, { "epoch": 7.55667372881356, "grad_norm": 2.358427047729492, "learning_rate": 6.221795550847458e-06, "loss": 1.2556, "mean_token_accuracy": 0.730266198515892, "num_tokens": 22971821.0, "step": 28534 }, { "epoch": 7.557203389830509, "grad_norm": 3.1372437477111816, "learning_rate": 6.221530720338984e-06, "loss": 1.5144, "mean_token_accuracy": 0.6601473614573479, "num_tokens": 22973772.0, "step": 28536 }, { "epoch": 7.557733050847458, "grad_norm": 2.6137564182281494, "learning_rate": 6.221265889830509e-06, "loss": 1.0256, "mean_token_accuracy": 0.753065325319767, "num_tokens": 22975234.0, "step": 28538 }, { "epoch": 7.558262711864407, "grad_norm": 2.3905351161956787, "learning_rate": 6.221001059322035e-06, "loss": 0.9084, "mean_token_accuracy": 0.7913429662585258, "num_tokens": 22976663.0, "step": 28540 }, { "epoch": 7.558792372881356, "grad_norm": 2.665466547012329, "learning_rate": 6.22073622881356e-06, "loss": 1.0847, "mean_token_accuracy": 0.7510837018489838, "num_tokens": 22978099.0, "step": 28542 }, { "epoch": 7.559322033898305, "grad_norm": 2.2767014503479004, "learning_rate": 6.220471398305086e-06, "loss": 0.8972, "mean_token_accuracy": 0.7764066159725189, "num_tokens": 22979665.0, "step": 28544 }, { "epoch": 7.559851694915254, "grad_norm": 2.2600772380828857, "learning_rate": 6.220206567796611e-06, "loss": 0.8955, "mean_token_accuracy": 0.7634903267025948, "num_tokens": 22981061.0, "step": 28546 }, { "epoch": 7.560381355932203, "grad_norm": 2.986999034881592, "learning_rate": 6.2199417372881365e-06, "loss": 1.1447, "mean_token_accuracy": 0.7341133058071136, "num_tokens": 22982258.0, "step": 28548 }, { "epoch": 7.560911016949152, "grad_norm": 2.8903896808624268, "learning_rate": 6.2196769067796614e-06, "loss": 1.3936, "mean_token_accuracy": 0.6700734794139862, "num_tokens": 22983887.0, "step": 28550 }, { "epoch": 7.561440677966102, "grad_norm": 2.2482693195343018, "learning_rate": 6.219412076271187e-06, "loss": 1.3229, "mean_token_accuracy": 0.6997174955904484, "num_tokens": 22985675.0, "step": 28552 }, { "epoch": 7.561970338983051, "grad_norm": 2.9786765575408936, "learning_rate": 6.219147245762712e-06, "loss": 1.2647, "mean_token_accuracy": 0.7020441479980946, "num_tokens": 22986963.0, "step": 28554 }, { "epoch": 7.5625, "grad_norm": 2.9005932807922363, "learning_rate": 6.218882415254238e-06, "loss": 1.2964, "mean_token_accuracy": 0.7017180174589157, "num_tokens": 22988370.0, "step": 28556 }, { "epoch": 7.563029661016949, "grad_norm": 2.2727556228637695, "learning_rate": 6.218617584745763e-06, "loss": 1.2887, "mean_token_accuracy": 0.7208585068583488, "num_tokens": 22990556.0, "step": 28558 }, { "epoch": 7.563559322033898, "grad_norm": 2.5852508544921875, "learning_rate": 6.218352754237289e-06, "loss": 1.0889, "mean_token_accuracy": 0.7250567376613617, "num_tokens": 22991982.0, "step": 28560 }, { "epoch": 7.564088983050848, "grad_norm": 2.8000235557556152, "learning_rate": 6.2180879237288136e-06, "loss": 1.402, "mean_token_accuracy": 0.691822562366724, "num_tokens": 22993438.0, "step": 28562 }, { "epoch": 7.564618644067797, "grad_norm": 2.2958550453186035, "learning_rate": 6.21782309322034e-06, "loss": 1.2541, "mean_token_accuracy": 0.6934717372059822, "num_tokens": 22995417.0, "step": 28564 }, { "epoch": 7.565148305084746, "grad_norm": 2.7098453044891357, "learning_rate": 6.217558262711865e-06, "loss": 0.9942, "mean_token_accuracy": 0.7703171521425247, "num_tokens": 22996964.0, "step": 28566 }, { "epoch": 7.565677966101695, "grad_norm": 2.2275431156158447, "learning_rate": 6.217293432203391e-06, "loss": 0.9231, "mean_token_accuracy": 0.7570513039827347, "num_tokens": 22998578.0, "step": 28568 }, { "epoch": 7.566207627118644, "grad_norm": 3.257279872894287, "learning_rate": 6.217028601694916e-06, "loss": 1.1384, "mean_token_accuracy": 0.761290580034256, "num_tokens": 22999826.0, "step": 28570 }, { "epoch": 7.566737288135593, "grad_norm": 2.5824079513549805, "learning_rate": 6.216763771186442e-06, "loss": 1.3307, "mean_token_accuracy": 0.6988473460078239, "num_tokens": 23001365.0, "step": 28572 }, { "epoch": 7.567266949152542, "grad_norm": 2.181962013244629, "learning_rate": 6.2164989406779665e-06, "loss": 1.0299, "mean_token_accuracy": 0.762511819601059, "num_tokens": 23002922.0, "step": 28574 }, { "epoch": 7.567796610169491, "grad_norm": 5.682751655578613, "learning_rate": 6.216234110169492e-06, "loss": 1.1657, "mean_token_accuracy": 0.7346581742167473, "num_tokens": 23004534.0, "step": 28576 }, { "epoch": 7.56832627118644, "grad_norm": 2.867884635925293, "learning_rate": 6.215969279661017e-06, "loss": 1.4444, "mean_token_accuracy": 0.6577868685126305, "num_tokens": 23006167.0, "step": 28578 }, { "epoch": 7.56885593220339, "grad_norm": 2.5800187587738037, "learning_rate": 6.215704449152543e-06, "loss": 1.3745, "mean_token_accuracy": 0.6934428513050079, "num_tokens": 23007941.0, "step": 28580 }, { "epoch": 7.569385593220339, "grad_norm": 2.8597970008850098, "learning_rate": 6.215439618644068e-06, "loss": 1.2059, "mean_token_accuracy": 0.7184940278530121, "num_tokens": 23009417.0, "step": 28582 }, { "epoch": 7.569915254237288, "grad_norm": 1.5007272958755493, "learning_rate": 6.215174788135594e-06, "loss": 0.755, "mean_token_accuracy": 0.8219963610172272, "num_tokens": 23011758.0, "step": 28584 }, { "epoch": 7.570444915254237, "grad_norm": 2.3415253162384033, "learning_rate": 6.214909957627119e-06, "loss": 1.2048, "mean_token_accuracy": 0.7007321417331696, "num_tokens": 23013798.0, "step": 28586 }, { "epoch": 7.570974576271187, "grad_norm": 2.3259029388427734, "learning_rate": 6.214645127118644e-06, "loss": 1.1098, "mean_token_accuracy": 0.7482909858226776, "num_tokens": 23015234.0, "step": 28588 }, { "epoch": 7.571504237288136, "grad_norm": 2.8251445293426514, "learning_rate": 6.214380296610169e-06, "loss": 1.124, "mean_token_accuracy": 0.7220113500952721, "num_tokens": 23016646.0, "step": 28590 }, { "epoch": 7.572033898305085, "grad_norm": 2.5001845359802246, "learning_rate": 6.214115466101696e-06, "loss": 1.4428, "mean_token_accuracy": 0.688028134405613, "num_tokens": 23018312.0, "step": 28592 }, { "epoch": 7.572563559322034, "grad_norm": 2.8200221061706543, "learning_rate": 6.21385063559322e-06, "loss": 1.4226, "mean_token_accuracy": 0.6745922043919563, "num_tokens": 23019785.0, "step": 28594 }, { "epoch": 7.573093220338983, "grad_norm": 2.323390007019043, "learning_rate": 6.213585805084747e-06, "loss": 0.8951, "mean_token_accuracy": 0.7836525440216064, "num_tokens": 23021205.0, "step": 28596 }, { "epoch": 7.5736228813559325, "grad_norm": 2.840390682220459, "learning_rate": 6.213320974576272e-06, "loss": 1.1976, "mean_token_accuracy": 0.7268722355365753, "num_tokens": 23022522.0, "step": 28598 }, { "epoch": 7.5741525423728815, "grad_norm": 2.33467435836792, "learning_rate": 6.213056144067797e-06, "loss": 0.8561, "mean_token_accuracy": 0.7801102623343468, "num_tokens": 23024095.0, "step": 28600 }, { "epoch": 7.5746822033898304, "grad_norm": 2.4792792797088623, "learning_rate": 6.212791313559322e-06, "loss": 0.6227, "mean_token_accuracy": 0.8299639672040939, "num_tokens": 23025710.0, "step": 28602 }, { "epoch": 7.575211864406779, "grad_norm": 2.6368465423583984, "learning_rate": 6.212526483050848e-06, "loss": 1.0276, "mean_token_accuracy": 0.7315548807382584, "num_tokens": 23027196.0, "step": 28604 }, { "epoch": 7.575741525423728, "grad_norm": 2.3861124515533447, "learning_rate": 6.212261652542373e-06, "loss": 0.9143, "mean_token_accuracy": 0.7491362169384956, "num_tokens": 23028781.0, "step": 28606 }, { "epoch": 7.576271186440678, "grad_norm": 3.219194173812866, "learning_rate": 6.211996822033899e-06, "loss": 1.4919, "mean_token_accuracy": 0.6681220009922981, "num_tokens": 23030256.0, "step": 28608 }, { "epoch": 7.576800847457627, "grad_norm": 2.4017691612243652, "learning_rate": 6.211731991525424e-06, "loss": 1.2975, "mean_token_accuracy": 0.6732756197452545, "num_tokens": 23031909.0, "step": 28610 }, { "epoch": 7.577330508474576, "grad_norm": 2.6546542644500732, "learning_rate": 6.2114671610169495e-06, "loss": 1.4244, "mean_token_accuracy": 0.6746628992259502, "num_tokens": 23033506.0, "step": 28612 }, { "epoch": 7.577860169491525, "grad_norm": 2.849076986312866, "learning_rate": 6.211202330508474e-06, "loss": 1.2908, "mean_token_accuracy": 0.7123292461037636, "num_tokens": 23035343.0, "step": 28614 }, { "epoch": 7.578389830508475, "grad_norm": 3.167559862136841, "learning_rate": 6.2109375e-06, "loss": 1.4649, "mean_token_accuracy": 0.6831313893198967, "num_tokens": 23036881.0, "step": 28616 }, { "epoch": 7.578919491525424, "grad_norm": 2.8620336055755615, "learning_rate": 6.210672669491527e-06, "loss": 1.5005, "mean_token_accuracy": 0.6454959809780121, "num_tokens": 23038186.0, "step": 28618 }, { "epoch": 7.579449152542373, "grad_norm": 2.0332911014556885, "learning_rate": 6.210407838983052e-06, "loss": 1.0172, "mean_token_accuracy": 0.7507378235459328, "num_tokens": 23039558.0, "step": 28620 }, { "epoch": 7.579978813559322, "grad_norm": 1.9921163320541382, "learning_rate": 6.2101430084745775e-06, "loss": 0.7775, "mean_token_accuracy": 0.7813841104507446, "num_tokens": 23041265.0, "step": 28622 }, { "epoch": 7.580508474576272, "grad_norm": 2.1785895824432373, "learning_rate": 6.2098781779661025e-06, "loss": 1.1138, "mean_token_accuracy": 0.7381158322095871, "num_tokens": 23043111.0, "step": 28624 }, { "epoch": 7.581038135593221, "grad_norm": 2.6539483070373535, "learning_rate": 6.209613347457628e-06, "loss": 1.4319, "mean_token_accuracy": 0.6800551638007164, "num_tokens": 23044770.0, "step": 28626 }, { "epoch": 7.5815677966101696, "grad_norm": 2.938894748687744, "learning_rate": 6.209348516949153e-06, "loss": 1.1757, "mean_token_accuracy": 0.6995508894324303, "num_tokens": 23046383.0, "step": 28628 }, { "epoch": 7.5820974576271185, "grad_norm": 2.5918920040130615, "learning_rate": 6.209083686440679e-06, "loss": 1.0547, "mean_token_accuracy": 0.7445913627743721, "num_tokens": 23048036.0, "step": 28630 }, { "epoch": 7.5826271186440675, "grad_norm": 2.336745262145996, "learning_rate": 6.208818855932204e-06, "loss": 1.0301, "mean_token_accuracy": 0.7686675265431404, "num_tokens": 23049686.0, "step": 28632 }, { "epoch": 7.583156779661017, "grad_norm": 2.4054160118103027, "learning_rate": 6.20855402542373e-06, "loss": 1.2354, "mean_token_accuracy": 0.6928936615586281, "num_tokens": 23051508.0, "step": 28634 }, { "epoch": 7.583686440677966, "grad_norm": 2.359879493713379, "learning_rate": 6.2082891949152546e-06, "loss": 1.2563, "mean_token_accuracy": 0.6979847177863121, "num_tokens": 23053150.0, "step": 28636 }, { "epoch": 7.584216101694915, "grad_norm": 3.7877933979034424, "learning_rate": 6.20802436440678e-06, "loss": 1.5115, "mean_token_accuracy": 0.6589865684509277, "num_tokens": 23054625.0, "step": 28638 }, { "epoch": 7.584745762711864, "grad_norm": 2.5065910816192627, "learning_rate": 6.207759533898305e-06, "loss": 0.7423, "mean_token_accuracy": 0.7891513854265213, "num_tokens": 23056125.0, "step": 28640 }, { "epoch": 7.585275423728813, "grad_norm": 2.836930990219116, "learning_rate": 6.207494703389831e-06, "loss": 1.3138, "mean_token_accuracy": 0.692231573164463, "num_tokens": 23057842.0, "step": 28642 }, { "epoch": 7.585805084745763, "grad_norm": 2.626889705657959, "learning_rate": 6.207229872881356e-06, "loss": 1.2103, "mean_token_accuracy": 0.7259850427508354, "num_tokens": 23059532.0, "step": 28644 }, { "epoch": 7.586334745762712, "grad_norm": 2.6206471920013428, "learning_rate": 6.206965042372883e-06, "loss": 1.2182, "mean_token_accuracy": 0.7259177938103676, "num_tokens": 23061195.0, "step": 28646 }, { "epoch": 7.586864406779661, "grad_norm": 3.022749900817871, "learning_rate": 6.206700211864407e-06, "loss": 1.8812, "mean_token_accuracy": 0.6017995662987232, "num_tokens": 23063073.0, "step": 28648 }, { "epoch": 7.58739406779661, "grad_norm": 2.929018020629883, "learning_rate": 6.206435381355933e-06, "loss": 0.9211, "mean_token_accuracy": 0.7527078166604042, "num_tokens": 23064484.0, "step": 28650 }, { "epoch": 7.58792372881356, "grad_norm": 2.6619765758514404, "learning_rate": 6.206170550847458e-06, "loss": 0.9065, "mean_token_accuracy": 0.7709757834672928, "num_tokens": 23065806.0, "step": 28652 }, { "epoch": 7.588453389830509, "grad_norm": 3.202012538909912, "learning_rate": 6.205905720338984e-06, "loss": 1.2012, "mean_token_accuracy": 0.7258397713303566, "num_tokens": 23067212.0, "step": 28654 }, { "epoch": 7.588983050847458, "grad_norm": 2.965120315551758, "learning_rate": 6.205640889830509e-06, "loss": 1.2349, "mean_token_accuracy": 0.7182354852557182, "num_tokens": 23068806.0, "step": 28656 }, { "epoch": 7.589512711864407, "grad_norm": 2.1466786861419678, "learning_rate": 6.205376059322035e-06, "loss": 0.654, "mean_token_accuracy": 0.8349296227097511, "num_tokens": 23070555.0, "step": 28658 }, { "epoch": 7.590042372881356, "grad_norm": 2.428699254989624, "learning_rate": 6.20511122881356e-06, "loss": 1.3887, "mean_token_accuracy": 0.6690312102437019, "num_tokens": 23072096.0, "step": 28660 }, { "epoch": 7.590572033898305, "grad_norm": 2.6812262535095215, "learning_rate": 6.2048463983050854e-06, "loss": 1.0339, "mean_token_accuracy": 0.7318368181586266, "num_tokens": 23073373.0, "step": 28662 }, { "epoch": 7.591101694915254, "grad_norm": 2.9241390228271484, "learning_rate": 6.20458156779661e-06, "loss": 1.1392, "mean_token_accuracy": 0.7408505007624626, "num_tokens": 23074876.0, "step": 28664 }, { "epoch": 7.591631355932203, "grad_norm": 3.2289671897888184, "learning_rate": 6.204316737288136e-06, "loss": 1.7802, "mean_token_accuracy": 0.6010838896036148, "num_tokens": 23076571.0, "step": 28666 }, { "epoch": 7.592161016949152, "grad_norm": 2.717733144760132, "learning_rate": 6.204051906779661e-06, "loss": 1.4916, "mean_token_accuracy": 0.6790854781866074, "num_tokens": 23078395.0, "step": 28668 }, { "epoch": 7.592690677966102, "grad_norm": 3.046370267868042, "learning_rate": 6.203787076271187e-06, "loss": 1.1979, "mean_token_accuracy": 0.7392072081565857, "num_tokens": 23079748.0, "step": 28670 }, { "epoch": 7.593220338983051, "grad_norm": 2.9447414875030518, "learning_rate": 6.203522245762712e-06, "loss": 1.1541, "mean_token_accuracy": 0.7170230448246002, "num_tokens": 23081167.0, "step": 28672 }, { "epoch": 7.59375, "grad_norm": 3.322838068008423, "learning_rate": 6.203257415254238e-06, "loss": 1.3331, "mean_token_accuracy": 0.689442552626133, "num_tokens": 23082692.0, "step": 28674 }, { "epoch": 7.594279661016949, "grad_norm": 2.7764997482299805, "learning_rate": 6.2029925847457625e-06, "loss": 1.1678, "mean_token_accuracy": 0.736045591533184, "num_tokens": 23084237.0, "step": 28676 }, { "epoch": 7.594809322033898, "grad_norm": 2.789398431777954, "learning_rate": 6.202727754237289e-06, "loss": 1.2446, "mean_token_accuracy": 0.7240805327892303, "num_tokens": 23085753.0, "step": 28678 }, { "epoch": 7.595338983050848, "grad_norm": 2.5568761825561523, "learning_rate": 6.202462923728814e-06, "loss": 1.2626, "mean_token_accuracy": 0.7064447999000549, "num_tokens": 23087469.0, "step": 28680 }, { "epoch": 7.595868644067797, "grad_norm": 2.7939364910125732, "learning_rate": 6.20219809322034e-06, "loss": 1.4791, "mean_token_accuracy": 0.6630422174930573, "num_tokens": 23089255.0, "step": 28682 }, { "epoch": 7.596398305084746, "grad_norm": 2.7944254875183105, "learning_rate": 6.201933262711865e-06, "loss": 1.0763, "mean_token_accuracy": 0.7386712655425072, "num_tokens": 23090572.0, "step": 28684 }, { "epoch": 7.596927966101695, "grad_norm": 3.0460379123687744, "learning_rate": 6.2016684322033905e-06, "loss": 1.269, "mean_token_accuracy": 0.694147564470768, "num_tokens": 23092010.0, "step": 28686 }, { "epoch": 7.597457627118644, "grad_norm": 2.731248617172241, "learning_rate": 6.2014036016949154e-06, "loss": 1.6509, "mean_token_accuracy": 0.6247376203536987, "num_tokens": 23093787.0, "step": 28688 }, { "epoch": 7.597987288135593, "grad_norm": 2.7637124061584473, "learning_rate": 6.201138771186441e-06, "loss": 1.0488, "mean_token_accuracy": 0.7478572502732277, "num_tokens": 23095355.0, "step": 28690 }, { "epoch": 7.598516949152542, "grad_norm": 2.9321305751800537, "learning_rate": 6.200873940677966e-06, "loss": 1.0822, "mean_token_accuracy": 0.7349454909563065, "num_tokens": 23096730.0, "step": 28692 }, { "epoch": 7.599046610169491, "grad_norm": 1.951811671257019, "learning_rate": 6.200609110169492e-06, "loss": 0.9191, "mean_token_accuracy": 0.7755628004670143, "num_tokens": 23098486.0, "step": 28694 }, { "epoch": 7.59957627118644, "grad_norm": 3.0483992099761963, "learning_rate": 6.200344279661017e-06, "loss": 1.2103, "mean_token_accuracy": 0.72055534273386, "num_tokens": 23100024.0, "step": 28696 }, { "epoch": 7.60010593220339, "grad_norm": 2.645625352859497, "learning_rate": 6.200079449152543e-06, "loss": 1.6488, "mean_token_accuracy": 0.6347468048334122, "num_tokens": 23101829.0, "step": 28698 }, { "epoch": 7.600635593220339, "grad_norm": 2.86206316947937, "learning_rate": 6.1998146186440676e-06, "loss": 1.4071, "mean_token_accuracy": 0.68096923828125, "num_tokens": 23103410.0, "step": 28700 }, { "epoch": 7.601165254237288, "grad_norm": 2.6966519355773926, "learning_rate": 6.199549788135593e-06, "loss": 1.0832, "mean_token_accuracy": 0.746501088142395, "num_tokens": 23104927.0, "step": 28702 }, { "epoch": 7.601694915254237, "grad_norm": 2.6640591621398926, "learning_rate": 6.19928495762712e-06, "loss": 0.8732, "mean_token_accuracy": 0.7909225448966026, "num_tokens": 23106445.0, "step": 28704 }, { "epoch": 7.602224576271187, "grad_norm": 2.0405046939849854, "learning_rate": 6.199020127118645e-06, "loss": 0.9949, "mean_token_accuracy": 0.7709813714027405, "num_tokens": 23108175.0, "step": 28706 }, { "epoch": 7.602754237288136, "grad_norm": 2.5281739234924316, "learning_rate": 6.198755296610171e-06, "loss": 1.3064, "mean_token_accuracy": 0.6909221485257149, "num_tokens": 23110015.0, "step": 28708 }, { "epoch": 7.603283898305085, "grad_norm": 3.460510730743408, "learning_rate": 6.198490466101696e-06, "loss": 1.0115, "mean_token_accuracy": 0.769820012152195, "num_tokens": 23111838.0, "step": 28710 }, { "epoch": 7.603813559322034, "grad_norm": 3.333601713180542, "learning_rate": 6.198225635593221e-06, "loss": 1.475, "mean_token_accuracy": 0.6735633015632629, "num_tokens": 23113465.0, "step": 28712 }, { "epoch": 7.604343220338983, "grad_norm": 3.1617178916931152, "learning_rate": 6.197960805084746e-06, "loss": 1.4401, "mean_token_accuracy": 0.6803333386778831, "num_tokens": 23115089.0, "step": 28714 }, { "epoch": 7.6048728813559325, "grad_norm": 4.0233869552612305, "learning_rate": 6.197695974576272e-06, "loss": 1.094, "mean_token_accuracy": 0.7454447969794273, "num_tokens": 23116651.0, "step": 28716 }, { "epoch": 7.6054025423728815, "grad_norm": 1.8719233274459839, "learning_rate": 6.197431144067797e-06, "loss": 1.0556, "mean_token_accuracy": 0.745360940694809, "num_tokens": 23118562.0, "step": 28718 }, { "epoch": 7.6059322033898304, "grad_norm": 2.6710996627807617, "learning_rate": 6.197166313559323e-06, "loss": 0.9883, "mean_token_accuracy": 0.7225740179419518, "num_tokens": 23120375.0, "step": 28720 }, { "epoch": 7.606461864406779, "grad_norm": 3.0548717975616455, "learning_rate": 6.196901483050848e-06, "loss": 1.3421, "mean_token_accuracy": 0.6884158700704575, "num_tokens": 23121774.0, "step": 28722 }, { "epoch": 7.606991525423728, "grad_norm": 2.3866751194000244, "learning_rate": 6.1966366525423735e-06, "loss": 1.1393, "mean_token_accuracy": 0.7073479741811752, "num_tokens": 23123810.0, "step": 28724 }, { "epoch": 7.607521186440678, "grad_norm": 2.948490858078003, "learning_rate": 6.196371822033898e-06, "loss": 1.2642, "mean_token_accuracy": 0.6925201714038849, "num_tokens": 23125552.0, "step": 28726 }, { "epoch": 7.608050847457627, "grad_norm": 2.6003148555755615, "learning_rate": 6.196106991525425e-06, "loss": 1.2148, "mean_token_accuracy": 0.7227680012583733, "num_tokens": 23127071.0, "step": 28728 }, { "epoch": 7.608580508474576, "grad_norm": 2.8898491859436035, "learning_rate": 6.195842161016949e-06, "loss": 1.446, "mean_token_accuracy": 0.6753829568624496, "num_tokens": 23128622.0, "step": 28730 }, { "epoch": 7.609110169491525, "grad_norm": 2.7673659324645996, "learning_rate": 6.195577330508476e-06, "loss": 1.1655, "mean_token_accuracy": 0.7331407517194748, "num_tokens": 23129911.0, "step": 28732 }, { "epoch": 7.609639830508475, "grad_norm": 2.703489065170288, "learning_rate": 6.195312500000001e-06, "loss": 1.1702, "mean_token_accuracy": 0.7441919520497322, "num_tokens": 23131599.0, "step": 28734 }, { "epoch": 7.610169491525424, "grad_norm": 2.7703137397766113, "learning_rate": 6.1950476694915264e-06, "loss": 1.172, "mean_token_accuracy": 0.7319499179720879, "num_tokens": 23133183.0, "step": 28736 }, { "epoch": 7.610699152542373, "grad_norm": 2.6286072731018066, "learning_rate": 6.194782838983051e-06, "loss": 1.142, "mean_token_accuracy": 0.7330624014139175, "num_tokens": 23134723.0, "step": 28738 }, { "epoch": 7.611228813559322, "grad_norm": 2.723698854446411, "learning_rate": 6.194518008474577e-06, "loss": 1.3175, "mean_token_accuracy": 0.6921240091323853, "num_tokens": 23136175.0, "step": 28740 }, { "epoch": 7.611758474576272, "grad_norm": 2.4956438541412354, "learning_rate": 6.194253177966102e-06, "loss": 1.1888, "mean_token_accuracy": 0.7250310033559799, "num_tokens": 23137752.0, "step": 28742 }, { "epoch": 7.612288135593221, "grad_norm": 2.2312378883361816, "learning_rate": 6.193988347457628e-06, "loss": 1.1936, "mean_token_accuracy": 0.7219408378005028, "num_tokens": 23139270.0, "step": 28744 }, { "epoch": 7.6128177966101696, "grad_norm": 2.227999448776245, "learning_rate": 6.193723516949153e-06, "loss": 1.1339, "mean_token_accuracy": 0.7292050644755363, "num_tokens": 23140912.0, "step": 28746 }, { "epoch": 7.6133474576271185, "grad_norm": 2.7289814949035645, "learning_rate": 6.1934586864406786e-06, "loss": 1.2571, "mean_token_accuracy": 0.7225552573800087, "num_tokens": 23142356.0, "step": 28748 }, { "epoch": 7.6138771186440675, "grad_norm": 2.1911611557006836, "learning_rate": 6.1931938559322035e-06, "loss": 1.0827, "step": 28750 }, { "epoch": 7.6138771186440675, "eval_loss": 1.3384778499603271, "eval_mean_token_accuracy": 0.6991076561344134, "eval_num_tokens": 23143932.0, "eval_runtime": 48.918, "eval_samples_per_second": 6.296, "eval_steps_per_second": 6.296, "step": 28750 }, { "epoch": 7.614406779661017, "grad_norm": 2.2263991832733154, "learning_rate": 6.192929025423729e-06, "loss": 0.8079, "mean_token_accuracy": 0.7718689814209938, "num_tokens": 23145543.0, "step": 28752 }, { "epoch": 7.614936440677966, "grad_norm": 2.4795687198638916, "learning_rate": 6.192664194915254e-06, "loss": 1.0129, "mean_token_accuracy": 0.7657807990908623, "num_tokens": 23147285.0, "step": 28754 }, { "epoch": 7.615466101694915, "grad_norm": 2.449899196624756, "learning_rate": 6.19239936440678e-06, "loss": 1.1423, "mean_token_accuracy": 0.7351089268922806, "num_tokens": 23148829.0, "step": 28756 }, { "epoch": 7.615995762711864, "grad_norm": 3.144918203353882, "learning_rate": 6.192134533898305e-06, "loss": 1.1593, "mean_token_accuracy": 0.718065083026886, "num_tokens": 23150313.0, "step": 28758 }, { "epoch": 7.616525423728813, "grad_norm": 2.42075514793396, "learning_rate": 6.1918697033898315e-06, "loss": 0.9688, "mean_token_accuracy": 0.7805681973695755, "num_tokens": 23151587.0, "step": 28760 }, { "epoch": 7.617055084745763, "grad_norm": 2.2524378299713135, "learning_rate": 6.1916048728813564e-06, "loss": 1.1271, "mean_token_accuracy": 0.7709258794784546, "num_tokens": 23153120.0, "step": 28762 }, { "epoch": 7.617584745762712, "grad_norm": 2.721356153488159, "learning_rate": 6.191340042372882e-06, "loss": 1.064, "mean_token_accuracy": 0.7326133921742439, "num_tokens": 23154856.0, "step": 28764 }, { "epoch": 7.618114406779661, "grad_norm": 2.8593873977661133, "learning_rate": 6.191075211864407e-06, "loss": 1.1796, "mean_token_accuracy": 0.7088791355490685, "num_tokens": 23156414.0, "step": 28766 }, { "epoch": 7.61864406779661, "grad_norm": 3.039695978164673, "learning_rate": 6.190810381355933e-06, "loss": 1.147, "mean_token_accuracy": 0.7352884635329247, "num_tokens": 23158049.0, "step": 28768 }, { "epoch": 7.61917372881356, "grad_norm": 2.8332762718200684, "learning_rate": 6.190545550847458e-06, "loss": 0.8529, "mean_token_accuracy": 0.7889354154467583, "num_tokens": 23159621.0, "step": 28770 }, { "epoch": 7.619703389830509, "grad_norm": 2.814647674560547, "learning_rate": 6.190280720338984e-06, "loss": 1.0469, "mean_token_accuracy": 0.7553627863526344, "num_tokens": 23161148.0, "step": 28772 }, { "epoch": 7.620233050847458, "grad_norm": 1.8388091325759888, "learning_rate": 6.1900158898305086e-06, "loss": 0.9221, "mean_token_accuracy": 0.7685612663626671, "num_tokens": 23163686.0, "step": 28774 }, { "epoch": 7.620762711864407, "grad_norm": 2.690999984741211, "learning_rate": 6.189751059322034e-06, "loss": 1.136, "mean_token_accuracy": 0.7297258973121643, "num_tokens": 23165295.0, "step": 28776 }, { "epoch": 7.621292372881356, "grad_norm": 2.212021827697754, "learning_rate": 6.189486228813559e-06, "loss": 0.8412, "mean_token_accuracy": 0.7745092958211899, "num_tokens": 23166679.0, "step": 28778 }, { "epoch": 7.621822033898305, "grad_norm": 2.283292293548584, "learning_rate": 6.189221398305085e-06, "loss": 0.9571, "mean_token_accuracy": 0.7676557824015617, "num_tokens": 23168325.0, "step": 28780 }, { "epoch": 7.622351694915254, "grad_norm": 2.6862542629241943, "learning_rate": 6.18895656779661e-06, "loss": 1.0736, "mean_token_accuracy": 0.7436497658491135, "num_tokens": 23169871.0, "step": 28782 }, { "epoch": 7.622881355932203, "grad_norm": 2.7531580924987793, "learning_rate": 6.188691737288136e-06, "loss": 1.2114, "mean_token_accuracy": 0.7247939631342888, "num_tokens": 23171567.0, "step": 28784 }, { "epoch": 7.623411016949152, "grad_norm": 2.2070960998535156, "learning_rate": 6.188426906779662e-06, "loss": 1.0473, "mean_token_accuracy": 0.782091811299324, "num_tokens": 23173114.0, "step": 28786 }, { "epoch": 7.623940677966102, "grad_norm": 1.9693206548690796, "learning_rate": 6.188162076271187e-06, "loss": 0.7304, "mean_token_accuracy": 0.8086549565196037, "num_tokens": 23175104.0, "step": 28788 }, { "epoch": 7.624470338983051, "grad_norm": 2.838179111480713, "learning_rate": 6.187897245762713e-06, "loss": 0.9901, "mean_token_accuracy": 0.7588059455156326, "num_tokens": 23176439.0, "step": 28790 }, { "epoch": 7.625, "grad_norm": 3.0365357398986816, "learning_rate": 6.187632415254238e-06, "loss": 1.1504, "mean_token_accuracy": 0.7178907468914986, "num_tokens": 23178025.0, "step": 28792 }, { "epoch": 7.625529661016949, "grad_norm": 2.679933786392212, "learning_rate": 6.187367584745764e-06, "loss": 1.2364, "mean_token_accuracy": 0.7110492214560509, "num_tokens": 23179613.0, "step": 28794 }, { "epoch": 7.626059322033898, "grad_norm": 2.8874127864837646, "learning_rate": 6.187102754237289e-06, "loss": 1.612, "mean_token_accuracy": 0.6582372412085533, "num_tokens": 23181368.0, "step": 28796 }, { "epoch": 7.626588983050848, "grad_norm": 2.5192792415618896, "learning_rate": 6.1868379237288145e-06, "loss": 1.1651, "mean_token_accuracy": 0.7263751477003098, "num_tokens": 23183060.0, "step": 28798 }, { "epoch": 7.627118644067797, "grad_norm": 2.6265244483947754, "learning_rate": 6.186573093220339e-06, "loss": 1.0395, "mean_token_accuracy": 0.7491288185119629, "num_tokens": 23184619.0, "step": 28800 }, { "epoch": 7.627648305084746, "grad_norm": 2.2694640159606934, "learning_rate": 6.186308262711865e-06, "loss": 1.0219, "mean_token_accuracy": 0.7581164538860321, "num_tokens": 23185969.0, "step": 28802 }, { "epoch": 7.628177966101695, "grad_norm": 2.3038673400878906, "learning_rate": 6.18604343220339e-06, "loss": 1.0061, "mean_token_accuracy": 0.7611557170748711, "num_tokens": 23187542.0, "step": 28804 }, { "epoch": 7.628707627118644, "grad_norm": 2.8905208110809326, "learning_rate": 6.185778601694916e-06, "loss": 1.4126, "mean_token_accuracy": 0.6864903047680855, "num_tokens": 23189268.0, "step": 28806 }, { "epoch": 7.629237288135593, "grad_norm": 2.8342111110687256, "learning_rate": 6.185513771186441e-06, "loss": 1.2789, "mean_token_accuracy": 0.7167520076036453, "num_tokens": 23190884.0, "step": 28808 }, { "epoch": 7.629766949152542, "grad_norm": 3.2115042209625244, "learning_rate": 6.185248940677967e-06, "loss": 1.3158, "mean_token_accuracy": 0.7005595117807388, "num_tokens": 23192605.0, "step": 28810 }, { "epoch": 7.630296610169491, "grad_norm": 2.6586389541625977, "learning_rate": 6.1849841101694915e-06, "loss": 1.1373, "mean_token_accuracy": 0.729701429605484, "num_tokens": 23194402.0, "step": 28812 }, { "epoch": 7.63082627118644, "grad_norm": 2.755347490310669, "learning_rate": 6.184719279661018e-06, "loss": 1.1363, "mean_token_accuracy": 0.7323609963059425, "num_tokens": 23195828.0, "step": 28814 }, { "epoch": 7.63135593220339, "grad_norm": 2.3352761268615723, "learning_rate": 6.184454449152543e-06, "loss": 0.6659, "mean_token_accuracy": 0.8178574964404106, "num_tokens": 23197258.0, "step": 28816 }, { "epoch": 7.631885593220339, "grad_norm": 2.512976884841919, "learning_rate": 6.184189618644069e-06, "loss": 1.4047, "mean_token_accuracy": 0.6854623407125473, "num_tokens": 23198880.0, "step": 28818 }, { "epoch": 7.632415254237288, "grad_norm": 2.3167874813079834, "learning_rate": 6.183924788135594e-06, "loss": 1.0109, "mean_token_accuracy": 0.7623746320605278, "num_tokens": 23200514.0, "step": 28820 }, { "epoch": 7.632944915254237, "grad_norm": 2.916109561920166, "learning_rate": 6.1836599576271196e-06, "loss": 1.0651, "mean_token_accuracy": 0.7505797520279884, "num_tokens": 23201912.0, "step": 28822 }, { "epoch": 7.633474576271187, "grad_norm": 2.2334535121917725, "learning_rate": 6.1833951271186445e-06, "loss": 1.1935, "mean_token_accuracy": 0.7301368713378906, "num_tokens": 23203627.0, "step": 28824 }, { "epoch": 7.634004237288136, "grad_norm": 2.637617349624634, "learning_rate": 6.18313029661017e-06, "loss": 0.9186, "mean_token_accuracy": 0.7972455620765686, "num_tokens": 23205260.0, "step": 28826 }, { "epoch": 7.634533898305085, "grad_norm": 2.7136380672454834, "learning_rate": 6.182865466101695e-06, "loss": 1.2146, "mean_token_accuracy": 0.7391020655632019, "num_tokens": 23206822.0, "step": 28828 }, { "epoch": 7.635063559322034, "grad_norm": 2.925591230392456, "learning_rate": 6.182600635593221e-06, "loss": 1.2042, "mean_token_accuracy": 0.7581159770488739, "num_tokens": 23208196.0, "step": 28830 }, { "epoch": 7.635593220338983, "grad_norm": 2.348757028579712, "learning_rate": 6.182335805084746e-06, "loss": 1.282, "mean_token_accuracy": 0.7286501303315163, "num_tokens": 23209732.0, "step": 28832 }, { "epoch": 7.6361228813559325, "grad_norm": 2.7381744384765625, "learning_rate": 6.182070974576272e-06, "loss": 1.0319, "mean_token_accuracy": 0.751303069293499, "num_tokens": 23211301.0, "step": 28834 }, { "epoch": 7.6366525423728815, "grad_norm": 2.4732213020324707, "learning_rate": 6.181806144067797e-06, "loss": 1.136, "mean_token_accuracy": 0.7441712394356728, "num_tokens": 23212930.0, "step": 28836 }, { "epoch": 7.6371822033898304, "grad_norm": 2.5555996894836426, "learning_rate": 6.181541313559322e-06, "loss": 1.3286, "mean_token_accuracy": 0.7169641852378845, "num_tokens": 23214746.0, "step": 28838 }, { "epoch": 7.637711864406779, "grad_norm": 2.627791404724121, "learning_rate": 6.181276483050847e-06, "loss": 1.652, "mean_token_accuracy": 0.6249661222100258, "num_tokens": 23216393.0, "step": 28840 }, { "epoch": 7.638241525423728, "grad_norm": 2.595165729522705, "learning_rate": 6.181011652542374e-06, "loss": 1.1424, "mean_token_accuracy": 0.7444952875375748, "num_tokens": 23217903.0, "step": 28842 }, { "epoch": 7.638771186440678, "grad_norm": 2.533723831176758, "learning_rate": 6.180746822033898e-06, "loss": 1.0303, "mean_token_accuracy": 0.7531117498874664, "num_tokens": 23219552.0, "step": 28844 }, { "epoch": 7.639300847457627, "grad_norm": 2.6958069801330566, "learning_rate": 6.180481991525425e-06, "loss": 1.3784, "mean_token_accuracy": 0.6916334442794323, "num_tokens": 23221181.0, "step": 28846 }, { "epoch": 7.639830508474576, "grad_norm": 3.332916736602783, "learning_rate": 6.18021716101695e-06, "loss": 1.3194, "mean_token_accuracy": 0.6851604655385017, "num_tokens": 23222867.0, "step": 28848 }, { "epoch": 7.640360169491525, "grad_norm": 3.2468409538269043, "learning_rate": 6.179952330508475e-06, "loss": 1.0536, "mean_token_accuracy": 0.7442777380347252, "num_tokens": 23224459.0, "step": 28850 }, { "epoch": 7.640889830508475, "grad_norm": 2.805539608001709, "learning_rate": 6.1796875e-06, "loss": 1.1732, "mean_token_accuracy": 0.7204864844679832, "num_tokens": 23226008.0, "step": 28852 }, { "epoch": 7.641419491525424, "grad_norm": 2.864442825317383, "learning_rate": 6.179422669491526e-06, "loss": 1.3607, "mean_token_accuracy": 0.6990146562457085, "num_tokens": 23227620.0, "step": 28854 }, { "epoch": 7.641949152542373, "grad_norm": 2.490384578704834, "learning_rate": 6.179157838983051e-06, "loss": 1.4067, "mean_token_accuracy": 0.7096929997205734, "num_tokens": 23229247.0, "step": 28856 }, { "epoch": 7.642478813559322, "grad_norm": 2.878878593444824, "learning_rate": 6.178893008474577e-06, "loss": 1.3617, "mean_token_accuracy": 0.6862645670771599, "num_tokens": 23231173.0, "step": 28858 }, { "epoch": 7.643008474576272, "grad_norm": 2.5794222354888916, "learning_rate": 6.178628177966102e-06, "loss": 1.1124, "mean_token_accuracy": 0.7277038134634495, "num_tokens": 23233129.0, "step": 28860 }, { "epoch": 7.643538135593221, "grad_norm": 3.420322895050049, "learning_rate": 6.1783633474576275e-06, "loss": 1.5327, "mean_token_accuracy": 0.6714323610067368, "num_tokens": 23234626.0, "step": 28862 }, { "epoch": 7.6440677966101696, "grad_norm": 2.714463949203491, "learning_rate": 6.178098516949152e-06, "loss": 1.0695, "mean_token_accuracy": 0.7212475314736366, "num_tokens": 23236268.0, "step": 28864 }, { "epoch": 7.6445974576271185, "grad_norm": 2.7180440425872803, "learning_rate": 6.177833686440678e-06, "loss": 1.3671, "mean_token_accuracy": 0.6796343773603439, "num_tokens": 23238108.0, "step": 28866 }, { "epoch": 7.6451271186440675, "grad_norm": 2.98521089553833, "learning_rate": 6.177568855932203e-06, "loss": 1.1953, "mean_token_accuracy": 0.7461206614971161, "num_tokens": 23239520.0, "step": 28868 }, { "epoch": 7.645656779661017, "grad_norm": 3.0126404762268066, "learning_rate": 6.17730402542373e-06, "loss": 1.3807, "mean_token_accuracy": 0.6816997826099396, "num_tokens": 23241209.0, "step": 28870 }, { "epoch": 7.646186440677966, "grad_norm": 2.6648874282836914, "learning_rate": 6.1770391949152555e-06, "loss": 1.2135, "mean_token_accuracy": 0.7209179699420929, "num_tokens": 23242587.0, "step": 28872 }, { "epoch": 7.646716101694915, "grad_norm": 2.8257129192352295, "learning_rate": 6.1767743644067804e-06, "loss": 1.3431, "mean_token_accuracy": 0.6985660865902901, "num_tokens": 23244060.0, "step": 28874 }, { "epoch": 7.647245762711864, "grad_norm": 2.522075891494751, "learning_rate": 6.176509533898306e-06, "loss": 1.0075, "mean_token_accuracy": 0.7593702301383018, "num_tokens": 23245503.0, "step": 28876 }, { "epoch": 7.647775423728813, "grad_norm": 2.7578163146972656, "learning_rate": 6.176244703389831e-06, "loss": 1.3209, "mean_token_accuracy": 0.6859518140554428, "num_tokens": 23247032.0, "step": 28878 }, { "epoch": 7.648305084745763, "grad_norm": 2.593048334121704, "learning_rate": 6.175979872881357e-06, "loss": 1.1024, "mean_token_accuracy": 0.740915983915329, "num_tokens": 23248317.0, "step": 28880 }, { "epoch": 7.648834745762712, "grad_norm": 2.444350481033325, "learning_rate": 6.175715042372882e-06, "loss": 0.6694, "mean_token_accuracy": 0.815679520368576, "num_tokens": 23250025.0, "step": 28882 }, { "epoch": 7.649364406779661, "grad_norm": 2.2150461673736572, "learning_rate": 6.175450211864408e-06, "loss": 1.0602, "mean_token_accuracy": 0.7496110498905182, "num_tokens": 23251543.0, "step": 28884 }, { "epoch": 7.64989406779661, "grad_norm": 2.7157089710235596, "learning_rate": 6.1751853813559326e-06, "loss": 0.9345, "mean_token_accuracy": 0.7739507257938385, "num_tokens": 23252965.0, "step": 28886 }, { "epoch": 7.65042372881356, "grad_norm": 2.124499559402466, "learning_rate": 6.174920550847458e-06, "loss": 0.9254, "mean_token_accuracy": 0.742513470351696, "num_tokens": 23255513.0, "step": 28888 }, { "epoch": 7.650953389830509, "grad_norm": 2.781053304672241, "learning_rate": 6.174655720338983e-06, "loss": 1.0801, "mean_token_accuracy": 0.7690970376133919, "num_tokens": 23256972.0, "step": 28890 }, { "epoch": 7.651483050847458, "grad_norm": 3.080352306365967, "learning_rate": 6.174390889830509e-06, "loss": 1.1994, "mean_token_accuracy": 0.7414596527814865, "num_tokens": 23258437.0, "step": 28892 }, { "epoch": 7.652012711864407, "grad_norm": 2.620351552963257, "learning_rate": 6.174126059322034e-06, "loss": 1.2452, "mean_token_accuracy": 0.7154684364795685, "num_tokens": 23260033.0, "step": 28894 }, { "epoch": 7.652542372881356, "grad_norm": 2.857823133468628, "learning_rate": 6.173861228813561e-06, "loss": 1.0611, "mean_token_accuracy": 0.7410604581236839, "num_tokens": 23261536.0, "step": 28896 }, { "epoch": 7.653072033898305, "grad_norm": 2.5117318630218506, "learning_rate": 6.173596398305085e-06, "loss": 1.3403, "mean_token_accuracy": 0.7199429497122765, "num_tokens": 23263058.0, "step": 28898 }, { "epoch": 7.653601694915254, "grad_norm": 2.3897087574005127, "learning_rate": 6.173331567796611e-06, "loss": 1.2109, "mean_token_accuracy": 0.7057674154639244, "num_tokens": 23264565.0, "step": 28900 }, { "epoch": 7.654131355932203, "grad_norm": 3.2720165252685547, "learning_rate": 6.173066737288136e-06, "loss": 0.9386, "mean_token_accuracy": 0.7748789712786674, "num_tokens": 23266173.0, "step": 28902 }, { "epoch": 7.654661016949152, "grad_norm": 2.5166618824005127, "learning_rate": 6.172801906779662e-06, "loss": 1.4598, "mean_token_accuracy": 0.6793062388896942, "num_tokens": 23267776.0, "step": 28904 }, { "epoch": 7.655190677966102, "grad_norm": 2.9505348205566406, "learning_rate": 6.172537076271187e-06, "loss": 1.4097, "mean_token_accuracy": 0.6599445268511772, "num_tokens": 23269195.0, "step": 28906 }, { "epoch": 7.655720338983051, "grad_norm": 2.570249319076538, "learning_rate": 6.172272245762713e-06, "loss": 1.2235, "mean_token_accuracy": 0.6978493332862854, "num_tokens": 23270788.0, "step": 28908 }, { "epoch": 7.65625, "grad_norm": 2.8926308155059814, "learning_rate": 6.172007415254238e-06, "loss": 1.3033, "mean_token_accuracy": 0.7084280997514725, "num_tokens": 23272280.0, "step": 28910 }, { "epoch": 7.656779661016949, "grad_norm": 2.7346832752227783, "learning_rate": 6.171742584745763e-06, "loss": 1.1451, "mean_token_accuracy": 0.7211497947573662, "num_tokens": 23273629.0, "step": 28912 }, { "epoch": 7.657309322033898, "grad_norm": 2.4753854274749756, "learning_rate": 6.171477754237288e-06, "loss": 1.1863, "mean_token_accuracy": 0.718725174665451, "num_tokens": 23275341.0, "step": 28914 }, { "epoch": 7.657838983050848, "grad_norm": 2.373363733291626, "learning_rate": 6.171212923728814e-06, "loss": 0.8568, "mean_token_accuracy": 0.7881125882267952, "num_tokens": 23276800.0, "step": 28916 }, { "epoch": 7.658368644067797, "grad_norm": 3.182140350341797, "learning_rate": 6.170948093220339e-06, "loss": 1.2633, "mean_token_accuracy": 0.6958927884697914, "num_tokens": 23278388.0, "step": 28918 }, { "epoch": 7.658898305084746, "grad_norm": 2.614258050918579, "learning_rate": 6.170683262711865e-06, "loss": 1.3968, "mean_token_accuracy": 0.7115992084145546, "num_tokens": 23279906.0, "step": 28920 }, { "epoch": 7.659427966101695, "grad_norm": 2.3343558311462402, "learning_rate": 6.17041843220339e-06, "loss": 1.4635, "mean_token_accuracy": 0.6645021885633469, "num_tokens": 23281656.0, "step": 28922 }, { "epoch": 7.659957627118644, "grad_norm": 2.452357769012451, "learning_rate": 6.170153601694916e-06, "loss": 1.0237, "mean_token_accuracy": 0.7446954250335693, "num_tokens": 23283303.0, "step": 28924 }, { "epoch": 7.660487288135593, "grad_norm": 2.3576719760894775, "learning_rate": 6.1698887711864405e-06, "loss": 0.7808, "mean_token_accuracy": 0.7849275395274162, "num_tokens": 23284970.0, "step": 28926 }, { "epoch": 7.661016949152542, "grad_norm": 2.8701257705688477, "learning_rate": 6.169623940677967e-06, "loss": 1.3382, "mean_token_accuracy": 0.7259666100144386, "num_tokens": 23286344.0, "step": 28928 }, { "epoch": 7.661546610169491, "grad_norm": 2.476954460144043, "learning_rate": 6.169359110169492e-06, "loss": 1.4617, "mean_token_accuracy": 0.6888571605086327, "num_tokens": 23287958.0, "step": 28930 }, { "epoch": 7.66207627118644, "grad_norm": 2.7259275913238525, "learning_rate": 6.169094279661018e-06, "loss": 1.1828, "mean_token_accuracy": 0.733015350997448, "num_tokens": 23289177.0, "step": 28932 }, { "epoch": 7.66260593220339, "grad_norm": 2.6539721488952637, "learning_rate": 6.168829449152543e-06, "loss": 1.2613, "mean_token_accuracy": 0.7079657539725304, "num_tokens": 23290661.0, "step": 28934 }, { "epoch": 7.663135593220339, "grad_norm": 2.85611891746521, "learning_rate": 6.1685646186440685e-06, "loss": 1.3652, "mean_token_accuracy": 0.6986339949071407, "num_tokens": 23292338.0, "step": 28936 }, { "epoch": 7.663665254237288, "grad_norm": 2.089280843734741, "learning_rate": 6.168299788135593e-06, "loss": 0.8889, "mean_token_accuracy": 0.7931355535984039, "num_tokens": 23293838.0, "step": 28938 }, { "epoch": 7.664194915254237, "grad_norm": 2.509232759475708, "learning_rate": 6.168034957627119e-06, "loss": 1.4047, "mean_token_accuracy": 0.6774246618151665, "num_tokens": 23295624.0, "step": 28940 }, { "epoch": 7.664724576271187, "grad_norm": 3.013608932495117, "learning_rate": 6.167770127118644e-06, "loss": 1.6411, "mean_token_accuracy": 0.6341440677642822, "num_tokens": 23297306.0, "step": 28942 }, { "epoch": 7.665254237288136, "grad_norm": 2.7955214977264404, "learning_rate": 6.16750529661017e-06, "loss": 1.4811, "mean_token_accuracy": 0.6939040869474411, "num_tokens": 23298964.0, "step": 28944 }, { "epoch": 7.665783898305085, "grad_norm": 2.5505645275115967, "learning_rate": 6.167240466101695e-06, "loss": 1.1663, "mean_token_accuracy": 0.7497569695115089, "num_tokens": 23300450.0, "step": 28946 }, { "epoch": 7.666313559322034, "grad_norm": 3.0818586349487305, "learning_rate": 6.166975635593221e-06, "loss": 1.2658, "mean_token_accuracy": 0.7249124422669411, "num_tokens": 23302140.0, "step": 28948 }, { "epoch": 7.666843220338983, "grad_norm": 3.386685609817505, "learning_rate": 6.1667108050847455e-06, "loss": 1.1871, "mean_token_accuracy": 0.7156982645392418, "num_tokens": 23303479.0, "step": 28950 }, { "epoch": 7.6673728813559325, "grad_norm": 2.1915838718414307, "learning_rate": 6.166445974576271e-06, "loss": 1.1368, "mean_token_accuracy": 0.7166100814938545, "num_tokens": 23305366.0, "step": 28952 }, { "epoch": 7.6679025423728815, "grad_norm": 2.825099229812622, "learning_rate": 6.166181144067798e-06, "loss": 1.3489, "mean_token_accuracy": 0.7094222791492939, "num_tokens": 23306953.0, "step": 28954 }, { "epoch": 7.6684322033898304, "grad_norm": 2.6779239177703857, "learning_rate": 6.165916313559323e-06, "loss": 1.2908, "mean_token_accuracy": 0.687568224966526, "num_tokens": 23308500.0, "step": 28956 }, { "epoch": 7.668961864406779, "grad_norm": 2.198552131652832, "learning_rate": 6.165651483050849e-06, "loss": 1.2671, "mean_token_accuracy": 0.7309446334838867, "num_tokens": 23309905.0, "step": 28958 }, { "epoch": 7.669491525423728, "grad_norm": 2.8277320861816406, "learning_rate": 6.1653866525423736e-06, "loss": 1.0814, "mean_token_accuracy": 0.7523138970136642, "num_tokens": 23311563.0, "step": 28960 }, { "epoch": 7.670021186440678, "grad_norm": 2.970715284347534, "learning_rate": 6.165121822033899e-06, "loss": 1.4115, "mean_token_accuracy": 0.6730294823646545, "num_tokens": 23313330.0, "step": 28962 }, { "epoch": 7.670550847457627, "grad_norm": 2.676422119140625, "learning_rate": 6.164856991525424e-06, "loss": 1.0357, "mean_token_accuracy": 0.7459382563829422, "num_tokens": 23315113.0, "step": 28964 }, { "epoch": 7.671080508474576, "grad_norm": 2.251688003540039, "learning_rate": 6.16459216101695e-06, "loss": 0.9632, "mean_token_accuracy": 0.7553350776433945, "num_tokens": 23317001.0, "step": 28966 }, { "epoch": 7.671610169491525, "grad_norm": 2.615062952041626, "learning_rate": 6.164327330508475e-06, "loss": 1.3521, "mean_token_accuracy": 0.7144247964024544, "num_tokens": 23318672.0, "step": 28968 }, { "epoch": 7.672139830508475, "grad_norm": 2.7464845180511475, "learning_rate": 6.164062500000001e-06, "loss": 1.0919, "mean_token_accuracy": 0.749298021197319, "num_tokens": 23320099.0, "step": 28970 }, { "epoch": 7.672669491525424, "grad_norm": 2.64976167678833, "learning_rate": 6.163797669491526e-06, "loss": 1.1757, "mean_token_accuracy": 0.7177671566605568, "num_tokens": 23321614.0, "step": 28972 }, { "epoch": 7.673199152542373, "grad_norm": 2.511121988296509, "learning_rate": 6.1635328389830515e-06, "loss": 1.1682, "mean_token_accuracy": 0.7200159877538681, "num_tokens": 23323455.0, "step": 28974 }, { "epoch": 7.673728813559322, "grad_norm": 2.3641726970672607, "learning_rate": 6.163268008474576e-06, "loss": 1.0577, "mean_token_accuracy": 0.7522282265126705, "num_tokens": 23324940.0, "step": 28976 }, { "epoch": 7.674258474576272, "grad_norm": 2.9942774772644043, "learning_rate": 6.163003177966103e-06, "loss": 1.1509, "mean_token_accuracy": 0.7464409992098808, "num_tokens": 23326375.0, "step": 28978 }, { "epoch": 7.674788135593221, "grad_norm": 2.93774676322937, "learning_rate": 6.162738347457627e-06, "loss": 1.552, "mean_token_accuracy": 0.6715504825115204, "num_tokens": 23327995.0, "step": 28980 }, { "epoch": 7.6753177966101696, "grad_norm": 2.5686497688293457, "learning_rate": 6.162473516949154e-06, "loss": 0.9028, "mean_token_accuracy": 0.7724381238222122, "num_tokens": 23329304.0, "step": 28982 }, { "epoch": 7.6758474576271185, "grad_norm": 2.9155325889587402, "learning_rate": 6.162208686440679e-06, "loss": 1.1406, "mean_token_accuracy": 0.7446270361542702, "num_tokens": 23330694.0, "step": 28984 }, { "epoch": 7.6763771186440675, "grad_norm": 2.8496146202087402, "learning_rate": 6.161943855932204e-06, "loss": 1.2063, "mean_token_accuracy": 0.7102869004011154, "num_tokens": 23332185.0, "step": 28986 }, { "epoch": 7.676906779661017, "grad_norm": 3.026737689971924, "learning_rate": 6.161679025423729e-06, "loss": 1.3147, "mean_token_accuracy": 0.6782831773161888, "num_tokens": 23333759.0, "step": 28988 }, { "epoch": 7.677436440677966, "grad_norm": 2.4008617401123047, "learning_rate": 6.161414194915255e-06, "loss": 0.7684, "mean_token_accuracy": 0.7938967123627663, "num_tokens": 23335360.0, "step": 28990 }, { "epoch": 7.677966101694915, "grad_norm": 2.3415329456329346, "learning_rate": 6.16114936440678e-06, "loss": 1.4634, "mean_token_accuracy": 0.6660221815109253, "num_tokens": 23336870.0, "step": 28992 }, { "epoch": 7.678495762711864, "grad_norm": 2.0882623195648193, "learning_rate": 6.160884533898306e-06, "loss": 0.7536, "mean_token_accuracy": 0.8019564747810364, "num_tokens": 23338542.0, "step": 28994 }, { "epoch": 7.679025423728813, "grad_norm": 3.3045473098754883, "learning_rate": 6.160619703389831e-06, "loss": 0.9325, "mean_token_accuracy": 0.7632784694433212, "num_tokens": 23340324.0, "step": 28996 }, { "epoch": 7.679555084745763, "grad_norm": 2.576768398284912, "learning_rate": 6.1603548728813565e-06, "loss": 1.1852, "mean_token_accuracy": 0.7291794866323471, "num_tokens": 23341850.0, "step": 28998 }, { "epoch": 7.680084745762712, "grad_norm": 2.8783984184265137, "learning_rate": 6.1600900423728815e-06, "loss": 1.0901, "step": 29000 }, { "epoch": 7.680084745762712, "eval_loss": 1.3363978862762451, "eval_mean_token_accuracy": 0.6987746587240851, "eval_num_tokens": 23343436.0, "eval_runtime": 48.0815, "eval_samples_per_second": 6.406, "eval_steps_per_second": 6.406, "step": 29000 } ], "logging_steps": 2, "max_steps": 75520, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.0617185950286807e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }