{ "best_metric": null, "best_model_checkpoint": null, "epoch": 6.420545746388443, "eval_steps": 500, "global_step": 8000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04012841091492777, "grad_norm": 86.5052261352539, "learning_rate": 5.750000000000001e-07, "loss": 2.891, "step": 50 }, { "epoch": 0.08025682182985554, "grad_norm": 44.9648551940918, "learning_rate": 1.2000000000000002e-06, "loss": 2.0127, "step": 100 }, { "epoch": 0.12038523274478331, "grad_norm": 32.67415237426758, "learning_rate": 1.825e-06, "loss": 1.3251, "step": 150 }, { "epoch": 0.16051364365971107, "grad_norm": 33.664764404296875, "learning_rate": 2.4500000000000003e-06, "loss": 1.139, "step": 200 }, { "epoch": 0.20064205457463885, "grad_norm": 38.86428451538086, "learning_rate": 3.075e-06, "loss": 0.9913, "step": 250 }, { "epoch": 0.24077046548956663, "grad_norm": 38.11140441894531, "learning_rate": 3.7e-06, "loss": 0.9344, "step": 300 }, { "epoch": 0.2808988764044944, "grad_norm": 39.1014518737793, "learning_rate": 4.325e-06, "loss": 0.8068, "step": 350 }, { "epoch": 0.32102728731942215, "grad_norm": 19.286678314208984, "learning_rate": 4.95e-06, "loss": 0.7104, "step": 400 }, { "epoch": 0.3611556982343499, "grad_norm": 24.21994972229004, "learning_rate": 5.575000000000001e-06, "loss": 0.6199, "step": 450 }, { "epoch": 0.4012841091492777, "grad_norm": 23.403148651123047, "learning_rate": 6.200000000000001e-06, "loss": 0.6139, "step": 500 }, { "epoch": 0.44141252006420545, "grad_norm": 21.166723251342773, "learning_rate": 6.825000000000001e-06, "loss": 0.5843, "step": 550 }, { "epoch": 0.48154093097913325, "grad_norm": 18.9536075592041, "learning_rate": 7.450000000000001e-06, "loss": 0.578, "step": 600 }, { "epoch": 0.521669341894061, "grad_norm": 21.438701629638672, "learning_rate": 8.075000000000001e-06, "loss": 0.5505, "step": 650 }, { "epoch": 0.5617977528089888, "grad_norm": 25.26392364501953, "learning_rate": 8.700000000000001e-06, "loss": 0.5649, "step": 700 }, { "epoch": 0.6019261637239165, "grad_norm": 19.77715492248535, "learning_rate": 9.325000000000001e-06, "loss": 0.5839, "step": 750 }, { "epoch": 0.6420545746388443, "grad_norm": 20.142839431762695, "learning_rate": 9.950000000000001e-06, "loss": 0.5518, "step": 800 }, { "epoch": 0.6821829855537721, "grad_norm": 20.156387329101562, "learning_rate": 9.936111111111112e-06, "loss": 0.5514, "step": 850 }, { "epoch": 0.7223113964686998, "grad_norm": 21.246883392333984, "learning_rate": 9.866666666666668e-06, "loss": 0.5125, "step": 900 }, { "epoch": 0.7624398073836276, "grad_norm": 17.57798957824707, "learning_rate": 9.797222222222223e-06, "loss": 0.5129, "step": 950 }, { "epoch": 0.8025682182985554, "grad_norm": 18.979021072387695, "learning_rate": 9.727777777777777e-06, "loss": 0.493, "step": 1000 }, { "epoch": 0.8426966292134831, "grad_norm": 20.188549041748047, "learning_rate": 9.658333333333334e-06, "loss": 0.5158, "step": 1050 }, { "epoch": 0.8828250401284109, "grad_norm": 17.60784149169922, "learning_rate": 9.58888888888889e-06, "loss": 0.4901, "step": 1100 }, { "epoch": 0.9229534510433387, "grad_norm": 38.89186477661133, "learning_rate": 9.519444444444446e-06, "loss": 0.493, "step": 1150 }, { "epoch": 0.9630818619582665, "grad_norm": 20.224136352539062, "learning_rate": 9.450000000000001e-06, "loss": 0.4911, "step": 1200 }, { "epoch": 1.0032102728731942, "grad_norm": 13.886350631713867, "learning_rate": 9.380555555555556e-06, "loss": 0.4649, "step": 1250 }, { "epoch": 1.043338683788122, "grad_norm": 14.338887214660645, "learning_rate": 9.311111111111112e-06, "loss": 0.3867, "step": 1300 }, { "epoch": 1.0834670947030498, "grad_norm": 17.406949996948242, "learning_rate": 9.241666666666668e-06, "loss": 0.3609, "step": 1350 }, { "epoch": 1.1235955056179776, "grad_norm": 19.521095275878906, "learning_rate": 9.172222222222223e-06, "loss": 0.354, "step": 1400 }, { "epoch": 1.1637239165329052, "grad_norm": 14.558744430541992, "learning_rate": 9.102777777777777e-06, "loss": 0.357, "step": 1450 }, { "epoch": 1.203852327447833, "grad_norm": 13.476064682006836, "learning_rate": 9.033333333333334e-06, "loss": 0.3593, "step": 1500 }, { "epoch": 1.2439807383627608, "grad_norm": 15.735109329223633, "learning_rate": 8.96388888888889e-06, "loss": 0.3567, "step": 1550 }, { "epoch": 1.2841091492776886, "grad_norm": 17.4168758392334, "learning_rate": 8.894444444444445e-06, "loss": 0.3534, "step": 1600 }, { "epoch": 1.3242375601926164, "grad_norm": 17.488101959228516, "learning_rate": 8.825000000000001e-06, "loss": 0.3481, "step": 1650 }, { "epoch": 1.3643659711075442, "grad_norm": 14.388931274414062, "learning_rate": 8.755555555555556e-06, "loss": 0.3517, "step": 1700 }, { "epoch": 1.404494382022472, "grad_norm": 14.942325592041016, "learning_rate": 8.686111111111112e-06, "loss": 0.3361, "step": 1750 }, { "epoch": 1.4446227929373996, "grad_norm": 19.468101501464844, "learning_rate": 8.616666666666668e-06, "loss": 0.349, "step": 1800 }, { "epoch": 1.4847512038523274, "grad_norm": 17.254077911376953, "learning_rate": 8.547222222222223e-06, "loss": 0.3593, "step": 1850 }, { "epoch": 1.5248796147672552, "grad_norm": 14.908585548400879, "learning_rate": 8.477777777777778e-06, "loss": 0.3464, "step": 1900 }, { "epoch": 1.565008025682183, "grad_norm": 14.53330135345459, "learning_rate": 8.408333333333334e-06, "loss": 0.3431, "step": 1950 }, { "epoch": 1.6051364365971108, "grad_norm": 16.17108154296875, "learning_rate": 8.33888888888889e-06, "loss": 0.3272, "step": 2000 }, { "epoch": 1.6452648475120384, "grad_norm": 17.363994598388672, "learning_rate": 8.269444444444445e-06, "loss": 0.354, "step": 2050 }, { "epoch": 1.6853932584269664, "grad_norm": 15.006171226501465, "learning_rate": 8.2e-06, "loss": 0.3399, "step": 2100 }, { "epoch": 1.725521669341894, "grad_norm": 19.402164459228516, "learning_rate": 8.130555555555556e-06, "loss": 0.3507, "step": 2150 }, { "epoch": 1.7656500802568218, "grad_norm": 18.973295211791992, "learning_rate": 8.061111111111112e-06, "loss": 0.3384, "step": 2200 }, { "epoch": 1.8057784911717496, "grad_norm": 16.426692962646484, "learning_rate": 7.991666666666668e-06, "loss": 0.3475, "step": 2250 }, { "epoch": 1.8459069020866774, "grad_norm": 15.58637523651123, "learning_rate": 7.922222222222223e-06, "loss": 0.3392, "step": 2300 }, { "epoch": 1.8860353130016052, "grad_norm": 22.870681762695312, "learning_rate": 7.852777777777778e-06, "loss": 0.3379, "step": 2350 }, { "epoch": 1.9261637239165328, "grad_norm": 13.37627124786377, "learning_rate": 7.783333333333334e-06, "loss": 0.3397, "step": 2400 }, { "epoch": 1.9662921348314608, "grad_norm": 13.650712013244629, "learning_rate": 7.71388888888889e-06, "loss": 0.3325, "step": 2450 }, { "epoch": 2.0064205457463884, "grad_norm": 11.761895179748535, "learning_rate": 7.644444444444445e-06, "loss": 0.3198, "step": 2500 }, { "epoch": 2.0465489566613164, "grad_norm": 10.545045852661133, "learning_rate": 7.575e-06, "loss": 0.2247, "step": 2550 }, { "epoch": 2.086677367576244, "grad_norm": 14.525964736938477, "learning_rate": 7.505555555555556e-06, "loss": 0.2106, "step": 2600 }, { "epoch": 2.1268057784911716, "grad_norm": 12.788507461547852, "learning_rate": 7.436111111111112e-06, "loss": 0.2179, "step": 2650 }, { "epoch": 2.1669341894060996, "grad_norm": 15.106952667236328, "learning_rate": 7.3666666666666676e-06, "loss": 0.23, "step": 2700 }, { "epoch": 2.207062600321027, "grad_norm": 14.04389762878418, "learning_rate": 7.297222222222223e-06, "loss": 0.2156, "step": 2750 }, { "epoch": 2.247191011235955, "grad_norm": 17.352598190307617, "learning_rate": 7.227777777777778e-06, "loss": 0.2175, "step": 2800 }, { "epoch": 2.287319422150883, "grad_norm": 11.569601058959961, "learning_rate": 7.158333333333334e-06, "loss": 0.2079, "step": 2850 }, { "epoch": 2.3274478330658104, "grad_norm": 14.02609634399414, "learning_rate": 7.0888888888888894e-06, "loss": 0.2307, "step": 2900 }, { "epoch": 2.3675762439807384, "grad_norm": 15.397391319274902, "learning_rate": 7.019444444444446e-06, "loss": 0.211, "step": 2950 }, { "epoch": 2.407704654895666, "grad_norm": 13.850847244262695, "learning_rate": 6.95e-06, "loss": 0.2115, "step": 3000 }, { "epoch": 2.447833065810594, "grad_norm": 10.497112274169922, "learning_rate": 6.880555555555556e-06, "loss": 0.2071, "step": 3050 }, { "epoch": 2.4879614767255216, "grad_norm": 13.437689781188965, "learning_rate": 6.811111111111111e-06, "loss": 0.2085, "step": 3100 }, { "epoch": 2.5280898876404496, "grad_norm": 17.070728302001953, "learning_rate": 6.741666666666668e-06, "loss": 0.2022, "step": 3150 }, { "epoch": 2.568218298555377, "grad_norm": 14.102376937866211, "learning_rate": 6.672222222222223e-06, "loss": 0.2014, "step": 3200 }, { "epoch": 2.608346709470305, "grad_norm": 12.899041175842285, "learning_rate": 6.602777777777778e-06, "loss": 0.2172, "step": 3250 }, { "epoch": 2.648475120385233, "grad_norm": 15.379789352416992, "learning_rate": 6.533333333333334e-06, "loss": 0.2096, "step": 3300 }, { "epoch": 2.6886035313001604, "grad_norm": 12.808381080627441, "learning_rate": 6.4638888888888895e-06, "loss": 0.2154, "step": 3350 }, { "epoch": 2.7287319422150884, "grad_norm": 13.826408386230469, "learning_rate": 6.394444444444445e-06, "loss": 0.2176, "step": 3400 }, { "epoch": 2.768860353130016, "grad_norm": 14.112689971923828, "learning_rate": 6.3250000000000004e-06, "loss": 0.2079, "step": 3450 }, { "epoch": 2.808988764044944, "grad_norm": 11.431640625, "learning_rate": 6.255555555555556e-06, "loss": 0.2152, "step": 3500 }, { "epoch": 2.8491171749598716, "grad_norm": 17.434322357177734, "learning_rate": 6.186111111111111e-06, "loss": 0.189, "step": 3550 }, { "epoch": 2.889245585874799, "grad_norm": 15.394646644592285, "learning_rate": 6.116666666666668e-06, "loss": 0.2078, "step": 3600 }, { "epoch": 2.929373996789727, "grad_norm": 19.227445602416992, "learning_rate": 6.047222222222223e-06, "loss": 0.2104, "step": 3650 }, { "epoch": 2.969502407704655, "grad_norm": 14.099166870117188, "learning_rate": 5.977777777777778e-06, "loss": 0.2252, "step": 3700 }, { "epoch": 3.009630818619583, "grad_norm": 11.261530876159668, "learning_rate": 5.908333333333334e-06, "loss": 0.1895, "step": 3750 }, { "epoch": 3.0497592295345104, "grad_norm": 8.676029205322266, "learning_rate": 5.8388888888888895e-06, "loss": 0.133, "step": 3800 }, { "epoch": 3.0898876404494384, "grad_norm": 8.65832233428955, "learning_rate": 5.769444444444445e-06, "loss": 0.1152, "step": 3850 }, { "epoch": 3.130016051364366, "grad_norm": 9.719037055969238, "learning_rate": 5.7e-06, "loss": 0.1261, "step": 3900 }, { "epoch": 3.1701444622792936, "grad_norm": 12.145017623901367, "learning_rate": 5.630555555555556e-06, "loss": 0.1216, "step": 3950 }, { "epoch": 3.2102728731942216, "grad_norm": 10.67035961151123, "learning_rate": 5.561111111111111e-06, "loss": 0.1341, "step": 4000 }, { "epoch": 3.250401284109149, "grad_norm": 13.00109577178955, "learning_rate": 5.491666666666668e-06, "loss": 0.1294, "step": 4050 }, { "epoch": 3.290529695024077, "grad_norm": 34.79175567626953, "learning_rate": 5.422222222222223e-06, "loss": 0.1287, "step": 4100 }, { "epoch": 3.330658105939005, "grad_norm": 8.047740936279297, "learning_rate": 5.352777777777778e-06, "loss": 0.1282, "step": 4150 }, { "epoch": 3.370786516853933, "grad_norm": 10.46933364868164, "learning_rate": 5.283333333333333e-06, "loss": 0.126, "step": 4200 }, { "epoch": 3.4109149277688604, "grad_norm": 9.083063125610352, "learning_rate": 5.21388888888889e-06, "loss": 0.1322, "step": 4250 }, { "epoch": 3.451043338683788, "grad_norm": 10.835976600646973, "learning_rate": 5.144444444444445e-06, "loss": 0.1393, "step": 4300 }, { "epoch": 3.491171749598716, "grad_norm": 9.417113304138184, "learning_rate": 5.075e-06, "loss": 0.1322, "step": 4350 }, { "epoch": 3.5313001605136436, "grad_norm": 11.396635055541992, "learning_rate": 5.005555555555556e-06, "loss": 0.1369, "step": 4400 }, { "epoch": 3.571428571428571, "grad_norm": 16.883840560913086, "learning_rate": 4.9361111111111115e-06, "loss": 0.1301, "step": 4450 }, { "epoch": 3.611556982343499, "grad_norm": 16.863872528076172, "learning_rate": 4.866666666666667e-06, "loss": 0.1383, "step": 4500 }, { "epoch": 3.6516853932584272, "grad_norm": 11.84510612487793, "learning_rate": 4.797222222222222e-06, "loss": 0.1288, "step": 4550 }, { "epoch": 3.691813804173355, "grad_norm": 10.211877822875977, "learning_rate": 4.727777777777779e-06, "loss": 0.1327, "step": 4600 }, { "epoch": 3.7319422150882824, "grad_norm": 11.919416427612305, "learning_rate": 4.658333333333333e-06, "loss": 0.1306, "step": 4650 }, { "epoch": 3.7720706260032104, "grad_norm": 10.668038368225098, "learning_rate": 4.58888888888889e-06, "loss": 0.1432, "step": 4700 }, { "epoch": 3.812199036918138, "grad_norm": 9.114903450012207, "learning_rate": 4.519444444444444e-06, "loss": 0.1313, "step": 4750 }, { "epoch": 3.8523274478330656, "grad_norm": 9.845243453979492, "learning_rate": 4.450000000000001e-06, "loss": 0.1351, "step": 4800 }, { "epoch": 3.8924558587479936, "grad_norm": 10.04245376586914, "learning_rate": 4.380555555555556e-06, "loss": 0.1242, "step": 4850 }, { "epoch": 3.932584269662921, "grad_norm": 11.454913139343262, "learning_rate": 4.3111111111111115e-06, "loss": 0.1327, "step": 4900 }, { "epoch": 3.972712680577849, "grad_norm": 9.704380989074707, "learning_rate": 4.241666666666667e-06, "loss": 0.136, "step": 4950 }, { "epoch": 4.012841091492777, "grad_norm": 8.607508659362793, "learning_rate": 4.1722222222222225e-06, "loss": 0.1057, "step": 5000 }, { "epoch": 4.052969502407705, "grad_norm": 9.908164024353027, "learning_rate": 4.102777777777778e-06, "loss": 0.0793, "step": 5050 }, { "epoch": 4.093097913322633, "grad_norm": 9.310582160949707, "learning_rate": 4.033333333333333e-06, "loss": 0.0819, "step": 5100 }, { "epoch": 4.13322632423756, "grad_norm": 8.358500480651855, "learning_rate": 3.96388888888889e-06, "loss": 0.0713, "step": 5150 }, { "epoch": 4.173354735152488, "grad_norm": 10.421764373779297, "learning_rate": 3.894444444444444e-06, "loss": 0.0882, "step": 5200 }, { "epoch": 4.213483146067416, "grad_norm": 9.101556777954102, "learning_rate": 3.825000000000001e-06, "loss": 0.0795, "step": 5250 }, { "epoch": 4.253611556982343, "grad_norm": 8.120292663574219, "learning_rate": 3.7555555555555557e-06, "loss": 0.0837, "step": 5300 }, { "epoch": 4.293739967897271, "grad_norm": 7.72833251953125, "learning_rate": 3.6861111111111116e-06, "loss": 0.0834, "step": 5350 }, { "epoch": 4.333868378812199, "grad_norm": 8.289836883544922, "learning_rate": 3.616666666666667e-06, "loss": 0.0787, "step": 5400 }, { "epoch": 4.373996789727126, "grad_norm": 11.09408950805664, "learning_rate": 3.5472222222222225e-06, "loss": 0.0838, "step": 5450 }, { "epoch": 4.414125200642054, "grad_norm": 5.381303310394287, "learning_rate": 3.4777777777777784e-06, "loss": 0.08, "step": 5500 }, { "epoch": 4.454253611556982, "grad_norm": 7.058831214904785, "learning_rate": 3.4083333333333335e-06, "loss": 0.0811, "step": 5550 }, { "epoch": 4.49438202247191, "grad_norm": 7.9168620109558105, "learning_rate": 3.3388888888888893e-06, "loss": 0.0796, "step": 5600 }, { "epoch": 4.534510433386838, "grad_norm": 8.293107032775879, "learning_rate": 3.2694444444444444e-06, "loss": 0.0765, "step": 5650 }, { "epoch": 4.574638844301766, "grad_norm": 14.970193862915039, "learning_rate": 3.2000000000000003e-06, "loss": 0.0786, "step": 5700 }, { "epoch": 4.614767255216694, "grad_norm": 7.619186878204346, "learning_rate": 3.1305555555555557e-06, "loss": 0.079, "step": 5750 }, { "epoch": 4.654895666131621, "grad_norm": 7.955104827880859, "learning_rate": 3.0611111111111112e-06, "loss": 0.0814, "step": 5800 }, { "epoch": 4.695024077046549, "grad_norm": 6.463953971862793, "learning_rate": 2.991666666666667e-06, "loss": 0.0846, "step": 5850 }, { "epoch": 4.735152487961477, "grad_norm": 7.264988422393799, "learning_rate": 2.9222222222222226e-06, "loss": 0.0846, "step": 5900 }, { "epoch": 4.775280898876405, "grad_norm": 9.477662086486816, "learning_rate": 2.852777777777778e-06, "loss": 0.0737, "step": 5950 }, { "epoch": 4.815409309791332, "grad_norm": 7.392651557922363, "learning_rate": 2.7833333333333335e-06, "loss": 0.0819, "step": 6000 }, { "epoch": 4.85553772070626, "grad_norm": 7.89952278137207, "learning_rate": 2.7138888888888894e-06, "loss": 0.0733, "step": 6050 }, { "epoch": 4.895666131621188, "grad_norm": 9.701936721801758, "learning_rate": 2.6444444444444444e-06, "loss": 0.0766, "step": 6100 }, { "epoch": 4.935794542536115, "grad_norm": 9.0658540725708, "learning_rate": 2.5750000000000003e-06, "loss": 0.0719, "step": 6150 }, { "epoch": 4.975922953451043, "grad_norm": 8.084450721740723, "learning_rate": 2.5055555555555554e-06, "loss": 0.0797, "step": 6200 }, { "epoch": 5.016051364365971, "grad_norm": 9.634577751159668, "learning_rate": 2.4361111111111113e-06, "loss": 0.0728, "step": 6250 }, { "epoch": 5.056179775280899, "grad_norm": 6.3712921142578125, "learning_rate": 2.3666666666666667e-06, "loss": 0.0492, "step": 6300 }, { "epoch": 5.096308186195826, "grad_norm": 5.370436191558838, "learning_rate": 2.297222222222222e-06, "loss": 0.0462, "step": 6350 }, { "epoch": 5.136436597110754, "grad_norm": 5.955212116241455, "learning_rate": 2.2277777777777777e-06, "loss": 0.0458, "step": 6400 }, { "epoch": 5.176565008025682, "grad_norm": 5.898159503936768, "learning_rate": 2.1583333333333336e-06, "loss": 0.0521, "step": 6450 }, { "epoch": 5.21669341894061, "grad_norm": 4.376798152923584, "learning_rate": 2.088888888888889e-06, "loss": 0.0539, "step": 6500 }, { "epoch": 5.256821829855538, "grad_norm": 8.122651100158691, "learning_rate": 2.0194444444444445e-06, "loss": 0.0532, "step": 6550 }, { "epoch": 5.296950240770466, "grad_norm": 6.136049270629883, "learning_rate": 1.9500000000000004e-06, "loss": 0.049, "step": 6600 }, { "epoch": 5.337078651685394, "grad_norm": 4.084792613983154, "learning_rate": 1.8805555555555556e-06, "loss": 0.0606, "step": 6650 }, { "epoch": 5.377207062600321, "grad_norm": 10.023465156555176, "learning_rate": 1.8111111111111113e-06, "loss": 0.0528, "step": 6700 }, { "epoch": 5.417335473515249, "grad_norm": 7.965837478637695, "learning_rate": 1.7416666666666668e-06, "loss": 0.0495, "step": 6750 }, { "epoch": 5.457463884430177, "grad_norm": 4.469130516052246, "learning_rate": 1.6722222222222223e-06, "loss": 0.0498, "step": 6800 }, { "epoch": 5.497592295345104, "grad_norm": 7.078831672668457, "learning_rate": 1.6027777777777777e-06, "loss": 0.0553, "step": 6850 }, { "epoch": 5.537720706260032, "grad_norm": 6.6740336418151855, "learning_rate": 1.5333333333333334e-06, "loss": 0.0476, "step": 6900 }, { "epoch": 5.57784911717496, "grad_norm": 4.432163715362549, "learning_rate": 1.463888888888889e-06, "loss": 0.0482, "step": 6950 }, { "epoch": 5.617977528089888, "grad_norm": 7.402093410491943, "learning_rate": 1.3944444444444446e-06, "loss": 0.043, "step": 7000 }, { "epoch": 5.658105939004815, "grad_norm": 5.417418003082275, "learning_rate": 1.3250000000000002e-06, "loss": 0.0438, "step": 7050 }, { "epoch": 5.698234349919743, "grad_norm": 5.910097599029541, "learning_rate": 1.2555555555555557e-06, "loss": 0.0456, "step": 7100 }, { "epoch": 5.738362760834671, "grad_norm": 7.781131744384766, "learning_rate": 1.1861111111111112e-06, "loss": 0.0504, "step": 7150 }, { "epoch": 5.778491171749598, "grad_norm": 7.682769298553467, "learning_rate": 1.1166666666666666e-06, "loss": 0.0543, "step": 7200 }, { "epoch": 5.818619582664526, "grad_norm": 5.041004657745361, "learning_rate": 1.0472222222222223e-06, "loss": 0.0477, "step": 7250 }, { "epoch": 5.858747993579454, "grad_norm": 5.9667158126831055, "learning_rate": 9.77777777777778e-07, "loss": 0.0472, "step": 7300 }, { "epoch": 5.898876404494382, "grad_norm": 7.793878555297852, "learning_rate": 9.083333333333335e-07, "loss": 0.0507, "step": 7350 }, { "epoch": 5.93900481540931, "grad_norm": 7.589353561401367, "learning_rate": 8.388888888888889e-07, "loss": 0.0588, "step": 7400 }, { "epoch": 5.979133226324238, "grad_norm": 6.20251989364624, "learning_rate": 7.694444444444445e-07, "loss": 0.0499, "step": 7450 }, { "epoch": 6.019261637239166, "grad_norm": 6.208334445953369, "learning_rate": 7.000000000000001e-07, "loss": 0.0441, "step": 7500 }, { "epoch": 6.059390048154093, "grad_norm": 4.3427324295043945, "learning_rate": 6.305555555555556e-07, "loss": 0.0404, "step": 7550 }, { "epoch": 6.099518459069021, "grad_norm": 5.3076581954956055, "learning_rate": 5.611111111111111e-07, "loss": 0.0363, "step": 7600 }, { "epoch": 6.139646869983949, "grad_norm": 3.9682233333587646, "learning_rate": 4.916666666666667e-07, "loss": 0.0382, "step": 7650 }, { "epoch": 6.179775280898877, "grad_norm": 5.612052917480469, "learning_rate": 4.2222222222222226e-07, "loss": 0.0411, "step": 7700 }, { "epoch": 6.219903691813804, "grad_norm": 4.144915580749512, "learning_rate": 3.527777777777778e-07, "loss": 0.0358, "step": 7750 }, { "epoch": 6.260032102728732, "grad_norm": 7.28611946105957, "learning_rate": 2.8333333333333336e-07, "loss": 0.035, "step": 7800 }, { "epoch": 6.30016051364366, "grad_norm": 4.628722667694092, "learning_rate": 2.138888888888889e-07, "loss": 0.0371, "step": 7850 }, { "epoch": 6.340288924558587, "grad_norm": 5.527387619018555, "learning_rate": 1.4444444444444445e-07, "loss": 0.0377, "step": 7900 }, { "epoch": 6.380417335473515, "grad_norm": 4.704113006591797, "learning_rate": 7.500000000000001e-08, "loss": 0.0352, "step": 7950 }, { "epoch": 6.420545746388443, "grad_norm": 4.203430652618408, "learning_rate": 5.555555555555556e-09, "loss": 0.0341, "step": 8000 } ], "logging_steps": 50, "max_steps": 8000, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.38692049199104e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }