| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 6.420545746388443, | |
| "eval_steps": 500, | |
| "global_step": 8000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.04012841091492777, | |
| "grad_norm": 86.5052261352539, | |
| "learning_rate": 5.750000000000001e-07, | |
| "loss": 2.891, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.08025682182985554, | |
| "grad_norm": 44.9648551940918, | |
| "learning_rate": 1.2000000000000002e-06, | |
| "loss": 2.0127, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.12038523274478331, | |
| "grad_norm": 32.67415237426758, | |
| "learning_rate": 1.825e-06, | |
| "loss": 1.3251, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.16051364365971107, | |
| "grad_norm": 33.664764404296875, | |
| "learning_rate": 2.4500000000000003e-06, | |
| "loss": 1.139, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.20064205457463885, | |
| "grad_norm": 38.86428451538086, | |
| "learning_rate": 3.075e-06, | |
| "loss": 0.9913, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.24077046548956663, | |
| "grad_norm": 38.11140441894531, | |
| "learning_rate": 3.7e-06, | |
| "loss": 0.9344, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.2808988764044944, | |
| "grad_norm": 39.1014518737793, | |
| "learning_rate": 4.325e-06, | |
| "loss": 0.8068, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.32102728731942215, | |
| "grad_norm": 19.286678314208984, | |
| "learning_rate": 4.95e-06, | |
| "loss": 0.7104, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.3611556982343499, | |
| "grad_norm": 24.21994972229004, | |
| "learning_rate": 5.575000000000001e-06, | |
| "loss": 0.6199, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.4012841091492777, | |
| "grad_norm": 23.403148651123047, | |
| "learning_rate": 6.200000000000001e-06, | |
| "loss": 0.6139, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.44141252006420545, | |
| "grad_norm": 21.166723251342773, | |
| "learning_rate": 6.825000000000001e-06, | |
| "loss": 0.5843, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.48154093097913325, | |
| "grad_norm": 18.9536075592041, | |
| "learning_rate": 7.450000000000001e-06, | |
| "loss": 0.578, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.521669341894061, | |
| "grad_norm": 21.438701629638672, | |
| "learning_rate": 8.075000000000001e-06, | |
| "loss": 0.5505, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.5617977528089888, | |
| "grad_norm": 25.26392364501953, | |
| "learning_rate": 8.700000000000001e-06, | |
| "loss": 0.5649, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.6019261637239165, | |
| "grad_norm": 19.77715492248535, | |
| "learning_rate": 9.325000000000001e-06, | |
| "loss": 0.5839, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.6420545746388443, | |
| "grad_norm": 20.142839431762695, | |
| "learning_rate": 9.950000000000001e-06, | |
| "loss": 0.5518, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.6821829855537721, | |
| "grad_norm": 20.156387329101562, | |
| "learning_rate": 9.936111111111112e-06, | |
| "loss": 0.5514, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.7223113964686998, | |
| "grad_norm": 21.246883392333984, | |
| "learning_rate": 9.866666666666668e-06, | |
| "loss": 0.5125, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.7624398073836276, | |
| "grad_norm": 17.57798957824707, | |
| "learning_rate": 9.797222222222223e-06, | |
| "loss": 0.5129, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.8025682182985554, | |
| "grad_norm": 18.979021072387695, | |
| "learning_rate": 9.727777777777777e-06, | |
| "loss": 0.493, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.8426966292134831, | |
| "grad_norm": 20.188549041748047, | |
| "learning_rate": 9.658333333333334e-06, | |
| "loss": 0.5158, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.8828250401284109, | |
| "grad_norm": 17.60784149169922, | |
| "learning_rate": 9.58888888888889e-06, | |
| "loss": 0.4901, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.9229534510433387, | |
| "grad_norm": 38.89186477661133, | |
| "learning_rate": 9.519444444444446e-06, | |
| "loss": 0.493, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.9630818619582665, | |
| "grad_norm": 20.224136352539062, | |
| "learning_rate": 9.450000000000001e-06, | |
| "loss": 0.4911, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.0032102728731942, | |
| "grad_norm": 13.886350631713867, | |
| "learning_rate": 9.380555555555556e-06, | |
| "loss": 0.4649, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.043338683788122, | |
| "grad_norm": 14.338887214660645, | |
| "learning_rate": 9.311111111111112e-06, | |
| "loss": 0.3867, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.0834670947030498, | |
| "grad_norm": 17.406949996948242, | |
| "learning_rate": 9.241666666666668e-06, | |
| "loss": 0.3609, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.1235955056179776, | |
| "grad_norm": 19.521095275878906, | |
| "learning_rate": 9.172222222222223e-06, | |
| "loss": 0.354, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.1637239165329052, | |
| "grad_norm": 14.558744430541992, | |
| "learning_rate": 9.102777777777777e-06, | |
| "loss": 0.357, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 1.203852327447833, | |
| "grad_norm": 13.476064682006836, | |
| "learning_rate": 9.033333333333334e-06, | |
| "loss": 0.3593, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.2439807383627608, | |
| "grad_norm": 15.735109329223633, | |
| "learning_rate": 8.96388888888889e-06, | |
| "loss": 0.3567, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 1.2841091492776886, | |
| "grad_norm": 17.4168758392334, | |
| "learning_rate": 8.894444444444445e-06, | |
| "loss": 0.3534, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.3242375601926164, | |
| "grad_norm": 17.488101959228516, | |
| "learning_rate": 8.825000000000001e-06, | |
| "loss": 0.3481, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 1.3643659711075442, | |
| "grad_norm": 14.388931274414062, | |
| "learning_rate": 8.755555555555556e-06, | |
| "loss": 0.3517, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.404494382022472, | |
| "grad_norm": 14.942325592041016, | |
| "learning_rate": 8.686111111111112e-06, | |
| "loss": 0.3361, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 1.4446227929373996, | |
| "grad_norm": 19.468101501464844, | |
| "learning_rate": 8.616666666666668e-06, | |
| "loss": 0.349, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.4847512038523274, | |
| "grad_norm": 17.254077911376953, | |
| "learning_rate": 8.547222222222223e-06, | |
| "loss": 0.3593, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 1.5248796147672552, | |
| "grad_norm": 14.908585548400879, | |
| "learning_rate": 8.477777777777778e-06, | |
| "loss": 0.3464, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.565008025682183, | |
| "grad_norm": 14.53330135345459, | |
| "learning_rate": 8.408333333333334e-06, | |
| "loss": 0.3431, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 1.6051364365971108, | |
| "grad_norm": 16.17108154296875, | |
| "learning_rate": 8.33888888888889e-06, | |
| "loss": 0.3272, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.6452648475120384, | |
| "grad_norm": 17.363994598388672, | |
| "learning_rate": 8.269444444444445e-06, | |
| "loss": 0.354, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 1.6853932584269664, | |
| "grad_norm": 15.006171226501465, | |
| "learning_rate": 8.2e-06, | |
| "loss": 0.3399, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 1.725521669341894, | |
| "grad_norm": 19.402164459228516, | |
| "learning_rate": 8.130555555555556e-06, | |
| "loss": 0.3507, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 1.7656500802568218, | |
| "grad_norm": 18.973295211791992, | |
| "learning_rate": 8.061111111111112e-06, | |
| "loss": 0.3384, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.8057784911717496, | |
| "grad_norm": 16.426692962646484, | |
| "learning_rate": 7.991666666666668e-06, | |
| "loss": 0.3475, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 1.8459069020866774, | |
| "grad_norm": 15.58637523651123, | |
| "learning_rate": 7.922222222222223e-06, | |
| "loss": 0.3392, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 1.8860353130016052, | |
| "grad_norm": 22.870681762695312, | |
| "learning_rate": 7.852777777777778e-06, | |
| "loss": 0.3379, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 1.9261637239165328, | |
| "grad_norm": 13.37627124786377, | |
| "learning_rate": 7.783333333333334e-06, | |
| "loss": 0.3397, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 1.9662921348314608, | |
| "grad_norm": 13.650712013244629, | |
| "learning_rate": 7.71388888888889e-06, | |
| "loss": 0.3325, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 2.0064205457463884, | |
| "grad_norm": 11.761895179748535, | |
| "learning_rate": 7.644444444444445e-06, | |
| "loss": 0.3198, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 2.0465489566613164, | |
| "grad_norm": 10.545045852661133, | |
| "learning_rate": 7.575e-06, | |
| "loss": 0.2247, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 2.086677367576244, | |
| "grad_norm": 14.525964736938477, | |
| "learning_rate": 7.505555555555556e-06, | |
| "loss": 0.2106, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 2.1268057784911716, | |
| "grad_norm": 12.788507461547852, | |
| "learning_rate": 7.436111111111112e-06, | |
| "loss": 0.2179, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 2.1669341894060996, | |
| "grad_norm": 15.106952667236328, | |
| "learning_rate": 7.3666666666666676e-06, | |
| "loss": 0.23, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 2.207062600321027, | |
| "grad_norm": 14.04389762878418, | |
| "learning_rate": 7.297222222222223e-06, | |
| "loss": 0.2156, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 2.247191011235955, | |
| "grad_norm": 17.352598190307617, | |
| "learning_rate": 7.227777777777778e-06, | |
| "loss": 0.2175, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 2.287319422150883, | |
| "grad_norm": 11.569601058959961, | |
| "learning_rate": 7.158333333333334e-06, | |
| "loss": 0.2079, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 2.3274478330658104, | |
| "grad_norm": 14.02609634399414, | |
| "learning_rate": 7.0888888888888894e-06, | |
| "loss": 0.2307, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 2.3675762439807384, | |
| "grad_norm": 15.397391319274902, | |
| "learning_rate": 7.019444444444446e-06, | |
| "loss": 0.211, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 2.407704654895666, | |
| "grad_norm": 13.850847244262695, | |
| "learning_rate": 6.95e-06, | |
| "loss": 0.2115, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 2.447833065810594, | |
| "grad_norm": 10.497112274169922, | |
| "learning_rate": 6.880555555555556e-06, | |
| "loss": 0.2071, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 2.4879614767255216, | |
| "grad_norm": 13.437689781188965, | |
| "learning_rate": 6.811111111111111e-06, | |
| "loss": 0.2085, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 2.5280898876404496, | |
| "grad_norm": 17.070728302001953, | |
| "learning_rate": 6.741666666666668e-06, | |
| "loss": 0.2022, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 2.568218298555377, | |
| "grad_norm": 14.102376937866211, | |
| "learning_rate": 6.672222222222223e-06, | |
| "loss": 0.2014, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 2.608346709470305, | |
| "grad_norm": 12.899041175842285, | |
| "learning_rate": 6.602777777777778e-06, | |
| "loss": 0.2172, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 2.648475120385233, | |
| "grad_norm": 15.379789352416992, | |
| "learning_rate": 6.533333333333334e-06, | |
| "loss": 0.2096, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 2.6886035313001604, | |
| "grad_norm": 12.808381080627441, | |
| "learning_rate": 6.4638888888888895e-06, | |
| "loss": 0.2154, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 2.7287319422150884, | |
| "grad_norm": 13.826408386230469, | |
| "learning_rate": 6.394444444444445e-06, | |
| "loss": 0.2176, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 2.768860353130016, | |
| "grad_norm": 14.112689971923828, | |
| "learning_rate": 6.3250000000000004e-06, | |
| "loss": 0.2079, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 2.808988764044944, | |
| "grad_norm": 11.431640625, | |
| "learning_rate": 6.255555555555556e-06, | |
| "loss": 0.2152, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 2.8491171749598716, | |
| "grad_norm": 17.434322357177734, | |
| "learning_rate": 6.186111111111111e-06, | |
| "loss": 0.189, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 2.889245585874799, | |
| "grad_norm": 15.394646644592285, | |
| "learning_rate": 6.116666666666668e-06, | |
| "loss": 0.2078, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 2.929373996789727, | |
| "grad_norm": 19.227445602416992, | |
| "learning_rate": 6.047222222222223e-06, | |
| "loss": 0.2104, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 2.969502407704655, | |
| "grad_norm": 14.099166870117188, | |
| "learning_rate": 5.977777777777778e-06, | |
| "loss": 0.2252, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 3.009630818619583, | |
| "grad_norm": 11.261530876159668, | |
| "learning_rate": 5.908333333333334e-06, | |
| "loss": 0.1895, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 3.0497592295345104, | |
| "grad_norm": 8.676029205322266, | |
| "learning_rate": 5.8388888888888895e-06, | |
| "loss": 0.133, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 3.0898876404494384, | |
| "grad_norm": 8.65832233428955, | |
| "learning_rate": 5.769444444444445e-06, | |
| "loss": 0.1152, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 3.130016051364366, | |
| "grad_norm": 9.719037055969238, | |
| "learning_rate": 5.7e-06, | |
| "loss": 0.1261, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 3.1701444622792936, | |
| "grad_norm": 12.145017623901367, | |
| "learning_rate": 5.630555555555556e-06, | |
| "loss": 0.1216, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 3.2102728731942216, | |
| "grad_norm": 10.67035961151123, | |
| "learning_rate": 5.561111111111111e-06, | |
| "loss": 0.1341, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 3.250401284109149, | |
| "grad_norm": 13.00109577178955, | |
| "learning_rate": 5.491666666666668e-06, | |
| "loss": 0.1294, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 3.290529695024077, | |
| "grad_norm": 34.79175567626953, | |
| "learning_rate": 5.422222222222223e-06, | |
| "loss": 0.1287, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 3.330658105939005, | |
| "grad_norm": 8.047740936279297, | |
| "learning_rate": 5.352777777777778e-06, | |
| "loss": 0.1282, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 3.370786516853933, | |
| "grad_norm": 10.46933364868164, | |
| "learning_rate": 5.283333333333333e-06, | |
| "loss": 0.126, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 3.4109149277688604, | |
| "grad_norm": 9.083063125610352, | |
| "learning_rate": 5.21388888888889e-06, | |
| "loss": 0.1322, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 3.451043338683788, | |
| "grad_norm": 10.835976600646973, | |
| "learning_rate": 5.144444444444445e-06, | |
| "loss": 0.1393, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 3.491171749598716, | |
| "grad_norm": 9.417113304138184, | |
| "learning_rate": 5.075e-06, | |
| "loss": 0.1322, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 3.5313001605136436, | |
| "grad_norm": 11.396635055541992, | |
| "learning_rate": 5.005555555555556e-06, | |
| "loss": 0.1369, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 3.571428571428571, | |
| "grad_norm": 16.883840560913086, | |
| "learning_rate": 4.9361111111111115e-06, | |
| "loss": 0.1301, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 3.611556982343499, | |
| "grad_norm": 16.863872528076172, | |
| "learning_rate": 4.866666666666667e-06, | |
| "loss": 0.1383, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 3.6516853932584272, | |
| "grad_norm": 11.84510612487793, | |
| "learning_rate": 4.797222222222222e-06, | |
| "loss": 0.1288, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 3.691813804173355, | |
| "grad_norm": 10.211877822875977, | |
| "learning_rate": 4.727777777777779e-06, | |
| "loss": 0.1327, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 3.7319422150882824, | |
| "grad_norm": 11.919416427612305, | |
| "learning_rate": 4.658333333333333e-06, | |
| "loss": 0.1306, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 3.7720706260032104, | |
| "grad_norm": 10.668038368225098, | |
| "learning_rate": 4.58888888888889e-06, | |
| "loss": 0.1432, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 3.812199036918138, | |
| "grad_norm": 9.114903450012207, | |
| "learning_rate": 4.519444444444444e-06, | |
| "loss": 0.1313, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 3.8523274478330656, | |
| "grad_norm": 9.845243453979492, | |
| "learning_rate": 4.450000000000001e-06, | |
| "loss": 0.1351, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 3.8924558587479936, | |
| "grad_norm": 10.04245376586914, | |
| "learning_rate": 4.380555555555556e-06, | |
| "loss": 0.1242, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 3.932584269662921, | |
| "grad_norm": 11.454913139343262, | |
| "learning_rate": 4.3111111111111115e-06, | |
| "loss": 0.1327, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 3.972712680577849, | |
| "grad_norm": 9.704380989074707, | |
| "learning_rate": 4.241666666666667e-06, | |
| "loss": 0.136, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 4.012841091492777, | |
| "grad_norm": 8.607508659362793, | |
| "learning_rate": 4.1722222222222225e-06, | |
| "loss": 0.1057, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 4.052969502407705, | |
| "grad_norm": 9.908164024353027, | |
| "learning_rate": 4.102777777777778e-06, | |
| "loss": 0.0793, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 4.093097913322633, | |
| "grad_norm": 9.310582160949707, | |
| "learning_rate": 4.033333333333333e-06, | |
| "loss": 0.0819, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 4.13322632423756, | |
| "grad_norm": 8.358500480651855, | |
| "learning_rate": 3.96388888888889e-06, | |
| "loss": 0.0713, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 4.173354735152488, | |
| "grad_norm": 10.421764373779297, | |
| "learning_rate": 3.894444444444444e-06, | |
| "loss": 0.0882, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 4.213483146067416, | |
| "grad_norm": 9.101556777954102, | |
| "learning_rate": 3.825000000000001e-06, | |
| "loss": 0.0795, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 4.253611556982343, | |
| "grad_norm": 8.120292663574219, | |
| "learning_rate": 3.7555555555555557e-06, | |
| "loss": 0.0837, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 4.293739967897271, | |
| "grad_norm": 7.72833251953125, | |
| "learning_rate": 3.6861111111111116e-06, | |
| "loss": 0.0834, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 4.333868378812199, | |
| "grad_norm": 8.289836883544922, | |
| "learning_rate": 3.616666666666667e-06, | |
| "loss": 0.0787, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 4.373996789727126, | |
| "grad_norm": 11.09408950805664, | |
| "learning_rate": 3.5472222222222225e-06, | |
| "loss": 0.0838, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 4.414125200642054, | |
| "grad_norm": 5.381303310394287, | |
| "learning_rate": 3.4777777777777784e-06, | |
| "loss": 0.08, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 4.454253611556982, | |
| "grad_norm": 7.058831214904785, | |
| "learning_rate": 3.4083333333333335e-06, | |
| "loss": 0.0811, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 4.49438202247191, | |
| "grad_norm": 7.9168620109558105, | |
| "learning_rate": 3.3388888888888893e-06, | |
| "loss": 0.0796, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 4.534510433386838, | |
| "grad_norm": 8.293107032775879, | |
| "learning_rate": 3.2694444444444444e-06, | |
| "loss": 0.0765, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 4.574638844301766, | |
| "grad_norm": 14.970193862915039, | |
| "learning_rate": 3.2000000000000003e-06, | |
| "loss": 0.0786, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 4.614767255216694, | |
| "grad_norm": 7.619186878204346, | |
| "learning_rate": 3.1305555555555557e-06, | |
| "loss": 0.079, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 4.654895666131621, | |
| "grad_norm": 7.955104827880859, | |
| "learning_rate": 3.0611111111111112e-06, | |
| "loss": 0.0814, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 4.695024077046549, | |
| "grad_norm": 6.463953971862793, | |
| "learning_rate": 2.991666666666667e-06, | |
| "loss": 0.0846, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 4.735152487961477, | |
| "grad_norm": 7.264988422393799, | |
| "learning_rate": 2.9222222222222226e-06, | |
| "loss": 0.0846, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 4.775280898876405, | |
| "grad_norm": 9.477662086486816, | |
| "learning_rate": 2.852777777777778e-06, | |
| "loss": 0.0737, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 4.815409309791332, | |
| "grad_norm": 7.392651557922363, | |
| "learning_rate": 2.7833333333333335e-06, | |
| "loss": 0.0819, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 4.85553772070626, | |
| "grad_norm": 7.89952278137207, | |
| "learning_rate": 2.7138888888888894e-06, | |
| "loss": 0.0733, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 4.895666131621188, | |
| "grad_norm": 9.701936721801758, | |
| "learning_rate": 2.6444444444444444e-06, | |
| "loss": 0.0766, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 4.935794542536115, | |
| "grad_norm": 9.0658540725708, | |
| "learning_rate": 2.5750000000000003e-06, | |
| "loss": 0.0719, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 4.975922953451043, | |
| "grad_norm": 8.084450721740723, | |
| "learning_rate": 2.5055555555555554e-06, | |
| "loss": 0.0797, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 5.016051364365971, | |
| "grad_norm": 9.634577751159668, | |
| "learning_rate": 2.4361111111111113e-06, | |
| "loss": 0.0728, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 5.056179775280899, | |
| "grad_norm": 6.3712921142578125, | |
| "learning_rate": 2.3666666666666667e-06, | |
| "loss": 0.0492, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 5.096308186195826, | |
| "grad_norm": 5.370436191558838, | |
| "learning_rate": 2.297222222222222e-06, | |
| "loss": 0.0462, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 5.136436597110754, | |
| "grad_norm": 5.955212116241455, | |
| "learning_rate": 2.2277777777777777e-06, | |
| "loss": 0.0458, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 5.176565008025682, | |
| "grad_norm": 5.898159503936768, | |
| "learning_rate": 2.1583333333333336e-06, | |
| "loss": 0.0521, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 5.21669341894061, | |
| "grad_norm": 4.376798152923584, | |
| "learning_rate": 2.088888888888889e-06, | |
| "loss": 0.0539, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 5.256821829855538, | |
| "grad_norm": 8.122651100158691, | |
| "learning_rate": 2.0194444444444445e-06, | |
| "loss": 0.0532, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 5.296950240770466, | |
| "grad_norm": 6.136049270629883, | |
| "learning_rate": 1.9500000000000004e-06, | |
| "loss": 0.049, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 5.337078651685394, | |
| "grad_norm": 4.084792613983154, | |
| "learning_rate": 1.8805555555555556e-06, | |
| "loss": 0.0606, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 5.377207062600321, | |
| "grad_norm": 10.023465156555176, | |
| "learning_rate": 1.8111111111111113e-06, | |
| "loss": 0.0528, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 5.417335473515249, | |
| "grad_norm": 7.965837478637695, | |
| "learning_rate": 1.7416666666666668e-06, | |
| "loss": 0.0495, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 5.457463884430177, | |
| "grad_norm": 4.469130516052246, | |
| "learning_rate": 1.6722222222222223e-06, | |
| "loss": 0.0498, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 5.497592295345104, | |
| "grad_norm": 7.078831672668457, | |
| "learning_rate": 1.6027777777777777e-06, | |
| "loss": 0.0553, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 5.537720706260032, | |
| "grad_norm": 6.6740336418151855, | |
| "learning_rate": 1.5333333333333334e-06, | |
| "loss": 0.0476, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 5.57784911717496, | |
| "grad_norm": 4.432163715362549, | |
| "learning_rate": 1.463888888888889e-06, | |
| "loss": 0.0482, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 5.617977528089888, | |
| "grad_norm": 7.402093410491943, | |
| "learning_rate": 1.3944444444444446e-06, | |
| "loss": 0.043, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 5.658105939004815, | |
| "grad_norm": 5.417418003082275, | |
| "learning_rate": 1.3250000000000002e-06, | |
| "loss": 0.0438, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 5.698234349919743, | |
| "grad_norm": 5.910097599029541, | |
| "learning_rate": 1.2555555555555557e-06, | |
| "loss": 0.0456, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 5.738362760834671, | |
| "grad_norm": 7.781131744384766, | |
| "learning_rate": 1.1861111111111112e-06, | |
| "loss": 0.0504, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 5.778491171749598, | |
| "grad_norm": 7.682769298553467, | |
| "learning_rate": 1.1166666666666666e-06, | |
| "loss": 0.0543, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 5.818619582664526, | |
| "grad_norm": 5.041004657745361, | |
| "learning_rate": 1.0472222222222223e-06, | |
| "loss": 0.0477, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 5.858747993579454, | |
| "grad_norm": 5.9667158126831055, | |
| "learning_rate": 9.77777777777778e-07, | |
| "loss": 0.0472, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 5.898876404494382, | |
| "grad_norm": 7.793878555297852, | |
| "learning_rate": 9.083333333333335e-07, | |
| "loss": 0.0507, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 5.93900481540931, | |
| "grad_norm": 7.589353561401367, | |
| "learning_rate": 8.388888888888889e-07, | |
| "loss": 0.0588, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 5.979133226324238, | |
| "grad_norm": 6.20251989364624, | |
| "learning_rate": 7.694444444444445e-07, | |
| "loss": 0.0499, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 6.019261637239166, | |
| "grad_norm": 6.208334445953369, | |
| "learning_rate": 7.000000000000001e-07, | |
| "loss": 0.0441, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 6.059390048154093, | |
| "grad_norm": 4.3427324295043945, | |
| "learning_rate": 6.305555555555556e-07, | |
| "loss": 0.0404, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 6.099518459069021, | |
| "grad_norm": 5.3076581954956055, | |
| "learning_rate": 5.611111111111111e-07, | |
| "loss": 0.0363, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 6.139646869983949, | |
| "grad_norm": 3.9682233333587646, | |
| "learning_rate": 4.916666666666667e-07, | |
| "loss": 0.0382, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 6.179775280898877, | |
| "grad_norm": 5.612052917480469, | |
| "learning_rate": 4.2222222222222226e-07, | |
| "loss": 0.0411, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 6.219903691813804, | |
| "grad_norm": 4.144915580749512, | |
| "learning_rate": 3.527777777777778e-07, | |
| "loss": 0.0358, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 6.260032102728732, | |
| "grad_norm": 7.28611946105957, | |
| "learning_rate": 2.8333333333333336e-07, | |
| "loss": 0.035, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 6.30016051364366, | |
| "grad_norm": 4.628722667694092, | |
| "learning_rate": 2.138888888888889e-07, | |
| "loss": 0.0371, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 6.340288924558587, | |
| "grad_norm": 5.527387619018555, | |
| "learning_rate": 1.4444444444444445e-07, | |
| "loss": 0.0377, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 6.380417335473515, | |
| "grad_norm": 4.704113006591797, | |
| "learning_rate": 7.500000000000001e-08, | |
| "loss": 0.0352, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 6.420545746388443, | |
| "grad_norm": 4.203430652618408, | |
| "learning_rate": 5.555555555555556e-09, | |
| "loss": 0.0341, | |
| "step": 8000 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 8000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 7, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 7.38692049199104e+19, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |