{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 24564, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012212994626282364, "grad_norm": 2.7971887588500977, "learning_rate": 4.9796450089561964e-05, "loss": 0.3257, "step": 100 }, { "epoch": 0.024425989252564728, "grad_norm": 2.295069456100464, "learning_rate": 4.9592900179123925e-05, "loss": 0.1159, "step": 200 }, { "epoch": 0.03663898387884709, "grad_norm": 1.1914141178131104, "learning_rate": 4.9389350268685886e-05, "loss": 0.1073, "step": 300 }, { "epoch": 0.048851978505129456, "grad_norm": 0.8838738203048706, "learning_rate": 4.918580035824785e-05, "loss": 0.1076, "step": 400 }, { "epoch": 0.06106497313141182, "grad_norm": 1.8016488552093506, "learning_rate": 4.898225044780981e-05, "loss": 0.0851, "step": 500 }, { "epoch": 0.07327796775769418, "grad_norm": 2.5403218269348145, "learning_rate": 4.8778700537371764e-05, "loss": 0.1119, "step": 600 }, { "epoch": 0.08549096238397655, "grad_norm": 1.6208165884017944, "learning_rate": 4.8575150626933725e-05, "loss": 0.0968, "step": 700 }, { "epoch": 0.09770395701025891, "grad_norm": 1.7685565948486328, "learning_rate": 4.8371600716495686e-05, "loss": 0.0848, "step": 800 }, { "epoch": 0.10991695163654128, "grad_norm": 5.9024882316589355, "learning_rate": 4.816805080605765e-05, "loss": 0.0816, "step": 900 }, { "epoch": 0.12212994626282364, "grad_norm": 1.4809324741363525, "learning_rate": 4.796450089561961e-05, "loss": 0.0815, "step": 1000 }, { "epoch": 0.134342940889106, "grad_norm": 1.8953092098236084, "learning_rate": 4.776095098518157e-05, "loss": 0.0835, "step": 1100 }, { "epoch": 0.14655593551538837, "grad_norm": 0.2310028374195099, "learning_rate": 4.755740107474353e-05, "loss": 0.0768, "step": 1200 }, { "epoch": 0.15876893014167073, "grad_norm": 4.047482013702393, "learning_rate": 4.735385116430549e-05, "loss": 0.0785, "step": 1300 }, { "epoch": 0.1709819247679531, "grad_norm": 5.049030303955078, "learning_rate": 4.7150301253867454e-05, "loss": 0.0609, "step": 1400 }, { "epoch": 0.18319491939423546, "grad_norm": 4.447434902191162, "learning_rate": 4.6946751343429415e-05, "loss": 0.0625, "step": 1500 }, { "epoch": 0.19540791402051783, "grad_norm": 0.3569432199001312, "learning_rate": 4.674320143299138e-05, "loss": 0.0612, "step": 1600 }, { "epoch": 0.2076209086468002, "grad_norm": 0.6527674794197083, "learning_rate": 4.653965152255334e-05, "loss": 0.07, "step": 1700 }, { "epoch": 0.21983390327308255, "grad_norm": 5.448887825012207, "learning_rate": 4.633610161211529e-05, "loss": 0.0669, "step": 1800 }, { "epoch": 0.23204689789936492, "grad_norm": 1.804388165473938, "learning_rate": 4.6132551701677254e-05, "loss": 0.0636, "step": 1900 }, { "epoch": 0.24425989252564728, "grad_norm": 1.574012279510498, "learning_rate": 4.5929001791239215e-05, "loss": 0.0798, "step": 2000 }, { "epoch": 0.2564728871519297, "grad_norm": 0.8235198259353638, "learning_rate": 4.5725451880801177e-05, "loss": 0.054, "step": 2100 }, { "epoch": 0.268685881778212, "grad_norm": 0.6802400350570679, "learning_rate": 4.552190197036313e-05, "loss": 0.0613, "step": 2200 }, { "epoch": 0.2808988764044944, "grad_norm": 0.5732834935188293, "learning_rate": 4.531835205992509e-05, "loss": 0.0544, "step": 2300 }, { "epoch": 0.29311187103077674, "grad_norm": 0.9544196128845215, "learning_rate": 4.5114802149487054e-05, "loss": 0.0735, "step": 2400 }, { "epoch": 0.30532486565705913, "grad_norm": 0.8141427040100098, "learning_rate": 4.4911252239049015e-05, "loss": 0.0586, "step": 2500 }, { "epoch": 0.31753786028334147, "grad_norm": 3.740103006362915, "learning_rate": 4.4707702328610976e-05, "loss": 0.0498, "step": 2600 }, { "epoch": 0.32975085490962386, "grad_norm": 0.24550916254520416, "learning_rate": 4.450415241817294e-05, "loss": 0.056, "step": 2700 }, { "epoch": 0.3419638495359062, "grad_norm": 0.7254294157028198, "learning_rate": 4.43006025077349e-05, "loss": 0.053, "step": 2800 }, { "epoch": 0.3541768441621886, "grad_norm": 0.5358878970146179, "learning_rate": 4.409705259729686e-05, "loss": 0.0561, "step": 2900 }, { "epoch": 0.3663898387884709, "grad_norm": 0.3604901432991028, "learning_rate": 4.3893502686858815e-05, "loss": 0.059, "step": 3000 }, { "epoch": 0.3786028334147533, "grad_norm": 0.19227269291877747, "learning_rate": 4.3689952776420776e-05, "loss": 0.0548, "step": 3100 }, { "epoch": 0.39081582804103565, "grad_norm": 0.3620028495788574, "learning_rate": 4.348640286598274e-05, "loss": 0.0535, "step": 3200 }, { "epoch": 0.40302882266731804, "grad_norm": 4.794760704040527, "learning_rate": 4.32828529555447e-05, "loss": 0.0549, "step": 3300 }, { "epoch": 0.4152418172936004, "grad_norm": 0.5320255160331726, "learning_rate": 4.307930304510666e-05, "loss": 0.051, "step": 3400 }, { "epoch": 0.42745481191988277, "grad_norm": 2.048164129257202, "learning_rate": 4.287575313466862e-05, "loss": 0.067, "step": 3500 }, { "epoch": 0.4396678065461651, "grad_norm": 3.6915972232818604, "learning_rate": 4.267220322423058e-05, "loss": 0.0494, "step": 3600 }, { "epoch": 0.4518808011724475, "grad_norm": 0.7861614227294922, "learning_rate": 4.2468653313792544e-05, "loss": 0.0652, "step": 3700 }, { "epoch": 0.46409379579872984, "grad_norm": 1.9440407752990723, "learning_rate": 4.2265103403354505e-05, "loss": 0.0699, "step": 3800 }, { "epoch": 0.4763067904250122, "grad_norm": 0.15847598016262054, "learning_rate": 4.206155349291647e-05, "loss": 0.0575, "step": 3900 }, { "epoch": 0.48851978505129456, "grad_norm": 0.2988128960132599, "learning_rate": 4.185800358247843e-05, "loss": 0.0468, "step": 4000 }, { "epoch": 0.5007327796775769, "grad_norm": 1.6092756986618042, "learning_rate": 4.165445367204039e-05, "loss": 0.0555, "step": 4100 }, { "epoch": 0.5129457743038593, "grad_norm": 13.008635520935059, "learning_rate": 4.1450903761602344e-05, "loss": 0.0604, "step": 4200 }, { "epoch": 0.5251587689301417, "grad_norm": 2.7357659339904785, "learning_rate": 4.1247353851164305e-05, "loss": 0.0478, "step": 4300 }, { "epoch": 0.537371763556424, "grad_norm": 0.4085894823074341, "learning_rate": 4.104380394072627e-05, "loss": 0.0478, "step": 4400 }, { "epoch": 0.5495847581827064, "grad_norm": 1.5472468137741089, "learning_rate": 4.084025403028823e-05, "loss": 0.0559, "step": 4500 }, { "epoch": 0.5617977528089888, "grad_norm": 2.616894006729126, "learning_rate": 4.063670411985019e-05, "loss": 0.0444, "step": 4600 }, { "epoch": 0.5740107474352711, "grad_norm": 0.7861430644989014, "learning_rate": 4.043315420941215e-05, "loss": 0.0431, "step": 4700 }, { "epoch": 0.5862237420615535, "grad_norm": 1.3745653629302979, "learning_rate": 4.022960429897411e-05, "loss": 0.0422, "step": 4800 }, { "epoch": 0.5984367366878358, "grad_norm": 1.471048355102539, "learning_rate": 4.002605438853607e-05, "loss": 0.052, "step": 4900 }, { "epoch": 0.6106497313141183, "grad_norm": 0.3034818470478058, "learning_rate": 3.9822504478098035e-05, "loss": 0.0481, "step": 5000 }, { "epoch": 0.6228627259404006, "grad_norm": 0.5265262722969055, "learning_rate": 3.9618954567659996e-05, "loss": 0.0592, "step": 5100 }, { "epoch": 0.6350757205666829, "grad_norm": 1.226517915725708, "learning_rate": 3.941540465722196e-05, "loss": 0.0554, "step": 5200 }, { "epoch": 0.6472887151929653, "grad_norm": 0.365315705537796, "learning_rate": 3.921185474678392e-05, "loss": 0.0466, "step": 5300 }, { "epoch": 0.6595017098192477, "grad_norm": 0.4508240818977356, "learning_rate": 3.900830483634587e-05, "loss": 0.0468, "step": 5400 }, { "epoch": 0.67171470444553, "grad_norm": 0.51467365026474, "learning_rate": 3.8804754925907834e-05, "loss": 0.054, "step": 5500 }, { "epoch": 0.6839276990718124, "grad_norm": 0.6186398267745972, "learning_rate": 3.8601205015469796e-05, "loss": 0.0554, "step": 5600 }, { "epoch": 0.6961406936980947, "grad_norm": 0.6723864674568176, "learning_rate": 3.839765510503176e-05, "loss": 0.0556, "step": 5700 }, { "epoch": 0.7083536883243772, "grad_norm": 3.6368353366851807, "learning_rate": 3.819410519459372e-05, "loss": 0.0512, "step": 5800 }, { "epoch": 0.7205666829506595, "grad_norm": 3.5719096660614014, "learning_rate": 3.799055528415568e-05, "loss": 0.0545, "step": 5900 }, { "epoch": 0.7327796775769418, "grad_norm": 1.1756514310836792, "learning_rate": 3.778700537371764e-05, "loss": 0.0465, "step": 6000 }, { "epoch": 0.7449926722032242, "grad_norm": 1.2159337997436523, "learning_rate": 3.7583455463279596e-05, "loss": 0.0463, "step": 6100 }, { "epoch": 0.7572056668295066, "grad_norm": 1.0632232427597046, "learning_rate": 3.737990555284156e-05, "loss": 0.0444, "step": 6200 }, { "epoch": 0.769418661455789, "grad_norm": 0.669765293598175, "learning_rate": 3.717635564240352e-05, "loss": 0.0433, "step": 6300 }, { "epoch": 0.7816316560820713, "grad_norm": 0.13478492200374603, "learning_rate": 3.697280573196548e-05, "loss": 0.0469, "step": 6400 }, { "epoch": 0.7938446507083536, "grad_norm": 0.5963812470436096, "learning_rate": 3.676925582152744e-05, "loss": 0.0504, "step": 6500 }, { "epoch": 0.8060576453346361, "grad_norm": 0.5829123258590698, "learning_rate": 3.6565705911089395e-05, "loss": 0.0483, "step": 6600 }, { "epoch": 0.8182706399609184, "grad_norm": 2.3114776611328125, "learning_rate": 3.636215600065136e-05, "loss": 0.0601, "step": 6700 }, { "epoch": 0.8304836345872008, "grad_norm": 0.27553310990333557, "learning_rate": 3.615860609021332e-05, "loss": 0.0527, "step": 6800 }, { "epoch": 0.8426966292134831, "grad_norm": 0.3668135106563568, "learning_rate": 3.595505617977528e-05, "loss": 0.0541, "step": 6900 }, { "epoch": 0.8549096238397655, "grad_norm": 3.1305336952209473, "learning_rate": 3.575150626933724e-05, "loss": 0.0534, "step": 7000 }, { "epoch": 0.8671226184660479, "grad_norm": 0.08432205021381378, "learning_rate": 3.55479563588992e-05, "loss": 0.0523, "step": 7100 }, { "epoch": 0.8793356130923302, "grad_norm": 1.3692104816436768, "learning_rate": 3.534440644846116e-05, "loss": 0.0428, "step": 7200 }, { "epoch": 0.8915486077186126, "grad_norm": 1.1145917177200317, "learning_rate": 3.5140856538023125e-05, "loss": 0.0528, "step": 7300 }, { "epoch": 0.903761602344895, "grad_norm": 0.07234195619821548, "learning_rate": 3.4937306627585086e-05, "loss": 0.0449, "step": 7400 }, { "epoch": 0.9159745969711773, "grad_norm": 0.12755821645259857, "learning_rate": 3.473375671714705e-05, "loss": 0.0381, "step": 7500 }, { "epoch": 0.9281875915974597, "grad_norm": 1.066666603088379, "learning_rate": 3.453020680670901e-05, "loss": 0.0411, "step": 7600 }, { "epoch": 0.940400586223742, "grad_norm": 2.3837034702301025, "learning_rate": 3.432665689627097e-05, "loss": 0.0471, "step": 7700 }, { "epoch": 0.9526135808500245, "grad_norm": 0.2601478397846222, "learning_rate": 3.4123106985832924e-05, "loss": 0.0408, "step": 7800 }, { "epoch": 0.9648265754763068, "grad_norm": 1.0532914400100708, "learning_rate": 3.3919557075394886e-05, "loss": 0.0422, "step": 7900 }, { "epoch": 0.9770395701025891, "grad_norm": 0.16507047414779663, "learning_rate": 3.371600716495685e-05, "loss": 0.0464, "step": 8000 }, { "epoch": 0.9892525647288715, "grad_norm": 0.310465544462204, "learning_rate": 3.351245725451881e-05, "loss": 0.056, "step": 8100 }, { "epoch": 1.0, "eval_f1": 0.8987804878048782, "eval_loss": 0.04277478903532028, "eval_precision": 0.8910749615300066, "eval_recall": 0.9066204428539477, "eval_runtime": 133.6334, "eval_samples_per_second": 61.272, "eval_steps_per_second": 7.663, "step": 8188 }, { "epoch": 1.0014655593551538, "grad_norm": 0.8648662567138672, "learning_rate": 3.330890734408077e-05, "loss": 0.0441, "step": 8200 }, { "epoch": 1.0136785539814364, "grad_norm": 0.16131815314292908, "learning_rate": 3.310535743364273e-05, "loss": 0.0306, "step": 8300 }, { "epoch": 1.0258915486077187, "grad_norm": 4.484282970428467, "learning_rate": 3.290180752320469e-05, "loss": 0.0383, "step": 8400 }, { "epoch": 1.038104543234001, "grad_norm": 0.5343158841133118, "learning_rate": 3.2698257612766654e-05, "loss": 0.0331, "step": 8500 }, { "epoch": 1.0503175378602834, "grad_norm": 0.028084266930818558, "learning_rate": 3.2494707702328615e-05, "loss": 0.0381, "step": 8600 }, { "epoch": 1.0625305324865657, "grad_norm": 0.483477920293808, "learning_rate": 3.2291157791890576e-05, "loss": 0.0462, "step": 8700 }, { "epoch": 1.074743527112848, "grad_norm": 0.8194773197174072, "learning_rate": 3.208760788145254e-05, "loss": 0.0346, "step": 8800 }, { "epoch": 1.0869565217391304, "grad_norm": 0.11062140762805939, "learning_rate": 3.188405797101449e-05, "loss": 0.0369, "step": 8900 }, { "epoch": 1.0991695163654127, "grad_norm": 0.300889790058136, "learning_rate": 3.1680508060576454e-05, "loss": 0.0297, "step": 9000 }, { "epoch": 1.111382510991695, "grad_norm": 0.6278924345970154, "learning_rate": 3.1476958150138415e-05, "loss": 0.0349, "step": 9100 }, { "epoch": 1.1235955056179776, "grad_norm": 0.542029082775116, "learning_rate": 3.1273408239700376e-05, "loss": 0.0473, "step": 9200 }, { "epoch": 1.13580850024426, "grad_norm": 0.6147358417510986, "learning_rate": 3.106985832926234e-05, "loss": 0.0356, "step": 9300 }, { "epoch": 1.1480214948705423, "grad_norm": 1.301965355873108, "learning_rate": 3.08663084188243e-05, "loss": 0.0371, "step": 9400 }, { "epoch": 1.1602344894968246, "grad_norm": 0.026711974292993546, "learning_rate": 3.066275850838626e-05, "loss": 0.0346, "step": 9500 }, { "epoch": 1.172447484123107, "grad_norm": 1.258608102798462, "learning_rate": 3.0459208597948218e-05, "loss": 0.0334, "step": 9600 }, { "epoch": 1.1846604787493893, "grad_norm": 0.36877045035362244, "learning_rate": 3.025565868751018e-05, "loss": 0.032, "step": 9700 }, { "epoch": 1.1968734733756716, "grad_norm": 0.2519334852695465, "learning_rate": 3.005210877707214e-05, "loss": 0.0416, "step": 9800 }, { "epoch": 1.2090864680019542, "grad_norm": 0.5204672813415527, "learning_rate": 2.9848558866634102e-05, "loss": 0.0366, "step": 9900 }, { "epoch": 1.2212994626282365, "grad_norm": 0.43101000785827637, "learning_rate": 2.9645008956196063e-05, "loss": 0.0379, "step": 10000 }, { "epoch": 1.2335124572545189, "grad_norm": 0.681117057800293, "learning_rate": 2.9441459045758018e-05, "loss": 0.0353, "step": 10100 }, { "epoch": 1.2457254518808012, "grad_norm": 0.49889543652534485, "learning_rate": 2.923790913531998e-05, "loss": 0.0394, "step": 10200 }, { "epoch": 1.2579384465070835, "grad_norm": 0.8064567446708679, "learning_rate": 2.903435922488194e-05, "loss": 0.034, "step": 10300 }, { "epoch": 1.2701514411333659, "grad_norm": 0.21315552294254303, "learning_rate": 2.8830809314443902e-05, "loss": 0.0292, "step": 10400 }, { "epoch": 1.2823644357596482, "grad_norm": 1.6466035842895508, "learning_rate": 2.8627259404005863e-05, "loss": 0.0333, "step": 10500 }, { "epoch": 1.2945774303859308, "grad_norm": 1.1944749355316162, "learning_rate": 2.8423709493567825e-05, "loss": 0.0357, "step": 10600 }, { "epoch": 1.3067904250122129, "grad_norm": 0.6488074064254761, "learning_rate": 2.8220159583129786e-05, "loss": 0.0315, "step": 10700 }, { "epoch": 1.3190034196384954, "grad_norm": 0.030384689569473267, "learning_rate": 2.8016609672691747e-05, "loss": 0.0422, "step": 10800 }, { "epoch": 1.3312164142647778, "grad_norm": 0.2363937795162201, "learning_rate": 2.781305976225371e-05, "loss": 0.0335, "step": 10900 }, { "epoch": 1.34342940889106, "grad_norm": 0.04548358544707298, "learning_rate": 2.7609509851815666e-05, "loss": 0.0371, "step": 11000 }, { "epoch": 1.3556424035173424, "grad_norm": 0.6259112358093262, "learning_rate": 2.7405959941377628e-05, "loss": 0.0412, "step": 11100 }, { "epoch": 1.3678553981436248, "grad_norm": 0.42487379908561707, "learning_rate": 2.720241003093959e-05, "loss": 0.0252, "step": 11200 }, { "epoch": 1.3800683927699071, "grad_norm": 0.9125863313674927, "learning_rate": 2.6998860120501547e-05, "loss": 0.0367, "step": 11300 }, { "epoch": 1.3922813873961895, "grad_norm": 0.7670263051986694, "learning_rate": 2.679531021006351e-05, "loss": 0.0288, "step": 11400 }, { "epoch": 1.404494382022472, "grad_norm": 0.1614452451467514, "learning_rate": 2.6591760299625466e-05, "loss": 0.0399, "step": 11500 }, { "epoch": 1.4167073766487543, "grad_norm": 3.3551249504089355, "learning_rate": 2.6388210389187428e-05, "loss": 0.0391, "step": 11600 }, { "epoch": 1.4289203712750367, "grad_norm": 0.7188284397125244, "learning_rate": 2.618466047874939e-05, "loss": 0.0315, "step": 11700 }, { "epoch": 1.441133365901319, "grad_norm": 0.48031413555145264, "learning_rate": 2.598111056831135e-05, "loss": 0.0244, "step": 11800 }, { "epoch": 1.4533463605276014, "grad_norm": 0.7492583394050598, "learning_rate": 2.577756065787331e-05, "loss": 0.0359, "step": 11900 }, { "epoch": 1.4655593551538837, "grad_norm": 0.6593573689460754, "learning_rate": 2.5574010747435273e-05, "loss": 0.0327, "step": 12000 }, { "epoch": 1.477772349780166, "grad_norm": 0.2940855026245117, "learning_rate": 2.5370460836997234e-05, "loss": 0.0336, "step": 12100 }, { "epoch": 1.4899853444064486, "grad_norm": 0.45900267362594604, "learning_rate": 2.5166910926559195e-05, "loss": 0.0242, "step": 12200 }, { "epoch": 1.5021983390327307, "grad_norm": 2.2023909091949463, "learning_rate": 2.4963361016121153e-05, "loss": 0.0348, "step": 12300 }, { "epoch": 1.5144113336590133, "grad_norm": 0.12489739805459976, "learning_rate": 2.4759811105683115e-05, "loss": 0.0335, "step": 12400 }, { "epoch": 1.5266243282852956, "grad_norm": 1.0575867891311646, "learning_rate": 2.4556261195245076e-05, "loss": 0.0199, "step": 12500 }, { "epoch": 1.538837322911578, "grad_norm": 1.7309564352035522, "learning_rate": 2.4352711284807037e-05, "loss": 0.0316, "step": 12600 }, { "epoch": 1.5510503175378603, "grad_norm": 0.925658643245697, "learning_rate": 2.4149161374369e-05, "loss": 0.0445, "step": 12700 }, { "epoch": 1.5632633121641426, "grad_norm": 0.48667579889297485, "learning_rate": 2.3945611463930957e-05, "loss": 0.0437, "step": 12800 }, { "epoch": 1.5754763067904252, "grad_norm": 0.11213243752717972, "learning_rate": 2.3742061553492918e-05, "loss": 0.0387, "step": 12900 }, { "epoch": 1.5876893014167073, "grad_norm": 0.14116732776165009, "learning_rate": 2.353851164305488e-05, "loss": 0.033, "step": 13000 }, { "epoch": 1.5999022960429898, "grad_norm": 0.686268150806427, "learning_rate": 2.333496173261684e-05, "loss": 0.0281, "step": 13100 }, { "epoch": 1.612115290669272, "grad_norm": 0.4795430898666382, "learning_rate": 2.31314118221788e-05, "loss": 0.0436, "step": 13200 }, { "epoch": 1.6243282852955545, "grad_norm": 0.026416413486003876, "learning_rate": 2.292786191174076e-05, "loss": 0.0343, "step": 13300 }, { "epoch": 1.6365412799218368, "grad_norm": 0.582073986530304, "learning_rate": 2.2724312001302718e-05, "loss": 0.0312, "step": 13400 }, { "epoch": 1.6487542745481192, "grad_norm": 1.669487476348877, "learning_rate": 2.252076209086468e-05, "loss": 0.0384, "step": 13500 }, { "epoch": 1.6609672691744015, "grad_norm": 0.19379857182502747, "learning_rate": 2.231721218042664e-05, "loss": 0.0322, "step": 13600 }, { "epoch": 1.6731802638006839, "grad_norm": 4.540911674499512, "learning_rate": 2.2113662269988602e-05, "loss": 0.0363, "step": 13700 }, { "epoch": 1.6853932584269664, "grad_norm": 0.24804505705833435, "learning_rate": 2.1910112359550563e-05, "loss": 0.0326, "step": 13800 }, { "epoch": 1.6976062530532485, "grad_norm": 1.8535521030426025, "learning_rate": 2.1706562449112524e-05, "loss": 0.0316, "step": 13900 }, { "epoch": 1.709819247679531, "grad_norm": 0.04862889647483826, "learning_rate": 2.1503012538674482e-05, "loss": 0.0248, "step": 14000 }, { "epoch": 1.7220322423058134, "grad_norm": 0.3953320384025574, "learning_rate": 2.1299462628236444e-05, "loss": 0.0393, "step": 14100 }, { "epoch": 1.7342452369320958, "grad_norm": 0.5966042876243591, "learning_rate": 2.1095912717798405e-05, "loss": 0.0358, "step": 14200 }, { "epoch": 1.746458231558378, "grad_norm": 0.1555975377559662, "learning_rate": 2.0892362807360366e-05, "loss": 0.0425, "step": 14300 }, { "epoch": 1.7586712261846604, "grad_norm": 0.8556230068206787, "learning_rate": 2.0688812896922328e-05, "loss": 0.0267, "step": 14400 }, { "epoch": 1.770884220810943, "grad_norm": 0.03833279386162758, "learning_rate": 2.048526298648429e-05, "loss": 0.034, "step": 14500 }, { "epoch": 1.783097215437225, "grad_norm": 0.043861281126737595, "learning_rate": 2.0281713076046247e-05, "loss": 0.0288, "step": 14600 }, { "epoch": 1.7953102100635077, "grad_norm": 0.28712257742881775, "learning_rate": 2.0078163165608208e-05, "loss": 0.0285, "step": 14700 }, { "epoch": 1.8075232046897898, "grad_norm": 1.3535864353179932, "learning_rate": 1.987461325517017e-05, "loss": 0.0377, "step": 14800 }, { "epoch": 1.8197361993160723, "grad_norm": 3.164818048477173, "learning_rate": 1.967106334473213e-05, "loss": 0.0334, "step": 14900 }, { "epoch": 1.8319491939423547, "grad_norm": 0.08736918866634369, "learning_rate": 1.9467513434294092e-05, "loss": 0.0294, "step": 15000 }, { "epoch": 1.844162188568637, "grad_norm": 1.25545072555542, "learning_rate": 1.926396352385605e-05, "loss": 0.0285, "step": 15100 }, { "epoch": 1.8563751831949193, "grad_norm": 0.030480826273560524, "learning_rate": 1.906041361341801e-05, "loss": 0.0328, "step": 15200 }, { "epoch": 1.8685881778212017, "grad_norm": 1.6334197521209717, "learning_rate": 1.8856863702979973e-05, "loss": 0.037, "step": 15300 }, { "epoch": 1.8808011724474842, "grad_norm": 1.2553733587265015, "learning_rate": 1.865331379254193e-05, "loss": 0.0256, "step": 15400 }, { "epoch": 1.8930141670737664, "grad_norm": 0.061297524720430374, "learning_rate": 1.8449763882103892e-05, "loss": 0.0276, "step": 15500 }, { "epoch": 1.905227161700049, "grad_norm": 1.0915943384170532, "learning_rate": 1.8246213971665853e-05, "loss": 0.0362, "step": 15600 }, { "epoch": 1.9174401563263312, "grad_norm": 0.020990842953324318, "learning_rate": 1.8042664061227815e-05, "loss": 0.025, "step": 15700 }, { "epoch": 1.9296531509526136, "grad_norm": 0.09211856126785278, "learning_rate": 1.7839114150789773e-05, "loss": 0.0265, "step": 15800 }, { "epoch": 1.941866145578896, "grad_norm": 1.5800979137420654, "learning_rate": 1.7635564240351734e-05, "loss": 0.0256, "step": 15900 }, { "epoch": 1.9540791402051783, "grad_norm": 0.39250850677490234, "learning_rate": 1.7432014329913695e-05, "loss": 0.0249, "step": 16000 }, { "epoch": 1.9662921348314608, "grad_norm": 0.8597753047943115, "learning_rate": 1.7228464419475657e-05, "loss": 0.0355, "step": 16100 }, { "epoch": 1.978505129457743, "grad_norm": 0.16734100878238678, "learning_rate": 1.7024914509037618e-05, "loss": 0.032, "step": 16200 }, { "epoch": 1.9907181240840255, "grad_norm": 0.11750225722789764, "learning_rate": 1.682136459859958e-05, "loss": 0.0227, "step": 16300 }, { "epoch": 2.0, "eval_f1": 0.9197428223035141, "eval_loss": 0.036899276077747345, "eval_precision": 0.9117582417582417, "eval_recall": 0.9278684857973608, "eval_runtime": 75.3931, "eval_samples_per_second": 108.604, "eval_steps_per_second": 13.582, "step": 16376 }, { "epoch": 2.0029311187103076, "grad_norm": 0.6276179552078247, "learning_rate": 1.6617814688161537e-05, "loss": 0.0326, "step": 16400 }, { "epoch": 2.01514411333659, "grad_norm": 0.27882876992225647, "learning_rate": 1.64142647777235e-05, "loss": 0.0206, "step": 16500 }, { "epoch": 2.0273571079628727, "grad_norm": 0.9930168986320496, "learning_rate": 1.621071486728546e-05, "loss": 0.0135, "step": 16600 }, { "epoch": 2.039570102589155, "grad_norm": 0.21392406523227692, "learning_rate": 1.600716495684742e-05, "loss": 0.028, "step": 16700 }, { "epoch": 2.0517830972154374, "grad_norm": 2.1995363235473633, "learning_rate": 1.5803615046409382e-05, "loss": 0.0273, "step": 16800 }, { "epoch": 2.0639960918417195, "grad_norm": 1.91357421875, "learning_rate": 1.560006513597134e-05, "loss": 0.0152, "step": 16900 }, { "epoch": 2.076209086468002, "grad_norm": 0.057265687733888626, "learning_rate": 1.53965152255333e-05, "loss": 0.0206, "step": 17000 }, { "epoch": 2.088422081094284, "grad_norm": 0.05291162431240082, "learning_rate": 1.5192965315095261e-05, "loss": 0.022, "step": 17100 }, { "epoch": 2.1006350757205667, "grad_norm": 2.424394369125366, "learning_rate": 1.4989415404657223e-05, "loss": 0.0178, "step": 17200 }, { "epoch": 2.112848070346849, "grad_norm": 8.053882598876953, "learning_rate": 1.4785865494219184e-05, "loss": 0.0256, "step": 17300 }, { "epoch": 2.1250610649731314, "grad_norm": 1.606079339981079, "learning_rate": 1.4582315583781145e-05, "loss": 0.017, "step": 17400 }, { "epoch": 2.137274059599414, "grad_norm": 0.26984503865242004, "learning_rate": 1.4378765673343103e-05, "loss": 0.0202, "step": 17500 }, { "epoch": 2.149487054225696, "grad_norm": 0.044966306537389755, "learning_rate": 1.4175215762905064e-05, "loss": 0.0234, "step": 17600 }, { "epoch": 2.1617000488519786, "grad_norm": 0.05067300796508789, "learning_rate": 1.3971665852467026e-05, "loss": 0.0263, "step": 17700 }, { "epoch": 2.1739130434782608, "grad_norm": 0.5125128030776978, "learning_rate": 1.3768115942028985e-05, "loss": 0.0216, "step": 17800 }, { "epoch": 2.1861260381045433, "grad_norm": 0.04719540849328041, "learning_rate": 1.3564566031590947e-05, "loss": 0.0256, "step": 17900 }, { "epoch": 2.1983390327308254, "grad_norm": 0.11627175658941269, "learning_rate": 1.3361016121152908e-05, "loss": 0.0185, "step": 18000 }, { "epoch": 2.210552027357108, "grad_norm": 0.2016720473766327, "learning_rate": 1.3157466210714866e-05, "loss": 0.0111, "step": 18100 }, { "epoch": 2.22276502198339, "grad_norm": 1.6914150714874268, "learning_rate": 1.2953916300276827e-05, "loss": 0.0237, "step": 18200 }, { "epoch": 2.2349780166096727, "grad_norm": 0.3582985997200012, "learning_rate": 1.2750366389838789e-05, "loss": 0.0188, "step": 18300 }, { "epoch": 2.247191011235955, "grad_norm": 0.9769508838653564, "learning_rate": 1.254681647940075e-05, "loss": 0.024, "step": 18400 }, { "epoch": 2.2594040058622373, "grad_norm": 0.03454025089740753, "learning_rate": 1.2343266568962711e-05, "loss": 0.0307, "step": 18500 }, { "epoch": 2.27161700048852, "grad_norm": 0.0919230654835701, "learning_rate": 1.2139716658524671e-05, "loss": 0.0183, "step": 18600 }, { "epoch": 2.283829995114802, "grad_norm": 0.05342525988817215, "learning_rate": 1.1936166748086632e-05, "loss": 0.0295, "step": 18700 }, { "epoch": 2.2960429897410846, "grad_norm": 0.11520762741565704, "learning_rate": 1.1732616837648592e-05, "loss": 0.0187, "step": 18800 }, { "epoch": 2.308255984367367, "grad_norm": 1.8612200021743774, "learning_rate": 1.1529066927210551e-05, "loss": 0.0228, "step": 18900 }, { "epoch": 2.3204689789936492, "grad_norm": 0.9779945611953735, "learning_rate": 1.1325517016772513e-05, "loss": 0.0182, "step": 19000 }, { "epoch": 2.332681973619932, "grad_norm": 1.9669654369354248, "learning_rate": 1.1121967106334474e-05, "loss": 0.0247, "step": 19100 }, { "epoch": 2.344894968246214, "grad_norm": 0.1722841113805771, "learning_rate": 1.0918417195896434e-05, "loss": 0.0206, "step": 19200 }, { "epoch": 2.3571079628724965, "grad_norm": 0.1652793437242508, "learning_rate": 1.0714867285458395e-05, "loss": 0.0146, "step": 19300 }, { "epoch": 2.3693209574987786, "grad_norm": 0.07285087555646896, "learning_rate": 1.0511317375020356e-05, "loss": 0.0151, "step": 19400 }, { "epoch": 2.381533952125061, "grad_norm": 2.59061861038208, "learning_rate": 1.0307767464582316e-05, "loss": 0.0192, "step": 19500 }, { "epoch": 2.3937469467513433, "grad_norm": 0.02776254341006279, "learning_rate": 1.0104217554144277e-05, "loss": 0.0245, "step": 19600 }, { "epoch": 2.405959941377626, "grad_norm": 0.48207101225852966, "learning_rate": 9.900667643706239e-06, "loss": 0.0147, "step": 19700 }, { "epoch": 2.4181729360039084, "grad_norm": 0.7725105285644531, "learning_rate": 9.697117733268198e-06, "loss": 0.0206, "step": 19800 }, { "epoch": 2.4303859306301905, "grad_norm": 1.8201816082000732, "learning_rate": 9.493567822830158e-06, "loss": 0.0205, "step": 19900 }, { "epoch": 2.442598925256473, "grad_norm": 0.2930428385734558, "learning_rate": 9.29001791239212e-06, "loss": 0.0163, "step": 20000 }, { "epoch": 2.454811919882755, "grad_norm": 0.7441920638084412, "learning_rate": 9.086468001954079e-06, "loss": 0.0181, "step": 20100 }, { "epoch": 2.4670249145090377, "grad_norm": 0.5970872640609741, "learning_rate": 8.88291809151604e-06, "loss": 0.0172, "step": 20200 }, { "epoch": 2.47923790913532, "grad_norm": 0.17312058806419373, "learning_rate": 8.679368181078002e-06, "loss": 0.0163, "step": 20300 }, { "epoch": 2.4914509037616024, "grad_norm": 0.26520836353302, "learning_rate": 8.475818270639961e-06, "loss": 0.016, "step": 20400 }, { "epoch": 2.5036638983878845, "grad_norm": 0.08623456209897995, "learning_rate": 8.272268360201922e-06, "loss": 0.018, "step": 20500 }, { "epoch": 2.515876893014167, "grad_norm": 0.16404370963573456, "learning_rate": 8.068718449763882e-06, "loss": 0.0164, "step": 20600 }, { "epoch": 2.5280898876404496, "grad_norm": 0.051970474421978, "learning_rate": 7.865168539325843e-06, "loss": 0.0203, "step": 20700 }, { "epoch": 2.5403028822667317, "grad_norm": 0.08457406610250473, "learning_rate": 7.661618628887805e-06, "loss": 0.0211, "step": 20800 }, { "epoch": 2.5525158768930143, "grad_norm": 0.35134220123291016, "learning_rate": 7.4580687184497635e-06, "loss": 0.018, "step": 20900 }, { "epoch": 2.5647288715192964, "grad_norm": 0.487570196390152, "learning_rate": 7.254518808011725e-06, "loss": 0.0281, "step": 21000 }, { "epoch": 2.576941866145579, "grad_norm": 2.1460368633270264, "learning_rate": 7.050968897573685e-06, "loss": 0.0196, "step": 21100 }, { "epoch": 2.5891548607718615, "grad_norm": 0.3036395311355591, "learning_rate": 6.847418987135645e-06, "loss": 0.0191, "step": 21200 }, { "epoch": 2.6013678553981436, "grad_norm": 0.3689348101615906, "learning_rate": 6.643869076697606e-06, "loss": 0.0173, "step": 21300 }, { "epoch": 2.6135808500244258, "grad_norm": 1.0098440647125244, "learning_rate": 6.440319166259568e-06, "loss": 0.0162, "step": 21400 }, { "epoch": 2.6257938446507083, "grad_norm": 0.30733248591423035, "learning_rate": 6.236769255821528e-06, "loss": 0.0194, "step": 21500 }, { "epoch": 2.638006839276991, "grad_norm": 0.4835430085659027, "learning_rate": 6.0332193453834885e-06, "loss": 0.0295, "step": 21600 }, { "epoch": 2.650219833903273, "grad_norm": 0.041551847010850906, "learning_rate": 5.829669434945449e-06, "loss": 0.0209, "step": 21700 }, { "epoch": 2.6624328285295555, "grad_norm": 1.990522027015686, "learning_rate": 5.6261195245074095e-06, "loss": 0.0269, "step": 21800 }, { "epoch": 2.6746458231558377, "grad_norm": 0.04139232635498047, "learning_rate": 5.42256961406937e-06, "loss": 0.0226, "step": 21900 }, { "epoch": 2.68685881778212, "grad_norm": 0.9341286420822144, "learning_rate": 5.219019703631331e-06, "loss": 0.0201, "step": 22000 }, { "epoch": 2.6990718124084028, "grad_norm": 0.11153418570756912, "learning_rate": 5.015469793193292e-06, "loss": 0.0222, "step": 22100 }, { "epoch": 2.711284807034685, "grad_norm": 1.0574121475219727, "learning_rate": 4.811919882755251e-06, "loss": 0.0212, "step": 22200 }, { "epoch": 2.723497801660967, "grad_norm": 0.9357222318649292, "learning_rate": 4.608369972317213e-06, "loss": 0.0219, "step": 22300 }, { "epoch": 2.7357107962872496, "grad_norm": 0.18769215047359467, "learning_rate": 4.404820061879173e-06, "loss": 0.0192, "step": 22400 }, { "epoch": 2.747923790913532, "grad_norm": 1.0952208042144775, "learning_rate": 4.201270151441134e-06, "loss": 0.0165, "step": 22500 }, { "epoch": 2.7601367855398142, "grad_norm": 0.046009525656700134, "learning_rate": 3.997720241003094e-06, "loss": 0.0161, "step": 22600 }, { "epoch": 2.772349780166097, "grad_norm": 0.3359615206718445, "learning_rate": 3.794170330565055e-06, "loss": 0.0198, "step": 22700 }, { "epoch": 2.784562774792379, "grad_norm": 0.03583957999944687, "learning_rate": 3.590620420127015e-06, "loss": 0.0157, "step": 22800 }, { "epoch": 2.7967757694186615, "grad_norm": 2.4570398330688477, "learning_rate": 3.3870705096889755e-06, "loss": 0.0183, "step": 22900 }, { "epoch": 2.808988764044944, "grad_norm": 0.0799492597579956, "learning_rate": 3.1835205992509364e-06, "loss": 0.0154, "step": 23000 }, { "epoch": 2.821201758671226, "grad_norm": 0.17097431421279907, "learning_rate": 2.979970688812897e-06, "loss": 0.0208, "step": 23100 }, { "epoch": 2.8334147532975087, "grad_norm": 0.042323142290115356, "learning_rate": 2.776420778374858e-06, "loss": 0.0107, "step": 23200 }, { "epoch": 2.845627747923791, "grad_norm": 0.6305797100067139, "learning_rate": 2.5728708679368183e-06, "loss": 0.0225, "step": 23300 }, { "epoch": 2.8578407425500734, "grad_norm": 0.05080363526940346, "learning_rate": 2.3693209574987788e-06, "loss": 0.0238, "step": 23400 }, { "epoch": 2.8700537371763555, "grad_norm": 0.04388800263404846, "learning_rate": 2.1657710470607397e-06, "loss": 0.0184, "step": 23500 }, { "epoch": 2.882266731802638, "grad_norm": 2.4991371631622314, "learning_rate": 1.9622211366226997e-06, "loss": 0.0196, "step": 23600 }, { "epoch": 2.89447972642892, "grad_norm": 0.059519946575164795, "learning_rate": 1.7586712261846606e-06, "loss": 0.0156, "step": 23700 }, { "epoch": 2.9066927210552027, "grad_norm": 0.044085703790187836, "learning_rate": 1.5551213157466213e-06, "loss": 0.0161, "step": 23800 }, { "epoch": 2.9189057156814853, "grad_norm": 0.024006502702832222, "learning_rate": 1.3515714053085818e-06, "loss": 0.0172, "step": 23900 }, { "epoch": 2.9311187103077674, "grad_norm": 0.7654680609703064, "learning_rate": 1.1480214948705422e-06, "loss": 0.0165, "step": 24000 }, { "epoch": 2.94333170493405, "grad_norm": 0.8878483772277832, "learning_rate": 9.444715844325028e-07, "loss": 0.0174, "step": 24100 }, { "epoch": 2.955544699560332, "grad_norm": 3.2117550373077393, "learning_rate": 7.409216739944635e-07, "loss": 0.0171, "step": 24200 }, { "epoch": 2.9677576941866146, "grad_norm": 0.6114596128463745, "learning_rate": 5.373717635564241e-07, "loss": 0.0136, "step": 24300 }, { "epoch": 2.979970688812897, "grad_norm": 0.0984087809920311, "learning_rate": 3.3382185311838467e-07, "loss": 0.0166, "step": 24400 }, { "epoch": 2.9921836834391793, "grad_norm": 0.01637178845703602, "learning_rate": 1.3027194268034525e-07, "loss": 0.0133, "step": 24500 }, { "epoch": 3.0, "eval_f1": 0.9273861231763003, "eval_loss": 0.03446226194500923, "eval_precision": 0.9235803016858918, "eval_recall": 0.9312234399463207, "eval_runtime": 62.6474, "eval_samples_per_second": 130.7, "eval_steps_per_second": 16.345, "step": 24564 } ], "logging_steps": 100, "max_steps": 24564, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.283942398980096e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }