{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.996011396011396, "eval_steps": 500, "global_step": 1314, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.022792022792022793, "grad_norm": 7.939623134358423, "learning_rate": 7.575757575757576e-07, "loss": 0.8288, "step": 10 }, { "epoch": 0.045584045584045586, "grad_norm": 2.0350179182199772, "learning_rate": 1.5151515151515152e-06, "loss": 0.7833, "step": 20 }, { "epoch": 0.06837606837606838, "grad_norm": 1.0632794368243688, "learning_rate": 2.2727272727272728e-06, "loss": 0.7257, "step": 30 }, { "epoch": 0.09116809116809117, "grad_norm": 0.971986180487933, "learning_rate": 3.0303030303030305e-06, "loss": 0.7099, "step": 40 }, { "epoch": 0.11396011396011396, "grad_norm": 0.8814920783889958, "learning_rate": 3.7878787878787882e-06, "loss": 0.7004, "step": 50 }, { "epoch": 0.13675213675213677, "grad_norm": 0.8443308007282861, "learning_rate": 4.5454545454545455e-06, "loss": 0.6762, "step": 60 }, { "epoch": 0.15954415954415954, "grad_norm": 0.709097742036024, "learning_rate": 4.999875799605111e-06, "loss": 0.6695, "step": 70 }, { "epoch": 0.18233618233618235, "grad_norm": 0.7799598698600848, "learning_rate": 4.998478689774871e-06, "loss": 0.6681, "step": 80 }, { "epoch": 0.20512820512820512, "grad_norm": 0.5397139825176268, "learning_rate": 4.9955301078471286e-06, "loss": 0.6484, "step": 90 }, { "epoch": 0.22792022792022792, "grad_norm": 0.3704778229353924, "learning_rate": 4.991031922183084e-06, "loss": 0.6466, "step": 100 }, { "epoch": 0.25071225071225073, "grad_norm": 0.3442083289060042, "learning_rate": 4.984986983046283e-06, "loss": 0.6365, "step": 110 }, { "epoch": 0.27350427350427353, "grad_norm": 0.31979944611269634, "learning_rate": 4.977399120796549e-06, "loss": 0.6441, "step": 120 }, { "epoch": 0.2962962962962963, "grad_norm": 0.29438713846578946, "learning_rate": 4.968273143462887e-06, "loss": 0.6447, "step": 130 }, { "epoch": 0.3190883190883191, "grad_norm": 0.2982721034150172, "learning_rate": 4.957614833696889e-06, "loss": 0.6466, "step": 140 }, { "epoch": 0.3418803418803419, "grad_norm": 0.32028462240794187, "learning_rate": 4.945430945108575e-06, "loss": 0.6389, "step": 150 }, { "epoch": 0.3646723646723647, "grad_norm": 0.3083537166612179, "learning_rate": 4.93172919798698e-06, "loss": 0.6439, "step": 160 }, { "epoch": 0.38746438746438744, "grad_norm": 0.3558047153696352, "learning_rate": 4.916518274408218e-06, "loss": 0.6355, "step": 170 }, { "epoch": 0.41025641025641024, "grad_norm": 0.301027902860248, "learning_rate": 4.899807812734103e-06, "loss": 0.641, "step": 180 }, { "epoch": 0.43304843304843305, "grad_norm": 0.3011301029589234, "learning_rate": 4.881608401504832e-06, "loss": 0.6334, "step": 190 }, { "epoch": 0.45584045584045585, "grad_norm": 0.31176406471924095, "learning_rate": 4.861931572729577e-06, "loss": 0.6414, "step": 200 }, { "epoch": 0.47863247863247865, "grad_norm": 0.2941314332078632, "learning_rate": 4.840789794579267e-06, "loss": 0.6376, "step": 210 }, { "epoch": 0.5014245014245015, "grad_norm": 0.3032662943816638, "learning_rate": 4.818196463486153e-06, "loss": 0.6413, "step": 220 }, { "epoch": 0.5242165242165242, "grad_norm": 0.3253021562144048, "learning_rate": 4.794165895655202e-06, "loss": 0.6368, "step": 230 }, { "epoch": 0.5470085470085471, "grad_norm": 0.3229937623065352, "learning_rate": 4.768713317992671e-06, "loss": 0.6325, "step": 240 }, { "epoch": 0.5698005698005698, "grad_norm": 0.3204508336530559, "learning_rate": 4.741854858457611e-06, "loss": 0.6375, "step": 250 }, { "epoch": 0.5925925925925926, "grad_norm": 0.30853844481701553, "learning_rate": 4.7136075358424395e-06, "loss": 0.6356, "step": 260 }, { "epoch": 0.6153846153846154, "grad_norm": 0.3074064078008898, "learning_rate": 4.683989248989013e-06, "loss": 0.6327, "step": 270 }, { "epoch": 0.6381766381766382, "grad_norm": 0.31500570229793684, "learning_rate": 4.653018765447087e-06, "loss": 0.6435, "step": 280 }, { "epoch": 0.6609686609686609, "grad_norm": 0.30835495767035387, "learning_rate": 4.620715709582298e-06, "loss": 0.6321, "step": 290 }, { "epoch": 0.6837606837606838, "grad_norm": 0.2881600384981173, "learning_rate": 4.587100550141236e-06, "loss": 0.6301, "step": 300 }, { "epoch": 0.7065527065527065, "grad_norm": 0.29905772670860925, "learning_rate": 4.5521945872814845e-06, "loss": 0.6299, "step": 310 }, { "epoch": 0.7293447293447294, "grad_norm": 0.3068335419493767, "learning_rate": 4.5160199390748236e-06, "loss": 0.6305, "step": 320 }, { "epoch": 0.7521367521367521, "grad_norm": 0.2891580482632436, "learning_rate": 4.478599527492173e-06, "loss": 0.6271, "step": 330 }, { "epoch": 0.7749287749287749, "grad_norm": 0.29454699017937813, "learning_rate": 4.439957063879152e-06, "loss": 0.6329, "step": 340 }, { "epoch": 0.7977207977207977, "grad_norm": 0.31337493137815403, "learning_rate": 4.400117033931438e-06, "loss": 0.64, "step": 350 }, { "epoch": 0.8205128205128205, "grad_norm": 0.3167590518512679, "learning_rate": 4.3591046821794754e-06, "loss": 0.6317, "step": 360 }, { "epoch": 0.8433048433048433, "grad_norm": 0.313772837534301, "learning_rate": 4.316945995992346e-06, "loss": 0.6254, "step": 370 }, { "epoch": 0.8660968660968661, "grad_norm": 0.3184367407791849, "learning_rate": 4.273667689110936e-06, "loss": 0.6377, "step": 380 }, { "epoch": 0.8888888888888888, "grad_norm": 0.29474903561274024, "learning_rate": 4.229297184720847e-06, "loss": 0.6363, "step": 390 }, { "epoch": 0.9116809116809117, "grad_norm": 0.30322284745054784, "learning_rate": 4.18386259807577e-06, "loss": 0.6395, "step": 400 }, { "epoch": 0.9344729344729344, "grad_norm": 0.2736767008916395, "learning_rate": 4.13739271868232e-06, "loss": 0.6357, "step": 410 }, { "epoch": 0.9572649572649573, "grad_norm": 0.295007606721812, "learning_rate": 4.089916992057649e-06, "loss": 0.6226, "step": 420 }, { "epoch": 0.98005698005698, "grad_norm": 0.32204463497685043, "learning_rate": 4.041465501071366e-06, "loss": 0.6356, "step": 430 }, { "epoch": 0.9982905982905983, "eval_loss": 0.6262302994728088, "eval_runtime": 441.0164, "eval_samples_per_second": 26.809, "eval_steps_per_second": 0.419, "step": 438 }, { "epoch": 1.0034188034188034, "grad_norm": 0.3033117452535514, "learning_rate": 3.992068946883608e-06, "loss": 0.6488, "step": 440 }, { "epoch": 1.0262108262108263, "grad_norm": 0.3336102520884439, "learning_rate": 3.9417586294913315e-06, "loss": 0.6059, "step": 450 }, { "epoch": 1.049002849002849, "grad_norm": 0.3492495215760836, "learning_rate": 3.8905664278951494e-06, "loss": 0.5995, "step": 460 }, { "epoch": 1.0717948717948718, "grad_norm": 0.28190114431686303, "learning_rate": 3.838524779899283e-06, "loss": 0.6004, "step": 470 }, { "epoch": 1.0945868945868946, "grad_norm": 0.2899953882392186, "learning_rate": 3.7856666615574422e-06, "loss": 0.6079, "step": 480 }, { "epoch": 1.1173789173789175, "grad_norm": 0.30732761620175586, "learning_rate": 3.7320255662776246e-06, "loss": 0.6128, "step": 490 }, { "epoch": 1.1401709401709401, "grad_norm": 0.275405736183233, "learning_rate": 3.6776354835991117e-06, "loss": 0.6027, "step": 500 }, { "epoch": 1.162962962962963, "grad_norm": 0.30542058057406496, "learning_rate": 3.6225308776550844e-06, "loss": 0.6004, "step": 510 }, { "epoch": 1.1857549857549858, "grad_norm": 0.2972872770936463, "learning_rate": 3.566746665334519e-06, "loss": 0.6062, "step": 520 }, { "epoch": 1.2085470085470085, "grad_norm": 0.28044746494299844, "learning_rate": 3.5103181941571846e-06, "loss": 0.5991, "step": 530 }, { "epoch": 1.2313390313390313, "grad_norm": 0.30457238046373325, "learning_rate": 3.4532812198757874e-06, "loss": 0.6093, "step": 540 }, { "epoch": 1.2541310541310542, "grad_norm": 0.2895337299494955, "learning_rate": 3.395671883819429e-06, "loss": 0.6058, "step": 550 }, { "epoch": 1.2769230769230768, "grad_norm": 0.295953760365921, "learning_rate": 3.3375266899927463e-06, "loss": 0.6073, "step": 560 }, { "epoch": 1.2997150997150997, "grad_norm": 0.29208828454656655, "learning_rate": 3.2788824819452476e-06, "loss": 0.6012, "step": 570 }, { "epoch": 1.3225071225071225, "grad_norm": 0.2678624328258118, "learning_rate": 3.2197764194254932e-06, "loss": 0.5996, "step": 580 }, { "epoch": 1.3452991452991454, "grad_norm": 0.2716891251090373, "learning_rate": 3.1602459548349164e-06, "loss": 0.5996, "step": 590 }, { "epoch": 1.368091168091168, "grad_norm": 0.3004576495092077, "learning_rate": 3.10032880949621e-06, "loss": 0.605, "step": 600 }, { "epoch": 1.390883190883191, "grad_norm": 0.28456743972947146, "learning_rate": 3.0400629497513094e-06, "loss": 0.605, "step": 610 }, { "epoch": 1.4136752136752135, "grad_norm": 0.2873858164658089, "learning_rate": 2.979486562904117e-06, "loss": 0.6077, "step": 620 }, { "epoch": 1.4364672364672364, "grad_norm": 0.27995682227117297, "learning_rate": 2.918638033023219e-06, "loss": 0.6055, "step": 630 }, { "epoch": 1.4592592592592593, "grad_norm": 0.289004332372419, "learning_rate": 2.857555916619922e-06, "loss": 0.6003, "step": 640 }, { "epoch": 1.4820512820512821, "grad_norm": 0.2925129355682463, "learning_rate": 2.796278918217017e-06, "loss": 0.5981, "step": 650 }, { "epoch": 1.504843304843305, "grad_norm": 0.2713728224771852, "learning_rate": 2.734845865823767e-06, "loss": 0.598, "step": 660 }, { "epoch": 1.5276353276353276, "grad_norm": 0.29369960587494426, "learning_rate": 2.6732956863326325e-06, "loss": 0.6054, "step": 670 }, { "epoch": 1.5504273504273505, "grad_norm": 0.2655509653545259, "learning_rate": 2.611667380853355e-06, "loss": 0.594, "step": 680 }, { "epoch": 1.573219373219373, "grad_norm": 0.3145351916199109, "learning_rate": 2.55e-06, "loss": 0.598, "step": 690 }, { "epoch": 1.596011396011396, "grad_norm": 0.27573518377728895, "learning_rate": 2.4883326191466466e-06, "loss": 0.6136, "step": 700 }, { "epoch": 1.6188034188034188, "grad_norm": 0.2828300340817601, "learning_rate": 2.426704313667368e-06, "loss": 0.6006, "step": 710 }, { "epoch": 1.6415954415954417, "grad_norm": 0.29475363456554354, "learning_rate": 2.3651541341762333e-06, "loss": 0.6051, "step": 720 }, { "epoch": 1.6643874643874645, "grad_norm": 0.2890470497181963, "learning_rate": 2.3037210817829835e-06, "loss": 0.5981, "step": 730 }, { "epoch": 1.6871794871794872, "grad_norm": 0.28845012018426935, "learning_rate": 2.2424440833800796e-06, "loss": 0.5991, "step": 740 }, { "epoch": 1.7099715099715098, "grad_norm": 0.2835419877186715, "learning_rate": 2.1813619669767817e-06, "loss": 0.5964, "step": 750 }, { "epoch": 1.7327635327635327, "grad_norm": 0.29055250125081744, "learning_rate": 2.120513437095884e-06, "loss": 0.6005, "step": 760 }, { "epoch": 1.7555555555555555, "grad_norm": 0.288897734344261, "learning_rate": 2.0599370502486917e-06, "loss": 0.597, "step": 770 }, { "epoch": 1.7783475783475784, "grad_norm": 0.28162979046487097, "learning_rate": 1.9996711905037915e-06, "loss": 0.6041, "step": 780 }, { "epoch": 1.8011396011396013, "grad_norm": 0.2772022044429406, "learning_rate": 1.9397540451650843e-06, "loss": 0.6016, "step": 790 }, { "epoch": 1.823931623931624, "grad_norm": 0.2841659521355681, "learning_rate": 1.8802235805745077e-06, "loss": 0.5884, "step": 800 }, { "epoch": 1.8467236467236468, "grad_norm": 0.29090863852456733, "learning_rate": 1.8211175180547533e-06, "loss": 0.5963, "step": 810 }, { "epoch": 1.8695156695156694, "grad_norm": 0.27841775727742474, "learning_rate": 1.762473310007255e-06, "loss": 0.594, "step": 820 }, { "epoch": 1.8923076923076922, "grad_norm": 0.27287314371643295, "learning_rate": 1.7043281161805714e-06, "loss": 0.5978, "step": 830 }, { "epoch": 1.915099715099715, "grad_norm": 0.28477444190437035, "learning_rate": 1.6467187801242131e-06, "loss": 0.6025, "step": 840 }, { "epoch": 1.937891737891738, "grad_norm": 0.28987744586835007, "learning_rate": 1.5896818058428166e-06, "loss": 0.6004, "step": 850 }, { "epoch": 1.9606837606837608, "grad_norm": 0.2763531701152957, "learning_rate": 1.5332533346654826e-06, "loss": 0.6016, "step": 860 }, { "epoch": 1.9834757834757835, "grad_norm": 0.29341222330429434, "learning_rate": 1.4774691223449159e-06, "loss": 0.5992, "step": 870 }, { "epoch": 1.9994301994301993, "eval_loss": 0.620370626449585, "eval_runtime": 440.9211, "eval_samples_per_second": 26.814, "eval_steps_per_second": 0.42, "step": 877 }, { "epoch": 2.006837606837607, "grad_norm": 0.26702658685178077, "learning_rate": 1.42236451640089e-06, "loss": 0.6223, "step": 880 }, { "epoch": 2.0296296296296297, "grad_norm": 0.2819037147993013, "learning_rate": 1.3679744337223768e-06, "loss": 0.5694, "step": 890 }, { "epoch": 2.0524216524216525, "grad_norm": 0.2794385525179804, "learning_rate": 1.3143333384425585e-06, "loss": 0.5851, "step": 900 }, { "epoch": 2.0752136752136754, "grad_norm": 0.2827246071333301, "learning_rate": 1.2614752201007169e-06, "loss": 0.583, "step": 910 }, { "epoch": 2.098005698005698, "grad_norm": 0.26946250848001335, "learning_rate": 1.2094335721048521e-06, "loss": 0.586, "step": 920 }, { "epoch": 2.1207977207977207, "grad_norm": 0.2698210904894004, "learning_rate": 1.1582413705086686e-06, "loss": 0.5836, "step": 930 }, { "epoch": 2.1435897435897435, "grad_norm": 0.2776399603049312, "learning_rate": 1.107931053116392e-06, "loss": 0.5783, "step": 940 }, { "epoch": 2.1663817663817664, "grad_norm": 0.2686298030477811, "learning_rate": 1.0585344989286345e-06, "loss": 0.5792, "step": 950 }, { "epoch": 2.1891737891737892, "grad_norm": 0.27121001172495685, "learning_rate": 1.0100830079423518e-06, "loss": 0.584, "step": 960 }, { "epoch": 2.211965811965812, "grad_norm": 0.28156262878987165, "learning_rate": 9.626072813176803e-07, "loss": 0.5829, "step": 970 }, { "epoch": 2.234757834757835, "grad_norm": 0.2793558026264157, "learning_rate": 9.161374019242304e-07, "loss": 0.5726, "step": 980 }, { "epoch": 2.2575498575498574, "grad_norm": 0.27490393600639224, "learning_rate": 8.707028152791524e-07, "loss": 0.5816, "step": 990 }, { "epoch": 2.2803418803418802, "grad_norm": 0.26292747820053386, "learning_rate": 8.26332310889065e-07, "loss": 0.5816, "step": 1000 }, { "epoch": 2.303133903133903, "grad_norm": 0.2803446183567512, "learning_rate": 7.830540040076546e-07, "loss": 0.5792, "step": 1010 }, { "epoch": 2.325925925925926, "grad_norm": 0.28158948177424703, "learning_rate": 7.408953178205249e-07, "loss": 0.5821, "step": 1020 }, { "epoch": 2.348717948717949, "grad_norm": 0.2678755980850509, "learning_rate": 6.998829660685626e-07, "loss": 0.5836, "step": 1030 }, { "epoch": 2.3715099715099717, "grad_norm": 0.26037999933130573, "learning_rate": 6.600429361208491e-07, "loss": 0.5811, "step": 1040 }, { "epoch": 2.394301994301994, "grad_norm": 0.2867823279885847, "learning_rate": 6.214004725078274e-07, "loss": 0.5806, "step": 1050 }, { "epoch": 2.417094017094017, "grad_norm": 0.26863402452402996, "learning_rate": 5.839800609251777e-07, "loss": 0.5887, "step": 1060 }, { "epoch": 2.43988603988604, "grad_norm": 0.265061150385046, "learning_rate": 5.478054127185157e-07, "loss": 0.5918, "step": 1070 }, { "epoch": 2.4626780626780627, "grad_norm": 0.2685780972904722, "learning_rate": 5.128994498587639e-07, "loss": 0.5871, "step": 1080 }, { "epoch": 2.4854700854700855, "grad_norm": 0.26335109003234897, "learning_rate": 4.792842904177032e-07, "loss": 0.5844, "step": 1090 }, { "epoch": 2.5082621082621084, "grad_norm": 0.2823774919948753, "learning_rate": 4.46981234552913e-07, "loss": 0.591, "step": 1100 }, { "epoch": 2.5310541310541312, "grad_norm": 0.261979212542654, "learning_rate": 4.1601075101098676e-07, "loss": 0.5839, "step": 1110 }, { "epoch": 2.5538461538461537, "grad_norm": 0.260899837972818, "learning_rate": 3.863924641575613e-07, "loss": 0.5823, "step": 1120 }, { "epoch": 2.5766381766381765, "grad_norm": 0.26776773055192804, "learning_rate": 3.581451415423892e-07, "loss": 0.5816, "step": 1130 }, { "epoch": 2.5994301994301994, "grad_norm": 0.2613472498758897, "learning_rate": 3.312866820073303e-07, "loss": 0.5812, "step": 1140 }, { "epoch": 2.6222222222222222, "grad_norm": 0.2712900892168655, "learning_rate": 3.0583410434479846e-07, "loss": 0.5813, "step": 1150 }, { "epoch": 2.645014245014245, "grad_norm": 0.27281353275393094, "learning_rate": 2.8180353651384805e-07, "loss": 0.5822, "step": 1160 }, { "epoch": 2.667806267806268, "grad_norm": 0.27217716252121404, "learning_rate": 2.592102054207339e-07, "loss": 0.5854, "step": 1170 }, { "epoch": 2.690598290598291, "grad_norm": 0.2612477447152697, "learning_rate": 2.3806842727042292e-07, "loss": 0.5804, "step": 1180 }, { "epoch": 2.7133903133903132, "grad_norm": 0.25720618860334776, "learning_rate": 2.1839159849516864e-07, "loss": 0.5813, "step": 1190 }, { "epoch": 2.736182336182336, "grad_norm": 0.26319244714607254, "learning_rate": 2.0019218726589726e-07, "loss": 0.5846, "step": 1200 }, { "epoch": 2.758974358974359, "grad_norm": 0.2571851886815192, "learning_rate": 1.8348172559178291e-07, "loss": 0.5878, "step": 1210 }, { "epoch": 2.781766381766382, "grad_norm": 0.2600480075434856, "learning_rate": 1.682708020130203e-07, "loss": 0.5775, "step": 1220 }, { "epoch": 2.8045584045584047, "grad_norm": 0.25280104808605824, "learning_rate": 1.545690548914252e-07, "loss": 0.576, "step": 1230 }, { "epoch": 2.827350427350427, "grad_norm": 0.2602309905026317, "learning_rate": 1.4238516630311088e-07, "loss": 0.5909, "step": 1240 }, { "epoch": 2.8501424501424504, "grad_norm": 0.2606814585720784, "learning_rate": 1.3172685653711352e-07, "loss": 0.5879, "step": 1250 }, { "epoch": 2.872934472934473, "grad_norm": 0.26378742051033477, "learning_rate": 1.2260087920345132e-07, "loss": 0.585, "step": 1260 }, { "epoch": 2.8957264957264957, "grad_norm": 0.265747998445119, "learning_rate": 1.1501301695371678e-07, "loss": 0.5858, "step": 1270 }, { "epoch": 2.9185185185185185, "grad_norm": 0.26130400943369125, "learning_rate": 1.0896807781691582e-07, "loss": 0.576, "step": 1280 }, { "epoch": 2.9413105413105414, "grad_norm": 0.26374210440166906, "learning_rate": 1.0446989215287212e-07, "loss": 0.5759, "step": 1290 }, { "epoch": 2.9641025641025642, "grad_norm": 0.2712373924034364, "learning_rate": 1.0152131022512929e-07, "loss": 0.5772, "step": 1300 }, { "epoch": 2.9868945868945866, "grad_norm": 0.2696124467190187, "learning_rate": 1.0012420039488912e-07, "loss": 0.5874, "step": 1310 }, { "epoch": 2.996011396011396, "eval_loss": 0.6207034587860107, "eval_runtime": 445.0791, "eval_samples_per_second": 26.564, "eval_steps_per_second": 0.416, "step": 1314 }, { "epoch": 2.996011396011396, "step": 1314, "total_flos": 2755219238682624.0, "train_loss": 0.6124119740461469, "train_runtime": 70824.8381, "train_samples_per_second": 9.514, "train_steps_per_second": 0.019 } ], "logging_steps": 10, "max_steps": 1314, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2755219238682624.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }