| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.996011396011396, | |
| "eval_steps": 500, | |
| "global_step": 1314, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.022792022792022793, | |
| "grad_norm": 7.939623134358423, | |
| "learning_rate": 7.575757575757576e-07, | |
| "loss": 0.8288, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.045584045584045586, | |
| "grad_norm": 2.0350179182199772, | |
| "learning_rate": 1.5151515151515152e-06, | |
| "loss": 0.7833, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.06837606837606838, | |
| "grad_norm": 1.0632794368243688, | |
| "learning_rate": 2.2727272727272728e-06, | |
| "loss": 0.7257, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.09116809116809117, | |
| "grad_norm": 0.971986180487933, | |
| "learning_rate": 3.0303030303030305e-06, | |
| "loss": 0.7099, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.11396011396011396, | |
| "grad_norm": 0.8814920783889958, | |
| "learning_rate": 3.7878787878787882e-06, | |
| "loss": 0.7004, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.13675213675213677, | |
| "grad_norm": 0.8443308007282861, | |
| "learning_rate": 4.5454545454545455e-06, | |
| "loss": 0.6762, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.15954415954415954, | |
| "grad_norm": 0.709097742036024, | |
| "learning_rate": 4.999875799605111e-06, | |
| "loss": 0.6695, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.18233618233618235, | |
| "grad_norm": 0.7799598698600848, | |
| "learning_rate": 4.998478689774871e-06, | |
| "loss": 0.6681, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.20512820512820512, | |
| "grad_norm": 0.5397139825176268, | |
| "learning_rate": 4.9955301078471286e-06, | |
| "loss": 0.6484, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.22792022792022792, | |
| "grad_norm": 0.3704778229353924, | |
| "learning_rate": 4.991031922183084e-06, | |
| "loss": 0.6466, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.25071225071225073, | |
| "grad_norm": 0.3442083289060042, | |
| "learning_rate": 4.984986983046283e-06, | |
| "loss": 0.6365, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.27350427350427353, | |
| "grad_norm": 0.31979944611269634, | |
| "learning_rate": 4.977399120796549e-06, | |
| "loss": 0.6441, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.2962962962962963, | |
| "grad_norm": 0.29438713846578946, | |
| "learning_rate": 4.968273143462887e-06, | |
| "loss": 0.6447, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.3190883190883191, | |
| "grad_norm": 0.2982721034150172, | |
| "learning_rate": 4.957614833696889e-06, | |
| "loss": 0.6466, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.3418803418803419, | |
| "grad_norm": 0.32028462240794187, | |
| "learning_rate": 4.945430945108575e-06, | |
| "loss": 0.6389, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.3646723646723647, | |
| "grad_norm": 0.3083537166612179, | |
| "learning_rate": 4.93172919798698e-06, | |
| "loss": 0.6439, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.38746438746438744, | |
| "grad_norm": 0.3558047153696352, | |
| "learning_rate": 4.916518274408218e-06, | |
| "loss": 0.6355, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.41025641025641024, | |
| "grad_norm": 0.301027902860248, | |
| "learning_rate": 4.899807812734103e-06, | |
| "loss": 0.641, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.43304843304843305, | |
| "grad_norm": 0.3011301029589234, | |
| "learning_rate": 4.881608401504832e-06, | |
| "loss": 0.6334, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.45584045584045585, | |
| "grad_norm": 0.31176406471924095, | |
| "learning_rate": 4.861931572729577e-06, | |
| "loss": 0.6414, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.47863247863247865, | |
| "grad_norm": 0.2941314332078632, | |
| "learning_rate": 4.840789794579267e-06, | |
| "loss": 0.6376, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.5014245014245015, | |
| "grad_norm": 0.3032662943816638, | |
| "learning_rate": 4.818196463486153e-06, | |
| "loss": 0.6413, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.5242165242165242, | |
| "grad_norm": 0.3253021562144048, | |
| "learning_rate": 4.794165895655202e-06, | |
| "loss": 0.6368, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.5470085470085471, | |
| "grad_norm": 0.3229937623065352, | |
| "learning_rate": 4.768713317992671e-06, | |
| "loss": 0.6325, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.5698005698005698, | |
| "grad_norm": 0.3204508336530559, | |
| "learning_rate": 4.741854858457611e-06, | |
| "loss": 0.6375, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.5925925925925926, | |
| "grad_norm": 0.30853844481701553, | |
| "learning_rate": 4.7136075358424395e-06, | |
| "loss": 0.6356, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.6153846153846154, | |
| "grad_norm": 0.3074064078008898, | |
| "learning_rate": 4.683989248989013e-06, | |
| "loss": 0.6327, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.6381766381766382, | |
| "grad_norm": 0.31500570229793684, | |
| "learning_rate": 4.653018765447087e-06, | |
| "loss": 0.6435, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.6609686609686609, | |
| "grad_norm": 0.30835495767035387, | |
| "learning_rate": 4.620715709582298e-06, | |
| "loss": 0.6321, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.6837606837606838, | |
| "grad_norm": 0.2881600384981173, | |
| "learning_rate": 4.587100550141236e-06, | |
| "loss": 0.6301, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.7065527065527065, | |
| "grad_norm": 0.29905772670860925, | |
| "learning_rate": 4.5521945872814845e-06, | |
| "loss": 0.6299, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.7293447293447294, | |
| "grad_norm": 0.3068335419493767, | |
| "learning_rate": 4.5160199390748236e-06, | |
| "loss": 0.6305, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.7521367521367521, | |
| "grad_norm": 0.2891580482632436, | |
| "learning_rate": 4.478599527492173e-06, | |
| "loss": 0.6271, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.7749287749287749, | |
| "grad_norm": 0.29454699017937813, | |
| "learning_rate": 4.439957063879152e-06, | |
| "loss": 0.6329, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.7977207977207977, | |
| "grad_norm": 0.31337493137815403, | |
| "learning_rate": 4.400117033931438e-06, | |
| "loss": 0.64, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.8205128205128205, | |
| "grad_norm": 0.3167590518512679, | |
| "learning_rate": 4.3591046821794754e-06, | |
| "loss": 0.6317, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.8433048433048433, | |
| "grad_norm": 0.313772837534301, | |
| "learning_rate": 4.316945995992346e-06, | |
| "loss": 0.6254, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.8660968660968661, | |
| "grad_norm": 0.3184367407791849, | |
| "learning_rate": 4.273667689110936e-06, | |
| "loss": 0.6377, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.8888888888888888, | |
| "grad_norm": 0.29474903561274024, | |
| "learning_rate": 4.229297184720847e-06, | |
| "loss": 0.6363, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.9116809116809117, | |
| "grad_norm": 0.30322284745054784, | |
| "learning_rate": 4.18386259807577e-06, | |
| "loss": 0.6395, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.9344729344729344, | |
| "grad_norm": 0.2736767008916395, | |
| "learning_rate": 4.13739271868232e-06, | |
| "loss": 0.6357, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.9572649572649573, | |
| "grad_norm": 0.295007606721812, | |
| "learning_rate": 4.089916992057649e-06, | |
| "loss": 0.6226, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.98005698005698, | |
| "grad_norm": 0.32204463497685043, | |
| "learning_rate": 4.041465501071366e-06, | |
| "loss": 0.6356, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.9982905982905983, | |
| "eval_loss": 0.6262302994728088, | |
| "eval_runtime": 441.0164, | |
| "eval_samples_per_second": 26.809, | |
| "eval_steps_per_second": 0.419, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 1.0034188034188034, | |
| "grad_norm": 0.3033117452535514, | |
| "learning_rate": 3.992068946883608e-06, | |
| "loss": 0.6488, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.0262108262108263, | |
| "grad_norm": 0.3336102520884439, | |
| "learning_rate": 3.9417586294913315e-06, | |
| "loss": 0.6059, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.049002849002849, | |
| "grad_norm": 0.3492495215760836, | |
| "learning_rate": 3.8905664278951494e-06, | |
| "loss": 0.5995, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.0717948717948718, | |
| "grad_norm": 0.28190114431686303, | |
| "learning_rate": 3.838524779899283e-06, | |
| "loss": 0.6004, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.0945868945868946, | |
| "grad_norm": 0.2899953882392186, | |
| "learning_rate": 3.7856666615574422e-06, | |
| "loss": 0.6079, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.1173789173789175, | |
| "grad_norm": 0.30732761620175586, | |
| "learning_rate": 3.7320255662776246e-06, | |
| "loss": 0.6128, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.1401709401709401, | |
| "grad_norm": 0.275405736183233, | |
| "learning_rate": 3.6776354835991117e-06, | |
| "loss": 0.6027, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.162962962962963, | |
| "grad_norm": 0.30542058057406496, | |
| "learning_rate": 3.6225308776550844e-06, | |
| "loss": 0.6004, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.1857549857549858, | |
| "grad_norm": 0.2972872770936463, | |
| "learning_rate": 3.566746665334519e-06, | |
| "loss": 0.6062, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.2085470085470085, | |
| "grad_norm": 0.28044746494299844, | |
| "learning_rate": 3.5103181941571846e-06, | |
| "loss": 0.5991, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.2313390313390313, | |
| "grad_norm": 0.30457238046373325, | |
| "learning_rate": 3.4532812198757874e-06, | |
| "loss": 0.6093, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.2541310541310542, | |
| "grad_norm": 0.2895337299494955, | |
| "learning_rate": 3.395671883819429e-06, | |
| "loss": 0.6058, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.2769230769230768, | |
| "grad_norm": 0.295953760365921, | |
| "learning_rate": 3.3375266899927463e-06, | |
| "loss": 0.6073, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.2997150997150997, | |
| "grad_norm": 0.29208828454656655, | |
| "learning_rate": 3.2788824819452476e-06, | |
| "loss": 0.6012, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.3225071225071225, | |
| "grad_norm": 0.2678624328258118, | |
| "learning_rate": 3.2197764194254932e-06, | |
| "loss": 0.5996, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.3452991452991454, | |
| "grad_norm": 0.2716891251090373, | |
| "learning_rate": 3.1602459548349164e-06, | |
| "loss": 0.5996, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.368091168091168, | |
| "grad_norm": 0.3004576495092077, | |
| "learning_rate": 3.10032880949621e-06, | |
| "loss": 0.605, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.390883190883191, | |
| "grad_norm": 0.28456743972947146, | |
| "learning_rate": 3.0400629497513094e-06, | |
| "loss": 0.605, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.4136752136752135, | |
| "grad_norm": 0.2873858164658089, | |
| "learning_rate": 2.979486562904117e-06, | |
| "loss": 0.6077, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.4364672364672364, | |
| "grad_norm": 0.27995682227117297, | |
| "learning_rate": 2.918638033023219e-06, | |
| "loss": 0.6055, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.4592592592592593, | |
| "grad_norm": 0.289004332372419, | |
| "learning_rate": 2.857555916619922e-06, | |
| "loss": 0.6003, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.4820512820512821, | |
| "grad_norm": 0.2925129355682463, | |
| "learning_rate": 2.796278918217017e-06, | |
| "loss": 0.5981, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.504843304843305, | |
| "grad_norm": 0.2713728224771852, | |
| "learning_rate": 2.734845865823767e-06, | |
| "loss": 0.598, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.5276353276353276, | |
| "grad_norm": 0.29369960587494426, | |
| "learning_rate": 2.6732956863326325e-06, | |
| "loss": 0.6054, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.5504273504273505, | |
| "grad_norm": 0.2655509653545259, | |
| "learning_rate": 2.611667380853355e-06, | |
| "loss": 0.594, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.573219373219373, | |
| "grad_norm": 0.3145351916199109, | |
| "learning_rate": 2.55e-06, | |
| "loss": 0.598, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.596011396011396, | |
| "grad_norm": 0.27573518377728895, | |
| "learning_rate": 2.4883326191466466e-06, | |
| "loss": 0.6136, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.6188034188034188, | |
| "grad_norm": 0.2828300340817601, | |
| "learning_rate": 2.426704313667368e-06, | |
| "loss": 0.6006, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.6415954415954417, | |
| "grad_norm": 0.29475363456554354, | |
| "learning_rate": 2.3651541341762333e-06, | |
| "loss": 0.6051, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.6643874643874645, | |
| "grad_norm": 0.2890470497181963, | |
| "learning_rate": 2.3037210817829835e-06, | |
| "loss": 0.5981, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.6871794871794872, | |
| "grad_norm": 0.28845012018426935, | |
| "learning_rate": 2.2424440833800796e-06, | |
| "loss": 0.5991, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.7099715099715098, | |
| "grad_norm": 0.2835419877186715, | |
| "learning_rate": 2.1813619669767817e-06, | |
| "loss": 0.5964, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.7327635327635327, | |
| "grad_norm": 0.29055250125081744, | |
| "learning_rate": 2.120513437095884e-06, | |
| "loss": 0.6005, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.7555555555555555, | |
| "grad_norm": 0.288897734344261, | |
| "learning_rate": 2.0599370502486917e-06, | |
| "loss": 0.597, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.7783475783475784, | |
| "grad_norm": 0.28162979046487097, | |
| "learning_rate": 1.9996711905037915e-06, | |
| "loss": 0.6041, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.8011396011396013, | |
| "grad_norm": 0.2772022044429406, | |
| "learning_rate": 1.9397540451650843e-06, | |
| "loss": 0.6016, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.823931623931624, | |
| "grad_norm": 0.2841659521355681, | |
| "learning_rate": 1.8802235805745077e-06, | |
| "loss": 0.5884, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.8467236467236468, | |
| "grad_norm": 0.29090863852456733, | |
| "learning_rate": 1.8211175180547533e-06, | |
| "loss": 0.5963, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.8695156695156694, | |
| "grad_norm": 0.27841775727742474, | |
| "learning_rate": 1.762473310007255e-06, | |
| "loss": 0.594, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.8923076923076922, | |
| "grad_norm": 0.27287314371643295, | |
| "learning_rate": 1.7043281161805714e-06, | |
| "loss": 0.5978, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.915099715099715, | |
| "grad_norm": 0.28477444190437035, | |
| "learning_rate": 1.6467187801242131e-06, | |
| "loss": 0.6025, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.937891737891738, | |
| "grad_norm": 0.28987744586835007, | |
| "learning_rate": 1.5896818058428166e-06, | |
| "loss": 0.6004, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.9606837606837608, | |
| "grad_norm": 0.2763531701152957, | |
| "learning_rate": 1.5332533346654826e-06, | |
| "loss": 0.6016, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.9834757834757835, | |
| "grad_norm": 0.29341222330429434, | |
| "learning_rate": 1.4774691223449159e-06, | |
| "loss": 0.5992, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.9994301994301993, | |
| "eval_loss": 0.620370626449585, | |
| "eval_runtime": 440.9211, | |
| "eval_samples_per_second": 26.814, | |
| "eval_steps_per_second": 0.42, | |
| "step": 877 | |
| }, | |
| { | |
| "epoch": 2.006837606837607, | |
| "grad_norm": 0.26702658685178077, | |
| "learning_rate": 1.42236451640089e-06, | |
| "loss": 0.6223, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 2.0296296296296297, | |
| "grad_norm": 0.2819037147993013, | |
| "learning_rate": 1.3679744337223768e-06, | |
| "loss": 0.5694, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 2.0524216524216525, | |
| "grad_norm": 0.2794385525179804, | |
| "learning_rate": 1.3143333384425585e-06, | |
| "loss": 0.5851, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.0752136752136754, | |
| "grad_norm": 0.2827246071333301, | |
| "learning_rate": 1.2614752201007169e-06, | |
| "loss": 0.583, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 2.098005698005698, | |
| "grad_norm": 0.26946250848001335, | |
| "learning_rate": 1.2094335721048521e-06, | |
| "loss": 0.586, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 2.1207977207977207, | |
| "grad_norm": 0.2698210904894004, | |
| "learning_rate": 1.1582413705086686e-06, | |
| "loss": 0.5836, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 2.1435897435897435, | |
| "grad_norm": 0.2776399603049312, | |
| "learning_rate": 1.107931053116392e-06, | |
| "loss": 0.5783, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 2.1663817663817664, | |
| "grad_norm": 0.2686298030477811, | |
| "learning_rate": 1.0585344989286345e-06, | |
| "loss": 0.5792, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 2.1891737891737892, | |
| "grad_norm": 0.27121001172495685, | |
| "learning_rate": 1.0100830079423518e-06, | |
| "loss": 0.584, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 2.211965811965812, | |
| "grad_norm": 0.28156262878987165, | |
| "learning_rate": 9.626072813176803e-07, | |
| "loss": 0.5829, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 2.234757834757835, | |
| "grad_norm": 0.2793558026264157, | |
| "learning_rate": 9.161374019242304e-07, | |
| "loss": 0.5726, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 2.2575498575498574, | |
| "grad_norm": 0.27490393600639224, | |
| "learning_rate": 8.707028152791524e-07, | |
| "loss": 0.5816, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 2.2803418803418802, | |
| "grad_norm": 0.26292747820053386, | |
| "learning_rate": 8.26332310889065e-07, | |
| "loss": 0.5816, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 2.303133903133903, | |
| "grad_norm": 0.2803446183567512, | |
| "learning_rate": 7.830540040076546e-07, | |
| "loss": 0.5792, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 2.325925925925926, | |
| "grad_norm": 0.28158948177424703, | |
| "learning_rate": 7.408953178205249e-07, | |
| "loss": 0.5821, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 2.348717948717949, | |
| "grad_norm": 0.2678755980850509, | |
| "learning_rate": 6.998829660685626e-07, | |
| "loss": 0.5836, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 2.3715099715099717, | |
| "grad_norm": 0.26037999933130573, | |
| "learning_rate": 6.600429361208491e-07, | |
| "loss": 0.5811, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 2.394301994301994, | |
| "grad_norm": 0.2867823279885847, | |
| "learning_rate": 6.214004725078274e-07, | |
| "loss": 0.5806, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 2.417094017094017, | |
| "grad_norm": 0.26863402452402996, | |
| "learning_rate": 5.839800609251777e-07, | |
| "loss": 0.5887, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 2.43988603988604, | |
| "grad_norm": 0.265061150385046, | |
| "learning_rate": 5.478054127185157e-07, | |
| "loss": 0.5918, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 2.4626780626780627, | |
| "grad_norm": 0.2685780972904722, | |
| "learning_rate": 5.128994498587639e-07, | |
| "loss": 0.5871, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 2.4854700854700855, | |
| "grad_norm": 0.26335109003234897, | |
| "learning_rate": 4.792842904177032e-07, | |
| "loss": 0.5844, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 2.5082621082621084, | |
| "grad_norm": 0.2823774919948753, | |
| "learning_rate": 4.46981234552913e-07, | |
| "loss": 0.591, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 2.5310541310541312, | |
| "grad_norm": 0.261979212542654, | |
| "learning_rate": 4.1601075101098676e-07, | |
| "loss": 0.5839, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 2.5538461538461537, | |
| "grad_norm": 0.260899837972818, | |
| "learning_rate": 3.863924641575613e-07, | |
| "loss": 0.5823, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 2.5766381766381765, | |
| "grad_norm": 0.26776773055192804, | |
| "learning_rate": 3.581451415423892e-07, | |
| "loss": 0.5816, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 2.5994301994301994, | |
| "grad_norm": 0.2613472498758897, | |
| "learning_rate": 3.312866820073303e-07, | |
| "loss": 0.5812, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 2.6222222222222222, | |
| "grad_norm": 0.2712900892168655, | |
| "learning_rate": 3.0583410434479846e-07, | |
| "loss": 0.5813, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 2.645014245014245, | |
| "grad_norm": 0.27281353275393094, | |
| "learning_rate": 2.8180353651384805e-07, | |
| "loss": 0.5822, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 2.667806267806268, | |
| "grad_norm": 0.27217716252121404, | |
| "learning_rate": 2.592102054207339e-07, | |
| "loss": 0.5854, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 2.690598290598291, | |
| "grad_norm": 0.2612477447152697, | |
| "learning_rate": 2.3806842727042292e-07, | |
| "loss": 0.5804, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 2.7133903133903132, | |
| "grad_norm": 0.25720618860334776, | |
| "learning_rate": 2.1839159849516864e-07, | |
| "loss": 0.5813, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 2.736182336182336, | |
| "grad_norm": 0.26319244714607254, | |
| "learning_rate": 2.0019218726589726e-07, | |
| "loss": 0.5846, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.758974358974359, | |
| "grad_norm": 0.2571851886815192, | |
| "learning_rate": 1.8348172559178291e-07, | |
| "loss": 0.5878, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 2.781766381766382, | |
| "grad_norm": 0.2600480075434856, | |
| "learning_rate": 1.682708020130203e-07, | |
| "loss": 0.5775, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 2.8045584045584047, | |
| "grad_norm": 0.25280104808605824, | |
| "learning_rate": 1.545690548914252e-07, | |
| "loss": 0.576, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 2.827350427350427, | |
| "grad_norm": 0.2602309905026317, | |
| "learning_rate": 1.4238516630311088e-07, | |
| "loss": 0.5909, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 2.8501424501424504, | |
| "grad_norm": 0.2606814585720784, | |
| "learning_rate": 1.3172685653711352e-07, | |
| "loss": 0.5879, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 2.872934472934473, | |
| "grad_norm": 0.26378742051033477, | |
| "learning_rate": 1.2260087920345132e-07, | |
| "loss": 0.585, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 2.8957264957264957, | |
| "grad_norm": 0.265747998445119, | |
| "learning_rate": 1.1501301695371678e-07, | |
| "loss": 0.5858, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 2.9185185185185185, | |
| "grad_norm": 0.26130400943369125, | |
| "learning_rate": 1.0896807781691582e-07, | |
| "loss": 0.576, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 2.9413105413105414, | |
| "grad_norm": 0.26374210440166906, | |
| "learning_rate": 1.0446989215287212e-07, | |
| "loss": 0.5759, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 2.9641025641025642, | |
| "grad_norm": 0.2712373924034364, | |
| "learning_rate": 1.0152131022512929e-07, | |
| "loss": 0.5772, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.9868945868945866, | |
| "grad_norm": 0.2696124467190187, | |
| "learning_rate": 1.0012420039488912e-07, | |
| "loss": 0.5874, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 2.996011396011396, | |
| "eval_loss": 0.6207034587860107, | |
| "eval_runtime": 445.0791, | |
| "eval_samples_per_second": 26.564, | |
| "eval_steps_per_second": 0.416, | |
| "step": 1314 | |
| }, | |
| { | |
| "epoch": 2.996011396011396, | |
| "step": 1314, | |
| "total_flos": 2755219238682624.0, | |
| "train_loss": 0.6124119740461469, | |
| "train_runtime": 70824.8381, | |
| "train_samples_per_second": 9.514, | |
| "train_steps_per_second": 0.019 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1314, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2755219238682624.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |