{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 9000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011111111111111112, "grad_norm": 30.737470626831055, "learning_rate": 5.111111111111112e-07, "logits/chosen": 2.288069486618042, "logits/rejected": 2.3415634632110596, "logps/chosen": -148.15972900390625, "logps/rejected": -154.33299255371094, "loss": 1.1754, "nll_loss": 0.8885363936424255, "rewards/accuracies": 0.6050000190734863, "rewards/chosen": -14.815974235534668, "rewards/margins": 0.6173248291015625, "rewards/rejected": -15.433298110961914, "step": 50 }, { "epoch": 0.022222222222222223, "grad_norm": 27.839527130126953, "learning_rate": 1.0555555555555557e-06, "logits/chosen": 2.2870049476623535, "logits/rejected": 2.375169277191162, "logps/chosen": -139.32150268554688, "logps/rejected": -148.69989013671875, "loss": 0.9729, "nll_loss": 0.858982264995575, "rewards/accuracies": 0.6449999809265137, "rewards/chosen": -13.932150840759277, "rewards/margins": 0.937837541103363, "rewards/rejected": -14.869989395141602, "step": 100 }, { "epoch": 0.03333333333333333, "grad_norm": 84.59671783447266, "learning_rate": 1.6111111111111113e-06, "logits/chosen": 2.2526779174804688, "logits/rejected": 2.322312831878662, "logps/chosen": -135.03587341308594, "logps/rejected": -143.58334350585938, "loss": 0.9463, "nll_loss": 0.8337222933769226, "rewards/accuracies": 0.6200000047683716, "rewards/chosen": -13.50358772277832, "rewards/margins": 0.8547466993331909, "rewards/rejected": -14.3583345413208, "step": 150 }, { "epoch": 0.044444444444444446, "grad_norm": 10.991082191467285, "learning_rate": 2.1555555555555558e-06, "logits/chosen": 2.231903314590454, "logits/rejected": 2.2660622596740723, "logps/chosen": -133.3765411376953, "logps/rejected": -142.4051513671875, "loss": 0.872, "nll_loss": 0.8225094676017761, "rewards/accuracies": 0.5950000286102295, "rewards/chosen": -13.337655067443848, "rewards/margins": 0.9028608202934265, "rewards/rejected": -14.240516662597656, "step": 200 }, { "epoch": 0.05555555555555555, "grad_norm": 45.91594314575195, "learning_rate": 2.7111111111111116e-06, "logits/chosen": 2.267564296722412, "logits/rejected": 2.3324620723724365, "logps/chosen": -140.41372680664062, "logps/rejected": -147.0218505859375, "loss": 0.9918, "nll_loss": 0.8563990592956543, "rewards/accuracies": 0.5799999833106995, "rewards/chosen": -14.041373252868652, "rewards/margins": 0.6608125567436218, "rewards/rejected": -14.702186584472656, "step": 250 }, { "epoch": 0.06666666666666667, "grad_norm": 38.895626068115234, "learning_rate": 3.266666666666667e-06, "logits/chosen": 2.3046438694000244, "logits/rejected": 2.3641300201416016, "logps/chosen": -133.28237915039062, "logps/rejected": -141.36343383789062, "loss": 0.9501, "nll_loss": 0.8569705486297607, "rewards/accuracies": 0.6200000047683716, "rewards/chosen": -13.328237533569336, "rewards/margins": 0.8081063628196716, "rewards/rejected": -14.136343955993652, "step": 300 }, { "epoch": 0.07777777777777778, "grad_norm": 66.35061645507812, "learning_rate": 3.8222222222222224e-06, "logits/chosen": 2.316657304763794, "logits/rejected": 2.3791918754577637, "logps/chosen": -139.16127014160156, "logps/rejected": -146.6514129638672, "loss": 1.0616, "nll_loss": 0.8685147166252136, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -13.916126251220703, "rewards/margins": 0.7490136623382568, "rewards/rejected": -14.665142059326172, "step": 350 }, { "epoch": 0.08888888888888889, "grad_norm": 39.1428337097168, "learning_rate": 4.377777777777778e-06, "logits/chosen": 2.3417108058929443, "logits/rejected": 2.3877785205841064, "logps/chosen": -140.82168579101562, "logps/rejected": -149.19383239746094, "loss": 0.9138, "nll_loss": 0.842674195766449, "rewards/accuracies": 0.5849999785423279, "rewards/chosen": -14.082167625427246, "rewards/margins": 0.8372161388397217, "rewards/rejected": -14.91938591003418, "step": 400 }, { "epoch": 0.1, "grad_norm": 16.129215240478516, "learning_rate": 4.933333333333334e-06, "logits/chosen": 2.3277602195739746, "logits/rejected": 2.3971059322357178, "logps/chosen": -135.13775634765625, "logps/rejected": -147.26150512695312, "loss": 0.8023, "nll_loss": 0.8287579417228699, "rewards/accuracies": 0.6600000262260437, "rewards/chosen": -13.513776779174805, "rewards/margins": 1.2123744487762451, "rewards/rejected": -14.726149559020996, "step": 450 }, { "epoch": 0.1111111111111111, "grad_norm": 63.364898681640625, "learning_rate": 5.4888888888888895e-06, "logits/chosen": 2.3398239612579346, "logits/rejected": 2.415525436401367, "logps/chosen": -137.08380126953125, "logps/rejected": -145.44190979003906, "loss": 1.0585, "nll_loss": 0.8642376661300659, "rewards/accuracies": 0.6349999904632568, "rewards/chosen": -13.708379745483398, "rewards/margins": 0.8358100652694702, "rewards/rejected": -14.544190406799316, "step": 500 }, { "epoch": 0.12222222222222222, "grad_norm": 15.574223518371582, "learning_rate": 6.044444444444445e-06, "logits/chosen": 2.2965493202209473, "logits/rejected": 2.361276626586914, "logps/chosen": -134.34083557128906, "logps/rejected": -145.83859252929688, "loss": 0.799, "nll_loss": 0.8278245329856873, "rewards/accuracies": 0.6549999713897705, "rewards/chosen": -13.434082984924316, "rewards/margins": 1.1497764587402344, "rewards/rejected": -14.58385944366455, "step": 550 }, { "epoch": 0.13333333333333333, "grad_norm": 46.650978088378906, "learning_rate": 6.600000000000001e-06, "logits/chosen": 2.4101524353027344, "logits/rejected": 2.448826551437378, "logps/chosen": -140.3436737060547, "logps/rejected": -148.26324462890625, "loss": 1.034, "nll_loss": 0.8485268354415894, "rewards/accuracies": 0.6150000095367432, "rewards/chosen": -14.034366607666016, "rewards/margins": 0.7919567227363586, "rewards/rejected": -14.826324462890625, "step": 600 }, { "epoch": 0.14444444444444443, "grad_norm": 44.84783172607422, "learning_rate": 7.155555555555556e-06, "logits/chosen": 2.493147850036621, "logits/rejected": 2.565725803375244, "logps/chosen": -140.9664306640625, "logps/rejected": -151.43325805664062, "loss": 1.0343, "nll_loss": 0.8390085101127625, "rewards/accuracies": 0.6200000047683716, "rewards/chosen": -14.096641540527344, "rewards/margins": 1.046683669090271, "rewards/rejected": -15.143324851989746, "step": 650 }, { "epoch": 0.15555555555555556, "grad_norm": 61.45585250854492, "learning_rate": 7.711111111111112e-06, "logits/chosen": 2.5712664127349854, "logits/rejected": 2.624903678894043, "logps/chosen": -141.75469970703125, "logps/rejected": -151.08155822753906, "loss": 0.8933, "nll_loss": 0.859018087387085, "rewards/accuracies": 0.7049999833106995, "rewards/chosen": -14.175471305847168, "rewards/margins": 0.9326856732368469, "rewards/rejected": -15.10815715789795, "step": 700 }, { "epoch": 0.16666666666666666, "grad_norm": 28.346845626831055, "learning_rate": 8.266666666666667e-06, "logits/chosen": 2.6571621894836426, "logits/rejected": 2.7124974727630615, "logps/chosen": -134.9234619140625, "logps/rejected": -164.70962524414062, "loss": 0.6756, "nll_loss": 0.8667213916778564, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -13.492345809936523, "rewards/margins": 2.978616237640381, "rewards/rejected": -16.470964431762695, "step": 750 }, { "epoch": 0.17777777777777778, "grad_norm": 61.787044525146484, "learning_rate": 8.822222222222223e-06, "logits/chosen": 2.610713243484497, "logits/rejected": 2.6504781246185303, "logps/chosen": -138.24453735351562, "logps/rejected": -148.29908752441406, "loss": 0.9459, "nll_loss": 0.8650538921356201, "rewards/accuracies": 0.5950000286102295, "rewards/chosen": -13.824453353881836, "rewards/margins": 1.0054553747177124, "rewards/rejected": -14.82990837097168, "step": 800 }, { "epoch": 0.18888888888888888, "grad_norm": 9.88873291015625, "learning_rate": 9.377777777777779e-06, "logits/chosen": 2.614926338195801, "logits/rejected": 2.6556060314178467, "logps/chosen": -131.00535583496094, "logps/rejected": -151.04949951171875, "loss": 0.6585, "nll_loss": 0.8831941485404968, "rewards/accuracies": 0.7149999737739563, "rewards/chosen": -13.100536346435547, "rewards/margins": 2.0044116973876953, "rewards/rejected": -15.104948997497559, "step": 850 }, { "epoch": 0.2, "grad_norm": 82.63723754882812, "learning_rate": 9.933333333333334e-06, "logits/chosen": 2.5326764583587646, "logits/rejected": 2.5826892852783203, "logps/chosen": -138.1752471923828, "logps/rejected": -158.4066925048828, "loss": 0.7018, "nll_loss": 0.832364559173584, "rewards/accuracies": 0.7549999952316284, "rewards/chosen": -13.817523956298828, "rewards/margins": 2.023144245147705, "rewards/rejected": -15.840670585632324, "step": 900 }, { "epoch": 0.2111111111111111, "grad_norm": 3.078538179397583, "learning_rate": 9.999271944429139e-06, "logits/chosen": 2.560307264328003, "logits/rejected": 2.60050106048584, "logps/chosen": -134.33383178710938, "logps/rejected": -148.48223876953125, "loss": 0.8178, "nll_loss": 0.8627762794494629, "rewards/accuracies": 0.6600000262260437, "rewards/chosen": -13.433382987976074, "rewards/margins": 1.414842128753662, "rewards/rejected": -14.848222732543945, "step": 950 }, { "epoch": 0.2222222222222222, "grad_norm": 68.0674057006836, "learning_rate": 9.996677405681096e-06, "logits/chosen": 2.545196533203125, "logits/rejected": 2.5989139080047607, "logps/chosen": -137.47396850585938, "logps/rejected": -163.21507263183594, "loss": 0.7496, "nll_loss": 0.8726359605789185, "rewards/accuracies": 0.6949999928474426, "rewards/chosen": -13.747394561767578, "rewards/margins": 2.5741100311279297, "rewards/rejected": -16.32150650024414, "step": 1000 }, { "epoch": 0.23333333333333334, "grad_norm": 57.82680892944336, "learning_rate": 9.992203820909906e-06, "logits/chosen": 2.542595624923706, "logits/rejected": 2.592430353164673, "logps/chosen": -142.53028869628906, "logps/rejected": -156.15875244140625, "loss": 0.9915, "nll_loss": 0.8822048306465149, "rewards/accuracies": 0.6549999713897705, "rewards/chosen": -14.253028869628906, "rewards/margins": 1.3628443479537964, "rewards/rejected": -15.615875244140625, "step": 1050 }, { "epoch": 0.24444444444444444, "grad_norm": 18.480113983154297, "learning_rate": 9.985852872447845e-06, "logits/chosen": 2.538330554962158, "logits/rejected": 2.6010327339172363, "logps/chosen": -138.7212371826172, "logps/rejected": -159.77830505371094, "loss": 0.8432, "nll_loss": 0.8557933568954468, "rewards/accuracies": 0.6850000023841858, "rewards/chosen": -13.872123718261719, "rewards/margins": 2.1057071685791016, "rewards/rejected": -15.977831840515137, "step": 1100 }, { "epoch": 0.25555555555555554, "grad_norm": 73.4820556640625, "learning_rate": 9.977626948626897e-06, "logits/chosen": 2.6203174591064453, "logits/rejected": 2.6719906330108643, "logps/chosen": -140.52574157714844, "logps/rejected": -156.38177490234375, "loss": 0.9128, "nll_loss": 0.8684940934181213, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -14.052573204040527, "rewards/margins": 1.5856045484542847, "rewards/rejected": -15.638178825378418, "step": 1150 }, { "epoch": 0.26666666666666666, "grad_norm": 23.718711853027344, "learning_rate": 9.967529142880592e-06, "logits/chosen": 2.6670925617218018, "logits/rejected": 2.732938528060913, "logps/chosen": -134.9943389892578, "logps/rejected": -154.99261474609375, "loss": 0.6533, "nll_loss": 0.8487300872802734, "rewards/accuracies": 0.7099999785423279, "rewards/chosen": -13.499436378479004, "rewards/margins": 1.9998255968093872, "rewards/rejected": -15.499259948730469, "step": 1200 }, { "epoch": 0.2777777777777778, "grad_norm": 42.438804626464844, "learning_rate": 9.955563252580704e-06, "logits/chosen": 2.676227331161499, "logits/rejected": 2.715693473815918, "logps/chosen": -138.00128173828125, "logps/rejected": -156.00650024414062, "loss": 0.8255, "nll_loss": 0.8802525401115417, "rewards/accuracies": 0.6850000023841858, "rewards/chosen": -13.800128936767578, "rewards/margins": 1.8005197048187256, "rewards/rejected": -15.6006498336792, "step": 1250 }, { "epoch": 0.28888888888888886, "grad_norm": 10.877970695495605, "learning_rate": 9.941733777609204e-06, "logits/chosen": 2.751253366470337, "logits/rejected": 2.795524835586548, "logps/chosen": -139.14181518554688, "logps/rejected": -160.90316772460938, "loss": 0.7424, "nll_loss": 0.8603755235671997, "rewards/accuracies": 0.7450000047683716, "rewards/chosen": -13.91418170928955, "rewards/margins": 2.1761343479156494, "rewards/rejected": -16.090314865112305, "step": 1300 }, { "epoch": 0.3, "grad_norm": 47.730648040771484, "learning_rate": 9.926045918666045e-06, "logits/chosen": 2.7930471897125244, "logits/rejected": 2.8543944358825684, "logps/chosen": -139.95281982421875, "logps/rejected": -157.9696044921875, "loss": 0.7688, "nll_loss": 0.8698009252548218, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -13.995281219482422, "rewards/margins": 1.8016793727874756, "rewards/rejected": -15.796960830688477, "step": 1350 }, { "epoch": 0.3111111111111111, "grad_norm": 27.38691520690918, "learning_rate": 9.908505575313389e-06, "logits/chosen": 2.7655038833618164, "logits/rejected": 2.798898220062256, "logps/chosen": -144.7771759033203, "logps/rejected": -161.54562377929688, "loss": 0.7954, "nll_loss": 0.8934239149093628, "rewards/accuracies": 0.75, "rewards/chosen": -14.4777193069458, "rewards/margins": 1.676843523979187, "rewards/rejected": -16.154563903808594, "step": 1400 }, { "epoch": 0.32222222222222224, "grad_norm": 52.532676696777344, "learning_rate": 9.889119343757026e-06, "logits/chosen": 2.7126517295837402, "logits/rejected": 2.7499828338623047, "logps/chosen": -132.46609497070312, "logps/rejected": -155.1712646484375, "loss": 0.6659, "nll_loss": 0.8384760022163391, "rewards/accuracies": 0.7149999737739563, "rewards/chosen": -13.246609687805176, "rewards/margins": 2.270516872406006, "rewards/rejected": -15.517126083374023, "step": 1450 }, { "epoch": 0.3333333333333333, "grad_norm": 7.1437506675720215, "learning_rate": 9.867894514365802e-06, "logits/chosen": 2.688624858856201, "logits/rejected": 2.736680030822754, "logps/chosen": -134.9152374267578, "logps/rejected": -148.8809051513672, "loss": 0.8186, "nll_loss": 0.8558450937271118, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -13.491524696350098, "rewards/margins": 1.3965673446655273, "rewards/rejected": -14.888092041015625, "step": 1500 }, { "epoch": 0.34444444444444444, "grad_norm": 38.98169708251953, "learning_rate": 9.844839068930021e-06, "logits/chosen": 2.7444489002227783, "logits/rejected": 2.7875852584838867, "logps/chosen": -135.85693359375, "logps/rejected": -152.01834106445312, "loss": 0.8083, "nll_loss": 0.8617506623268127, "rewards/accuracies": 0.7049999833106995, "rewards/chosen": -13.585694313049316, "rewards/margins": 1.6161404848098755, "rewards/rejected": -15.201834678649902, "step": 1550 }, { "epoch": 0.35555555555555557, "grad_norm": 28.79446792602539, "learning_rate": 9.819961677659813e-06, "logits/chosen": 2.8199007511138916, "logits/rejected": 2.8766839504241943, "logps/chosen": -136.37872314453125, "logps/rejected": -157.2845458984375, "loss": 0.8224, "nll_loss": 0.8459619283676147, "rewards/accuracies": 0.6800000071525574, "rewards/chosen": -13.637870788574219, "rewards/margins": 2.0905823707580566, "rewards/rejected": -15.728453636169434, "step": 1600 }, { "epoch": 0.36666666666666664, "grad_norm": 13.561357498168945, "learning_rate": 9.793271695924621e-06, "logits/chosen": 2.8344268798828125, "logits/rejected": 2.87033748626709, "logps/chosen": -134.71429443359375, "logps/rejected": -148.44094848632812, "loss": 0.8539, "nll_loss": 0.842303991317749, "rewards/accuracies": 0.6899999976158142, "rewards/chosen": -13.471430778503418, "rewards/margins": 1.3726661205291748, "rewards/rejected": -14.844097137451172, "step": 1650 }, { "epoch": 0.37777777777777777, "grad_norm": 59.226470947265625, "learning_rate": 9.76477916073504e-06, "logits/chosen": 2.7606427669525146, "logits/rejected": 2.8242993354797363, "logps/chosen": -145.2235107421875, "logps/rejected": -162.43003845214844, "loss": 0.7646, "nll_loss": 0.8559630513191223, "rewards/accuracies": 0.6800000071525574, "rewards/chosen": -14.522350311279297, "rewards/margins": 1.7206525802612305, "rewards/rejected": -16.243003845214844, "step": 1700 }, { "epoch": 0.3888888888888889, "grad_norm": 9.57642936706543, "learning_rate": 9.734494786968293e-06, "logits/chosen": 2.686143398284912, "logits/rejected": 2.721389055252075, "logps/chosen": -139.97434997558594, "logps/rejected": -162.78562927246094, "loss": 0.7372, "nll_loss": 0.8470394611358643, "rewards/accuracies": 0.7200000286102295, "rewards/chosen": -13.99743366241455, "rewards/margins": 2.281128168106079, "rewards/rejected": -16.278562545776367, "step": 1750 }, { "epoch": 0.4, "grad_norm": 96.98550415039062, "learning_rate": 9.702429963338812e-06, "logits/chosen": 2.745166540145874, "logits/rejected": 2.824406147003174, "logps/chosen": -132.31056213378906, "logps/rejected": -151.85853576660156, "loss": 0.6959, "nll_loss": 0.8725927472114563, "rewards/accuracies": 0.7200000286102295, "rewards/chosen": -13.231057167053223, "rewards/margins": 1.9547969102859497, "rewards/rejected": -15.1858549118042, "step": 1800 }, { "epoch": 0.4111111111111111, "grad_norm": 17.682653427124023, "learning_rate": 9.668596748115413e-06, "logits/chosen": 2.7395882606506348, "logits/rejected": 2.807523488998413, "logps/chosen": -140.53750610351562, "logps/rejected": -167.0639190673828, "loss": 0.6729, "nll_loss": 0.8412314057350159, "rewards/accuracies": 0.7799999713897705, "rewards/chosen": -14.053749084472656, "rewards/margins": 2.6526434421539307, "rewards/rejected": -16.70639419555664, "step": 1850 }, { "epoch": 0.4222222222222222, "grad_norm": 26.376707077026367, "learning_rate": 9.633007864586661e-06, "logits/chosen": 2.757847309112549, "logits/rejected": 2.8308401107788086, "logps/chosen": -135.72564697265625, "logps/rejected": -156.1707000732422, "loss": 0.701, "nll_loss": 0.8870025873184204, "rewards/accuracies": 0.7350000143051147, "rewards/chosen": -13.572566032409668, "rewards/margins": 2.0445029735565186, "rewards/rejected": -15.61706829071045, "step": 1900 }, { "epoch": 0.43333333333333335, "grad_norm": 5.650549411773682, "learning_rate": 9.595676696276173e-06, "logits/chosen": 2.8521478176116943, "logits/rejected": 2.886814594268799, "logps/chosen": -141.73439025878906, "logps/rejected": -159.01644897460938, "loss": 0.8613, "nll_loss": 0.8619416952133179, "rewards/accuracies": 0.6650000214576721, "rewards/chosen": -14.173439979553223, "rewards/margins": 1.728204369544983, "rewards/rejected": -15.901644706726074, "step": 1950 }, { "epoch": 0.4444444444444444, "grad_norm": 52.1768913269043, "learning_rate": 9.55661728190962e-06, "logits/chosen": 2.927558660507202, "logits/rejected": 2.9791042804718018, "logps/chosen": -137.81671142578125, "logps/rejected": -154.84564208984375, "loss": 0.7334, "nll_loss": 0.8633580207824707, "rewards/accuracies": 0.7049999833106995, "rewards/chosen": -13.781669616699219, "rewards/margins": 1.7028939723968506, "rewards/rejected": -15.484563827514648, "step": 2000 }, { "epoch": 0.45555555555555555, "grad_norm": 47.696746826171875, "learning_rate": 9.515844310135328e-06, "logits/chosen": 2.985607862472534, "logits/rejected": 3.0231430530548096, "logps/chosen": -141.2799835205078, "logps/rejected": -158.4896240234375, "loss": 0.8235, "nll_loss": 0.8734262585639954, "rewards/accuracies": 0.7049999833106995, "rewards/chosen": -14.127999305725098, "rewards/margins": 1.7209625244140625, "rewards/rejected": -15.848962783813477, "step": 2050 }, { "epoch": 0.4666666666666667, "grad_norm": 6.931493759155273, "learning_rate": 9.473373114000493e-06, "logits/chosen": 2.980579137802124, "logits/rejected": 3.0231592655181885, "logps/chosen": -140.94241333007812, "logps/rejected": -155.71998596191406, "loss": 0.835, "nll_loss": 0.8785473704338074, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -14.094242095947266, "rewards/margins": 1.4777570962905884, "rewards/rejected": -15.571999549865723, "step": 2100 }, { "epoch": 0.4777777777777778, "grad_norm": 62.173519134521484, "learning_rate": 9.429219665185034e-06, "logits/chosen": 2.9382734298706055, "logits/rejected": 2.9916536808013916, "logps/chosen": -136.1079559326172, "logps/rejected": -159.69107055664062, "loss": 0.8086, "nll_loss": 0.8379454016685486, "rewards/accuracies": 0.7350000143051147, "rewards/chosen": -13.610795974731445, "rewards/margins": 2.3583104610443115, "rewards/rejected": -15.969106674194336, "step": 2150 }, { "epoch": 0.4888888888888889, "grad_norm": 41.61237335205078, "learning_rate": 9.38340056799531e-06, "logits/chosen": 2.808896541595459, "logits/rejected": 2.8540892601013184, "logps/chosen": -136.34718322753906, "logps/rejected": -153.9817352294922, "loss": 0.6651, "nll_loss": 0.8380148410797119, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -13.634718894958496, "rewards/margins": 1.763453722000122, "rewards/rejected": -15.398174285888672, "step": 2200 }, { "epoch": 0.5, "grad_norm": 38.237552642822266, "learning_rate": 9.335933053119906e-06, "logits/chosen": 2.745997667312622, "logits/rejected": 2.8198602199554443, "logps/chosen": -139.24932861328125, "logps/rejected": -162.5125732421875, "loss": 0.6868, "nll_loss": 0.8611648678779602, "rewards/accuracies": 0.7450000047683716, "rewards/chosen": -13.924934387207031, "rewards/margins": 2.3263232707977295, "rewards/rejected": -16.251256942749023, "step": 2250 }, { "epoch": 0.5111111111111111, "grad_norm": 30.992565155029297, "learning_rate": 9.286834971149891e-06, "logits/chosen": 2.708925485610962, "logits/rejected": 2.763185501098633, "logps/chosen": -140.6510009765625, "logps/rejected": -161.15457153320312, "loss": 0.7342, "nll_loss": 0.901926577091217, "rewards/accuracies": 0.7450000047683716, "rewards/chosen": -14.065099716186523, "rewards/margins": 2.0503571033477783, "rewards/rejected": -16.115459442138672, "step": 2300 }, { "epoch": 0.5222222222222223, "grad_norm": 29.499160766601562, "learning_rate": 9.23612478586593e-06, "logits/chosen": 2.7198758125305176, "logits/rejected": 2.781439781188965, "logps/chosen": -147.30389404296875, "logps/rejected": -168.24838256835938, "loss": 0.7896, "nll_loss": 0.8842223882675171, "rewards/accuracies": 0.7099999785423279, "rewards/chosen": -14.730390548706055, "rewards/margins": 2.094449520111084, "rewards/rejected": -16.824838638305664, "step": 2350 }, { "epoch": 0.5333333333333333, "grad_norm": 19.278635025024414, "learning_rate": 9.184883117153579e-06, "logits/chosen": 2.8237617015838623, "logits/rejected": 2.852189064025879, "logps/chosen": -139.60552978515625, "logps/rejected": -160.8499755859375, "loss": 0.7499, "nll_loss": 0.8797626495361328, "rewards/accuracies": 0.7149999737739563, "rewards/chosen": -13.960552215576172, "rewards/margins": 2.1244430541992188, "rewards/rejected": -16.084997177124023, "step": 2400 }, { "epoch": 0.5444444444444444, "grad_norm": 16.882116317749023, "learning_rate": 9.131037805083889e-06, "logits/chosen": 2.7883031368255615, "logits/rejected": 2.829986810684204, "logps/chosen": -140.319580078125, "logps/rejected": -163.9326629638672, "loss": 0.7635, "nll_loss": 0.889370858669281, "rewards/accuracies": 0.7049999833106995, "rewards/chosen": -14.031957626342773, "rewards/margins": 2.36130690574646, "rewards/rejected": -16.393264770507812, "step": 2450 }, { "epoch": 0.5555555555555556, "grad_norm": 9.147072792053223, "learning_rate": 9.07563897864277e-06, "logits/chosen": 2.7562732696533203, "logits/rejected": 2.8055427074432373, "logps/chosen": -135.0802459716797, "logps/rejected": -154.70709228515625, "loss": 0.6194, "nll_loss": 0.8583150506019592, "rewards/accuracies": 0.7599999904632568, "rewards/chosen": -13.508025169372559, "rewards/margins": 1.9626835584640503, "rewards/rejected": -15.470707893371582, "step": 2500 }, { "epoch": 0.5666666666666667, "grad_norm": 46.41688537597656, "learning_rate": 9.018707471063206e-06, "logits/chosen": 2.752124547958374, "logits/rejected": 2.786952257156372, "logps/chosen": -143.7683563232422, "logps/rejected": -161.48851013183594, "loss": 0.8724, "nll_loss": 0.8703722357749939, "rewards/accuracies": 0.7049999833106995, "rewards/chosen": -14.376836776733398, "rewards/margins": 1.7720153331756592, "rewards/rejected": -16.14885139465332, "step": 2550 }, { "epoch": 0.5777777777777777, "grad_norm": 19.447980880737305, "learning_rate": 8.960264691956864e-06, "logits/chosen": 2.7894318103790283, "logits/rejected": 2.8483948707580566, "logps/chosen": -141.15562438964844, "logps/rejected": -158.61134338378906, "loss": 0.7643, "nll_loss": 0.8781980276107788, "rewards/accuracies": 0.6949999928474426, "rewards/chosen": -14.115564346313477, "rewards/margins": 1.7455706596374512, "rewards/rejected": -15.861133575439453, "step": 2600 }, { "epoch": 0.5888888888888889, "grad_norm": 16.72323226928711, "learning_rate": 8.900332619262834e-06, "logits/chosen": 2.8104143142700195, "logits/rejected": 2.882678747177124, "logps/chosen": -140.38174438476562, "logps/rejected": -160.63214111328125, "loss": 0.725, "nll_loss": 0.8860365152359009, "rewards/accuracies": 0.7450000047683716, "rewards/chosen": -14.038174629211426, "rewards/margins": 2.0250399112701416, "rewards/rejected": -16.063213348388672, "step": 2650 }, { "epoch": 0.6, "grad_norm": 3.247159957885742, "learning_rate": 8.838933790982612e-06, "logits/chosen": 2.8210253715515137, "logits/rejected": 2.8741745948791504, "logps/chosen": -132.52493286132812, "logps/rejected": -155.1892852783203, "loss": 0.5984, "nll_loss": 0.8526219129562378, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -13.252492904663086, "rewards/margins": 2.266435384750366, "rewards/rejected": -15.518929481506348, "step": 2700 }, { "epoch": 0.6111111111111112, "grad_norm": 65.30035400390625, "learning_rate": 8.776091296704487e-06, "logits/chosen": 2.758326768875122, "logits/rejected": 2.8138275146484375, "logps/chosen": -137.0075225830078, "logps/rejected": -159.10792541503906, "loss": 0.633, "nll_loss": 0.8668171167373657, "rewards/accuracies": 0.7450000047683716, "rewards/chosen": -13.700752258300781, "rewards/margins": 2.2100412845611572, "rewards/rejected": -15.910792350769043, "step": 2750 }, { "epoch": 0.6222222222222222, "grad_norm": 43.83423614501953, "learning_rate": 8.711828768920489e-06, "logits/chosen": 2.7353198528289795, "logits/rejected": 2.77675199508667, "logps/chosen": -138.0487060546875, "logps/rejected": -160.97042846679688, "loss": 0.7606, "nll_loss": 0.8759157657623291, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -13.804869651794434, "rewards/margins": 2.2921743392944336, "rewards/rejected": -16.097043991088867, "step": 2800 }, { "epoch": 0.6333333333333333, "grad_norm": 30.70613670349121, "learning_rate": 8.646170374139172e-06, "logits/chosen": 2.7556371688842773, "logits/rejected": 2.7869367599487305, "logps/chosen": -138.87327575683594, "logps/rejected": -161.50193786621094, "loss": 0.7397, "nll_loss": 0.889627993106842, "rewards/accuracies": 0.7400000095367432, "rewards/chosen": -13.8873291015625, "rewards/margins": 2.262864828109741, "rewards/rejected": -16.150196075439453, "step": 2850 }, { "epoch": 0.6444444444444445, "grad_norm": 71.25941467285156, "learning_rate": 8.57914080379758e-06, "logits/chosen": 2.7243993282318115, "logits/rejected": 2.7958648204803467, "logps/chosen": -137.00030517578125, "logps/rejected": -166.46339416503906, "loss": 0.5952, "nll_loss": 0.8637996912002563, "rewards/accuracies": 0.7799999713897705, "rewards/chosen": -13.700030326843262, "rewards/margins": 2.9463095664978027, "rewards/rejected": -16.646339416503906, "step": 2900 }, { "epoch": 0.6555555555555556, "grad_norm": 2.572660207748413, "learning_rate": 8.510765264975813e-06, "logits/chosen": 2.697694778442383, "logits/rejected": 2.7674074172973633, "logps/chosen": -134.11878967285156, "logps/rejected": -158.1276397705078, "loss": 0.7655, "nll_loss": 0.8479865193367004, "rewards/accuracies": 0.7400000095367432, "rewards/chosen": -13.411879539489746, "rewards/margins": 2.4008872509002686, "rewards/rejected": -15.812766075134277, "step": 2950 }, { "epoch": 0.6666666666666666, "grad_norm": 48.32857894897461, "learning_rate": 8.441069470917664e-06, "logits/chosen": 2.694932222366333, "logits/rejected": 2.7517261505126953, "logps/chosen": -134.69090270996094, "logps/rejected": -150.8953094482422, "loss": 0.7999, "nll_loss": 0.8715799450874329, "rewards/accuracies": 0.6899999976158142, "rewards/chosen": -13.469090461730957, "rewards/margins": 1.6204394102096558, "rewards/rejected": -15.089529991149902, "step": 3000 }, { "epoch": 0.6777777777777778, "grad_norm": 50.64729690551758, "learning_rate": 8.370079631360931e-06, "logits/chosen": 2.689343214035034, "logits/rejected": 2.753621816635132, "logps/chosen": -136.48391723632812, "logps/rejected": -155.56744384765625, "loss": 0.7337, "nll_loss": 0.8551886677742004, "rewards/accuracies": 0.7400000095367432, "rewards/chosen": -13.64838981628418, "rewards/margins": 1.9083553552627563, "rewards/rejected": -15.556746482849121, "step": 3050 }, { "epoch": 0.6888888888888889, "grad_norm": 33.486053466796875, "learning_rate": 8.297822442681e-06, "logits/chosen": 2.677144765853882, "logits/rejected": 2.741197109222412, "logps/chosen": -139.4444122314453, "logps/rejected": -163.82855224609375, "loss": 0.7451, "nll_loss": 0.8538349270820618, "rewards/accuracies": 0.7599999904632568, "rewards/chosen": -13.944441795349121, "rewards/margins": 2.438413143157959, "rewards/rejected": -16.382854461669922, "step": 3100 }, { "epoch": 0.7, "grad_norm": 8.641382217407227, "learning_rate": 8.224325077851429e-06, "logits/chosen": 2.652789831161499, "logits/rejected": 2.722318649291992, "logps/chosen": -138.2322235107422, "logps/rejected": -165.62013244628906, "loss": 0.5378, "nll_loss": 0.8370693325996399, "rewards/accuracies": 0.7900000214576721, "rewards/chosen": -13.823221206665039, "rewards/margins": 2.7387890815734863, "rewards/rejected": -16.56201171875, "step": 3150 }, { "epoch": 0.7111111111111111, "grad_norm": 16.82419776916504, "learning_rate": 8.14961517622531e-06, "logits/chosen": 2.539468765258789, "logits/rejected": 2.5970101356506348, "logps/chosen": -150.5495147705078, "logps/rejected": -172.2704315185547, "loss": 0.6274, "nll_loss": 0.8992826342582703, "rewards/accuracies": 0.7450000047683716, "rewards/chosen": -15.054952621459961, "rewards/margins": 2.1720917224884033, "rewards/rejected": -17.22704315185547, "step": 3200 }, { "epoch": 0.7222222222222222, "grad_norm": 72.15422821044922, "learning_rate": 8.073720833141234e-06, "logits/chosen": 2.5378000736236572, "logits/rejected": 2.593313694000244, "logps/chosen": -137.90753173828125, "logps/rejected": -161.35955810546875, "loss": 0.637, "nll_loss": 0.8648838996887207, "rewards/accuracies": 0.75, "rewards/chosen": -13.790754318237305, "rewards/margins": 2.345200777053833, "rewards/rejected": -16.135953903198242, "step": 3250 }, { "epoch": 0.7333333333333333, "grad_norm": 9.427265167236328, "learning_rate": 7.996670589357782e-06, "logits/chosen": 2.5012764930725098, "logits/rejected": 2.5697526931762695, "logps/chosen": -140.64195251464844, "logps/rejected": -172.3091583251953, "loss": 0.5146, "nll_loss": 0.878004789352417, "rewards/accuracies": 0.8100000023841858, "rewards/chosen": -14.064196586608887, "rewards/margins": 3.166719913482666, "rewards/rejected": -17.230915069580078, "step": 3300 }, { "epoch": 0.7444444444444445, "grad_norm": 2.961467981338501, "learning_rate": 7.918493420320518e-06, "logits/chosen": 2.459111452102661, "logits/rejected": 2.5447537899017334, "logps/chosen": -140.2351531982422, "logps/rejected": -173.6145477294922, "loss": 0.5418, "nll_loss": 0.8784846663475037, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -14.023514747619629, "rewards/margins": 3.337939977645874, "rewards/rejected": -17.3614559173584, "step": 3350 }, { "epoch": 0.7555555555555555, "grad_norm": 27.382617950439453, "learning_rate": 7.839218725265507e-06, "logits/chosen": 2.460723638534546, "logits/rejected": 2.5178442001342773, "logps/chosen": -136.5524444580078, "logps/rejected": -165.16700744628906, "loss": 0.5596, "nll_loss": 0.884792149066925, "rewards/accuracies": 0.8149999976158142, "rewards/chosen": -13.655243873596191, "rewards/margins": 2.8614559173583984, "rewards/rejected": -16.516698837280273, "step": 3400 }, { "epoch": 0.7666666666666667, "grad_norm": 46.12528610229492, "learning_rate": 7.75887631616346e-06, "logits/chosen": 2.435918092727661, "logits/rejected": 2.48390531539917, "logps/chosen": -144.48410034179688, "logps/rejected": -171.04135131835938, "loss": 0.5937, "nll_loss": 0.9030132293701172, "rewards/accuracies": 0.75, "rewards/chosen": -14.448410987854004, "rewards/margins": 2.6557226181030273, "rewards/rejected": -17.10413360595703, "step": 3450 }, { "epoch": 0.7777777777777778, "grad_norm": 73.7654037475586, "learning_rate": 7.677496406508673e-06, "logits/chosen": 2.4464337825775146, "logits/rejected": 2.4917571544647217, "logps/chosen": -138.65414428710938, "logps/rejected": -163.29440307617188, "loss": 0.6605, "nll_loss": 0.8930404186248779, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -13.8654146194458, "rewards/margins": 2.4640254974365234, "rewards/rejected": -16.32944107055664, "step": 3500 }, { "epoch": 0.7888888888888889, "grad_norm": 34.9301872253418, "learning_rate": 7.595109599956978e-06, "logits/chosen": 2.438520669937134, "logits/rejected": 2.5156054496765137, "logps/chosen": -142.00396728515625, "logps/rejected": -170.0238800048828, "loss": 0.4949, "nll_loss": 0.8846253156661987, "rewards/accuracies": 0.8399999737739563, "rewards/chosen": -14.200397491455078, "rewards/margins": 2.801990032196045, "rewards/rejected": -17.002389907836914, "step": 3550 }, { "epoch": 0.8, "grad_norm": 59.70923614501953, "learning_rate": 7.511746878816944e-06, "logits/chosen": 2.409118413925171, "logits/rejected": 2.4729368686676025, "logps/chosen": -142.9822998046875, "logps/rejected": -170.6934356689453, "loss": 0.7865, "nll_loss": 0.8965829610824585, "rewards/accuracies": 0.7649999856948853, "rewards/chosen": -14.29823112487793, "rewards/margins": 2.7711145877838135, "rewards/rejected": -17.06934356689453, "step": 3600 }, { "epoch": 0.8111111111111111, "grad_norm": 46.28501510620117, "learning_rate": 7.427439592398707e-06, "logits/chosen": 2.4235284328460693, "logits/rejected": 2.4894955158233643, "logps/chosen": -141.8939971923828, "logps/rejected": -169.57406616210938, "loss": 0.6666, "nll_loss": 0.8588463068008423, "rewards/accuracies": 0.7799999713897705, "rewards/chosen": -14.189399719238281, "rewards/margins": 2.7680084705352783, "rewards/rejected": -16.957408905029297, "step": 3650 }, { "epoch": 0.8222222222222222, "grad_norm": 38.428951263427734, "learning_rate": 7.342219445224771e-06, "logits/chosen": 2.4041669368743896, "logits/rejected": 2.4712166786193848, "logps/chosen": -142.94776916503906, "logps/rejected": -177.90609741210938, "loss": 0.5616, "nll_loss": 0.8728026747703552, "rewards/accuracies": 0.8050000071525574, "rewards/chosen": -14.294777870178223, "rewards/margins": 3.4958336353302, "rewards/rejected": -17.790611267089844, "step": 3700 }, { "epoch": 0.8333333333333334, "grad_norm": 44.377906799316406, "learning_rate": 7.256118485107242e-06, "logits/chosen": 2.427722930908203, "logits/rejected": 2.4884796142578125, "logps/chosen": -146.18592834472656, "logps/rejected": -177.53652954101562, "loss": 0.6173, "nll_loss": 0.8623551726341248, "rewards/accuracies": 0.8100000023841858, "rewards/chosen": -14.6185941696167, "rewards/margins": 3.1350598335266113, "rewards/rejected": -17.75365447998047, "step": 3750 }, { "epoch": 0.8444444444444444, "grad_norm": 24.273250579833984, "learning_rate": 7.169169091095949e-06, "logits/chosen": 2.497875452041626, "logits/rejected": 2.5441315174102783, "logps/chosen": -137.83047485351562, "logps/rejected": -169.2275848388672, "loss": 0.5692, "nll_loss": 0.8750566244125366, "rewards/accuracies": 0.7649999856948853, "rewards/chosen": -13.783045768737793, "rewards/margins": 3.1397128105163574, "rewards/rejected": -16.922758102416992, "step": 3800 }, { "epoch": 0.8555555555555555, "grad_norm": 20.51363754272461, "learning_rate": 7.081403961302007e-06, "logits/chosen": 2.4951789379119873, "logits/rejected": 2.5337958335876465, "logps/chosen": -140.47337341308594, "logps/rejected": -170.82681274414062, "loss": 0.5461, "nll_loss": 0.8628194332122803, "rewards/accuracies": 0.8199999928474426, "rewards/chosen": -14.047338485717773, "rewards/margins": 3.035342216491699, "rewards/rejected": -17.082683563232422, "step": 3850 }, { "epoch": 0.8666666666666667, "grad_norm": 1.1138660907745361, "learning_rate": 6.9928561006014035e-06, "logits/chosen": 2.4680285453796387, "logits/rejected": 2.5492372512817383, "logps/chosen": -145.60736083984375, "logps/rejected": -175.27304077148438, "loss": 0.573, "nll_loss": 0.8965364098548889, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -14.560735702514648, "rewards/margins": 2.9665701389312744, "rewards/rejected": -17.527305603027344, "step": 3900 }, { "epoch": 0.8777777777777778, "grad_norm": 42.39490509033203, "learning_rate": 6.903558808223205e-06, "logits/chosen": 2.4792604446411133, "logits/rejected": 2.5629947185516357, "logps/chosen": -144.02230834960938, "logps/rejected": -176.77955627441406, "loss": 0.5887, "nll_loss": 0.8880175948143005, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -14.402230262756348, "rewards/margins": 3.2757256031036377, "rewards/rejected": -17.677955627441406, "step": 3950 }, { "epoch": 0.8888888888888888, "grad_norm": 25.261232376098633, "learning_rate": 6.813545665227086e-06, "logits/chosen": 2.475069284439087, "logits/rejected": 2.548849105834961, "logps/chosen": -149.9359588623047, "logps/rejected": -176.93045043945312, "loss": 0.7649, "nll_loss": 0.9091718196868896, "rewards/accuracies": 0.7049999833106995, "rewards/chosen": -14.993595123291016, "rewards/margins": 2.6994497776031494, "rewards/rejected": -17.693042755126953, "step": 4000 }, { "epoch": 0.9, "grad_norm": 17.662899017333984, "learning_rate": 6.7228505218748555e-06, "logits/chosen": 2.539234161376953, "logits/rejected": 2.605987548828125, "logps/chosen": -144.09429931640625, "logps/rejected": -181.97972106933594, "loss": 0.5815, "nll_loss": 0.9110980033874512, "rewards/accuracies": 0.8149999976158142, "rewards/chosen": -14.409429550170898, "rewards/margins": 3.788541555404663, "rewards/rejected": -18.197973251342773, "step": 4050 }, { "epoch": 0.9111111111111111, "grad_norm": 25.98221778869629, "learning_rate": 6.631507484900771e-06, "logits/chosen": 2.529738426208496, "logits/rejected": 2.5699193477630615, "logps/chosen": -146.29800415039062, "logps/rejected": -180.32785034179688, "loss": 0.5622, "nll_loss": 0.8859230875968933, "rewards/accuracies": 0.8050000071525574, "rewards/chosen": -14.629799842834473, "rewards/margins": 3.4029860496520996, "rewards/rejected": -18.032785415649414, "step": 4100 }, { "epoch": 0.9222222222222223, "grad_norm": 17.0459041595459, "learning_rate": 6.53955090468538e-06, "logits/chosen": 2.4142448902130127, "logits/rejected": 2.4958066940307617, "logps/chosen": -143.1546630859375, "logps/rejected": -178.91212463378906, "loss": 0.4652, "nll_loss": 0.8616129159927368, "rewards/accuracies": 0.8199999928474426, "rewards/chosen": -14.315465927124023, "rewards/margins": 3.575746536254883, "rewards/rejected": -17.891212463378906, "step": 4150 }, { "epoch": 0.9333333333333333, "grad_norm": 57.73278045654297, "learning_rate": 6.447015362337758e-06, "logits/chosen": 2.403080940246582, "logits/rejected": 2.448233127593994, "logps/chosen": -149.53823852539062, "logps/rejected": -179.0576934814453, "loss": 0.5992, "nll_loss": 0.9083038568496704, "rewards/accuracies": 0.7950000166893005, "rewards/chosen": -14.953824043273926, "rewards/margins": 2.9519457817077637, "rewards/rejected": -17.90576934814453, "step": 4200 }, { "epoch": 0.9444444444444444, "grad_norm": 56.34907913208008, "learning_rate": 6.3539356566909485e-06, "logits/chosen": 2.3669228553771973, "logits/rejected": 2.445305109024048, "logps/chosen": -143.73602294921875, "logps/rejected": -175.57749938964844, "loss": 0.5794, "nll_loss": 0.8900047540664673, "rewards/accuracies": 0.7799999713897705, "rewards/chosen": -14.373603820800781, "rewards/margins": 3.1841464042663574, "rewards/rejected": -17.557750701904297, "step": 4250 }, { "epoch": 0.9555555555555556, "grad_norm": 30.58467674255371, "learning_rate": 6.26034679121557e-06, "logits/chosen": 2.329667329788208, "logits/rejected": 2.4006171226501465, "logps/chosen": -143.96170043945312, "logps/rejected": -180.5399169921875, "loss": 0.5348, "nll_loss": 0.8790844082832336, "rewards/accuracies": 0.7850000262260437, "rewards/chosen": -14.396170616149902, "rewards/margins": 3.6578216552734375, "rewards/rejected": -18.053993225097656, "step": 4300 }, { "epoch": 0.9666666666666667, "grad_norm": 61.15194320678711, "learning_rate": 6.16628396085642e-06, "logits/chosen": 2.2458531856536865, "logits/rejected": 2.288853883743286, "logps/chosen": -137.1383056640625, "logps/rejected": -164.68406677246094, "loss": 0.5668, "nll_loss": 0.8777892589569092, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -13.713830947875977, "rewards/margins": 2.7545764446258545, "rewards/rejected": -16.468406677246094, "step": 4350 }, { "epoch": 0.9777777777777777, "grad_norm": 23.967437744140625, "learning_rate": 6.071782538797112e-06, "logits/chosen": 2.1951770782470703, "logits/rejected": 2.296672821044922, "logps/chosen": -136.63027954101562, "logps/rejected": -178.8312530517578, "loss": 0.4923, "nll_loss": 0.8543100953102112, "rewards/accuracies": 0.8149999976158142, "rewards/chosen": -13.6630277633667, "rewards/margins": 4.22009801864624, "rewards/rejected": -17.88312339782715, "step": 4400 }, { "epoch": 0.9888888888888889, "grad_norm": 65.98208618164062, "learning_rate": 5.976878063157653e-06, "logits/chosen": 2.1789865493774414, "logits/rejected": 2.229640007019043, "logps/chosen": -141.5849151611328, "logps/rejected": -166.90452575683594, "loss": 0.647, "nll_loss": 0.8632768392562866, "rewards/accuracies": 0.75, "rewards/chosen": -14.158493995666504, "rewards/margins": 2.5319626331329346, "rewards/rejected": -16.690454483032227, "step": 4450 }, { "epoch": 1.0, "grad_norm": 43.130558013916016, "learning_rate": 5.881606223630029e-06, "logits/chosen": 2.1766304969787598, "logits/rejected": 2.2443294525146484, "logps/chosen": -139.3057098388672, "logps/rejected": -169.23995971679688, "loss": 0.5879, "nll_loss": 0.865101158618927, "rewards/accuracies": 0.7599999904632568, "rewards/chosen": -13.930569648742676, "rewards/margins": 2.9934256076812744, "rewards/rejected": -16.923995971679688, "step": 4500 }, { "epoch": 1.011111111111111, "grad_norm": 14.926958084106445, "learning_rate": 5.7860028480567465e-06, "logits/chosen": 2.163480043411255, "logits/rejected": 2.2339670658111572, "logps/chosen": -134.709716796875, "logps/rejected": -176.6461944580078, "loss": 0.2833, "nll_loss": 0.8595601916313171, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -13.470972061157227, "rewards/margins": 4.193648338317871, "rewards/rejected": -17.66461944580078, "step": 4550 }, { "epoch": 1.0222222222222221, "grad_norm": 24.16767120361328, "learning_rate": 5.690103888957473e-06, "logits/chosen": 2.0230038166046143, "logits/rejected": 2.0895702838897705, "logps/chosen": -132.3488311767578, "logps/rejected": -175.47560119628906, "loss": 0.287, "nll_loss": 0.8323882222175598, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -13.234881401062012, "rewards/margins": 4.312678337097168, "rewards/rejected": -17.54755973815918, "step": 4600 }, { "epoch": 1.0333333333333334, "grad_norm": 1.5205551385879517, "learning_rate": 5.593945410008742e-06, "logits/chosen": 1.9669561386108398, "logits/rejected": 2.0402841567993164, "logps/chosen": -133.15602111816406, "logps/rejected": -176.46730041503906, "loss": 0.2592, "nll_loss": 0.8528892397880554, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -13.315601348876953, "rewards/margins": 4.331127166748047, "rewards/rejected": -17.646730422973633, "step": 4650 }, { "epoch": 1.0444444444444445, "grad_norm": 13.740782737731934, "learning_rate": 5.497563572481896e-06, "logits/chosen": 1.963524580001831, "logits/rejected": 2.0396649837493896, "logps/chosen": -141.27101135253906, "logps/rejected": -190.2046356201172, "loss": 0.2665, "nll_loss": 0.8767533898353577, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -14.12710189819336, "rewards/margins": 4.893362045288086, "rewards/rejected": -19.020465850830078, "step": 4700 }, { "epoch": 1.0555555555555556, "grad_norm": 37.8596076965332, "learning_rate": 5.400994621644294e-06, "logits/chosen": 1.9676355123519897, "logits/rejected": 2.012232780456543, "logps/chosen": -137.65011596679688, "logps/rejected": -187.96646118164062, "loss": 0.2824, "nll_loss": 0.8659433126449585, "rewards/accuracies": 0.9150000214576721, "rewards/chosen": -13.765010833740234, "rewards/margins": 5.0316362380981445, "rewards/rejected": -18.796648025512695, "step": 4750 }, { "epoch": 1.0666666666666667, "grad_norm": 12.414517402648926, "learning_rate": 5.304274873128974e-06, "logits/chosen": 1.9294854402542114, "logits/rejected": 1.9716670513153076, "logps/chosen": -139.77052307128906, "logps/rejected": -189.33340454101562, "loss": 0.2764, "nll_loss": 0.8927189111709595, "rewards/accuracies": 0.9200000166893005, "rewards/chosen": -13.977051734924316, "rewards/margins": 4.956289291381836, "rewards/rejected": -18.933340072631836, "step": 4800 }, { "epoch": 1.0777777777777777, "grad_norm": 1.966835379600525, "learning_rate": 5.207440699277798e-06, "logits/chosen": 1.948458194732666, "logits/rejected": 2.0137341022491455, "logps/chosen": -140.36215209960938, "logps/rejected": -186.22996520996094, "loss": 0.2986, "nll_loss": 0.8593480587005615, "rewards/accuracies": 0.9300000071525574, "rewards/chosen": -14.036214828491211, "rewards/margins": 4.5867815017700195, "rewards/rejected": -18.622995376586914, "step": 4850 }, { "epoch": 1.0888888888888888, "grad_norm": 15.687790870666504, "learning_rate": 5.1105285154633285e-06, "logits/chosen": 1.8804965019226074, "logits/rejected": 1.9492509365081787, "logps/chosen": -139.7319793701172, "logps/rejected": -180.37353515625, "loss": 0.3067, "nll_loss": 0.8739662766456604, "rewards/accuracies": 0.8949999809265137, "rewards/chosen": -13.973197937011719, "rewards/margins": 4.064157009124756, "rewards/rejected": -18.037355422973633, "step": 4900 }, { "epoch": 1.1, "grad_norm": 2.446437120437622, "learning_rate": 5.0135747663944775e-06, "logits/chosen": 1.8845734596252441, "logits/rejected": 1.9228339195251465, "logps/chosen": -139.14076232910156, "logps/rejected": -188.3610382080078, "loss": 0.2484, "nll_loss": 0.8340281844139099, "rewards/accuracies": 0.9449999928474426, "rewards/chosen": -13.91407585144043, "rewards/margins": 4.922028064727783, "rewards/rejected": -18.836103439331055, "step": 4950 }, { "epoch": 1.1111111111111112, "grad_norm": 2.8917882442474365, "learning_rate": 4.916615912411151e-06, "logits/chosen": 1.8671766519546509, "logits/rejected": 1.9540256261825562, "logps/chosen": -142.6185302734375, "logps/rejected": -185.75514221191406, "loss": 0.3711, "nll_loss": 0.8754190802574158, "rewards/accuracies": 0.875, "rewards/chosen": -14.261853218078613, "rewards/margins": 4.313661575317383, "rewards/rejected": -18.575515747070312, "step": 5000 }, { "epoch": 1.1222222222222222, "grad_norm": 26.97482681274414, "learning_rate": 4.819688415773009e-06, "logits/chosen": 1.8861976861953735, "logits/rejected": 1.9495391845703125, "logps/chosen": -151.09506225585938, "logps/rejected": -199.3642120361328, "loss": 0.3729, "nll_loss": 0.9094197750091553, "rewards/accuracies": 0.8949999809265137, "rewards/chosen": -15.109506607055664, "rewards/margins": 4.826915740966797, "rewards/rejected": -19.936424255371094, "step": 5050 }, { "epoch": 1.1333333333333333, "grad_norm": 25.189985275268555, "learning_rate": 4.722828726947493e-06, "logits/chosen": 1.8727545738220215, "logits/rejected": 1.9451488256454468, "logps/chosen": -138.751220703125, "logps/rejected": -189.18695068359375, "loss": 0.2991, "nll_loss": 0.8500177264213562, "rewards/accuracies": 0.9150000214576721, "rewards/chosen": -13.8751220703125, "rewards/margins": 5.043573379516602, "rewards/rejected": -18.9186954498291, "step": 5100 }, { "epoch": 1.1444444444444444, "grad_norm": 16.11264419555664, "learning_rate": 4.626073270902295e-06, "logits/chosen": 1.8480844497680664, "logits/rejected": 1.892039179801941, "logps/chosen": -139.97703552246094, "logps/rejected": -186.30279541015625, "loss": 0.2919, "nll_loss": 0.8379194140434265, "rewards/accuracies": 0.9150000214576721, "rewards/chosen": -13.997703552246094, "rewards/margins": 4.6325764656066895, "rewards/rejected": -18.630279541015625, "step": 5150 }, { "epoch": 1.1555555555555554, "grad_norm": 42.09729766845703, "learning_rate": 4.529458433407429e-06, "logits/chosen": 1.8711543083190918, "logits/rejected": 1.9306063652038574, "logps/chosen": -135.43576049804688, "logps/rejected": -184.95187377929688, "loss": 0.2737, "nll_loss": 0.8510028123855591, "rewards/accuracies": 0.9350000023841858, "rewards/chosen": -13.543575286865234, "rewards/margins": 4.951611518859863, "rewards/rejected": -18.49518585205078, "step": 5200 }, { "epoch": 1.1666666666666667, "grad_norm": 15.656298637390137, "learning_rate": 4.434947336255132e-06, "logits/chosen": 1.8329623937606812, "logits/rejected": 1.8964951038360596, "logps/chosen": -145.45535278320312, "logps/rejected": -198.4934539794922, "loss": 0.3096, "nll_loss": 0.8971713781356812, "rewards/accuracies": 0.9200000166893005, "rewards/chosen": -14.54553508758545, "rewards/margins": 5.303811550140381, "rewards/rejected": -19.849348068237305, "step": 5250 }, { "epoch": 1.1777777777777778, "grad_norm": 26.11250877380371, "learning_rate": 4.3387180487143875e-06, "logits/chosen": 1.8482311964035034, "logits/rejected": 1.9194817543029785, "logps/chosen": -137.0039520263672, "logps/rejected": -183.7087860107422, "loss": 0.3277, "nll_loss": 0.8674495816230774, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -13.700395584106445, "rewards/margins": 4.670485019683838, "rewards/rejected": -18.370880126953125, "step": 5300 }, { "epoch": 1.1888888888888889, "grad_norm": 13.96750259399414, "learning_rate": 4.242737442271074e-06, "logits/chosen": 1.8494229316711426, "logits/rejected": 1.8867504596710205, "logps/chosen": -131.7822723388672, "logps/rejected": -174.9169921875, "loss": 0.3183, "nll_loss": 0.8361443877220154, "rewards/accuracies": 0.9200000166893005, "rewards/chosen": -13.178228378295898, "rewards/margins": 4.313467979431152, "rewards/rejected": -17.491695404052734, "step": 5350 }, { "epoch": 1.2, "grad_norm": 153.80136108398438, "learning_rate": 4.147041611305952e-06, "logits/chosen": 1.8332164287567139, "logits/rejected": 1.8641010522842407, "logps/chosen": -139.6233367919922, "logps/rejected": -191.58448791503906, "loss": 0.3024, "nll_loss": 0.8834293484687805, "rewards/accuracies": 0.9300000071525574, "rewards/chosen": -13.962336540222168, "rewards/margins": 5.196118354797363, "rewards/rejected": -19.1584529876709, "step": 5400 }, { "epoch": 1.211111111111111, "grad_norm": 2.159090042114258, "learning_rate": 4.051666543107377e-06, "logits/chosen": 1.7931147813796997, "logits/rejected": 1.8343441486358643, "logps/chosen": -146.0767822265625, "logps/rejected": -188.94058227539062, "loss": 0.4133, "nll_loss": 0.893923819065094, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -14.607678413391113, "rewards/margins": 4.286380767822266, "rewards/rejected": -18.894058227539062, "step": 5450 }, { "epoch": 1.2222222222222223, "grad_norm": 49.63187026977539, "learning_rate": 3.956648104337942e-06, "logits/chosen": 1.7636798620224, "logits/rejected": 1.8163806200027466, "logps/chosen": -139.93312072753906, "logps/rejected": -185.8674774169922, "loss": 0.3865, "nll_loss": 0.8775654435157776, "rewards/accuracies": 0.8899999856948853, "rewards/chosen": -13.993311882019043, "rewards/margins": 4.593437671661377, "rewards/rejected": -18.586748123168945, "step": 5500 }, { "epoch": 1.2333333333333334, "grad_norm": 1.0250152349472046, "learning_rate": 3.8620220275465014e-06, "logits/chosen": 1.7448630332946777, "logits/rejected": 1.7891240119934082, "logps/chosen": -133.88626098632812, "logps/rejected": -181.3544921875, "loss": 0.3238, "nll_loss": 0.8678062558174133, "rewards/accuracies": 0.9150000214576721, "rewards/chosen": -13.388625144958496, "rewards/margins": 4.746823787689209, "rewards/rejected": -18.135448455810547, "step": 5550 }, { "epoch": 1.2444444444444445, "grad_norm": 14.044897079467773, "learning_rate": 3.767823897730612e-06, "logits/chosen": 1.781459093093872, "logits/rejected": 1.8279938697814941, "logps/chosen": -139.7862548828125, "logps/rejected": -188.6760711669922, "loss": 0.292, "nll_loss": 0.8843945860862732, "rewards/accuracies": 0.9300000071525574, "rewards/chosen": -13.97862434387207, "rewards/margins": 4.888982772827148, "rewards/rejected": -18.86760902404785, "step": 5600 }, { "epoch": 1.2555555555555555, "grad_norm": 2.254403591156006, "learning_rate": 3.6740891389544764e-06, "logits/chosen": 1.747799038887024, "logits/rejected": 1.8126707077026367, "logps/chosen": -132.54306030273438, "logps/rejected": -183.2822265625, "loss": 0.267, "nll_loss": 0.813062310218811, "rewards/accuracies": 0.9300000071525574, "rewards/chosen": -13.25430679321289, "rewards/margins": 5.073916912078857, "rewards/rejected": -18.328224182128906, "step": 5650 }, { "epoch": 1.2666666666666666, "grad_norm": 2.6067352294921875, "learning_rate": 3.580853001027399e-06, "logits/chosen": 1.775789499282837, "logits/rejected": 1.8279727697372437, "logps/chosen": -135.30917358398438, "logps/rejected": -182.02606201171875, "loss": 0.2933, "nll_loss": 0.8531976938247681, "rewards/accuracies": 0.8899999856948853, "rewards/chosen": -13.53091812133789, "rewards/margins": 4.671688079833984, "rewards/rejected": -18.202608108520508, "step": 5700 }, { "epoch": 1.2777777777777777, "grad_norm": 2.927811622619629, "learning_rate": 3.488150546247778e-06, "logits/chosen": 1.762459397315979, "logits/rejected": 1.8294439315795898, "logps/chosen": -138.41099548339844, "logps/rejected": -186.82769775390625, "loss": 0.3345, "nll_loss": 0.8665143847465515, "rewards/accuracies": 0.9100000262260437, "rewards/chosen": -13.841099739074707, "rewards/margins": 4.841670513153076, "rewards/rejected": -18.682769775390625, "step": 5750 }, { "epoch": 1.2888888888888888, "grad_norm": 7.556782245635986, "learning_rate": 3.396016636217601e-06, "logits/chosen": 1.7123894691467285, "logits/rejected": 1.7558408975601196, "logps/chosen": -135.84544372558594, "logps/rejected": -183.96510314941406, "loss": 0.2978, "nll_loss": 0.8334808349609375, "rewards/accuracies": 0.9049999713897705, "rewards/chosen": -13.584543228149414, "rewards/margins": 4.811965465545654, "rewards/rejected": -18.396509170532227, "step": 5800 }, { "epoch": 1.3, "grad_norm": 18.42522430419922, "learning_rate": 3.304485918732431e-06, "logits/chosen": 1.744448184967041, "logits/rejected": 1.7827659845352173, "logps/chosen": -133.29783630371094, "logps/rejected": -175.83837890625, "loss": 0.2692, "nll_loss": 0.8538379669189453, "rewards/accuracies": 0.9399999976158142, "rewards/chosen": -13.329782485961914, "rewards/margins": 4.254055976867676, "rewards/rejected": -17.583837509155273, "step": 5850 }, { "epoch": 1.3111111111111111, "grad_norm": 17.906837463378906, "learning_rate": 3.2135928147517803e-06, "logits/chosen": 1.7398605346679688, "logits/rejected": 1.7756789922714233, "logps/chosen": -125.5423812866211, "logps/rejected": -173.01748657226562, "loss": 0.252, "nll_loss": 0.8250210285186768, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -12.554238319396973, "rewards/margins": 4.747509002685547, "rewards/rejected": -17.301746368408203, "step": 5900 }, { "epoch": 1.3222222222222222, "grad_norm": 48.91613006591797, "learning_rate": 3.123371505454804e-06, "logits/chosen": 1.7379705905914307, "logits/rejected": 1.7974755764007568, "logps/chosen": -144.49952697753906, "logps/rejected": -195.16482543945312, "loss": 0.2891, "nll_loss": 0.8644669055938721, "rewards/accuracies": 0.9300000071525574, "rewards/chosen": -14.44995403289795, "rewards/margins": 5.066527843475342, "rewards/rejected": -19.516481399536133, "step": 5950 }, { "epoch": 1.3333333333333333, "grad_norm": 54.523921966552734, "learning_rate": 3.0338559193861434e-06, "logits/chosen": 1.772323489189148, "logits/rejected": 1.800360918045044, "logps/chosen": -138.51698303222656, "logps/rejected": -184.87704467773438, "loss": 0.2834, "nll_loss": 0.8716569542884827, "rewards/accuracies": 0.9100000262260437, "rewards/chosen": -13.851698875427246, "rewards/margins": 4.636006832122803, "rewards/rejected": -18.48770523071289, "step": 6000 }, { "epoch": 1.3444444444444446, "grad_norm": 1.0145766735076904, "learning_rate": 2.945079719696802e-06, "logits/chosen": 1.7544074058532715, "logits/rejected": 1.8119055032730103, "logps/chosen": -143.91510009765625, "logps/rejected": -196.64031982421875, "loss": 0.3148, "nll_loss": 0.8662161231040955, "rewards/accuracies": 0.9150000214576721, "rewards/chosen": -14.391510009765625, "rewards/margins": 5.272523403167725, "rewards/rejected": -19.664031982421875, "step": 6050 }, { "epoch": 1.3555555555555556, "grad_norm": 68.27946472167969, "learning_rate": 2.8570762914848016e-06, "logits/chosen": 1.7976576089859009, "logits/rejected": 1.8383375406265259, "logps/chosen": -135.21192932128906, "logps/rejected": -185.3159637451172, "loss": 0.3196, "nll_loss": 0.8509930372238159, "rewards/accuracies": 0.8949999809265137, "rewards/chosen": -13.521193504333496, "rewards/margins": 5.010400295257568, "rewards/rejected": -18.53159523010254, "step": 6100 }, { "epoch": 1.3666666666666667, "grad_norm": 84.2784194946289, "learning_rate": 2.769878729240419e-06, "logits/chosen": 1.7699062824249268, "logits/rejected": 1.8263474702835083, "logps/chosen": -139.66969299316406, "logps/rejected": -180.58229064941406, "loss": 0.3908, "nll_loss": 0.8314890265464783, "rewards/accuracies": 0.8700000047683716, "rewards/chosen": -13.966970443725586, "rewards/margins": 4.0912580490112305, "rewards/rejected": -18.0582275390625, "step": 6150 }, { "epoch": 1.3777777777777778, "grad_norm": 9.088055610656738, "learning_rate": 2.683519824400693e-06, "logits/chosen": 1.8017587661743164, "logits/rejected": 1.8501276969909668, "logps/chosen": -139.66635131835938, "logps/rejected": -182.09388732910156, "loss": 0.3253, "nll_loss": 0.8349488973617554, "rewards/accuracies": 0.9049999713897705, "rewards/chosen": -13.966635704040527, "rewards/margins": 4.242753505706787, "rewards/rejected": -18.209388732910156, "step": 6200 }, { "epoch": 1.3888888888888888, "grad_norm": 1.069444179534912, "learning_rate": 2.5980320530179114e-06, "logits/chosen": 1.7473176717758179, "logits/rejected": 1.7838788032531738, "logps/chosen": -137.1415557861328, "logps/rejected": -193.8188934326172, "loss": 0.2448, "nll_loss": 0.8268473744392395, "rewards/accuracies": 0.9449999928474426, "rewards/chosen": -13.714155197143555, "rewards/margins": 5.667733669281006, "rewards/rejected": -19.38188934326172, "step": 6250 }, { "epoch": 1.4, "grad_norm": 33.168540954589844, "learning_rate": 2.5134475635467003e-06, "logits/chosen": 1.8181475400924683, "logits/rejected": 1.8965667486190796, "logps/chosen": -126.3827896118164, "logps/rejected": -179.6707000732422, "loss": 0.2385, "nll_loss": 0.8282772898674011, "rewards/accuracies": 0.9449999928474426, "rewards/chosen": -12.63827896118164, "rewards/margins": 5.328790664672852, "rewards/rejected": -17.967069625854492, "step": 6300 }, { "epoch": 1.411111111111111, "grad_norm": 7.906198501586914, "learning_rate": 2.429798164754299e-06, "logits/chosen": 1.8055589199066162, "logits/rejected": 1.8726186752319336, "logps/chosen": -135.39552307128906, "logps/rejected": -196.7256622314453, "loss": 0.1914, "nll_loss": 0.8056724071502686, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -13.539551734924316, "rewards/margins": 6.1330132484436035, "rewards/rejected": -19.672565460205078, "step": 6350 }, { "epoch": 1.4222222222222223, "grad_norm": 107.71742248535156, "learning_rate": 2.3471153137585823e-06, "logits/chosen": 1.8103935718536377, "logits/rejected": 1.8808395862579346, "logps/chosen": -130.2891082763672, "logps/rejected": -185.3134307861328, "loss": 0.228, "nll_loss": 0.8490158319473267, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -13.028907775878906, "rewards/margins": 5.502434730529785, "rewards/rejected": -18.531343460083008, "step": 6400 }, { "epoch": 1.4333333333333333, "grad_norm": 19.482093811035156, "learning_rate": 2.2654301041983267e-06, "logits/chosen": 1.8293110132217407, "logits/rejected": 1.8667221069335938, "logps/chosen": -136.7456512451172, "logps/rejected": -185.40151977539062, "loss": 0.2757, "nll_loss": 0.8289456367492676, "rewards/accuracies": 0.9350000023841858, "rewards/chosen": -13.67456340789795, "rewards/margins": 4.865589141845703, "rewards/rejected": -18.540151596069336, "step": 6450 }, { "epoch": 1.4444444444444444, "grad_norm": 16.981212615966797, "learning_rate": 2.184773254540169e-06, "logits/chosen": 1.8285144567489624, "logits/rejected": 1.880581021308899, "logps/chosen": -132.76669311523438, "logps/rejected": -179.2393798828125, "loss": 0.2871, "nll_loss": 0.8304244875907898, "rewards/accuracies": 0.9049999713897705, "rewards/chosen": -13.276671409606934, "rewards/margins": 4.647269248962402, "rewards/rejected": -17.923938751220703, "step": 6500 }, { "epoch": 1.4555555555555555, "grad_norm": 37.01171112060547, "learning_rate": 2.105175096526645e-06, "logits/chosen": 1.8807213306427002, "logits/rejected": 1.9186040163040161, "logps/chosen": -136.37696838378906, "logps/rejected": -184.046875, "loss": 0.2503, "nll_loss": 0.8423886895179749, "rewards/accuracies": 0.9399999976158142, "rewards/chosen": -13.63769817352295, "rewards/margins": 4.766992092132568, "rewards/rejected": -18.40468978881836, "step": 6550 }, { "epoch": 1.4666666666666668, "grad_norm": 6.618880271911621, "learning_rate": 2.026665563769655e-06, "logits/chosen": 1.8343521356582642, "logits/rejected": 1.8843910694122314, "logps/chosen": -138.26934814453125, "logps/rejected": -187.3291015625, "loss": 0.2373, "nll_loss": 0.8616310358047485, "rewards/accuracies": 0.9350000023841858, "rewards/chosen": -13.826935768127441, "rewards/margins": 4.905975341796875, "rewards/rejected": -18.732912063598633, "step": 6600 }, { "epoch": 1.4777777777777779, "grad_norm": 12.195965766906738, "learning_rate": 1.9492741804936623e-06, "logits/chosen": 1.8284790515899658, "logits/rejected": 1.8807638883590698, "logps/chosen": -131.57156372070312, "logps/rejected": -187.79971313476562, "loss": 0.2616, "nll_loss": 0.8129904270172119, "rewards/accuracies": 0.9200000166893005, "rewards/chosen": -13.157155990600586, "rewards/margins": 5.6228156089782715, "rewards/rejected": -18.779972076416016, "step": 6650 }, { "epoch": 1.488888888888889, "grad_norm": 7.610342502593994, "learning_rate": 1.8730300504328436e-06, "logits/chosen": 1.8161823749542236, "logits/rejected": 1.8631372451782227, "logps/chosen": -133.1810302734375, "logps/rejected": -181.60552978515625, "loss": 0.2399, "nll_loss": 0.8207729458808899, "rewards/accuracies": 0.9449999928474426, "rewards/chosen": -13.318105697631836, "rewards/margins": 4.8424482345581055, "rewards/rejected": -18.160554885864258, "step": 6700 }, { "epoch": 1.5, "grad_norm": 32.45072555541992, "learning_rate": 1.7979618458863606e-06, "logits/chosen": 1.8359429836273193, "logits/rejected": 1.907293438911438, "logps/chosen": -131.9154815673828, "logps/rejected": -187.42514038085938, "loss": 0.2183, "nll_loss": 0.8129041790962219, "rewards/accuracies": 0.9549999833106995, "rewards/chosen": -13.191550254821777, "rewards/margins": 5.550965785980225, "rewards/rejected": -18.742515563964844, "step": 6750 }, { "epoch": 1.511111111111111, "grad_norm": 22.23114585876465, "learning_rate": 1.7240977969358757e-06, "logits/chosen": 1.830859661102295, "logits/rejected": 1.8783316612243652, "logps/chosen": -132.79261779785156, "logps/rejected": -184.66763305664062, "loss": 0.2284, "nll_loss": 0.8487753868103027, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -13.279261589050293, "rewards/margins": 5.187502384185791, "rewards/rejected": -18.466764450073242, "step": 6800 }, { "epoch": 1.5222222222222221, "grad_norm": 6.155124664306641, "learning_rate": 1.6514656808293806e-06, "logits/chosen": 1.828493356704712, "logits/rejected": 1.8834805488586426, "logps/chosen": -139.61106872558594, "logps/rejected": -192.17010498046875, "loss": 0.2673, "nll_loss": 0.8353109955787659, "rewards/accuracies": 0.9300000071525574, "rewards/chosen": -13.96110725402832, "rewards/margins": 5.255902290344238, "rewards/rejected": -19.217008590698242, "step": 6850 }, { "epoch": 1.5333333333333332, "grad_norm": 3.335826873779297, "learning_rate": 1.580092811535308e-06, "logits/chosen": 1.8342680931091309, "logits/rejected": 1.8519172668457031, "logps/chosen": -129.4978790283203, "logps/rejected": -181.4263458251953, "loss": 0.231, "nll_loss": 0.8161830306053162, "rewards/accuracies": 0.9549999833106995, "rewards/chosen": -12.949789047241211, "rewards/margins": 5.192845344543457, "rewards/rejected": -18.14263343811035, "step": 6900 }, { "epoch": 1.5444444444444443, "grad_norm": 7.718993186950684, "learning_rate": 1.5100060294708647e-06, "logits/chosen": 1.8288401365280151, "logits/rejected": 1.8857090473175049, "logps/chosen": -132.69618225097656, "logps/rejected": -189.85968017578125, "loss": 0.2398, "nll_loss": 0.8371846675872803, "rewards/accuracies": 0.9350000023841858, "rewards/chosen": -13.269618034362793, "rewards/margins": 5.716350078582764, "rewards/rejected": -18.98596954345703, "step": 6950 }, { "epoch": 1.5555555555555556, "grad_norm": 1.7476555109024048, "learning_rate": 1.441231691408444e-06, "logits/chosen": 1.8012443780899048, "logits/rejected": 1.8418114185333252, "logps/chosen": -127.21989440917969, "logps/rejected": -177.24798583984375, "loss": 0.2335, "nll_loss": 0.807786226272583, "rewards/accuracies": 0.9300000071525574, "rewards/chosen": -12.721989631652832, "rewards/margins": 5.002810478210449, "rewards/rejected": -17.72480010986328, "step": 7000 }, { "epoch": 1.5666666666666667, "grad_norm": 20.87301254272461, "learning_rate": 1.3737956605639257e-06, "logits/chosen": 1.7706668376922607, "logits/rejected": 1.814083218574524, "logps/chosen": -132.85018920898438, "logps/rejected": -185.0130615234375, "loss": 0.2207, "nll_loss": 0.8018437027931213, "rewards/accuracies": 0.9399999976158142, "rewards/chosen": -13.285017967224121, "rewards/margins": 5.216287136077881, "rewards/rejected": -18.501306533813477, "step": 7050 }, { "epoch": 1.5777777777777777, "grad_norm": 1.5373958349227905, "learning_rate": 1.3077232968705805e-06, "logits/chosen": 1.7630535364151, "logits/rejected": 1.8269288539886475, "logps/chosen": -134.29409790039062, "logps/rejected": -183.10568237304688, "loss": 0.2557, "nll_loss": 0.8308072090148926, "rewards/accuracies": 0.9399999976158142, "rewards/chosen": -13.429411888122559, "rewards/margins": 4.881157398223877, "rewards/rejected": -18.31056785583496, "step": 7100 }, { "epoch": 1.588888888888889, "grad_norm": 14.283437728881836, "learning_rate": 1.243039447442233e-06, "logits/chosen": 1.7852157354354858, "logits/rejected": 1.8574109077453613, "logps/chosen": -133.07196044921875, "logps/rejected": -186.60089111328125, "loss": 0.2501, "nll_loss": 0.8370758891105652, "rewards/accuracies": 0.9549999833106995, "rewards/chosen": -13.307194709777832, "rewards/margins": 5.352894306182861, "rewards/rejected": -18.66008949279785, "step": 7150 }, { "epoch": 1.6, "grad_norm": 0.7173064351081848, "learning_rate": 1.1797684372292762e-06, "logits/chosen": 1.7764126062393188, "logits/rejected": 1.8252061605453491, "logps/chosen": -125.63392639160156, "logps/rejected": -180.2544403076172, "loss": 0.1836, "nll_loss": 0.8086937069892883, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -12.563390731811523, "rewards/margins": 5.462050914764404, "rewards/rejected": -18.02544403076172, "step": 7200 }, { "epoch": 1.6111111111111112, "grad_norm": 47.59480285644531, "learning_rate": 1.1179340598710547e-06, "logits/chosen": 1.7794564962387085, "logits/rejected": 1.8320943117141724, "logps/chosen": -129.47637939453125, "logps/rejected": -181.6626434326172, "loss": 0.2172, "nll_loss": 0.81744784116745, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -12.947639465332031, "rewards/margins": 5.218626499176025, "rewards/rejected": -18.1662654876709, "step": 7250 }, { "epoch": 1.6222222222222222, "grad_norm": 6.80866003036499, "learning_rate": 1.057559568748055e-06, "logits/chosen": 1.7940146923065186, "logits/rejected": 1.8295354843139648, "logps/chosen": -128.23739624023438, "logps/rejected": -183.8601531982422, "loss": 0.2243, "nll_loss": 0.8128785490989685, "rewards/accuracies": 0.9549999833106995, "rewards/chosen": -12.823739051818848, "rewards/margins": 5.562277793884277, "rewards/rejected": -18.386018753051758, "step": 7300 }, { "epoch": 1.6333333333333333, "grad_norm": 9.21158504486084, "learning_rate": 9.986676682372536e-07, "logits/chosen": 1.7984369993209839, "logits/rejected": 1.8172439336776733, "logps/chosen": -130.9366455078125, "logps/rejected": -186.3476104736328, "loss": 0.2484, "nll_loss": 0.8381571769714355, "rewards/accuracies": 0.9300000071525574, "rewards/chosen": -13.093664169311523, "rewards/margins": 5.541100025177002, "rewards/rejected": -18.634765625, "step": 7350 }, { "epoch": 1.6444444444444444, "grad_norm": 3.487398147583008, "learning_rate": 9.412805051739266e-07, "logits/chosen": 1.7885620594024658, "logits/rejected": 1.8500103950500488, "logps/chosen": -128.86256408691406, "logps/rejected": -185.11749267578125, "loss": 0.2083, "nll_loss": 0.8109752535820007, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -12.88625717163086, "rewards/margins": 5.625493049621582, "rewards/rejected": -18.511751174926758, "step": 7400 }, { "epoch": 1.6555555555555554, "grad_norm": 0.8990471959114075, "learning_rate": 8.854196605231408e-07, "logits/chosen": 1.7629153728485107, "logits/rejected": 1.826473355293274, "logps/chosen": -129.2388916015625, "logps/rejected": -183.47540283203125, "loss": 0.2274, "nll_loss": 0.8169078826904297, "rewards/accuracies": 0.9449999928474426, "rewards/chosen": -12.92388916015625, "rewards/margins": 5.423650741577148, "rewards/rejected": -18.3475399017334, "step": 7450 }, { "epoch": 1.6666666666666665, "grad_norm": 10.153523445129395, "learning_rate": 8.311061412640287e-07, "logits/chosen": 1.7779765129089355, "logits/rejected": 1.8294556140899658, "logps/chosen": -129.52830505371094, "logps/rejected": -176.06625366210938, "loss": 0.2234, "nll_loss": 0.8412315845489502, "rewards/accuracies": 0.9649999737739563, "rewards/chosen": -12.952829360961914, "rewards/margins": 4.653792381286621, "rewards/rejected": -17.60662269592285, "step": 7500 }, { "epoch": 1.6777777777777778, "grad_norm": 53.166969299316406, "learning_rate": 7.783603724899258e-07, "logits/chosen": 1.7453587055206299, "logits/rejected": 1.8029614686965942, "logps/chosen": -128.65765380859375, "logps/rejected": -183.56346130371094, "loss": 0.2068, "nll_loss": 0.8003409504890442, "rewards/accuracies": 0.9549999833106995, "rewards/chosen": -12.865766525268555, "rewards/margins": 5.490581035614014, "rewards/rejected": -18.356346130371094, "step": 7550 }, { "epoch": 1.6888888888888889, "grad_norm": 3.8732025623321533, "learning_rate": 7.272021897273196e-07, "logits/chosen": 1.7439498901367188, "logits/rejected": 1.800236463546753, "logps/chosen": -131.75717163085938, "logps/rejected": -190.20635986328125, "loss": 0.2288, "nll_loss": 0.8083850741386414, "rewards/accuracies": 0.9399999976158142, "rewards/chosen": -13.175716400146484, "rewards/margins": 5.844921112060547, "rewards/rejected": -19.02063751220703, "step": 7600 }, { "epoch": 1.7, "grad_norm": 6.036092281341553, "learning_rate": 6.776508314765328e-07, "logits/chosen": 1.7502888441085815, "logits/rejected": 1.800950050354004, "logps/chosen": -129.86033630371094, "logps/rejected": -187.9499969482422, "loss": 0.1475, "nll_loss": 0.7846776843070984, "rewards/accuracies": 0.9850000143051147, "rewards/chosen": -12.986032485961914, "rewards/margins": 5.8089680671691895, "rewards/rejected": -18.795001983642578, "step": 7650 }, { "epoch": 1.7111111111111112, "grad_norm": 19.501018524169922, "learning_rate": 6.297249319769016e-07, "logits/chosen": 1.735203504562378, "logits/rejected": 1.7816643714904785, "logps/chosen": -144.02491760253906, "logps/rejected": -195.0731201171875, "loss": 0.2339, "nll_loss": 0.8553965091705322, "rewards/accuracies": 0.9350000023841858, "rewards/chosen": -14.402491569519043, "rewards/margins": 5.1048197746276855, "rewards/rejected": -19.507312774658203, "step": 7700 }, { "epoch": 1.7222222222222223, "grad_norm": 19.679893493652344, "learning_rate": 5.834425141992045e-07, "logits/chosen": 1.7812249660491943, "logits/rejected": 1.8141940832138062, "logps/chosen": -128.50064086914062, "logps/rejected": -181.84490966796875, "loss": 0.2109, "nll_loss": 0.8024736046791077, "rewards/accuracies": 0.9549999833106995, "rewards/chosen": -12.850064277648926, "rewards/margins": 5.3344268798828125, "rewards/rejected": -18.184490203857422, "step": 7750 }, { "epoch": 1.7333333333333334, "grad_norm": 2.058384418487549, "learning_rate": 5.388209830679508e-07, "logits/chosen": 1.7586055994033813, "logits/rejected": 1.8227900266647339, "logps/chosen": -131.828369140625, "logps/rejected": -192.03436279296875, "loss": 0.1827, "nll_loss": 0.8194211721420288, "rewards/accuracies": 0.9649999737739563, "rewards/chosen": -13.182836532592773, "rewards/margins": 6.020601272583008, "rewards/rejected": -19.20343780517578, "step": 7800 }, { "epoch": 1.7444444444444445, "grad_norm": 0.7442694902420044, "learning_rate": 4.958771189161149e-07, "logits/chosen": 1.7269538640975952, "logits/rejected": 1.8100186586380005, "logps/chosen": -131.53375244140625, "logps/rejected": -189.80145263671875, "loss": 0.2083, "nll_loss": 0.8247582912445068, "rewards/accuracies": 0.9649999737739563, "rewards/chosen": -13.153375625610352, "rewards/margins": 5.826771259307861, "rewards/rejected": -18.980146408081055, "step": 7850 }, { "epoch": 1.7555555555555555, "grad_norm": 7.250400543212891, "learning_rate": 4.5462707117472914e-07, "logits/chosen": 1.7770702838897705, "logits/rejected": 1.8198394775390625, "logps/chosen": -125.560302734375, "logps/rejected": -182.01446533203125, "loss": 0.1605, "nll_loss": 0.8139318823814392, "rewards/accuracies": 0.9900000095367432, "rewards/chosen": -12.5560302734375, "rewards/margins": 5.645419120788574, "rewards/rejected": -18.201448440551758, "step": 7900 }, { "epoch": 1.7666666666666666, "grad_norm": 15.447861671447754, "learning_rate": 4.150863522997456e-07, "logits/chosen": 1.7705377340316772, "logits/rejected": 1.815138578414917, "logps/chosen": -133.81626892089844, "logps/rejected": -185.4794158935547, "loss": 0.1704, "nll_loss": 0.8367331027984619, "rewards/accuracies": 0.9850000143051147, "rewards/chosen": -13.381627082824707, "rewards/margins": 5.166313648223877, "rewards/rejected": -18.547941207885742, "step": 7950 }, { "epoch": 1.7777777777777777, "grad_norm": 74.23778533935547, "learning_rate": 3.772698319384349e-07, "logits/chosen": 1.7581907510757446, "logits/rejected": 1.7949700355529785, "logps/chosen": -127.36304473876953, "logps/rejected": -179.04075622558594, "loss": 0.2312, "nll_loss": 0.821094810962677, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -12.736305236816406, "rewards/margins": 5.1677727699279785, "rewards/rejected": -17.904077529907227, "step": 8000 }, { "epoch": 1.7888888888888888, "grad_norm": 7.166709899902344, "learning_rate": 3.411917313375235e-07, "logits/chosen": 1.7578457593917847, "logits/rejected": 1.8187922239303589, "logps/chosen": -131.8065948486328, "logps/rejected": -185.81988525390625, "loss": 0.1565, "nll_loss": 0.8209772706031799, "rewards/accuracies": 0.9900000095367432, "rewards/chosen": -13.180660247802734, "rewards/margins": 5.401330471038818, "rewards/rejected": -18.581989288330078, "step": 8050 }, { "epoch": 1.8, "grad_norm": 20.336841583251953, "learning_rate": 3.068656179951618e-07, "logits/chosen": 1.754990816116333, "logits/rejected": 1.8069425821304321, "logps/chosen": -131.60287475585938, "logps/rejected": -187.40249633789062, "loss": 0.2168, "nll_loss": 0.8227947950363159, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -13.160287857055664, "rewards/margins": 5.579964637756348, "rewards/rejected": -18.740253448486328, "step": 8100 }, { "epoch": 1.8111111111111111, "grad_norm": 11.0819673538208, "learning_rate": 2.743044005587425e-07, "logits/chosen": 1.739294409751892, "logits/rejected": 1.792691946029663, "logps/chosen": -130.30499267578125, "logps/rejected": -185.04054260253906, "loss": 0.1855, "nll_loss": 0.7876389026641846, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -13.030498504638672, "rewards/margins": 5.47355842590332, "rewards/rejected": -18.504056930541992, "step": 8150 }, { "epoch": 1.8222222222222222, "grad_norm": 2.490368366241455, "learning_rate": 2.4352032397048584e-07, "logits/chosen": 1.750669002532959, "logits/rejected": 1.807554006576538, "logps/chosen": -133.83702087402344, "logps/rejected": -194.2929229736328, "loss": 0.1909, "nll_loss": 0.8178808689117432, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -13.383702278137207, "rewards/margins": 6.045591831207275, "rewards/rejected": -19.42929458618164, "step": 8200 }, { "epoch": 1.8333333333333335, "grad_norm": 8.460405349731445, "learning_rate": 2.1452496486262132e-07, "logits/chosen": 1.7462210655212402, "logits/rejected": 1.7936456203460693, "logps/chosen": -135.83560180664062, "logps/rejected": -195.0395050048828, "loss": 0.1749, "nll_loss": 0.8002566695213318, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -13.583561897277832, "rewards/margins": 5.920388221740723, "rewards/rejected": -19.503948211669922, "step": 8250 }, { "epoch": 1.8444444444444446, "grad_norm": 5.650655269622803, "learning_rate": 1.873292272038868e-07, "logits/chosen": 1.766019582748413, "logits/rejected": 1.7951419353485107, "logps/chosen": -128.2977294921875, "logps/rejected": -184.0165252685547, "loss": 0.2065, "nll_loss": 0.8101325035095215, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -12.82977294921875, "rewards/margins": 5.571878433227539, "rewards/rejected": -18.401653289794922, "step": 8300 }, { "epoch": 1.8555555555555556, "grad_norm": 8.420461654663086, "learning_rate": 1.61943338198991e-07, "logits/chosen": 1.7439825534820557, "logits/rejected": 1.7637810707092285, "logps/chosen": -129.1949005126953, "logps/rejected": -188.9054718017578, "loss": 0.1771, "nll_loss": 0.7909958362579346, "rewards/accuracies": 0.9649999737739563, "rewards/chosen": -12.919489860534668, "rewards/margins": 5.971058368682861, "rewards/rejected": -18.890548706054688, "step": 8350 }, { "epoch": 1.8666666666666667, "grad_norm": 0.8354198932647705, "learning_rate": 1.3837684444258092e-07, "logits/chosen": 1.736147165298462, "logits/rejected": 1.7959703207015991, "logps/chosen": -134.53237915039062, "logps/rejected": -190.03311157226562, "loss": 0.1782, "nll_loss": 0.8283710479736328, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -13.453238487243652, "rewards/margins": 5.550073623657227, "rewards/rejected": -19.003313064575195, "step": 8400 }, { "epoch": 1.8777777777777778, "grad_norm": 1.552877426147461, "learning_rate": 1.166386083291604e-07, "logits/chosen": 1.7317723035812378, "logits/rejected": 1.7958444356918335, "logps/chosen": -134.1968994140625, "logps/rejected": -194.5653533935547, "loss": 0.1999, "nll_loss": 0.8258512616157532, "rewards/accuracies": 0.9549999833106995, "rewards/chosen": -13.41969108581543, "rewards/margins": 6.036843299865723, "rewards/rejected": -19.45653533935547, "step": 8450 }, { "epoch": 1.8888888888888888, "grad_norm": 3.1167664527893066, "learning_rate": 9.673680472030322e-08, "logits/chosen": 1.763848900794983, "logits/rejected": 1.8280714750289917, "logps/chosen": -138.82061767578125, "logps/rejected": -190.98226928710938, "loss": 0.2391, "nll_loss": 0.837114691734314, "rewards/accuracies": 0.9449999928474426, "rewards/chosen": -13.882061958312988, "rewards/margins": 5.216164588928223, "rewards/rejected": -19.098228454589844, "step": 8500 }, { "epoch": 1.9, "grad_norm": 6.203932762145996, "learning_rate": 7.86789178704217e-08, "logits/chosen": 1.7559263706207275, "logits/rejected": 1.8025550842285156, "logps/chosen": -135.3566131591797, "logps/rejected": -198.47793579101562, "loss": 0.1985, "nll_loss": 0.8531476855278015, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -13.535660743713379, "rewards/margins": 6.312133312225342, "rewards/rejected": -19.847793579101562, "step": 8550 }, { "epoch": 1.911111111111111, "grad_norm": 1.6203575134277344, "learning_rate": 6.247173861224753e-08, "logits/chosen": 1.779411792755127, "logits/rejected": 1.8070580959320068, "logps/chosen": -135.18385314941406, "logps/rejected": -197.24835205078125, "loss": 0.1888, "nll_loss": 0.8195974826812744, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -13.518383979797363, "rewards/margins": 6.206449508666992, "rewards/rejected": -19.724834442138672, "step": 8600 }, { "epoch": 1.9222222222222223, "grad_norm": 2.483865737915039, "learning_rate": 4.8121361803073476e-08, "logits/chosen": 1.7167291641235352, "logits/rejected": 1.7759296894073486, "logps/chosen": -133.6821746826172, "logps/rejected": -194.90081787109375, "loss": 0.1656, "nll_loss": 0.802808940410614, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -13.368217468261719, "rewards/margins": 6.1218647956848145, "rewards/rejected": -19.490083694458008, "step": 8650 }, { "epoch": 1.9333333333333333, "grad_norm": 7.93395471572876, "learning_rate": 3.563318403273119e-08, "logits/chosen": 1.7604470252990723, "logits/rejected": 1.7838407754898071, "logps/chosen": -139.4965057373047, "logps/rejected": -194.57095336914062, "loss": 0.2142, "nll_loss": 0.8445702195167542, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -13.949650764465332, "rewards/margins": 5.507444858551025, "rewards/rejected": -19.457094192504883, "step": 8700 }, { "epoch": 1.9444444444444444, "grad_norm": 5.804969310760498, "learning_rate": 2.501190159415079e-08, "logits/chosen": 1.7850152254104614, "logits/rejected": 1.8449233770370483, "logps/chosen": -136.3233642578125, "logps/rejected": -189.369140625, "loss": 0.22, "nll_loss": 0.8443524241447449, "rewards/accuracies": 0.9549999833106995, "rewards/chosen": -13.632336616516113, "rewards/margins": 5.304579257965088, "rewards/rejected": -18.936914443969727, "step": 8750 }, { "epoch": 1.9555555555555557, "grad_norm": 8.688746452331543, "learning_rate": 1.6261508717278497e-08, "logits/chosen": 1.7537765502929688, "logits/rejected": 1.811083197593689, "logps/chosen": -136.51060485839844, "logps/rejected": -196.0956573486328, "loss": 0.2033, "nll_loss": 0.8331271409988403, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -13.651061058044434, "rewards/margins": 5.958508491516113, "rewards/rejected": -19.609567642211914, "step": 8800 }, { "epoch": 1.9666666666666668, "grad_norm": 11.44411563873291, "learning_rate": 9.38529606701044e-09, "logits/chosen": 1.7702893018722534, "logits/rejected": 1.7988635301589966, "logps/chosen": -128.63967895507812, "logps/rejected": -177.89755249023438, "loss": 0.204, "nll_loss": 0.8193702101707458, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -12.863966941833496, "rewards/margins": 4.925789833068848, "rewards/rejected": -17.789758682250977, "step": 8850 }, { "epoch": 1.9777777777777779, "grad_norm": 5.2503485679626465, "learning_rate": 4.385849505708084e-09, "logits/chosen": 1.7600834369659424, "logits/rejected": 1.8281025886535645, "logps/chosen": -127.75049591064453, "logps/rejected": -195.32208251953125, "loss": 0.1846, "nll_loss": 0.7966732978820801, "rewards/accuracies": 0.9449999928474426, "rewards/chosen": -12.775049209594727, "rewards/margins": 6.7571587562561035, "rewards/rejected": -19.532207489013672, "step": 8900 }, { "epoch": 1.988888888888889, "grad_norm": 2.3727266788482666, "learning_rate": 1.265049120761086e-09, "logits/chosen": 1.7509323358535767, "logits/rejected": 1.7904582023620605, "logps/chosen": -132.1265106201172, "logps/rejected": -183.1971435546875, "loss": 0.2223, "nll_loss": 0.8050432801246643, "rewards/accuracies": 0.9399999976158142, "rewards/chosen": -13.212652206420898, "rewards/margins": 5.107062816619873, "rewards/rejected": -18.319717407226562, "step": 8950 }, { "epoch": 2.0, "grad_norm": 6.499065399169922, "learning_rate": 2.406851756231454e-11, "logits/chosen": 1.734092354774475, "logits/rejected": 1.7700865268707275, "logps/chosen": -129.91470336914062, "logps/rejected": -186.14447021484375, "loss": 0.1793, "nll_loss": 0.8055033683776855, "rewards/accuracies": 0.9649999737739563, "rewards/chosen": -12.991472244262695, "rewards/margins": 5.622976303100586, "rewards/rejected": -18.61444664001465, "step": 9000 } ], "logging_steps": 50, "max_steps": 9000, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }