{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.9969257795344753, "eval_steps": 500, "global_step": 4552, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00878348704435661, "grad_norm": 86.30626002963122, "learning_rate": 9.980228471001756e-07, "logits/chosen": 0.504931628704071, "logits/rejected": 0.481201171875, "logps/chosen": -248.9499969482422, "logps/rejected": -345.3999938964844, "loss": 0.6632, "rewards/accuracies": 0.41874998807907104, "rewards/chosen": -0.13306502997875214, "rewards/margins": 0.06868477165699005, "rewards/rejected": -0.20163726806640625, "step": 10 }, { "epoch": 0.01756697408871322, "grad_norm": 75.57284540153798, "learning_rate": 9.958260105448154e-07, "logits/chosen": 0.505725085735321, "logits/rejected": 0.4685607850551605, "logps/chosen": -256.75, "logps/rejected": -365.20001220703125, "loss": 0.6182, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.663037121295929, "rewards/margins": 0.3249877989292145, "rewards/rejected": -0.98681640625, "step": 20 }, { "epoch": 0.026350461133069828, "grad_norm": 59.851960022321094, "learning_rate": 9.936291739894551e-07, "logits/chosen": 0.3142150938510895, "logits/rejected": 0.30571287870407104, "logps/chosen": -249.14999389648438, "logps/rejected": -385.1499938964844, "loss": 0.6235, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.1501953601837158, "rewards/margins": 0.4761962890625, "rewards/rejected": -1.6257812976837158, "step": 30 }, { "epoch": 0.03513394817742644, "grad_norm": 73.83496374292575, "learning_rate": 9.91432337434095e-07, "logits/chosen": 0.22160644829273224, "logits/rejected": 0.17151793837547302, "logps/chosen": -286.0, "logps/rejected": -397.5, "loss": 0.6507, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.221289038658142, "rewards/margins": 0.45764464139938354, "rewards/rejected": -1.6798827648162842, "step": 40 }, { "epoch": 0.04391743522178305, "grad_norm": 110.3213609013335, "learning_rate": 9.892355008787344e-07, "logits/chosen": 0.18483276665210724, "logits/rejected": 0.2655044496059418, "logps/chosen": -294.6499938964844, "logps/rejected": -409.6499938964844, "loss": 0.5913, "rewards/accuracies": 0.6875, "rewards/chosen": -1.033593773841858, "rewards/margins": 0.5658203363418579, "rewards/rejected": -1.599218726158142, "step": 50 }, { "epoch": 0.052700922266139656, "grad_norm": 83.19057756450724, "learning_rate": 9.870386643233744e-07, "logits/chosen": 0.33186036348342896, "logits/rejected": 0.34326171875, "logps/chosen": -258.125, "logps/rejected": -362.6499938964844, "loss": 0.5869, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7457031011581421, "rewards/margins": 0.5174316167831421, "rewards/rejected": -1.2628905773162842, "step": 60 }, { "epoch": 0.061484409310496264, "grad_norm": 76.79854852441402, "learning_rate": 9.84841827768014e-07, "logits/chosen": 0.4409423768520355, "logits/rejected": 0.35008543729782104, "logps/chosen": -254.60000610351562, "logps/rejected": -349.4750061035156, "loss": 0.6274, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.54425048828125, "rewards/margins": 0.39222413301467896, "rewards/rejected": -0.9359375238418579, "step": 70 }, { "epoch": 0.07026789635485288, "grad_norm": 95.8010523516721, "learning_rate": 9.826449912126537e-07, "logits/chosen": 0.513171374797821, "logits/rejected": 0.3669067323207855, "logps/chosen": -262.79998779296875, "logps/rejected": -369.25, "loss": 0.5965, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.3936523497104645, "rewards/margins": 0.519848644733429, "rewards/rejected": -0.913037121295929, "step": 80 }, { "epoch": 0.07905138339920949, "grad_norm": 79.02666510409132, "learning_rate": 9.804481546572935e-07, "logits/chosen": 0.42827147245407104, "logits/rejected": 0.41110342741012573, "logps/chosen": -238.8249969482422, "logps/rejected": -335.29998779296875, "loss": 0.6016, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.22039183974266052, "rewards/margins": 0.4880615174770355, "rewards/rejected": -0.709551990032196, "step": 90 }, { "epoch": 0.0878348704435661, "grad_norm": 68.87638912001593, "learning_rate": 9.782513181019332e-07, "logits/chosen": 0.4198242127895355, "logits/rejected": 0.38599854707717896, "logps/chosen": -261.3999938964844, "logps/rejected": -336.1000061035156, "loss": 0.5905, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.07661132514476776, "rewards/margins": 0.4953857362270355, "rewards/rejected": -0.571826159954071, "step": 100 }, { "epoch": 0.0966183574879227, "grad_norm": 87.28787342152638, "learning_rate": 9.760544815465728e-07, "logits/chosen": 0.4025512635707855, "logits/rejected": 0.3656372129917145, "logps/chosen": -249.5749969482422, "logps/rejected": -403.5, "loss": 0.4731, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.05717773362994194, "rewards/margins": 0.8402343988418579, "rewards/rejected": -0.78271484375, "step": 110 }, { "epoch": 0.10540184453227931, "grad_norm": 49.68811062342779, "learning_rate": 9.738576449912126e-07, "logits/chosen": 0.4788269102573395, "logits/rejected": 0.43415528535842896, "logps/chosen": -272.67498779296875, "logps/rejected": -389.29998779296875, "loss": 0.5488, "rewards/accuracies": 0.6875, "rewards/chosen": -0.63458251953125, "rewards/margins": 0.8232421875, "rewards/rejected": -1.457617163658142, "step": 120 }, { "epoch": 0.11418533157663592, "grad_norm": 72.05085043021134, "learning_rate": 9.716608084358523e-07, "logits/chosen": 0.34930419921875, "logits/rejected": 0.360595703125, "logps/chosen": -286.8999938964844, "logps/rejected": -416.79998779296875, "loss": 0.4959, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.9070800542831421, "rewards/margins": 1.156347632408142, "rewards/rejected": -2.063671827316284, "step": 130 }, { "epoch": 0.12296881862099253, "grad_norm": 91.67711344229532, "learning_rate": 9.69463971880492e-07, "logits/chosen": 0.4559082090854645, "logits/rejected": 0.4104247987270355, "logps/chosen": -291.95001220703125, "logps/rejected": -395.45001220703125, "loss": 0.4574, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.7964233160018921, "rewards/margins": 1.088476538658142, "rewards/rejected": -1.8855469226837158, "step": 140 }, { "epoch": 0.13175230566534915, "grad_norm": 100.55855221605185, "learning_rate": 9.672671353251316e-07, "logits/chosen": 0.2634124755859375, "logits/rejected": 0.2819274961948395, "logps/chosen": -247.4250030517578, "logps/rejected": -346.29998779296875, "loss": 0.5576, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.621508777141571, "rewards/margins": 0.9619385004043579, "rewards/rejected": -1.582421898841858, "step": 150 }, { "epoch": 0.14053579270970576, "grad_norm": 53.65172771838944, "learning_rate": 9.650702987697716e-07, "logits/chosen": 0.32564085721969604, "logits/rejected": 0.21094055473804474, "logps/chosen": -274.42498779296875, "logps/rejected": -430.1499938964844, "loss": 0.4501, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.3674072325229645, "rewards/margins": 1.241308569908142, "rewards/rejected": -1.606835961341858, "step": 160 }, { "epoch": 0.14931927975406237, "grad_norm": 76.84251706588795, "learning_rate": 9.628734622144111e-07, "logits/chosen": 0.37872314453125, "logits/rejected": 0.21410521864891052, "logps/chosen": -260.1499938964844, "logps/rejected": -368.20001220703125, "loss": 0.477, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.02834167517721653, "rewards/margins": 1.10693359375, "rewards/rejected": -1.136254906654358, "step": 170 }, { "epoch": 0.15810276679841898, "grad_norm": 102.06277770991835, "learning_rate": 9.60676625659051e-07, "logits/chosen": 0.4544433653354645, "logits/rejected": 0.26850587129592896, "logps/chosen": -269.79998779296875, "logps/rejected": -390.25, "loss": 0.5608, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.44062501192092896, "rewards/margins": 1.1204102039337158, "rewards/rejected": -1.561865210533142, "step": 180 }, { "epoch": 0.16688625384277558, "grad_norm": 64.47210947057408, "learning_rate": 9.584797891036907e-07, "logits/chosen": 0.42279356718063354, "logits/rejected": 0.3376808166503906, "logps/chosen": -340.5, "logps/rejected": -431.79998779296875, "loss": 0.5027, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.596972644329071, "rewards/margins": 1.141992211341858, "rewards/rejected": -1.7384765148162842, "step": 190 }, { "epoch": 0.1756697408871322, "grad_norm": 112.16896901380547, "learning_rate": 9.562829525483304e-07, "logits/chosen": 0.47590333223342896, "logits/rejected": 0.4058471620082855, "logps/chosen": -281.0, "logps/rejected": -408.6499938964844, "loss": 0.482, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.21808472275733948, "rewards/margins": 1.21484375, "rewards/rejected": -1.431787133216858, "step": 200 }, { "epoch": 0.1844532279314888, "grad_norm": 80.51840028670627, "learning_rate": 9.5408611599297e-07, "logits/chosen": 0.41748046875, "logits/rejected": 0.3203491270542145, "logps/chosen": -291.1000061035156, "logps/rejected": -382.1499938964844, "loss": 0.555, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.2886718809604645, "rewards/margins": 1.015386939048767, "rewards/rejected": -1.3042480945587158, "step": 210 }, { "epoch": 0.1932367149758454, "grad_norm": 111.7453650521764, "learning_rate": 9.518892794376097e-07, "logits/chosen": 0.34769898653030396, "logits/rejected": 0.2665161192417145, "logps/chosen": -262.95001220703125, "logps/rejected": -370.0, "loss": 0.5997, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.23859862983226776, "rewards/margins": 0.857226550579071, "rewards/rejected": -1.0953369140625, "step": 220 }, { "epoch": 0.20202020202020202, "grad_norm": 90.00407895721551, "learning_rate": 9.496924428822495e-07, "logits/chosen": 0.31724244356155396, "logits/rejected": 0.3418945372104645, "logps/chosen": -266.79998779296875, "logps/rejected": -360.29998779296875, "loss": 0.5759, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.1005859375, "rewards/margins": 1.0169677734375, "rewards/rejected": -1.117407202720642, "step": 230 }, { "epoch": 0.21080368906455862, "grad_norm": 65.35406408034969, "learning_rate": 9.474956063268892e-07, "logits/chosen": 0.27976685762405396, "logits/rejected": 0.28980714082717896, "logps/chosen": -277.25, "logps/rejected": -387.79998779296875, "loss": 0.5426, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.22409668564796448, "rewards/margins": 1.0364990234375, "rewards/rejected": -0.812255859375, "step": 240 }, { "epoch": 0.21958717610891523, "grad_norm": 46.934689106398565, "learning_rate": 9.45298769771529e-07, "logits/chosen": 0.3211303651332855, "logits/rejected": 0.27087098360061646, "logps/chosen": -264.1499938964844, "logps/rejected": -349.20001220703125, "loss": 0.5322, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.14288024604320526, "rewards/margins": 1.204345703125, "rewards/rejected": -1.0603516101837158, "step": 250 }, { "epoch": 0.22837066315327184, "grad_norm": 88.70698352931073, "learning_rate": 9.431019332161687e-07, "logits/chosen": 0.4539550840854645, "logits/rejected": 0.4018005430698395, "logps/chosen": -244.6999969482422, "logps/rejected": -371.3999938964844, "loss": 0.5586, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.10993652045726776, "rewards/margins": 1.062963843345642, "rewards/rejected": -0.9530029296875, "step": 260 }, { "epoch": 0.23715415019762845, "grad_norm": 104.88388508507241, "learning_rate": 9.409050966608084e-07, "logits/chosen": 0.37379759550094604, "logits/rejected": 0.3221679627895355, "logps/chosen": -275.57501220703125, "logps/rejected": -380.6000061035156, "loss": 0.5329, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.10671386867761612, "rewards/margins": 1.03662109375, "rewards/rejected": -1.1419799327850342, "step": 270 }, { "epoch": 0.24593763724198506, "grad_norm": 63.642487234693135, "learning_rate": 9.387082601054481e-07, "logits/chosen": 0.405426025390625, "logits/rejected": 0.4315429627895355, "logps/chosen": -300.45001220703125, "logps/rejected": -395.0, "loss": 0.494, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.21533203125, "rewards/margins": 1.113378882408142, "rewards/rejected": -1.328125, "step": 280 }, { "epoch": 0.2547211242863417, "grad_norm": 90.35617106594229, "learning_rate": 9.365114235500879e-07, "logits/chosen": 0.46123045682907104, "logits/rejected": 0.33272093534469604, "logps/chosen": -261.1000061035156, "logps/rejected": -378.1000061035156, "loss": 0.5335, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.4730468690395355, "rewards/margins": 1.110742211341858, "rewards/rejected": -1.583837866783142, "step": 290 }, { "epoch": 0.2635046113306983, "grad_norm": 80.0442392907133, "learning_rate": 9.343145869947275e-07, "logits/chosen": 0.35919189453125, "logits/rejected": 0.26463621854782104, "logps/chosen": -237.6750030517578, "logps/rejected": -397.6000061035156, "loss": 0.4438, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.650585949420929, "rewards/margins": 1.52777099609375, "rewards/rejected": -2.1792969703674316, "step": 300 }, { "epoch": 0.2722880983750549, "grad_norm": 64.40170030634079, "learning_rate": 9.321177504393673e-07, "logits/chosen": 0.445587158203125, "logits/rejected": 0.34685057401657104, "logps/chosen": -261.04998779296875, "logps/rejected": -389.04998779296875, "loss": 0.5305, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.6322692632675171, "rewards/margins": 1.50830078125, "rewards/rejected": -2.140625, "step": 310 }, { "epoch": 0.2810715854194115, "grad_norm": 72.38928863902277, "learning_rate": 9.299209138840069e-07, "logits/chosen": 0.38462525606155396, "logits/rejected": 0.42045897245407104, "logps/chosen": -283.6000061035156, "logps/rejected": -395.79998779296875, "loss": 0.4293, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.722393810749054, "rewards/margins": 1.4466674327850342, "rewards/rejected": -2.170703172683716, "step": 320 }, { "epoch": 0.2898550724637681, "grad_norm": 70.98760478460224, "learning_rate": 9.277240773286467e-07, "logits/chosen": 0.4158691465854645, "logits/rejected": 0.2955688536167145, "logps/chosen": -264.42498779296875, "logps/rejected": -370.1499938964844, "loss": 0.5402, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.59735107421875, "rewards/margins": 1.1348145008087158, "rewards/rejected": -1.731298804283142, "step": 330 }, { "epoch": 0.29863855950812473, "grad_norm": 106.57823304368992, "learning_rate": 9.255272407732864e-07, "logits/chosen": 0.4222168028354645, "logits/rejected": 0.27473145723342896, "logps/chosen": -273.29998779296875, "logps/rejected": -389.54998779296875, "loss": 0.5372, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.45722657442092896, "rewards/margins": 1.470605492591858, "rewards/rejected": -1.927734375, "step": 340 }, { "epoch": 0.30742204655248134, "grad_norm": 73.25457747906178, "learning_rate": 9.233304042179262e-07, "logits/chosen": 0.49915772676467896, "logits/rejected": 0.47480469942092896, "logps/chosen": -288.3999938964844, "logps/rejected": -380.8999938964844, "loss": 0.4837, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.584057629108429, "rewards/margins": 1.1042144298553467, "rewards/rejected": -1.687890648841858, "step": 350 }, { "epoch": 0.31620553359683795, "grad_norm": 87.73830871717917, "learning_rate": 9.211335676625659e-07, "logits/chosen": 0.36259764432907104, "logits/rejected": 0.28267210721969604, "logps/chosen": -299.75, "logps/rejected": -425.79998779296875, "loss": 0.5065, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5321899652481079, "rewards/margins": 1.206689476966858, "rewards/rejected": -1.7380859851837158, "step": 360 }, { "epoch": 0.32498902064119456, "grad_norm": 57.83729234738691, "learning_rate": 9.189367311072056e-07, "logits/chosen": 0.37762451171875, "logits/rejected": 0.3880981504917145, "logps/chosen": -274.20001220703125, "logps/rejected": -363.42498779296875, "loss": 0.5513, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.17774200439453125, "rewards/margins": 0.9449707269668579, "rewards/rejected": -1.121923804283142, "step": 370 }, { "epoch": 0.33377250768555117, "grad_norm": 84.7285344101326, "learning_rate": 9.167398945518453e-07, "logits/chosen": 0.3119262754917145, "logits/rejected": 0.2841796875, "logps/chosen": -278.3999938964844, "logps/rejected": -404.5, "loss": 0.4963, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.1128082275390625, "rewards/margins": 1.295654296875, "rewards/rejected": -1.1838867664337158, "step": 380 }, { "epoch": 0.3425559947299078, "grad_norm": 60.43184329565753, "learning_rate": 9.14543057996485e-07, "logits/chosen": 0.3933044373989105, "logits/rejected": 0.27689820528030396, "logps/chosen": -235.5749969482422, "logps/rejected": -353.79998779296875, "loss": 0.568, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3050537109375, "rewards/margins": 1.087060570716858, "rewards/rejected": -1.39080810546875, "step": 390 }, { "epoch": 0.3513394817742644, "grad_norm": 72.52611282912443, "learning_rate": 9.123462214411247e-07, "logits/chosen": 0.2555908262729645, "logits/rejected": 0.25152587890625, "logps/chosen": -300.5, "logps/rejected": -410.6499938964844, "loss": 0.5278, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.6889556646347046, "rewards/margins": 1.2967529296875, "rewards/rejected": -1.9861328601837158, "step": 400 }, { "epoch": 0.360122968818621, "grad_norm": 55.61732812995934, "learning_rate": 9.101493848857645e-07, "logits/chosen": 0.2975524961948395, "logits/rejected": 0.27109986543655396, "logps/chosen": -283.95001220703125, "logps/rejected": -380.70001220703125, "loss": 0.4729, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.4813476502895355, "rewards/margins": 1.426367163658142, "rewards/rejected": -1.9074218273162842, "step": 410 }, { "epoch": 0.3689064558629776, "grad_norm": 66.43275567673332, "learning_rate": 9.079525483304041e-07, "logits/chosen": 0.3774566650390625, "logits/rejected": 0.327322393655777, "logps/chosen": -261.7250061035156, "logps/rejected": -354.45001220703125, "loss": 0.544, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.23259887099266052, "rewards/margins": 1.1032226085662842, "rewards/rejected": -1.335046410560608, "step": 420 }, { "epoch": 0.3776899429073342, "grad_norm": 81.99188926538554, "learning_rate": 9.057557117750439e-07, "logits/chosen": 0.385009765625, "logits/rejected": 0.4036498963832855, "logps/chosen": -263.5, "logps/rejected": -356.54998779296875, "loss": 0.6347, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.103607177734375, "rewards/margins": 0.7696288824081421, "rewards/rejected": -0.874267578125, "step": 430 }, { "epoch": 0.3864734299516908, "grad_norm": 84.05808494825982, "learning_rate": 9.035588752196836e-07, "logits/chosen": 0.4146728515625, "logits/rejected": 0.3936523497104645, "logps/chosen": -261.54998779296875, "logps/rejected": -407.6499938964844, "loss": 0.4429, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.1259765625, "rewards/margins": 1.2878906726837158, "rewards/rejected": -1.414941430091858, "step": 440 }, { "epoch": 0.3952569169960474, "grad_norm": 98.78080443456875, "learning_rate": 9.013620386643234e-07, "logits/chosen": 0.4496826231479645, "logits/rejected": 0.40058594942092896, "logps/chosen": -316.5, "logps/rejected": -446.70001220703125, "loss": 0.4756, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.611462414264679, "rewards/margins": 1.172998070716858, "rewards/rejected": -1.7830078601837158, "step": 450 }, { "epoch": 0.40404040404040403, "grad_norm": 64.12300336923452, "learning_rate": 8.99165202108963e-07, "logits/chosen": 0.3731750547885895, "logits/rejected": 0.3060058653354645, "logps/chosen": -253.47500610351562, "logps/rejected": -369.75, "loss": 0.4744, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.4822753965854645, "rewards/margins": 1.300024390220642, "rewards/rejected": -1.779882788658142, "step": 460 }, { "epoch": 0.41282389108476064, "grad_norm": 76.13136441198672, "learning_rate": 8.969683655536028e-07, "logits/chosen": 0.32374268770217896, "logits/rejected": 0.272735595703125, "logps/chosen": -262.625, "logps/rejected": -360.0, "loss": 0.4774, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.38884276151657104, "rewards/margins": 1.3416016101837158, "rewards/rejected": -1.7322266101837158, "step": 470 }, { "epoch": 0.42160737812911725, "grad_norm": 79.11332389964583, "learning_rate": 8.947715289982425e-07, "logits/chosen": 0.28551024198532104, "logits/rejected": 0.30437010526657104, "logps/chosen": -289.5, "logps/rejected": -400.1499938964844, "loss": 0.5336, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.6865234375, "rewards/margins": 1.336816430091858, "rewards/rejected": -2.0230469703674316, "step": 480 }, { "epoch": 0.43039086517347386, "grad_norm": 90.04164898380533, "learning_rate": 8.925746924428822e-07, "logits/chosen": 0.3634704649448395, "logits/rejected": 0.3254150450229645, "logps/chosen": -279.8999938964844, "logps/rejected": -358.1499938964844, "loss": 0.4768, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.4574218690395355, "rewards/margins": 1.381738305091858, "rewards/rejected": -1.839746117591858, "step": 490 }, { "epoch": 0.43917435221783047, "grad_norm": 74.98582547594647, "learning_rate": 8.903778558875219e-07, "logits/chosen": 0.323699951171875, "logits/rejected": 0.27777403593063354, "logps/chosen": -281.67498779296875, "logps/rejected": -429.70001220703125, "loss": 0.5156, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.2854980528354645, "rewards/margins": 1.441308617591858, "rewards/rejected": -1.725976586341858, "step": 500 }, { "epoch": 0.4479578392621871, "grad_norm": 83.31338074401668, "learning_rate": 8.881810193321616e-07, "logits/chosen": 0.3398071229457855, "logits/rejected": 0.3562988340854645, "logps/chosen": -258.25, "logps/rejected": -382.0, "loss": 0.5172, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.09996338188648224, "rewards/margins": 1.3611328601837158, "rewards/rejected": -1.461084008216858, "step": 510 }, { "epoch": 0.4567413263065437, "grad_norm": 59.62008881250761, "learning_rate": 8.859841827768013e-07, "logits/chosen": 0.5218505859375, "logits/rejected": 0.5107421875, "logps/chosen": -257.32501220703125, "logps/rejected": -355.32501220703125, "loss": 0.5288, "rewards/accuracies": 0.6875, "rewards/chosen": -0.11716308444738388, "rewards/margins": 1.211035132408142, "rewards/rejected": -1.32763671875, "step": 520 }, { "epoch": 0.4655248133509003, "grad_norm": 70.82518291333233, "learning_rate": 8.837873462214412e-07, "logits/chosen": 0.5044189691543579, "logits/rejected": 0.48004150390625, "logps/chosen": -269.42498779296875, "logps/rejected": -384.6000061035156, "loss": 0.513, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.06254883110523224, "rewards/margins": 1.2313232421875, "rewards/rejected": -1.293847680091858, "step": 530 }, { "epoch": 0.4743083003952569, "grad_norm": 65.88848785314461, "learning_rate": 8.815905096660808e-07, "logits/chosen": 0.580639660358429, "logits/rejected": 0.4949707090854645, "logps/chosen": -236.9499969482422, "logps/rejected": -345.79998779296875, "loss": 0.483, "rewards/accuracies": 0.71875, "rewards/chosen": 0.12285156548023224, "rewards/margins": 1.074438452720642, "rewards/rejected": -0.95263671875, "step": 540 }, { "epoch": 0.4830917874396135, "grad_norm": 84.80182057429442, "learning_rate": 8.793936731107206e-07, "logits/chosen": 0.5431152582168579, "logits/rejected": 0.47700196504592896, "logps/chosen": -235.14999389648438, "logps/rejected": -362.3999938964844, "loss": 0.4513, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.04042968899011612, "rewards/margins": 1.296972632408142, "rewards/rejected": -1.255615234375, "step": 550 }, { "epoch": 0.4918752744839701, "grad_norm": 88.31678525686402, "learning_rate": 8.771968365553602e-07, "logits/chosen": 0.5618652105331421, "logits/rejected": 0.5098876953125, "logps/chosen": -272.5, "logps/rejected": -377.5, "loss": 0.4983, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.24355468153953552, "rewards/margins": 1.386816382408142, "rewards/rejected": -1.6306641101837158, "step": 560 }, { "epoch": 0.5006587615283268, "grad_norm": 81.79872241084223, "learning_rate": 8.75e-07, "logits/chosen": 0.3651260435581207, "logits/rejected": 0.3791870176792145, "logps/chosen": -281.1499938964844, "logps/rejected": -369.6499938964844, "loss": 0.5421, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.3820434510707855, "rewards/margins": 1.097509741783142, "rewards/rejected": -1.480371117591858, "step": 570 }, { "epoch": 0.5094422485726834, "grad_norm": 61.254349116513914, "learning_rate": 8.728031634446396e-07, "logits/chosen": 0.37919920682907104, "logits/rejected": 0.28413695096969604, "logps/chosen": -283.5, "logps/rejected": -410.79998779296875, "loss": 0.5571, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.15074463188648224, "rewards/margins": 1.5600097179412842, "rewards/rejected": -1.7091796398162842, "step": 580 }, { "epoch": 0.51822573561704, "grad_norm": 52.16193823969323, "learning_rate": 8.706063268892794e-07, "logits/chosen": 0.29414063692092896, "logits/rejected": 0.24357299506664276, "logps/chosen": -232.10000610351562, "logps/rejected": -382.1000061035156, "loss": 0.4546, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.02094726637005806, "rewards/margins": 1.6032226085662842, "rewards/rejected": -1.62445068359375, "step": 590 }, { "epoch": 0.5270092226613966, "grad_norm": 159.8024434954629, "learning_rate": 8.684094903339191e-07, "logits/chosen": 0.269195556640625, "logits/rejected": 0.24223633110523224, "logps/chosen": -286.82501220703125, "logps/rejected": -414.1000061035156, "loss": 0.5466, "rewards/accuracies": 0.75, "rewards/chosen": -0.261880487203598, "rewards/margins": 1.4085204601287842, "rewards/rejected": -1.6699707508087158, "step": 600 }, { "epoch": 0.5357927097057532, "grad_norm": 114.97479520171112, "learning_rate": 8.662126537785588e-07, "logits/chosen": 0.26755982637405396, "logits/rejected": 0.30859678983688354, "logps/chosen": -215.47500610351562, "logps/rejected": -316.3999938964844, "loss": 0.5409, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.100830078125, "rewards/margins": 1.0963134765625, "rewards/rejected": -1.197998046875, "step": 610 }, { "epoch": 0.5445761967501098, "grad_norm": 59.170457169259784, "learning_rate": 8.640158172231986e-07, "logits/chosen": 0.39715576171875, "logits/rejected": 0.31226807832717896, "logps/chosen": -241.5, "logps/rejected": -386.25, "loss": 0.4359, "rewards/accuracies": 0.8125, "rewards/chosen": -0.12224121391773224, "rewards/margins": 1.7302734851837158, "rewards/rejected": -1.8525390625, "step": 620 }, { "epoch": 0.5533596837944664, "grad_norm": 66.93822693617777, "learning_rate": 8.618189806678383e-07, "logits/chosen": 0.24461975693702698, "logits/rejected": 0.20645752549171448, "logps/chosen": -275.0, "logps/rejected": -387.3500061035156, "loss": 0.4727, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.25587159395217896, "rewards/margins": 1.3369140625, "rewards/rejected": -1.5925781726837158, "step": 630 }, { "epoch": 0.562143170838823, "grad_norm": 53.52795758045956, "learning_rate": 8.59622144112478e-07, "logits/chosen": 0.20697021484375, "logits/rejected": 0.26872557401657104, "logps/chosen": -247.64999389648438, "logps/rejected": -373.75, "loss": 0.5504, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.27363282442092896, "rewards/margins": 1.2175781726837158, "rewards/rejected": -1.491796851158142, "step": 640 }, { "epoch": 0.5709266578831796, "grad_norm": 69.48656135643135, "learning_rate": 8.574253075571178e-07, "logits/chosen": 0.28449708223342896, "logits/rejected": 0.39190673828125, "logps/chosen": -246.0500030517578, "logps/rejected": -406.1000061035156, "loss": 0.5412, "rewards/accuracies": 0.71875, "rewards/chosen": -0.15899047255516052, "rewards/margins": 1.248803734779358, "rewards/rejected": -1.409082055091858, "step": 650 }, { "epoch": 0.5797101449275363, "grad_norm": 68.75343467997828, "learning_rate": 8.552284710017574e-07, "logits/chosen": 0.3454833924770355, "logits/rejected": 0.35661619901657104, "logps/chosen": -292.32501220703125, "logps/rejected": -426.75, "loss": 0.4102, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.3541015684604645, "rewards/margins": 1.7896239757537842, "rewards/rejected": -2.142773389816284, "step": 660 }, { "epoch": 0.5884936319718929, "grad_norm": 72.3767274783966, "learning_rate": 8.530316344463972e-07, "logits/chosen": 0.38081663846969604, "logits/rejected": 0.3572143614292145, "logps/chosen": -268.1499938964844, "logps/rejected": -365.20001220703125, "loss": 0.5357, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.324655145406723, "rewards/margins": 1.162744164466858, "rewards/rejected": -1.4882323741912842, "step": 670 }, { "epoch": 0.5972771190162495, "grad_norm": 81.40655334992947, "learning_rate": 8.508347978910368e-07, "logits/chosen": 0.3809814453125, "logits/rejected": 0.2562011778354645, "logps/chosen": -253.6750030517578, "logps/rejected": -368.20001220703125, "loss": 0.53, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.3208068907260895, "rewards/margins": 1.302880883216858, "rewards/rejected": -1.6228516101837158, "step": 680 }, { "epoch": 0.6060606060606061, "grad_norm": 84.09957046864274, "learning_rate": 8.486379613356766e-07, "logits/chosen": 0.4454589784145355, "logits/rejected": 0.2889160215854645, "logps/chosen": -269.45001220703125, "logps/rejected": -387.54998779296875, "loss": 0.575, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.2535034120082855, "rewards/margins": 1.0268065929412842, "rewards/rejected": -1.278222680091858, "step": 690 }, { "epoch": 0.6148440931049627, "grad_norm": 71.24254047588092, "learning_rate": 8.464411247803162e-07, "logits/chosen": 0.22578124701976776, "logits/rejected": 0.24399414658546448, "logps/chosen": -290.29998779296875, "logps/rejected": -411.3500061035156, "loss": 0.4896, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.02321777306497097, "rewards/margins": 1.379492163658142, "rewards/rejected": -1.401025414466858, "step": 700 }, { "epoch": 0.6236275801493193, "grad_norm": 61.51711219826457, "learning_rate": 8.442442882249561e-07, "logits/chosen": 0.3981567323207855, "logits/rejected": 0.2588745057582855, "logps/chosen": -251.0749969482422, "logps/rejected": -358.3999938964844, "loss": 0.4301, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.009472656063735485, "rewards/margins": 1.478417992591858, "rewards/rejected": -1.487329125404358, "step": 710 }, { "epoch": 0.6324110671936759, "grad_norm": 97.45075080482356, "learning_rate": 8.420474516695958e-07, "logits/chosen": 0.36164551973342896, "logits/rejected": 0.2941741943359375, "logps/chosen": -266.8500061035156, "logps/rejected": -414.875, "loss": 0.5336, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.47001951932907104, "rewards/margins": 1.479882836341858, "rewards/rejected": -1.9490234851837158, "step": 720 }, { "epoch": 0.6411945542380325, "grad_norm": 71.5202434090484, "learning_rate": 8.398506151142355e-07, "logits/chosen": 0.33540040254592896, "logits/rejected": 0.24340209364891052, "logps/chosen": -283.6000061035156, "logps/rejected": -386.04998779296875, "loss": 0.4855, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.35863035917282104, "rewards/margins": 1.4435546398162842, "rewards/rejected": -1.8025391101837158, "step": 730 }, { "epoch": 0.6499780412823891, "grad_norm": 85.44754367037828, "learning_rate": 8.376537785588752e-07, "logits/chosen": 0.2869857847690582, "logits/rejected": 0.2581543028354645, "logps/chosen": -283.25, "logps/rejected": -383.8500061035156, "loss": 0.4957, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.529614269733429, "rewards/margins": 1.456152319908142, "rewards/rejected": -1.9851562976837158, "step": 740 }, { "epoch": 0.6587615283267457, "grad_norm": 38.966351252085836, "learning_rate": 8.354569420035149e-07, "logits/chosen": 0.2989257872104645, "logits/rejected": 0.30454713106155396, "logps/chosen": -299.70001220703125, "logps/rejected": -374.29998779296875, "loss": 0.6308, "rewards/accuracies": 0.625, "rewards/chosen": -0.4601074159145355, "rewards/margins": 1.2113037109375, "rewards/rejected": -1.671972632408142, "step": 750 }, { "epoch": 0.6675450153711023, "grad_norm": 52.48282041093325, "learning_rate": 8.332601054481546e-07, "logits/chosen": 0.35906982421875, "logits/rejected": 0.16847534477710724, "logps/chosen": -288.29998779296875, "logps/rejected": -407.29998779296875, "loss": 0.4511, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.07607422024011612, "rewards/margins": 1.3205077648162842, "rewards/rejected": -1.3957030773162842, "step": 760 }, { "epoch": 0.6763285024154589, "grad_norm": 110.75547320097759, "learning_rate": 8.310632688927944e-07, "logits/chosen": 0.384521484375, "logits/rejected": 0.36524659395217896, "logps/chosen": -251.0500030517578, "logps/rejected": -358.8999938964844, "loss": 0.4772, "rewards/accuracies": 0.78125, "rewards/chosen": 0.17294922471046448, "rewards/margins": 1.538671851158142, "rewards/rejected": -1.366601586341858, "step": 770 }, { "epoch": 0.6851119894598156, "grad_norm": 84.84629982844571, "learning_rate": 8.28866432337434e-07, "logits/chosen": 0.3001464903354645, "logits/rejected": 0.24505615234375, "logps/chosen": -254.39999389648438, "logps/rejected": -394.5, "loss": 0.5167, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.089630126953125, "rewards/margins": 1.7184569835662842, "rewards/rejected": -1.8083007335662842, "step": 780 }, { "epoch": 0.6938954765041722, "grad_norm": 41.4181363204328, "learning_rate": 8.266695957820738e-07, "logits/chosen": 0.32066649198532104, "logits/rejected": 0.290374755859375, "logps/chosen": -244.1750030517578, "logps/rejected": -342.29998779296875, "loss": 0.4522, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.08376464992761612, "rewards/margins": 1.42333984375, "rewards/rejected": -1.3399658203125, "step": 790 }, { "epoch": 0.7026789635485288, "grad_norm": 47.25422109587412, "learning_rate": 8.244727592267134e-07, "logits/chosen": 0.2536254823207855, "logits/rejected": 0.12650147080421448, "logps/chosen": -252.85000610351562, "logps/rejected": -357.3500061035156, "loss": 0.5478, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.14219971001148224, "rewards/margins": 1.4521973133087158, "rewards/rejected": -1.5940430164337158, "step": 800 }, { "epoch": 0.7114624505928854, "grad_norm": 113.44349673713964, "learning_rate": 8.222759226713533e-07, "logits/chosen": 0.213226318359375, "logits/rejected": 0.170928955078125, "logps/chosen": -249.9499969482422, "logps/rejected": -358.3999938964844, "loss": 0.4937, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.29841309785842896, "rewards/margins": 1.6212890148162842, "rewards/rejected": -1.919824242591858, "step": 810 }, { "epoch": 0.720245937637242, "grad_norm": 68.44720002650934, "learning_rate": 8.200790861159929e-07, "logits/chosen": 0.30836182832717896, "logits/rejected": 0.2982116639614105, "logps/chosen": -290.75, "logps/rejected": -428.3999938964844, "loss": 0.4865, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.4919677674770355, "rewards/margins": 1.6305663585662842, "rewards/rejected": -2.1225829124450684, "step": 820 }, { "epoch": 0.7290294246815986, "grad_norm": 88.68484542641849, "learning_rate": 8.178822495606327e-07, "logits/chosen": 0.3367065489292145, "logits/rejected": 0.17885056138038635, "logps/chosen": -240.85000610351562, "logps/rejected": -348.1000061035156, "loss": 0.4998, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.06321410834789276, "rewards/margins": 1.385961890220642, "rewards/rejected": -1.4510924816131592, "step": 830 }, { "epoch": 0.7378129117259552, "grad_norm": 113.35940696341727, "learning_rate": 8.156854130052724e-07, "logits/chosen": 0.355804443359375, "logits/rejected": 0.2767272889614105, "logps/chosen": -259.20001220703125, "logps/rejected": -355.04998779296875, "loss": 0.5223, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.19244384765625, "rewards/margins": 1.162139892578125, "rewards/rejected": -0.9696594476699829, "step": 840 }, { "epoch": 0.7465963987703118, "grad_norm": 50.707008909919395, "learning_rate": 8.134885764499121e-07, "logits/chosen": 0.3296661376953125, "logits/rejected": 0.345114141702652, "logps/chosen": -249.0500030517578, "logps/rejected": -331.70001220703125, "loss": 0.5292, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.17262573540210724, "rewards/margins": 1.0563232898712158, "rewards/rejected": -0.8844238519668579, "step": 850 }, { "epoch": 0.7553798858146684, "grad_norm": 54.51345618787359, "learning_rate": 8.112917398945518e-07, "logits/chosen": 0.3970222473144531, "logits/rejected": 0.28850096464157104, "logps/chosen": -235.375, "logps/rejected": -350.20001220703125, "loss": 0.4415, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": 0.2743164002895355, "rewards/margins": 1.4638671875, "rewards/rejected": -1.18896484375, "step": 860 }, { "epoch": 0.764163372859025, "grad_norm": 96.10012867301216, "learning_rate": 8.090949033391915e-07, "logits/chosen": 0.22058716416358948, "logits/rejected": 0.21052245795726776, "logps/chosen": -262.7749938964844, "logps/rejected": -361.5, "loss": 0.5722, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.3714843690395355, "rewards/margins": 1.1228516101837158, "rewards/rejected": -1.494042992591858, "step": 870 }, { "epoch": 0.7729468599033816, "grad_norm": 73.68132274237811, "learning_rate": 8.068980667838312e-07, "logits/chosen": 0.3888610899448395, "logits/rejected": 0.167724609375, "logps/chosen": -230.10000610351562, "logps/rejected": -372.1000061035156, "loss": 0.6914, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.45307618379592896, "rewards/margins": 1.407446265220642, "rewards/rejected": -1.8585937023162842, "step": 880 }, { "epoch": 0.7817303469477382, "grad_norm": 61.616181332759744, "learning_rate": 8.04701230228471e-07, "logits/chosen": 0.32768553495407104, "logits/rejected": 0.23428955674171448, "logps/chosen": -258.875, "logps/rejected": -369.8999938964844, "loss": 0.4235, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.23674316704273224, "rewards/margins": 1.5515625476837158, "rewards/rejected": -1.7898437976837158, "step": 890 }, { "epoch": 0.7905138339920948, "grad_norm": 76.39156801678632, "learning_rate": 8.025043936731107e-07, "logits/chosen": 0.3125244081020355, "logits/rejected": 0.17890015244483948, "logps/chosen": -264.70001220703125, "logps/rejected": -354.29998779296875, "loss": 0.5575, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.614410400390625, "rewards/margins": 1.3477051258087158, "rewards/rejected": -1.962011694908142, "step": 900 }, { "epoch": 0.7992973210364515, "grad_norm": 88.12673401140376, "learning_rate": 8.003075571177505e-07, "logits/chosen": 0.3246215879917145, "logits/rejected": 0.23576660454273224, "logps/chosen": -275.75, "logps/rejected": -369.75, "loss": 0.48, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.5657714605331421, "rewards/margins": 1.4255859851837158, "rewards/rejected": -1.992578148841858, "step": 910 }, { "epoch": 0.8080808080808081, "grad_norm": 108.13980345599695, "learning_rate": 7.981107205623901e-07, "logits/chosen": 0.2906738221645355, "logits/rejected": 0.23848876357078552, "logps/chosen": -299.04998779296875, "logps/rejected": -371.8999938964844, "loss": 0.4856, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.6021728515625, "rewards/margins": 1.4636719226837158, "rewards/rejected": -2.0638670921325684, "step": 920 }, { "epoch": 0.8168642951251647, "grad_norm": 61.37192073111895, "learning_rate": 7.959138840070299e-07, "logits/chosen": 0.2937072813510895, "logits/rejected": 0.23980101943016052, "logps/chosen": -290.1499938964844, "logps/rejected": -415.1499938964844, "loss": 0.4251, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.4466308653354645, "rewards/margins": 1.7791016101837158, "rewards/rejected": -2.223437547683716, "step": 930 }, { "epoch": 0.8256477821695213, "grad_norm": 40.26464128196594, "learning_rate": 7.937170474516695e-07, "logits/chosen": 0.2821044921875, "logits/rejected": 0.28074342012405396, "logps/chosen": -248.39999389648438, "logps/rejected": -361.70001220703125, "loss": 0.4398, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.28559571504592896, "rewards/margins": 1.7460448741912842, "rewards/rejected": -2.0326170921325684, "step": 940 }, { "epoch": 0.8344312692138779, "grad_norm": 53.77019636081402, "learning_rate": 7.915202108963093e-07, "logits/chosen": 0.34534913301467896, "logits/rejected": 0.2879882752895355, "logps/chosen": -279.8500061035156, "logps/rejected": -396.70001220703125, "loss": 0.4873, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.526867687702179, "rewards/margins": 1.556249976158142, "rewards/rejected": -2.08203125, "step": 950 }, { "epoch": 0.8432147562582345, "grad_norm": 75.72796662497927, "learning_rate": 7.89323374340949e-07, "logits/chosen": 0.27079468965530396, "logits/rejected": 0.21390075981616974, "logps/chosen": -275.375, "logps/rejected": -388.45001220703125, "loss": 0.5787, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.489990234375, "rewards/margins": 1.2587401866912842, "rewards/rejected": -1.7498047351837158, "step": 960 }, { "epoch": 0.8519982433025911, "grad_norm": 67.90907046528764, "learning_rate": 7.871265377855887e-07, "logits/chosen": 0.314453125, "logits/rejected": 0.22438964247703552, "logps/chosen": -248.5, "logps/rejected": -390.75, "loss": 0.5109, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.3794006407260895, "rewards/margins": 1.437109351158142, "rewards/rejected": -1.8166992664337158, "step": 970 }, { "epoch": 0.8607817303469477, "grad_norm": 90.56077429898126, "learning_rate": 7.849297012302284e-07, "logits/chosen": 0.3433593809604645, "logits/rejected": 0.3183837831020355, "logps/chosen": -286.1499938964844, "logps/rejected": -375.95001220703125, "loss": 0.4593, "rewards/accuracies": 0.78125, "rewards/chosen": -0.20551757514476776, "rewards/margins": 1.37744140625, "rewards/rejected": -1.582421898841858, "step": 980 }, { "epoch": 0.8695652173913043, "grad_norm": 92.73388781228336, "learning_rate": 7.827328646748682e-07, "logits/chosen": 0.306906133890152, "logits/rejected": 0.29322510957717896, "logps/chosen": -286.95001220703125, "logps/rejected": -401.70001220703125, "loss": 0.5537, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.24338379502296448, "rewards/margins": 1.2459716796875, "rewards/rejected": -1.49072265625, "step": 990 }, { "epoch": 0.8783487044356609, "grad_norm": 57.50791934147996, "learning_rate": 7.805360281195079e-07, "logits/chosen": 0.29716795682907104, "logits/rejected": 0.15861816704273224, "logps/chosen": -282.79998779296875, "logps/rejected": -429.8500061035156, "loss": 0.4068, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10789795219898224, "rewards/margins": 1.935156226158142, "rewards/rejected": -2.041796922683716, "step": 1000 }, { "epoch": 0.8871321914800175, "grad_norm": 66.39139310646738, "learning_rate": 7.783391915641477e-07, "logits/chosen": 0.36488646268844604, "logits/rejected": 0.22176513075828552, "logps/chosen": -261.7250061035156, "logps/rejected": -345.1000061035156, "loss": 0.5196, "rewards/accuracies": 0.75, "rewards/chosen": -0.14448241889476776, "rewards/margins": 1.4163086414337158, "rewards/rejected": -1.55908203125, "step": 1010 }, { "epoch": 0.8959156785243741, "grad_norm": 47.57616796025895, "learning_rate": 7.761423550087873e-07, "logits/chosen": 0.3269409239292145, "logits/rejected": 0.247894287109375, "logps/chosen": -293.3500061035156, "logps/rejected": -434.54998779296875, "loss": 0.5313, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.3236328065395355, "rewards/margins": 1.5017578601837158, "rewards/rejected": -1.826562523841858, "step": 1020 }, { "epoch": 0.9046991655687308, "grad_norm": 72.34948021171432, "learning_rate": 7.739455184534271e-07, "logits/chosen": 0.2937866151332855, "logits/rejected": 0.27376556396484375, "logps/chosen": -309.3999938964844, "logps/rejected": -424.3999938964844, "loss": 0.4566, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.45698243379592896, "rewards/margins": 1.636816382408142, "rewards/rejected": -2.094531297683716, "step": 1030 }, { "epoch": 0.9134826526130874, "grad_norm": 153.4455733354812, "learning_rate": 7.717486818980667e-07, "logits/chosen": 0.36506348848342896, "logits/rejected": 0.23938599228858948, "logps/chosen": -245.5500030517578, "logps/rejected": -364.75, "loss": 0.5096, "rewards/accuracies": 0.75, "rewards/chosen": -0.30021971464157104, "rewards/margins": 1.5763671398162842, "rewards/rejected": -1.8757812976837158, "step": 1040 }, { "epoch": 0.922266139657444, "grad_norm": 63.77775905264279, "learning_rate": 7.695518453427065e-07, "logits/chosen": 0.330221563577652, "logits/rejected": 0.34270018339157104, "logps/chosen": -233.6999969482422, "logps/rejected": -395.54998779296875, "loss": 0.4914, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.14591065049171448, "rewards/margins": 1.64208984375, "rewards/rejected": -1.7861328125, "step": 1050 }, { "epoch": 0.9310496267018006, "grad_norm": 98.73010490009788, "learning_rate": 7.673550087873461e-07, "logits/chosen": 0.393585205078125, "logits/rejected": 0.19044189155101776, "logps/chosen": -297.125, "logps/rejected": -414.8999938964844, "loss": 0.5291, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3367981016635895, "rewards/margins": 1.486669898033142, "rewards/rejected": -1.82421875, "step": 1060 }, { "epoch": 0.9398331137461572, "grad_norm": 71.08329804883088, "learning_rate": 7.651581722319859e-07, "logits/chosen": 0.36182862520217896, "logits/rejected": 0.28608399629592896, "logps/chosen": -233.8249969482422, "logps/rejected": -381.8999938964844, "loss": 0.4808, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.14846190810203552, "rewards/margins": 1.712890625, "rewards/rejected": -1.861914038658142, "step": 1070 }, { "epoch": 0.9486166007905138, "grad_norm": 63.39663110726449, "learning_rate": 7.629613356766256e-07, "logits/chosen": 0.4180908203125, "logits/rejected": 0.4419189393520355, "logps/chosen": -257.3999938964844, "logps/rejected": -355.20001220703125, "loss": 0.472, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.005963134579360485, "rewards/margins": 1.429101586341858, "rewards/rejected": -1.4344818592071533, "step": 1080 }, { "epoch": 0.9574000878348704, "grad_norm": 90.29584133199342, "learning_rate": 7.607644991212654e-07, "logits/chosen": 0.478271484375, "logits/rejected": 0.4160522520542145, "logps/chosen": -264.54998779296875, "logps/rejected": -395.75, "loss": 0.4188, "rewards/accuracies": 0.84375, "rewards/chosen": -0.13417358696460724, "rewards/margins": 1.6453125476837158, "rewards/rejected": -1.7797362804412842, "step": 1090 }, { "epoch": 0.966183574879227, "grad_norm": 56.11268415384194, "learning_rate": 7.585676625659051e-07, "logits/chosen": 0.39582520723342896, "logits/rejected": 0.35059815645217896, "logps/chosen": -280.79998779296875, "logps/rejected": -407.0, "loss": 0.4903, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.008679199032485485, "rewards/margins": 1.618554711341858, "rewards/rejected": -1.626953125, "step": 1100 }, { "epoch": 0.9749670619235836, "grad_norm": 80.80024763861566, "learning_rate": 7.563708260105448e-07, "logits/chosen": 0.3339294493198395, "logits/rejected": 0.24503478407859802, "logps/chosen": -257.0, "logps/rejected": -392.8500061035156, "loss": 0.4714, "rewards/accuracies": 0.78125, "rewards/chosen": -0.12353515625, "rewards/margins": 1.8201415538787842, "rewards/rejected": -1.941992163658142, "step": 1110 }, { "epoch": 0.9837505489679402, "grad_norm": 83.13024725691754, "learning_rate": 7.541739894551845e-07, "logits/chosen": 0.31227415800094604, "logits/rejected": 0.26469725370407104, "logps/chosen": -262.92498779296875, "logps/rejected": -341.3999938964844, "loss": 0.4506, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.0002563476446084678, "rewards/margins": 1.28564453125, "rewards/rejected": -1.2862427234649658, "step": 1120 }, { "epoch": 0.9925340360122968, "grad_norm": 117.67159347513096, "learning_rate": 7.519771528998243e-07, "logits/chosen": 0.18920287489891052, "logits/rejected": 0.2196044921875, "logps/chosen": -301.04998779296875, "logps/rejected": -403.95001220703125, "loss": 0.6184, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.3028320372104645, "rewards/margins": 1.379492163658142, "rewards/rejected": -1.683496117591858, "step": 1130 }, { "epoch": 1.0008783487044357, "grad_norm": 20.440874192820978, "learning_rate": 7.497803163444639e-07, "logits/chosen": 0.34066611528396606, "logits/rejected": 0.23922890424728394, "logps/chosen": -274.4210510253906, "logps/rejected": -388.4210510253906, "loss": 0.4029, "rewards/accuracies": 0.8026315569877625, "rewards/chosen": 0.026084497570991516, "rewards/margins": 2.0867598056793213, "rewards/rejected": -2.0608551502227783, "step": 1140 }, { "epoch": 1.0096618357487923, "grad_norm": 15.809921052901133, "learning_rate": 7.475834797891037e-07, "logits/chosen": 0.299560546875, "logits/rejected": 0.3299667239189148, "logps/chosen": -246.0, "logps/rejected": -378.45001220703125, "loss": 0.1049, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.887451171875, "rewards/margins": 3.78515625, "rewards/rejected": -2.897265672683716, "step": 1150 }, { "epoch": 1.018445322793149, "grad_norm": 17.066901952935996, "learning_rate": 7.453866432337433e-07, "logits/chosen": 0.19952392578125, "logits/rejected": 0.20375366508960724, "logps/chosen": -220.77499389648438, "logps/rejected": -382.6499938964844, "loss": 0.0935, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.6580566167831421, "rewards/margins": 4.216406345367432, "rewards/rejected": -3.555859327316284, "step": 1160 }, { "epoch": 1.0272288098375055, "grad_norm": 17.78025040617728, "learning_rate": 7.431898066783831e-07, "logits/chosen": 0.06718139350414276, "logits/rejected": 0.01658935472369194, "logps/chosen": -263.54998779296875, "logps/rejected": -391.54998779296875, "loss": 0.0627, "rewards/accuracies": 1.0, "rewards/chosen": 0.013354492373764515, "rewards/margins": 5.010937690734863, "rewards/rejected": -5.0, "step": 1170 }, { "epoch": 1.0360122968818621, "grad_norm": 12.529885874001959, "learning_rate": 7.409929701230228e-07, "logits/chosen": -0.04084472730755806, "logits/rejected": 0.0020629882346838713, "logps/chosen": -267.25, "logps/rejected": -425.6000061035156, "loss": 0.0838, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.565747082233429, "rewards/margins": 5.733593940734863, "rewards/rejected": -6.299218654632568, "step": 1180 }, { "epoch": 1.0447957839262187, "grad_norm": 17.033216134908038, "learning_rate": 7.387961335676626e-07, "logits/chosen": 0.07882080227136612, "logits/rejected": -0.08107910305261612, "logps/chosen": -271.07501220703125, "logps/rejected": -429.8999938964844, "loss": 0.071, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.6080688238143921, "rewards/margins": 5.735937595367432, "rewards/rejected": -6.346875190734863, "step": 1190 }, { "epoch": 1.0535792709705754, "grad_norm": 26.055487091444334, "learning_rate": 7.365992970123023e-07, "logits/chosen": -0.05823059007525444, "logits/rejected": 0.02796020545065403, "logps/chosen": -275.8999938964844, "logps/rejected": -445.6000061035156, "loss": 0.0729, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.43598634004592896, "rewards/margins": 5.590624809265137, "rewards/rejected": -6.024218559265137, "step": 1200 }, { "epoch": 1.062362758014932, "grad_norm": 15.084125021296416, "learning_rate": 7.34402460456942e-07, "logits/chosen": -0.12774047255516052, "logits/rejected": -0.11796875298023224, "logps/chosen": -267.7749938964844, "logps/rejected": -387.29998779296875, "loss": 0.0691, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.32890623807907104, "rewards/margins": 5.6171875, "rewards/rejected": -5.951562404632568, "step": 1210 }, { "epoch": 1.0711462450592886, "grad_norm": 7.8731257934057615, "learning_rate": 7.322056239015817e-07, "logits/chosen": -0.0428466796875, "logits/rejected": -0.11500243842601776, "logps/chosen": -253.60000610351562, "logps/rejected": -411.8999938964844, "loss": 0.0756, "rewards/accuracies": 0.96875, "rewards/chosen": -0.21232910454273224, "rewards/margins": 5.545312404632568, "rewards/rejected": -5.7578125, "step": 1220 }, { "epoch": 1.0799297321036452, "grad_norm": 15.328425920240678, "learning_rate": 7.300087873462214e-07, "logits/chosen": -0.11522521823644638, "logits/rejected": -0.15517883002758026, "logps/chosen": -286.3999938964844, "logps/rejected": -429.8999938964844, "loss": 0.0799, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.7025970220565796, "rewards/margins": 5.014062404632568, "rewards/rejected": -5.717187404632568, "step": 1230 }, { "epoch": 1.0887132191480018, "grad_norm": 16.655009446483806, "learning_rate": 7.278119507908611e-07, "logits/chosen": -0.06629028171300888, "logits/rejected": -0.06909789890050888, "logps/chosen": -284.8500061035156, "logps/rejected": -418.79998779296875, "loss": 0.0742, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.8167694211006165, "rewards/margins": 5.428124904632568, "rewards/rejected": -6.245312690734863, "step": 1240 }, { "epoch": 1.0974967061923584, "grad_norm": 6.528924409084525, "learning_rate": 7.256151142355009e-07, "logits/chosen": -0.25098878145217896, "logits/rejected": -0.34846192598342896, "logps/chosen": -272.79998779296875, "logps/rejected": -459.1499938964844, "loss": 0.0523, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.073828101158142, "rewards/margins": 6.098437309265137, "rewards/rejected": -7.172656059265137, "step": 1250 }, { "epoch": 1.106280193236715, "grad_norm": 11.104439176916763, "learning_rate": 7.234182776801405e-07, "logits/chosen": -0.09501342475414276, "logits/rejected": -0.225067138671875, "logps/chosen": -264.7250061035156, "logps/rejected": -460.0, "loss": 0.0703, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.5216796398162842, "rewards/margins": 6.979687690734863, "rewards/rejected": -8.504687309265137, "step": 1260 }, { "epoch": 1.1150636802810716, "grad_norm": 14.68445284352973, "learning_rate": 7.212214411247804e-07, "logits/chosen": -0.18807372450828552, "logits/rejected": -0.3263916075229645, "logps/chosen": -274.5249938964844, "logps/rejected": -439.70001220703125, "loss": 0.0652, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.6147949695587158, "rewards/margins": 6.998437404632568, "rewards/rejected": -8.607812881469727, "step": 1270 }, { "epoch": 1.1238471673254282, "grad_norm": 6.756012226566608, "learning_rate": 7.1902460456942e-07, "logits/chosen": -0.17550048232078552, "logits/rejected": -0.19269409775733948, "logps/chosen": -302.9750061035156, "logps/rejected": -418.29998779296875, "loss": 0.0742, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.240014672279358, "rewards/margins": 6.211718559265137, "rewards/rejected": -7.446875095367432, "step": 1280 }, { "epoch": 1.1326306543697848, "grad_norm": 14.056513520133173, "learning_rate": 7.168277680140598e-07, "logits/chosen": -0.08388976752758026, "logits/rejected": -0.12659302353858948, "logps/chosen": -282.17498779296875, "logps/rejected": -415.0, "loss": 0.0676, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.162695288658142, "rewards/margins": 5.2265625, "rewards/rejected": -6.385937690734863, "step": 1290 }, { "epoch": 1.1414141414141414, "grad_norm": 27.124407129659946, "learning_rate": 7.146309314586994e-07, "logits/chosen": 0.07841797173023224, "logits/rejected": -0.06027831882238388, "logps/chosen": -260.54998779296875, "logps/rejected": -397.29998779296875, "loss": 0.0699, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.689013659954071, "rewards/margins": 5.721875190734863, "rewards/rejected": -6.41015625, "step": 1300 }, { "epoch": 1.150197628458498, "grad_norm": 17.60175967974828, "learning_rate": 7.124340949033392e-07, "logits/chosen": 0.02457275427877903, "logits/rejected": -0.05143127590417862, "logps/chosen": -240.97500610351562, "logps/rejected": -374.0, "loss": 0.0722, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.668200671672821, "rewards/margins": 5.4296875, "rewards/rejected": -6.099999904632568, "step": 1310 }, { "epoch": 1.1589811155028547, "grad_norm": 19.65028790170859, "learning_rate": 7.102372583479789e-07, "logits/chosen": 0.009417724795639515, "logits/rejected": -0.14219054579734802, "logps/chosen": -299.95001220703125, "logps/rejected": -482.29998779296875, "loss": 0.0725, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.139184594154358, "rewards/margins": 6.243750095367432, "rewards/rejected": -7.385937690734863, "step": 1320 }, { "epoch": 1.1677646025472113, "grad_norm": 15.368772158984585, "learning_rate": 7.080404217926186e-07, "logits/chosen": -0.13981933891773224, "logits/rejected": -0.17585448920726776, "logps/chosen": -257.3999938964844, "logps/rejected": -418.75, "loss": 0.0715, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.0472900867462158, "rewards/margins": 6.157812595367432, "rewards/rejected": -7.207812309265137, "step": 1330 }, { "epoch": 1.1765480895915679, "grad_norm": 1.2120818023261295, "learning_rate": 7.058435852372583e-07, "logits/chosen": -0.21288451552391052, "logits/rejected": -0.23605041205883026, "logps/chosen": -292.25, "logps/rejected": -477.20001220703125, "loss": 0.0678, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.542089819908142, "rewards/margins": 6.553124904632568, "rewards/rejected": -8.09375, "step": 1340 }, { "epoch": 1.1853315766359245, "grad_norm": 11.77363741711944, "learning_rate": 7.03646748681898e-07, "logits/chosen": -0.3181091248989105, "logits/rejected": -0.37370604276657104, "logps/chosen": -273.8500061035156, "logps/rejected": -420.75, "loss": 0.0637, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.913671851158142, "rewards/margins": 6.172656059265137, "rewards/rejected": -8.088281631469727, "step": 1350 }, { "epoch": 1.194115063680281, "grad_norm": 17.52212340805691, "learning_rate": 7.014499121265377e-07, "logits/chosen": -0.38044434785842896, "logits/rejected": -0.35929566621780396, "logps/chosen": -311.45001220703125, "logps/rejected": -441.95001220703125, "loss": 0.0748, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.9500000476837158, "rewards/margins": 6.485156059265137, "rewards/rejected": -8.432812690734863, "step": 1360 }, { "epoch": 1.2028985507246377, "grad_norm": 19.977518906613696, "learning_rate": 6.992530755711776e-07, "logits/chosen": -0.13048096001148224, "logits/rejected": -0.333740234375, "logps/chosen": -247.5749969482422, "logps/rejected": -423.8999938964844, "loss": 0.0837, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.012500047683716, "rewards/margins": 6.432812690734863, "rewards/rejected": -8.448437690734863, "step": 1370 }, { "epoch": 1.2116820377689943, "grad_norm": 6.793643051438152, "learning_rate": 6.970562390158172e-07, "logits/chosen": -0.3622192442417145, "logits/rejected": -0.25664061307907104, "logps/chosen": -303.8500061035156, "logps/rejected": -457.79998779296875, "loss": 0.0559, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.6754882335662842, "rewards/margins": 6.185937404632568, "rewards/rejected": -7.861718654632568, "step": 1380 }, { "epoch": 1.220465524813351, "grad_norm": 16.509188205886872, "learning_rate": 6.94859402460457e-07, "logits/chosen": -0.20051269233226776, "logits/rejected": -0.19052734971046448, "logps/chosen": -306.29998779296875, "logps/rejected": -516.4000244140625, "loss": 0.0598, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.0311706066131592, "rewards/margins": 6.682812690734863, "rewards/rejected": -7.712500095367432, "step": 1390 }, { "epoch": 1.2292490118577075, "grad_norm": 9.69148697623928, "learning_rate": 6.926625659050966e-07, "logits/chosen": -0.2977661192417145, "logits/rejected": -0.30290526151657104, "logps/chosen": -274.54998779296875, "logps/rejected": -412.75, "loss": 0.0551, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.1038086414337158, "rewards/margins": 5.912499904632568, "rewards/rejected": -7.01953125, "step": 1400 }, { "epoch": 1.2380324989020641, "grad_norm": 9.123323358093556, "learning_rate": 6.904657293497364e-07, "logits/chosen": -0.01696624793112278, "logits/rejected": -0.2503418028354645, "logps/chosen": -290.1499938964844, "logps/rejected": -455.6000061035156, "loss": 0.0744, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.7819335460662842, "rewards/margins": 6.560937404632568, "rewards/rejected": -8.340624809265137, "step": 1410 }, { "epoch": 1.2468159859464207, "grad_norm": 9.760602922827763, "learning_rate": 6.88268892794376e-07, "logits/chosen": -0.3173584043979645, "logits/rejected": -0.30096131563186646, "logps/chosen": -296.25, "logps/rejected": -451.5, "loss": 0.0624, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.1832642555236816, "rewards/margins": 6.259375095367432, "rewards/rejected": -8.443750381469727, "step": 1420 }, { "epoch": 1.2555994729907773, "grad_norm": 18.754482321447604, "learning_rate": 6.860720562390158e-07, "logits/chosen": -0.33122557401657104, "logits/rejected": -0.3621826171875, "logps/chosen": -287.6499938964844, "logps/rejected": -410.3999938964844, "loss": 0.0538, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.823339819908142, "rewards/margins": 6.673437595367432, "rewards/rejected": -8.498437881469727, "step": 1430 }, { "epoch": 1.264382960035134, "grad_norm": 11.343588628478456, "learning_rate": 6.838752196836555e-07, "logits/chosen": -0.3287719786167145, "logits/rejected": -0.34299927949905396, "logps/chosen": -269.1499938964844, "logps/rejected": -423.29998779296875, "loss": 0.0715, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.4979248046875, "rewards/margins": 6.659375190734863, "rewards/rejected": -8.168749809265137, "step": 1440 }, { "epoch": 1.2731664470794906, "grad_norm": 24.617025557603018, "learning_rate": 6.816783831282952e-07, "logits/chosen": -0.22059936821460724, "logits/rejected": -0.19501495361328125, "logps/chosen": -278.5625, "logps/rejected": -441.0, "loss": 0.0894, "rewards/accuracies": 0.96875, "rewards/chosen": -1.5487792491912842, "rewards/margins": 6.139843940734863, "rewards/rejected": -7.693749904632568, "step": 1450 }, { "epoch": 1.2819499341238472, "grad_norm": 13.83317928559678, "learning_rate": 6.79481546572935e-07, "logits/chosen": -0.09210205078125, "logits/rejected": -0.3416992127895355, "logps/chosen": -249.6999969482422, "logps/rejected": -409.1000061035156, "loss": 0.0712, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.244042992591858, "rewards/margins": 6.108593940734863, "rewards/rejected": -7.348437309265137, "step": 1460 }, { "epoch": 1.2907334211682038, "grad_norm": 21.344121421336983, "learning_rate": 6.772847100175747e-07, "logits/chosen": -0.33955079317092896, "logits/rejected": -0.423126220703125, "logps/chosen": -273.6499938964844, "logps/rejected": -420.1000061035156, "loss": 0.0682, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.8367187976837158, "rewards/margins": 5.910937309265137, "rewards/rejected": -7.748437404632568, "step": 1470 }, { "epoch": 1.2995169082125604, "grad_norm": 9.303741065361331, "learning_rate": 6.750878734622144e-07, "logits/chosen": -0.22593994438648224, "logits/rejected": -0.38630372285842896, "logps/chosen": -317.70001220703125, "logps/rejected": -511.29998779296875, "loss": 0.0487, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.328320264816284, "rewards/margins": 7.346875190734863, "rewards/rejected": -9.670312881469727, "step": 1480 }, { "epoch": 1.308300395256917, "grad_norm": 5.4591998485492015, "learning_rate": 6.728910369068542e-07, "logits/chosen": -0.07097168266773224, "logits/rejected": -0.3295226991176605, "logps/chosen": -278.5, "logps/rejected": -463.70001220703125, "loss": 0.0579, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.21484375, "rewards/margins": 7.603125095367432, "rewards/rejected": -9.8203125, "step": 1490 }, { "epoch": 1.3170838823012736, "grad_norm": 34.7390373457084, "learning_rate": 6.706942003514938e-07, "logits/chosen": -0.3793701231479645, "logits/rejected": -0.4449218809604645, "logps/chosen": -277.3999938964844, "logps/rejected": -409.6499938964844, "loss": 0.0844, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.756982445716858, "rewards/margins": 6.83203125, "rewards/rejected": -8.6015625, "step": 1500 }, { "epoch": 1.3258673693456302, "grad_norm": 33.20650659108437, "learning_rate": 6.684973637961336e-07, "logits/chosen": -0.20389404892921448, "logits/rejected": -0.2959960997104645, "logps/chosen": -280.5, "logps/rejected": -499.04998779296875, "loss": 0.0598, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.6301758289337158, "rewards/margins": 7.534375190734863, "rewards/rejected": -9.168749809265137, "step": 1510 }, { "epoch": 1.3346508563899868, "grad_norm": 28.559561108587047, "learning_rate": 6.663005272407732e-07, "logits/chosen": -0.24488525092601776, "logits/rejected": -0.27906495332717896, "logps/chosen": -302.125, "logps/rejected": -477.8999938964844, "loss": 0.0449, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.461035132408142, "rewards/margins": 6.6796875, "rewards/rejected": -8.142187118530273, "step": 1520 }, { "epoch": 1.3434343434343434, "grad_norm": 2.216012422365027, "learning_rate": 6.64103690685413e-07, "logits/chosen": -0.30253905057907104, "logits/rejected": -0.3541809022426605, "logps/chosen": -266.6000061035156, "logps/rejected": -436.25, "loss": 0.0643, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.0862305164337158, "rewards/margins": 6.129687309265137, "rewards/rejected": -7.228125095367432, "step": 1530 }, { "epoch": 1.3522178304787, "grad_norm": 21.825386855829453, "learning_rate": 6.619068541300526e-07, "logits/chosen": -0.20488281548023224, "logits/rejected": -0.2882018983364105, "logps/chosen": -297.1499938964844, "logps/rejected": -477.70001220703125, "loss": 0.068, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.1096923351287842, "rewards/margins": 6.871874809265137, "rewards/rejected": -7.978125095367432, "step": 1540 }, { "epoch": 1.3610013175230566, "grad_norm": 7.084091279178667, "learning_rate": 6.597100175746925e-07, "logits/chosen": -0.20550537109375, "logits/rejected": -0.35065919160842896, "logps/chosen": -281.5, "logps/rejected": -478.8999938964844, "loss": 0.0685, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.453125, "rewards/margins": 6.81640625, "rewards/rejected": -8.271875381469727, "step": 1550 }, { "epoch": 1.3697848045674132, "grad_norm": 11.9527319002209, "learning_rate": 6.575131810193322e-07, "logits/chosen": -0.36662596464157104, "logits/rejected": -0.4238296449184418, "logps/chosen": -275.1000061035156, "logps/rejected": -451.70001220703125, "loss": 0.0666, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.51806640625, "rewards/margins": 6.529687404632568, "rewards/rejected": -8.045312881469727, "step": 1560 }, { "epoch": 1.3785682916117699, "grad_norm": 35.407819790700955, "learning_rate": 6.553163444639719e-07, "logits/chosen": -0.20588989555835724, "logits/rejected": -0.29914551973342896, "logps/chosen": -245.1999969482422, "logps/rejected": -407.20001220703125, "loss": 0.0746, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.448602318763733, "rewards/margins": 6.22265625, "rewards/rejected": -7.673437595367432, "step": 1570 }, { "epoch": 1.3873517786561265, "grad_norm": 20.950088886265757, "learning_rate": 6.531195079086116e-07, "logits/chosen": -0.19101563096046448, "logits/rejected": -0.35041505098342896, "logps/chosen": -269.29998779296875, "logps/rejected": -445.5, "loss": 0.086, "rewards/accuracies": 0.96875, "rewards/chosen": -1.5352904796600342, "rewards/margins": 6.359375, "rewards/rejected": -7.893750190734863, "step": 1580 }, { "epoch": 1.396135265700483, "grad_norm": 23.91004471788852, "learning_rate": 6.509226713532513e-07, "logits/chosen": -0.326019287109375, "logits/rejected": -0.3315490782260895, "logps/chosen": -303.54998779296875, "logps/rejected": -474.5, "loss": 0.0412, "rewards/accuracies": 1.0, "rewards/chosen": -1.564208984375, "rewards/margins": 6.353125095367432, "rewards/rejected": -7.920312404632568, "step": 1590 }, { "epoch": 1.4049187527448397, "grad_norm": 8.6811438075454, "learning_rate": 6.48725834797891e-07, "logits/chosen": -0.08583831787109375, "logits/rejected": -0.24260254204273224, "logps/chosen": -279.25, "logps/rejected": -483.54998779296875, "loss": 0.0536, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.0837891101837158, "rewards/margins": 6.376562595367432, "rewards/rejected": -7.454687595367432, "step": 1600 }, { "epoch": 1.4137022397891963, "grad_norm": 37.85515362014508, "learning_rate": 6.465289982425308e-07, "logits/chosen": -0.17780761420726776, "logits/rejected": -0.2599243223667145, "logps/chosen": -254.25, "logps/rejected": -435.6499938964844, "loss": 0.075, "rewards/accuracies": 0.96875, "rewards/chosen": -1.4656250476837158, "rewards/margins": 6.319531440734863, "rewards/rejected": -7.785937309265137, "step": 1610 }, { "epoch": 1.422485726833553, "grad_norm": 64.75795808453968, "learning_rate": 6.443321616871704e-07, "logits/chosen": -0.19268798828125, "logits/rejected": -0.21155396103858948, "logps/chosen": -250.10000610351562, "logps/rejected": -426.6499938964844, "loss": 0.1135, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -1.054101586341858, "rewards/margins": 6.091406345367432, "rewards/rejected": -7.142968654632568, "step": 1620 }, { "epoch": 1.4312692138779095, "grad_norm": 54.0595271885039, "learning_rate": 6.421353251318102e-07, "logits/chosen": -0.10397949069738388, "logits/rejected": -0.25519102811813354, "logps/chosen": -310.75, "logps/rejected": -483.20001220703125, "loss": 0.048, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.1675841808319092, "rewards/margins": 6.8046875, "rewards/rejected": -7.9765625, "step": 1630 }, { "epoch": 1.4400527009222661, "grad_norm": 32.12921523240987, "learning_rate": 6.399384885764498e-07, "logits/chosen": -0.19061279296875, "logits/rejected": -0.19354248046875, "logps/chosen": -265.32501220703125, "logps/rejected": -440.70001220703125, "loss": 0.0822, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.864086925983429, "rewards/margins": 6.203906059265137, "rewards/rejected": -7.064062595367432, "step": 1640 }, { "epoch": 1.4488361879666227, "grad_norm": 6.279056749705282, "learning_rate": 6.377416520210897e-07, "logits/chosen": -0.1363983154296875, "logits/rejected": -0.19752807915210724, "logps/chosen": -254.39999389648438, "logps/rejected": -429.45001220703125, "loss": 0.0505, "rewards/accuracies": 1.0, "rewards/chosen": -0.6716552972793579, "rewards/margins": 6.310937404632568, "rewards/rejected": -6.979687690734863, "step": 1650 }, { "epoch": 1.4576196750109793, "grad_norm": 9.780437624302703, "learning_rate": 6.355448154657293e-07, "logits/chosen": -0.22934570908546448, "logits/rejected": -0.3731933534145355, "logps/chosen": -289.42498779296875, "logps/rejected": -454.75, "loss": 0.0554, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.7667175531387329, "rewards/margins": 6.532812595367432, "rewards/rejected": -7.301562309265137, "step": 1660 }, { "epoch": 1.466403162055336, "grad_norm": 41.19869957130925, "learning_rate": 6.333479789103691e-07, "logits/chosen": -0.34282225370407104, "logits/rejected": -0.360595703125, "logps/chosen": -280.20001220703125, "logps/rejected": -443.54998779296875, "loss": 0.0839, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.8349243402481079, "rewards/margins": 6.296875, "rewards/rejected": -7.128125190734863, "step": 1670 }, { "epoch": 1.4751866490996925, "grad_norm": 27.512796407371994, "learning_rate": 6.311511423550088e-07, "logits/chosen": -0.208099365234375, "logits/rejected": -0.33576661348342896, "logps/chosen": -266.1000061035156, "logps/rejected": -430.3999938964844, "loss": 0.0891, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.1465332508087158, "rewards/margins": 6.453125, "rewards/rejected": -7.610156059265137, "step": 1680 }, { "epoch": 1.4839701361440492, "grad_norm": 17.634359606766974, "learning_rate": 6.289543057996485e-07, "logits/chosen": -0.21291503310203552, "logits/rejected": -0.26719969511032104, "logps/chosen": -316.67498779296875, "logps/rejected": -482.20001220703125, "loss": 0.0448, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.870410144329071, "rewards/margins": 6.216406345367432, "rewards/rejected": -7.090624809265137, "step": 1690 }, { "epoch": 1.4927536231884058, "grad_norm": 10.72599781964671, "learning_rate": 6.267574692442882e-07, "logits/chosen": -0.18828125298023224, "logits/rejected": -0.2911376953125, "logps/chosen": -318.3999938964844, "logps/rejected": -459.1000061035156, "loss": 0.0476, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.1233336925506592, "rewards/margins": 6.271093845367432, "rewards/rejected": -7.395312309265137, "step": 1700 }, { "epoch": 1.5015371102327624, "grad_norm": 4.86877982176947, "learning_rate": 6.245606326889279e-07, "logits/chosen": -0.230621337890625, "logits/rejected": -0.3639160096645355, "logps/chosen": -278.7250061035156, "logps/rejected": -430.8500061035156, "loss": 0.0492, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.858154296875, "rewards/margins": 6.800000190734863, "rewards/rejected": -7.659375190734863, "step": 1710 }, { "epoch": 1.510320597277119, "grad_norm": 10.3103680423214, "learning_rate": 6.223637961335676e-07, "logits/chosen": -0.13945922255516052, "logits/rejected": -0.33256834745407104, "logps/chosen": -264.5249938964844, "logps/rejected": -442.20001220703125, "loss": 0.0514, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.317968726158142, "rewards/margins": 6.515625, "rewards/rejected": -7.829687595367432, "step": 1720 }, { "epoch": 1.5191040843214756, "grad_norm": 33.30831896280916, "learning_rate": 6.201669595782074e-07, "logits/chosen": -0.3582519590854645, "logits/rejected": -0.4204345643520355, "logps/chosen": -266.6499938964844, "logps/rejected": -442.79998779296875, "loss": 0.0927, "rewards/accuracies": 0.96875, "rewards/chosen": -1.393164038658142, "rewards/margins": 6.807812690734863, "rewards/rejected": -8.203125, "step": 1730 }, { "epoch": 1.5278875713658322, "grad_norm": 10.48831295198915, "learning_rate": 6.179701230228471e-07, "logits/chosen": -0.2739151120185852, "logits/rejected": -0.3834228515625, "logps/chosen": -291.04998779296875, "logps/rejected": -460.0, "loss": 0.0401, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.5134766101837158, "rewards/margins": 6.8125, "rewards/rejected": -8.324999809265137, "step": 1740 }, { "epoch": 1.5366710584101888, "grad_norm": 28.44790537655264, "learning_rate": 6.157732864674869e-07, "logits/chosen": -0.263681024312973, "logits/rejected": -0.31634521484375, "logps/chosen": -272.6000061035156, "logps/rejected": -409.6000061035156, "loss": 0.0464, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.259130835533142, "rewards/margins": 6.196093559265137, "rewards/rejected": -7.453125, "step": 1750 }, { "epoch": 1.5454545454545454, "grad_norm": 12.1783606941607, "learning_rate": 6.135764499121265e-07, "logits/chosen": -0.23885802924633026, "logits/rejected": -0.21274414658546448, "logps/chosen": -271.6499938964844, "logps/rejected": -478.5, "loss": 0.0677, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.491479516029358, "rewards/margins": 6.487500190734863, "rewards/rejected": -7.9765625, "step": 1760 }, { "epoch": 1.554238032498902, "grad_norm": 25.760898101192193, "learning_rate": 6.113796133567663e-07, "logits/chosen": -0.23751983046531677, "logits/rejected": -0.3150634765625, "logps/chosen": -283.5, "logps/rejected": -401.25, "loss": 0.064, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.0866577625274658, "rewards/margins": 6.682812690734863, "rewards/rejected": -7.771874904632568, "step": 1770 }, { "epoch": 1.5630215195432586, "grad_norm": 21.83874295358568, "learning_rate": 6.091827768014059e-07, "logits/chosen": -0.291107177734375, "logits/rejected": -0.28076171875, "logps/chosen": -260.0249938964844, "logps/rejected": -406.29998779296875, "loss": 0.1179, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.2273437976837158, "rewards/margins": 6.453125, "rewards/rejected": -7.676562309265137, "step": 1780 }, { "epoch": 1.5718050065876152, "grad_norm": 28.017355409897075, "learning_rate": 6.069859402460457e-07, "logits/chosen": -0.07996825873851776, "logits/rejected": -0.266937255859375, "logps/chosen": -274.17498779296875, "logps/rejected": -441.6000061035156, "loss": 0.0409, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.6071288585662842, "rewards/margins": 6.893750190734863, "rewards/rejected": -8.499218940734863, "step": 1790 }, { "epoch": 1.5805884936319718, "grad_norm": 19.21711722246913, "learning_rate": 6.047891036906854e-07, "logits/chosen": -0.29957884550094604, "logits/rejected": -0.3034912049770355, "logps/chosen": -268.8500061035156, "logps/rejected": -481.20001220703125, "loss": 0.0596, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.0208983421325684, "rewards/margins": 7.490624904632568, "rewards/rejected": -9.5, "step": 1800 }, { "epoch": 1.5893719806763285, "grad_norm": 41.33941602545988, "learning_rate": 6.025922671353251e-07, "logits/chosen": -0.33576661348342896, "logits/rejected": -0.39427489042282104, "logps/chosen": -281.95001220703125, "logps/rejected": -450.6000061035156, "loss": 0.0732, "rewards/accuracies": 0.96875, "rewards/chosen": -2.297607421875, "rewards/margins": 6.764062404632568, "rewards/rejected": -9.067187309265137, "step": 1810 }, { "epoch": 1.598155467720685, "grad_norm": 43.17272331675636, "learning_rate": 6.003954305799648e-07, "logits/chosen": -0.29521483182907104, "logits/rejected": -0.38325196504592896, "logps/chosen": -298.95001220703125, "logps/rejected": -505.3999938964844, "loss": 0.0642, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.0953125953674316, "rewards/margins": 7.699999809265137, "rewards/rejected": -9.787500381469727, "step": 1820 }, { "epoch": 1.6069389547650417, "grad_norm": 10.110563653921048, "learning_rate": 5.981985940246046e-07, "logits/chosen": -0.246337890625, "logits/rejected": -0.3450073301792145, "logps/chosen": -284.25, "logps/rejected": -461.6000061035156, "loss": 0.0527, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.784277319908142, "rewards/margins": 7.279687404632568, "rewards/rejected": -9.059374809265137, "step": 1830 }, { "epoch": 1.6157224418093983, "grad_norm": 30.75240525883354, "learning_rate": 5.960017574692443e-07, "logits/chosen": -0.17563477158546448, "logits/rejected": -0.26221925020217896, "logps/chosen": -248.35000610351562, "logps/rejected": -444.20001220703125, "loss": 0.0803, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.427099585533142, "rewards/margins": 6.821875095367432, "rewards/rejected": -8.256250381469727, "step": 1840 }, { "epoch": 1.6245059288537549, "grad_norm": 11.195555020626573, "learning_rate": 5.938049209138841e-07, "logits/chosen": -0.2042236328125, "logits/rejected": -0.25086671113967896, "logps/chosen": -290.07501220703125, "logps/rejected": -421.6499938964844, "loss": 0.0915, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.4860351085662842, "rewards/margins": 6.305468559265137, "rewards/rejected": -7.7890625, "step": 1850 }, { "epoch": 1.6332894158981115, "grad_norm": 15.296352579591929, "learning_rate": 5.916080843585237e-07, "logits/chosen": -0.22900390625, "logits/rejected": -0.242828369140625, "logps/chosen": -294.7749938964844, "logps/rejected": -469.8999938964844, "loss": 0.0659, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.8419921398162842, "rewards/margins": 6.643750190734863, "rewards/rejected": -8.478124618530273, "step": 1860 }, { "epoch": 1.642072902942468, "grad_norm": 29.76007000526832, "learning_rate": 5.894112478031635e-07, "logits/chosen": -0.13191528618335724, "logits/rejected": -0.22902831435203552, "logps/chosen": -276.20001220703125, "logps/rejected": -463.3999938964844, "loss": 0.0975, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.8875000476837158, "rewards/margins": 6.528124809265137, "rewards/rejected": -8.420312881469727, "step": 1870 }, { "epoch": 1.6508563899868247, "grad_norm": 16.976895262740946, "learning_rate": 5.872144112478031e-07, "logits/chosen": -0.16134949028491974, "logits/rejected": -0.2365264892578125, "logps/chosen": -282.6499938964844, "logps/rejected": -441.3999938964844, "loss": 0.0605, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.9391601085662842, "rewards/margins": 6.387499809265137, "rewards/rejected": -8.321874618530273, "step": 1880 }, { "epoch": 1.6596398770311813, "grad_norm": 9.943306364203947, "learning_rate": 5.850175746924429e-07, "logits/chosen": -0.09112854301929474, "logits/rejected": -0.13474121689796448, "logps/chosen": -263.375, "logps/rejected": -433.79998779296875, "loss": 0.0876, "rewards/accuracies": 0.96875, "rewards/chosen": -1.57476806640625, "rewards/margins": 6.479687690734863, "rewards/rejected": -8.0546875, "step": 1890 }, { "epoch": 1.668423364075538, "grad_norm": 12.512193753735103, "learning_rate": 5.828207381370825e-07, "logits/chosen": -0.19772949814796448, "logits/rejected": -0.21556396782398224, "logps/chosen": -272.8999938964844, "logps/rejected": -456.20001220703125, "loss": 0.0832, "rewards/accuracies": 0.96875, "rewards/chosen": -1.587133765220642, "rewards/margins": 6.170312404632568, "rewards/rejected": -7.748437404632568, "step": 1900 }, { "epoch": 1.6772068511198945, "grad_norm": 29.218015851485863, "learning_rate": 5.806239015817222e-07, "logits/chosen": -0.21563720703125, "logits/rejected": -0.2637573182582855, "logps/chosen": -291.04998779296875, "logps/rejected": -438.75, "loss": 0.0448, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.501135230064392, "rewards/margins": 6.9609375, "rewards/rejected": -8.475000381469727, "step": 1910 }, { "epoch": 1.6859903381642511, "grad_norm": 17.379628164789032, "learning_rate": 5.78427065026362e-07, "logits/chosen": -0.15725097060203552, "logits/rejected": -0.29433900117874146, "logps/chosen": -271.2250061035156, "logps/rejected": -458.0, "loss": 0.0636, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.9705078601837158, "rewards/margins": 6.868750095367432, "rewards/rejected": -8.8515625, "step": 1920 }, { "epoch": 1.6947738252086078, "grad_norm": 12.143153502900619, "learning_rate": 5.762302284710018e-07, "logits/chosen": -0.11627197265625, "logits/rejected": -0.28874510526657104, "logps/chosen": -282.20001220703125, "logps/rejected": -454.70001220703125, "loss": 0.0314, "rewards/accuracies": 1.0, "rewards/chosen": -2.0650391578674316, "rewards/margins": 7.171875, "rewards/rejected": -9.232812881469727, "step": 1930 }, { "epoch": 1.7035573122529644, "grad_norm": 8.110903038749546, "learning_rate": 5.740333919156415e-07, "logits/chosen": -0.21812744438648224, "logits/rejected": -0.29472655057907104, "logps/chosen": -316.2749938964844, "logps/rejected": -510.20001220703125, "loss": 0.0355, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.3316407203674316, "rewards/margins": 7.743750095367432, "rewards/rejected": -10.082812309265137, "step": 1940 }, { "epoch": 1.712340799297321, "grad_norm": 22.519635032203876, "learning_rate": 5.718365553602812e-07, "logits/chosen": -0.05684814602136612, "logits/rejected": -0.272552490234375, "logps/chosen": -278.8999938964844, "logps/rejected": -457.5, "loss": 0.05, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.6071288585662842, "rewards/margins": 7.160937309265137, "rewards/rejected": -8.760937690734863, "step": 1950 }, { "epoch": 1.7211242863416776, "grad_norm": 19.420131360558422, "learning_rate": 5.696397188049209e-07, "logits/chosen": -0.30647581815719604, "logits/rejected": -0.23797607421875, "logps/chosen": -328.0, "logps/rejected": -463.29998779296875, "loss": 0.0632, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.24365234375, "rewards/margins": 7.088281154632568, "rewards/rejected": -9.340624809265137, "step": 1960 }, { "epoch": 1.7299077733860342, "grad_norm": 42.10323399780937, "learning_rate": 5.674428822495607e-07, "logits/chosen": -0.22131958603858948, "logits/rejected": -0.18203124403953552, "logps/chosen": -273.6000061035156, "logps/rejected": -446.8999938964844, "loss": 0.0511, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.625146508216858, "rewards/margins": 6.754687309265137, "rewards/rejected": -8.381250381469727, "step": 1970 }, { "epoch": 1.7386912604303908, "grad_norm": 11.906931158276118, "learning_rate": 5.652460456942003e-07, "logits/chosen": -0.17329712212085724, "logits/rejected": -0.27037352323532104, "logps/chosen": -263.625, "logps/rejected": -424.0, "loss": 0.0384, "rewards/accuracies": 1.0, "rewards/chosen": -1.1536376476287842, "rewards/margins": 6.706250190734863, "rewards/rejected": -7.859375, "step": 1980 }, { "epoch": 1.7474747474747474, "grad_norm": 42.60252927762306, "learning_rate": 5.6304920913884e-07, "logits/chosen": -0.2701416015625, "logits/rejected": -0.2963195741176605, "logps/chosen": -272.5, "logps/rejected": -475.29998779296875, "loss": 0.0826, "rewards/accuracies": 0.96875, "rewards/chosen": -1.648828148841858, "rewards/margins": 7.020312309265137, "rewards/rejected": -8.665624618530273, "step": 1990 }, { "epoch": 1.756258234519104, "grad_norm": 34.551987531663606, "learning_rate": 5.608523725834797e-07, "logits/chosen": -0.3094116151332855, "logits/rejected": -0.3245300352573395, "logps/chosen": -263.1499938964844, "logps/rejected": -425.54998779296875, "loss": 0.0788, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.7099120616912842, "rewards/margins": 6.839062690734863, "rewards/rejected": -8.550000190734863, "step": 2000 }, { "epoch": 1.7650417215634606, "grad_norm": 4.409246957708579, "learning_rate": 5.586555360281194e-07, "logits/chosen": -0.24200439453125, "logits/rejected": -0.30987244844436646, "logps/chosen": -296.75, "logps/rejected": -490.0, "loss": 0.0434, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.733496069908142, "rewards/margins": 7.449999809265137, "rewards/rejected": -9.189062118530273, "step": 2010 }, { "epoch": 1.7738252086078172, "grad_norm": 29.98373603989161, "learning_rate": 5.564586994727593e-07, "logits/chosen": -0.33769530057907104, "logits/rejected": -0.2633728086948395, "logps/chosen": -284.25, "logps/rejected": -419.0, "loss": 0.0967, "rewards/accuracies": 0.96875, "rewards/chosen": -1.7285645008087158, "rewards/margins": 6.336718559265137, "rewards/rejected": -8.059374809265137, "step": 2020 }, { "epoch": 1.7826086956521738, "grad_norm": 17.498635500604873, "learning_rate": 5.54261862917399e-07, "logits/chosen": -0.14859619736671448, "logits/rejected": -0.29289549589157104, "logps/chosen": -278.6000061035156, "logps/rejected": -457.1000061035156, "loss": 0.0483, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.933691382408142, "rewards/margins": 7.296875, "rewards/rejected": -9.240625381469727, "step": 2030 }, { "epoch": 1.7913921826965304, "grad_norm": 10.85984494152549, "learning_rate": 5.520650263620387e-07, "logits/chosen": -0.2789306640625, "logits/rejected": -0.29945603013038635, "logps/chosen": -286.04998779296875, "logps/rejected": -443.5, "loss": 0.0466, "rewards/accuracies": 1.0, "rewards/chosen": -2.0191407203674316, "rewards/margins": 6.880468845367432, "rewards/rejected": -8.903124809265137, "step": 2040 }, { "epoch": 1.800175669740887, "grad_norm": 13.618895075010798, "learning_rate": 5.498681898066783e-07, "logits/chosen": -0.2585205137729645, "logits/rejected": -0.29619139432907104, "logps/chosen": -306.9750061035156, "logps/rejected": -475.5, "loss": 0.0598, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.12939453125, "rewards/margins": 7.3359375, "rewards/rejected": -9.467187881469727, "step": 2050 }, { "epoch": 1.8089591567852437, "grad_norm": 9.890069001970772, "learning_rate": 5.476713532513181e-07, "logits/chosen": -0.3307739198207855, "logits/rejected": -0.3159118592739105, "logps/chosen": -276.79998779296875, "logps/rejected": -407.70001220703125, "loss": 0.104, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.548730492591858, "rewards/margins": 6.508593559265137, "rewards/rejected": -8.056249618530273, "step": 2060 }, { "epoch": 1.8177426438296003, "grad_norm": 18.628806405874762, "learning_rate": 5.454745166959577e-07, "logits/chosen": -0.3272766172885895, "logits/rejected": -0.3149658143520355, "logps/chosen": -307.8500061035156, "logps/rejected": -495.6000061035156, "loss": 0.0665, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.740136742591858, "rewards/margins": 7.540625095367432, "rewards/rejected": -9.284375190734863, "step": 2070 }, { "epoch": 1.8265261308739569, "grad_norm": 14.894526429657923, "learning_rate": 5.432776801405975e-07, "logits/chosen": -0.26066285371780396, "logits/rejected": -0.24038085341453552, "logps/chosen": -293.45001220703125, "logps/rejected": -443.29998779296875, "loss": 0.0718, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.6868164539337158, "rewards/margins": 6.1875, "rewards/rejected": -7.878125190734863, "step": 2080 }, { "epoch": 1.8353096179183135, "grad_norm": 8.759595253555503, "learning_rate": 5.410808435852372e-07, "logits/chosen": -0.24942627549171448, "logits/rejected": -0.3677734434604645, "logps/chosen": -277.3500061035156, "logps/rejected": -463.6499938964844, "loss": 0.0656, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.300268530845642, "rewards/margins": 6.775000095367432, "rewards/rejected": -8.076562881469727, "step": 2090 }, { "epoch": 1.84409310496267, "grad_norm": 22.8549120329734, "learning_rate": 5.388840070298769e-07, "logits/chosen": -0.2662597596645355, "logits/rejected": -0.29798585176467896, "logps/chosen": -279.125, "logps/rejected": -465.79998779296875, "loss": 0.0706, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.5525391101837158, "rewards/margins": 6.612500190734863, "rewards/rejected": -8.167187690734863, "step": 2100 }, { "epoch": 1.8528765920070267, "grad_norm": 12.296941623423278, "learning_rate": 5.366871704745168e-07, "logits/chosen": -0.25340574979782104, "logits/rejected": -0.313720703125, "logps/chosen": -276.3999938964844, "logps/rejected": -434.45001220703125, "loss": 0.0642, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.3004882335662842, "rewards/margins": 6.880468845367432, "rewards/rejected": -8.1796875, "step": 2110 }, { "epoch": 1.8616600790513833, "grad_norm": 7.607909606644861, "learning_rate": 5.344903339191564e-07, "logits/chosen": -0.1822509765625, "logits/rejected": -0.34035950899124146, "logps/chosen": -283.5249938964844, "logps/rejected": -460.20001220703125, "loss": 0.0436, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.0501952171325684, "rewards/margins": 7.116406440734863, "rewards/rejected": -9.162500381469727, "step": 2120 }, { "epoch": 1.87044356609574, "grad_norm": 40.36606706660167, "learning_rate": 5.322934973637961e-07, "logits/chosen": -0.2871765196323395, "logits/rejected": -0.32252198457717896, "logps/chosen": -289.1000061035156, "logps/rejected": -475.70001220703125, "loss": 0.0726, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.0289549827575684, "rewards/margins": 7.296875, "rewards/rejected": -9.326562881469727, "step": 2130 }, { "epoch": 1.8792270531400965, "grad_norm": 10.650359338395834, "learning_rate": 5.300966608084358e-07, "logits/chosen": -0.16315917670726776, "logits/rejected": -0.2775558531284332, "logps/chosen": -300.79998779296875, "logps/rejected": -456.3999938964844, "loss": 0.0352, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.854711890220642, "rewards/margins": 7.354687690734863, "rewards/rejected": -9.2109375, "step": 2140 }, { "epoch": 1.8880105401844531, "grad_norm": 34.44593089777574, "learning_rate": 5.278998242530755e-07, "logits/chosen": -0.21055908501148224, "logits/rejected": -0.28326416015625, "logps/chosen": -306.6499938964844, "logps/rejected": -450.5, "loss": 0.0541, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.7317626476287842, "rewards/margins": 7.1484375, "rewards/rejected": -8.870312690734863, "step": 2150 }, { "epoch": 1.8967940272288097, "grad_norm": 40.91575454650661, "learning_rate": 5.257029876977153e-07, "logits/chosen": -0.17325440049171448, "logits/rejected": -0.19817809760570526, "logps/chosen": -277.75, "logps/rejected": -466.6000061035156, "loss": 0.0566, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.1877930164337158, "rewards/margins": 6.832812309265137, "rewards/rejected": -8.020312309265137, "step": 2160 }, { "epoch": 1.9055775142731664, "grad_norm": 23.456059379222147, "learning_rate": 5.235061511423549e-07, "logits/chosen": -0.14934691786766052, "logits/rejected": -0.29953306913375854, "logps/chosen": -271.75, "logps/rejected": -418.70001220703125, "loss": 0.0593, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.9826904535293579, "rewards/margins": 6.568749904632568, "rewards/rejected": -7.553124904632568, "step": 2170 }, { "epoch": 1.914361001317523, "grad_norm": 32.401503388910854, "learning_rate": 5.213093145869947e-07, "logits/chosen": -0.19694824516773224, "logits/rejected": -0.14257201552391052, "logps/chosen": -284.3500061035156, "logps/rejected": -432.8999938964844, "loss": 0.0748, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.153662085533142, "rewards/margins": 6.417187690734863, "rewards/rejected": -7.573437690734863, "step": 2180 }, { "epoch": 1.9231444883618796, "grad_norm": 6.346525651661966, "learning_rate": 5.191124780316343e-07, "logits/chosen": -0.13695068657398224, "logits/rejected": -0.3593994081020355, "logps/chosen": -255.1750030517578, "logps/rejected": -450.6000061035156, "loss": 0.0506, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.925366222858429, "rewards/margins": 6.642187595367432, "rewards/rejected": -7.568749904632568, "step": 2190 }, { "epoch": 1.9319279754062362, "grad_norm": 15.903306098258518, "learning_rate": 5.169156414762741e-07, "logits/chosen": -0.28913575410842896, "logits/rejected": -0.23695984482765198, "logps/chosen": -342.0, "logps/rejected": -499.70001220703125, "loss": 0.0698, "rewards/accuracies": 0.96875, "rewards/chosen": -1.5720703601837158, "rewards/margins": 7.359375, "rewards/rejected": -8.932812690734863, "step": 2200 }, { "epoch": 1.9407114624505928, "grad_norm": 35.42245972286577, "learning_rate": 5.147188049209139e-07, "logits/chosen": -0.28959351778030396, "logits/rejected": -0.16972656548023224, "logps/chosen": -247.9250030517578, "logps/rejected": -401.0, "loss": 0.0694, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.003027319908142, "rewards/margins": 6.614062309265137, "rewards/rejected": -7.615624904632568, "step": 2210 }, { "epoch": 1.9494949494949494, "grad_norm": 56.91620042907813, "learning_rate": 5.125219683655536e-07, "logits/chosen": -0.31341552734375, "logits/rejected": -0.2879882752895355, "logps/chosen": -270.54998779296875, "logps/rejected": -456.70001220703125, "loss": 0.0613, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.246252417564392, "rewards/margins": 7.322656154632568, "rewards/rejected": -8.571874618530273, "step": 2220 }, { "epoch": 1.958278436539306, "grad_norm": 208.46311258462507, "learning_rate": 5.103251318101933e-07, "logits/chosen": -0.26039427518844604, "logits/rejected": -0.2665649354457855, "logps/chosen": -254.10000610351562, "logps/rejected": -423.3999938964844, "loss": 0.0907, "rewards/accuracies": 0.96875, "rewards/chosen": -1.402929663658142, "rewards/margins": 6.973437309265137, "rewards/rejected": -8.370312690734863, "step": 2230 }, { "epoch": 1.9670619235836626, "grad_norm": 29.610917098572084, "learning_rate": 5.08128295254833e-07, "logits/chosen": -0.22614745795726776, "logits/rejected": -0.19291992485523224, "logps/chosen": -284.79998779296875, "logps/rejected": -469.29998779296875, "loss": 0.062, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.75732421875, "rewards/margins": 7.310937404632568, "rewards/rejected": -9.065625190734863, "step": 2240 }, { "epoch": 1.9758454106280192, "grad_norm": 6.407726409800503, "learning_rate": 5.059314586994727e-07, "logits/chosen": -0.3466552793979645, "logits/rejected": -0.4720703065395355, "logps/chosen": -322.3999938964844, "logps/rejected": -501.6000061035156, "loss": 0.0641, "rewards/accuracies": 0.96875, "rewards/chosen": -2.317187547683716, "rewards/margins": 7.65625, "rewards/rejected": -9.971875190734863, "step": 2250 }, { "epoch": 1.9846288976723758, "grad_norm": 14.470384792852883, "learning_rate": 5.037346221441124e-07, "logits/chosen": -0.24239501357078552, "logits/rejected": -0.36579591035842896, "logps/chosen": -256.45001220703125, "logps/rejected": -490.79998779296875, "loss": 0.0398, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.29534912109375, "rewards/margins": 7.396874904632568, "rewards/rejected": -9.689062118530273, "step": 2260 }, { "epoch": 1.9934123847167324, "grad_norm": 88.35000507527698, "learning_rate": 5.015377855887521e-07, "logits/chosen": -0.447601318359375, "logits/rejected": -0.4158935546875, "logps/chosen": -266.3999938964844, "logps/rejected": -443.20001220703125, "loss": 0.055, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.142285108566284, "rewards/margins": 7.129687309265137, "rewards/rejected": -9.284375190734863, "step": 2270 }, { "epoch": 2.0017566974088714, "grad_norm": 0.7889346910303661, "learning_rate": 4.993409490333919e-07, "logits/chosen": -0.35148540139198303, "logits/rejected": -0.39224404096603394, "logps/chosen": -265.6842041015625, "logps/rejected": -432.8947448730469, "loss": 0.0685, "rewards/accuracies": 0.9605262875556946, "rewards/chosen": -2.503392219543457, "rewards/margins": 7.025493621826172, "rewards/rejected": -9.523026466369629, "step": 2280 }, { "epoch": 2.010540184453228, "grad_norm": 0.44318006547541117, "learning_rate": 4.971441124780316e-07, "logits/chosen": -0.380767822265625, "logits/rejected": -0.423583984375, "logps/chosen": -292.0, "logps/rejected": -471.3999938964844, "loss": 0.0086, "rewards/accuracies": 1.0, "rewards/chosen": -2.2754883766174316, "rewards/margins": 8.350000381469727, "rewards/rejected": -10.634374618530273, "step": 2290 }, { "epoch": 2.0193236714975846, "grad_norm": 20.11385912877608, "learning_rate": 4.949472759226713e-07, "logits/chosen": -0.4604125916957855, "logits/rejected": -0.5492187738418579, "logps/chosen": -309.5, "logps/rejected": -495.70001220703125, "loss": 0.0105, "rewards/accuracies": 1.0, "rewards/chosen": -2.747753858566284, "rewards/margins": 8.839062690734863, "rewards/rejected": -11.595312118530273, "step": 2300 }, { "epoch": 2.0281071585419412, "grad_norm": 11.150688183502542, "learning_rate": 4.92750439367311e-07, "logits/chosen": -0.4310058653354645, "logits/rejected": -0.5071777105331421, "logps/chosen": -294.5, "logps/rejected": -498.0, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": -3.250781297683716, "rewards/margins": 10.096875190734863, "rewards/rejected": -13.334375381469727, "step": 2310 }, { "epoch": 2.036890645586298, "grad_norm": 1.0988846311883766, "learning_rate": 4.905536028119508e-07, "logits/chosen": -0.28662109375, "logits/rejected": -0.46318358182907104, "logps/chosen": -319.25, "logps/rejected": -538.2000122070312, "loss": 0.0113, "rewards/accuracies": 1.0, "rewards/chosen": -3.88671875, "rewards/margins": 9.784375190734863, "rewards/rejected": -13.668749809265137, "step": 2320 }, { "epoch": 2.0456741326306545, "grad_norm": 4.3135053085449355, "learning_rate": 4.883567662565905e-07, "logits/chosen": -0.555249035358429, "logits/rejected": -0.56982421875, "logps/chosen": -299.20001220703125, "logps/rejected": -470.29998779296875, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": -3.0872559547424316, "rewards/margins": 9.964062690734863, "rewards/rejected": -13.040624618530273, "step": 2330 }, { "epoch": 2.054457619675011, "grad_norm": 1.1307354313034275, "learning_rate": 4.861599297012302e-07, "logits/chosen": -0.36292725801467896, "logits/rejected": -0.3811096251010895, "logps/chosen": -318.95001220703125, "logps/rejected": -562.5999755859375, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -3.700390577316284, "rewards/margins": 10.495312690734863, "rewards/rejected": -14.190625190734863, "step": 2340 }, { "epoch": 2.0632411067193677, "grad_norm": 0.44764720545428005, "learning_rate": 4.839630931458699e-07, "logits/chosen": -0.12214355170726776, "logits/rejected": -0.3350585997104645, "logps/chosen": -287.5, "logps/rejected": -506.70001220703125, "loss": 0.0085, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.2652344703674316, "rewards/margins": 10.298437118530273, "rewards/rejected": -13.556249618530273, "step": 2350 }, { "epoch": 2.0720245937637243, "grad_norm": 0.7959245998758013, "learning_rate": 4.817662565905096e-07, "logits/chosen": -0.36467283964157104, "logits/rejected": -0.391357421875, "logps/chosen": -283.04998779296875, "logps/rejected": -447.3999938964844, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -2.823437452316284, "rewards/margins": 9.157812118530273, "rewards/rejected": -11.978124618530273, "step": 2360 }, { "epoch": 2.080808080808081, "grad_norm": 1.2843403690539308, "learning_rate": 4.795694200351494e-07, "logits/chosen": -0.504467785358429, "logits/rejected": -0.4788818359375, "logps/chosen": -330.6499938964844, "logps/rejected": -539.5999755859375, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -3.585156202316284, "rewards/margins": 10.643750190734863, "rewards/rejected": -14.225000381469727, "step": 2370 }, { "epoch": 2.0895915678524375, "grad_norm": 0.40937235905389235, "learning_rate": 4.77372583479789e-07, "logits/chosen": -0.2598632872104645, "logits/rejected": -0.526049792766571, "logps/chosen": -301.70001220703125, "logps/rejected": -508.20001220703125, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -3.8531250953674316, "rewards/margins": 10.515625, "rewards/rejected": -14.368749618530273, "step": 2380 }, { "epoch": 2.098375054896794, "grad_norm": 0.5936368886307941, "learning_rate": 4.751757469244288e-07, "logits/chosen": -0.35673826932907104, "logits/rejected": -0.411865234375, "logps/chosen": -312.79998779296875, "logps/rejected": -495.20001220703125, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -3.9945311546325684, "rewards/margins": 9.845312118530273, "rewards/rejected": -13.846875190734863, "step": 2390 }, { "epoch": 2.1071585419411507, "grad_norm": 1.869026634273845, "learning_rate": 4.729789103690685e-07, "logits/chosen": -0.36480712890625, "logits/rejected": -0.47137451171875, "logps/chosen": -265.75, "logps/rejected": -520.2000122070312, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": -3.657421827316284, "rewards/margins": 10.792187690734863, "rewards/rejected": -14.449999809265137, "step": 2400 }, { "epoch": 2.1159420289855073, "grad_norm": 0.5543320401968966, "learning_rate": 4.707820738137082e-07, "logits/chosen": -0.4366210997104645, "logits/rejected": -0.506054699420929, "logps/chosen": -295.29998779296875, "logps/rejected": -485.3999938964844, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": -3.4501953125, "rewards/margins": 9.7734375, "rewards/rejected": -13.232812881469727, "step": 2410 }, { "epoch": 2.124725516029864, "grad_norm": 4.870672961743894, "learning_rate": 4.68585237258348e-07, "logits/chosen": -0.370849609375, "logits/rejected": -0.4851440489292145, "logps/chosen": -280.95001220703125, "logps/rejected": -477.95001220703125, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": -3.5189452171325684, "rewards/margins": 9.801562309265137, "rewards/rejected": -13.318750381469727, "step": 2420 }, { "epoch": 2.1335090030742205, "grad_norm": 8.544811879159859, "learning_rate": 4.663884007029877e-07, "logits/chosen": -0.46983641386032104, "logits/rejected": -0.4407104551792145, "logps/chosen": -315.20001220703125, "logps/rejected": -514.4000244140625, "loss": 0.0121, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.2999510765075684, "rewards/margins": 9.798437118530273, "rewards/rejected": -13.106249809265137, "step": 2430 }, { "epoch": 2.142292490118577, "grad_norm": 16.437166896661743, "learning_rate": 4.641915641476274e-07, "logits/chosen": -0.2838378846645355, "logits/rejected": -0.2813781797885895, "logps/chosen": -299.45001220703125, "logps/rejected": -486.0, "loss": 0.0158, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.526562452316284, "rewards/margins": 9.668749809265137, "rewards/rejected": -13.203125, "step": 2440 }, { "epoch": 2.1510759771629338, "grad_norm": 15.659158496835714, "learning_rate": 4.619947275922671e-07, "logits/chosen": -0.44011229276657104, "logits/rejected": -0.47636717557907104, "logps/chosen": -310.1000061035156, "logps/rejected": -511.29998779296875, "loss": 0.0072, "rewards/accuracies": 1.0, "rewards/chosen": -4.126562595367432, "rewards/margins": 10.176562309265137, "rewards/rejected": -14.3125, "step": 2450 }, { "epoch": 2.1598594642072904, "grad_norm": 0.36667682444371485, "learning_rate": 4.5979789103690687e-07, "logits/chosen": -0.27360838651657104, "logits/rejected": -0.438232421875, "logps/chosen": -309.54998779296875, "logps/rejected": -507.8999938964844, "loss": 0.0052, "rewards/accuracies": 1.0, "rewards/chosen": -4.141015529632568, "rewards/margins": 10.056249618530273, "rewards/rejected": -14.193750381469727, "step": 2460 }, { "epoch": 2.168642951251647, "grad_norm": 0.4043541946325297, "learning_rate": 4.576010544815466e-07, "logits/chosen": -0.544921875, "logits/rejected": -0.539599597454071, "logps/chosen": -307.1499938964844, "logps/rejected": -526.4000244140625, "loss": 0.0133, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.096093654632568, "rewards/margins": 10.0859375, "rewards/rejected": -14.1875, "step": 2470 }, { "epoch": 2.1774264382960036, "grad_norm": 1.1252701324841667, "learning_rate": 4.554042179261863e-07, "logits/chosen": -0.3878173828125, "logits/rejected": -0.7210937738418579, "logps/chosen": -328.20001220703125, "logps/rejected": -483.20001220703125, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -4.36328125, "rewards/margins": 10.735937118530273, "rewards/rejected": -15.09375, "step": 2480 }, { "epoch": 2.18620992534036, "grad_norm": 1.4978297266152667, "learning_rate": 4.53207381370826e-07, "logits/chosen": -0.47945863008499146, "logits/rejected": -0.5327392816543579, "logps/chosen": -320.70001220703125, "logps/rejected": -530.5, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -4.118359565734863, "rewards/margins": 10.399999618530273, "rewards/rejected": -14.528124809265137, "step": 2490 }, { "epoch": 2.194993412384717, "grad_norm": 0.5475226962807528, "learning_rate": 4.510105448154657e-07, "logits/chosen": -0.4013915956020355, "logits/rejected": -0.4706054627895355, "logps/chosen": -312.0, "logps/rejected": -505.29998779296875, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": -4.532812595367432, "rewards/margins": 9.623437881469727, "rewards/rejected": -14.15625, "step": 2500 }, { "epoch": 2.2037768994290734, "grad_norm": 3.1936738027184752, "learning_rate": 4.4881370826010546e-07, "logits/chosen": -0.5706542730331421, "logits/rejected": -0.4527343809604645, "logps/chosen": -314.6000061035156, "logps/rejected": -523.5, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -4.640625, "rewards/margins": 10.425000190734863, "rewards/rejected": -15.068750381469727, "step": 2510 }, { "epoch": 2.21256038647343, "grad_norm": 1.4280106403329011, "learning_rate": 4.4661687170474517e-07, "logits/chosen": -0.44926756620407104, "logits/rejected": -0.594921886920929, "logps/chosen": -313.04998779296875, "logps/rejected": -528.4000244140625, "loss": 0.007, "rewards/accuracies": 1.0, "rewards/chosen": -3.8890624046325684, "rewards/margins": 10.576562881469727, "rewards/rejected": -14.46875, "step": 2520 }, { "epoch": 2.2213438735177866, "grad_norm": 0.9721544631680744, "learning_rate": 4.444200351493849e-07, "logits/chosen": -0.6512695550918579, "logits/rejected": -0.664306640625, "logps/chosen": -271.1499938964844, "logps/rejected": -489.8999938964844, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -3.8734374046325684, "rewards/margins": 10.234375, "rewards/rejected": -14.115625381469727, "step": 2530 }, { "epoch": 2.2301273605621432, "grad_norm": 0.5866674421514149, "learning_rate": 4.422231985940246e-07, "logits/chosen": -0.4952636659145355, "logits/rejected": -0.5534607172012329, "logps/chosen": -322.70001220703125, "logps/rejected": -521.0999755859375, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": -4.440625190734863, "rewards/margins": 10.514062881469727, "rewards/rejected": -14.956250190734863, "step": 2540 }, { "epoch": 2.2389108476065, "grad_norm": 5.768426354747632, "learning_rate": 4.400263620386643e-07, "logits/chosen": -0.42822265625, "logits/rejected": -0.605419933795929, "logps/chosen": -306.1000061035156, "logps/rejected": -544.2000122070312, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": -4.20703125, "rewards/margins": 11.071874618530273, "rewards/rejected": -15.262499809265137, "step": 2550 }, { "epoch": 2.2476943346508564, "grad_norm": 0.3492466502427124, "learning_rate": 4.3782952548330405e-07, "logits/chosen": -0.433401495218277, "logits/rejected": -0.5804687738418579, "logps/chosen": -335.45001220703125, "logps/rejected": -540.2000122070312, "loss": 0.0052, "rewards/accuracies": 1.0, "rewards/chosen": -3.984375, "rewards/margins": 10.350000381469727, "rewards/rejected": -14.331250190734863, "step": 2560 }, { "epoch": 2.256477821695213, "grad_norm": 2.51095390553696, "learning_rate": 4.3563268892794376e-07, "logits/chosen": -0.613903820514679, "logits/rejected": -0.704296886920929, "logps/chosen": -351.70001220703125, "logps/rejected": -588.0999755859375, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -4.067968845367432, "rewards/margins": 11.459375381469727, "rewards/rejected": -15.521875381469727, "step": 2570 }, { "epoch": 2.2652613087395697, "grad_norm": 0.8029842101565244, "learning_rate": 4.3343585237258347e-07, "logits/chosen": -0.3885498046875, "logits/rejected": -0.49272459745407104, "logps/chosen": -302.45001220703125, "logps/rejected": -517.9000244140625, "loss": 0.016, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.046777248382568, "rewards/margins": 10.776562690734863, "rewards/rejected": -14.821874618530273, "step": 2580 }, { "epoch": 2.2740447957839263, "grad_norm": 4.067239180624968, "learning_rate": 4.312390158172232e-07, "logits/chosen": -0.395477294921875, "logits/rejected": -0.5022948980331421, "logps/chosen": -359.8999938964844, "logps/rejected": -566.2000122070312, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -4.972265720367432, "rewards/margins": 11.8125, "rewards/rejected": -16.78125, "step": 2590 }, { "epoch": 2.282828282828283, "grad_norm": 4.536070940109926, "learning_rate": 4.2904217926186293e-07, "logits/chosen": -0.5258544683456421, "logits/rejected": -0.5191894769668579, "logps/chosen": -297.25, "logps/rejected": -489.8500061035156, "loss": 0.0083, "rewards/accuracies": 1.0, "rewards/chosen": -4.5794677734375, "rewards/margins": 10.646875381469727, "rewards/rejected": -15.225000381469727, "step": 2600 }, { "epoch": 2.2916117698726395, "grad_norm": 1.5136963199564475, "learning_rate": 4.2684534270650264e-07, "logits/chosen": -0.26636964082717896, "logits/rejected": -0.5439208745956421, "logps/chosen": -311.75, "logps/rejected": -514.0, "loss": 0.0114, "rewards/accuracies": 1.0, "rewards/chosen": -4.912499904632568, "rewards/margins": 11.6640625, "rewards/rejected": -16.571874618530273, "step": 2610 }, { "epoch": 2.300395256916996, "grad_norm": 0.6763103683968481, "learning_rate": 4.2464850615114235e-07, "logits/chosen": -0.3052124083042145, "logits/rejected": -0.48039549589157104, "logps/chosen": -286.25, "logps/rejected": -506.1499938964844, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -4.401953220367432, "rewards/margins": 10.53125, "rewards/rejected": -14.934374809265137, "step": 2620 }, { "epoch": 2.3091787439613527, "grad_norm": 14.370122333207242, "learning_rate": 4.2245166959578206e-07, "logits/chosen": -0.441162109375, "logits/rejected": -0.549755871295929, "logps/chosen": -276.75, "logps/rejected": -425.04998779296875, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": -4.025390625, "rewards/margins": 9.448437690734863, "rewards/rejected": -13.475000381469727, "step": 2630 }, { "epoch": 2.3179622310057093, "grad_norm": 4.233928913814668, "learning_rate": 4.2025483304042177e-07, "logits/chosen": -0.4903320372104645, "logits/rejected": -0.5772460699081421, "logps/chosen": -313.07501220703125, "logps/rejected": -523.7000122070312, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": -4.025000095367432, "rewards/margins": 10.112500190734863, "rewards/rejected": -14.121874809265137, "step": 2640 }, { "epoch": 2.326745718050066, "grad_norm": 0.763033801499235, "learning_rate": 4.180579964850615e-07, "logits/chosen": -0.514453113079071, "logits/rejected": -0.629382312297821, "logps/chosen": -302.8500061035156, "logps/rejected": -503.8500061035156, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -3.5347657203674316, "rewards/margins": 10.262499809265137, "rewards/rejected": -13.787500381469727, "step": 2650 }, { "epoch": 2.3355292050944225, "grad_norm": 5.762147012917278, "learning_rate": 4.1586115992970123e-07, "logits/chosen": -0.48380738496780396, "logits/rejected": -0.5455566644668579, "logps/chosen": -308.29998779296875, "logps/rejected": -482.1000061035156, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -3.4053711891174316, "rewards/margins": 9.942187309265137, "rewards/rejected": -13.346875190734863, "step": 2660 }, { "epoch": 2.344312692138779, "grad_norm": 2.4881151746358507, "learning_rate": 4.1366432337434094e-07, "logits/chosen": -0.5485595464706421, "logits/rejected": -0.525714099407196, "logps/chosen": -309.1000061035156, "logps/rejected": -488.8999938964844, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": -3.640625, "rewards/margins": 9.643750190734863, "rewards/rejected": -13.278124809265137, "step": 2670 }, { "epoch": 2.3530961791831357, "grad_norm": 58.69275654036956, "learning_rate": 4.1146748681898065e-07, "logits/chosen": -0.48490601778030396, "logits/rejected": -0.6314452886581421, "logps/chosen": -297.25, "logps/rejected": -505.29998779296875, "loss": 0.0076, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.16796875, "rewards/margins": 10.7265625, "rewards/rejected": -14.890625, "step": 2680 }, { "epoch": 2.3618796662274923, "grad_norm": 12.656219703063748, "learning_rate": 4.0927065026362036e-07, "logits/chosen": -0.53778076171875, "logits/rejected": -0.6039062738418579, "logps/chosen": -346.54998779296875, "logps/rejected": -575.0999755859375, "loss": 0.0085, "rewards/accuracies": 1.0, "rewards/chosen": -4.4921875, "rewards/margins": 10.920312881469727, "rewards/rejected": -15.425000190734863, "step": 2690 }, { "epoch": 2.370663153271849, "grad_norm": 0.7795449130086656, "learning_rate": 4.070738137082601e-07, "logits/chosen": -0.5100768804550171, "logits/rejected": -0.4996581971645355, "logps/chosen": -316.29998779296875, "logps/rejected": -496.8999938964844, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -3.8003907203674316, "rewards/margins": 10.653124809265137, "rewards/rejected": -14.459375381469727, "step": 2700 }, { "epoch": 2.3794466403162056, "grad_norm": 2.261109880679019, "learning_rate": 4.048769771528998e-07, "logits/chosen": -0.47001951932907104, "logits/rejected": -0.578717052936554, "logps/chosen": -336.7250061035156, "logps/rejected": -508.5, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": -4.123437404632568, "rewards/margins": 9.839062690734863, "rewards/rejected": -13.96875, "step": 2710 }, { "epoch": 2.388230127360562, "grad_norm": 5.719944447044475, "learning_rate": 4.0268014059753953e-07, "logits/chosen": -0.3439880311489105, "logits/rejected": -0.42902833223342896, "logps/chosen": -316.79998779296875, "logps/rejected": -502.29998779296875, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": -4.150586128234863, "rewards/margins": 10.59375, "rewards/rejected": -14.728124618530273, "step": 2720 }, { "epoch": 2.397013614404919, "grad_norm": 1.2458880985395626, "learning_rate": 4.0048330404217924e-07, "logits/chosen": -0.23670653998851776, "logits/rejected": -0.539599597454071, "logps/chosen": -317.07501220703125, "logps/rejected": -507.3999938964844, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -4.51953125, "rewards/margins": 11.09375, "rewards/rejected": -15.615625381469727, "step": 2730 }, { "epoch": 2.4057971014492754, "grad_norm": 5.910522347498881, "learning_rate": 3.98286467486819e-07, "logits/chosen": -0.21577759087085724, "logits/rejected": -0.610607922077179, "logps/chosen": -265.1499938964844, "logps/rejected": -512.4000244140625, "loss": 0.0112, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.009081840515137, "rewards/margins": 11.660937309265137, "rewards/rejected": -15.681249618530273, "step": 2740 }, { "epoch": 2.414580588493632, "grad_norm": 20.40111739402005, "learning_rate": 3.960896309314587e-07, "logits/chosen": -0.345703125, "logits/rejected": -0.46687012910842896, "logps/chosen": -274.1000061035156, "logps/rejected": -507.70001220703125, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -4.211718559265137, "rewards/margins": 11.046875, "rewards/rejected": -15.262499809265137, "step": 2750 }, { "epoch": 2.4233640755379886, "grad_norm": 0.43153327097547867, "learning_rate": 3.938927943760984e-07, "logits/chosen": -0.4475341737270355, "logits/rejected": -0.548632800579071, "logps/chosen": -267.8999938964844, "logps/rejected": -474.0, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": -3.5054688453674316, "rewards/margins": 10.339062690734863, "rewards/rejected": -13.856249809265137, "step": 2760 }, { "epoch": 2.432147562582345, "grad_norm": 0.9498182600870315, "learning_rate": 3.916959578207381e-07, "logits/chosen": -0.3609252870082855, "logits/rejected": -0.4793945252895355, "logps/chosen": -307.0, "logps/rejected": -489.0, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": -3.7865233421325684, "rewards/margins": 10.251562118530273, "rewards/rejected": -14.034375190734863, "step": 2770 }, { "epoch": 2.440931049626702, "grad_norm": 1.0299505700556804, "learning_rate": 3.8949912126537783e-07, "logits/chosen": -0.4979492127895355, "logits/rejected": -0.5478271245956421, "logps/chosen": -307.6499938964844, "logps/rejected": -510.0, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -4.018359184265137, "rewards/margins": 10.875, "rewards/rejected": -14.893750190734863, "step": 2780 }, { "epoch": 2.4497145366710584, "grad_norm": 0.15800974532151343, "learning_rate": 3.873022847100176e-07, "logits/chosen": -0.42308348417282104, "logits/rejected": -0.48405760526657104, "logps/chosen": -264.6499938964844, "logps/rejected": -502.70001220703125, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -3.8125, "rewards/margins": 10.893750190734863, "rewards/rejected": -14.706250190734863, "step": 2790 }, { "epoch": 2.458498023715415, "grad_norm": 6.409068745659451, "learning_rate": 3.851054481546573e-07, "logits/chosen": -0.43876951932907104, "logits/rejected": -0.55712890625, "logps/chosen": -309.1499938964844, "logps/rejected": -511.79998779296875, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -5.116406440734863, "rewards/margins": 11.390625, "rewards/rejected": -16.518749237060547, "step": 2800 }, { "epoch": 2.4672815107597716, "grad_norm": 12.173677949498884, "learning_rate": 3.82908611599297e-07, "logits/chosen": -0.290322870016098, "logits/rejected": -0.5130981206893921, "logps/chosen": -344.20001220703125, "logps/rejected": -548.7000122070312, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -5.154687404632568, "rewards/margins": 10.559374809265137, "rewards/rejected": -15.709375381469727, "step": 2810 }, { "epoch": 2.4760649978041283, "grad_norm": 0.3390635550342184, "learning_rate": 3.807117750439367e-07, "logits/chosen": 0.03293456882238388, "logits/rejected": -0.3926757872104645, "logps/chosen": -280.75, "logps/rejected": -528.2999877929688, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": -4.193749904632568, "rewards/margins": 11.584375381469727, "rewards/rejected": -15.778124809265137, "step": 2820 }, { "epoch": 2.484848484848485, "grad_norm": 1.2102646435832705, "learning_rate": 3.785149384885764e-07, "logits/chosen": -0.3826049864292145, "logits/rejected": -0.42817384004592896, "logps/chosen": -339.6499938964844, "logps/rejected": -534.9500122070312, "loss": 0.0265, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.449804782867432, "rewards/margins": 11.0703125, "rewards/rejected": -15.521875381469727, "step": 2830 }, { "epoch": 2.4936319718928415, "grad_norm": 2.3287746892850394, "learning_rate": 3.763181019332162e-07, "logits/chosen": -0.39851075410842896, "logits/rejected": -0.575390636920929, "logps/chosen": -297.54998779296875, "logps/rejected": -532.7000122070312, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": -4.564062595367432, "rewards/margins": 11.550000190734863, "rewards/rejected": -16.109375, "step": 2840 }, { "epoch": 2.502415458937198, "grad_norm": 3.243008722011686, "learning_rate": 3.741212653778559e-07, "logits/chosen": -0.26646119356155396, "logits/rejected": -0.3801025450229645, "logps/chosen": -324.8999938964844, "logps/rejected": -540.2000122070312, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -4.680859565734863, "rewards/margins": 11.981249809265137, "rewards/rejected": -16.674999237060547, "step": 2850 }, { "epoch": 2.5111989459815547, "grad_norm": 1.7752236593787043, "learning_rate": 3.719244288224956e-07, "logits/chosen": -0.397705078125, "logits/rejected": -0.4431518614292145, "logps/chosen": -318.5, "logps/rejected": -555.0, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -4.407812595367432, "rewards/margins": 10.532812118530273, "rewards/rejected": -14.949999809265137, "step": 2860 }, { "epoch": 2.5199824330259113, "grad_norm": 1.656683419660172, "learning_rate": 3.697275922671353e-07, "logits/chosen": -0.3946533203125, "logits/rejected": -0.39750975370407104, "logps/chosen": -265.5, "logps/rejected": -498.70001220703125, "loss": 0.0185, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.543750047683716, "rewards/margins": 10.631250381469727, "rewards/rejected": -14.168749809265137, "step": 2870 }, { "epoch": 2.528765920070268, "grad_norm": 20.696891476326872, "learning_rate": 3.6753075571177507e-07, "logits/chosen": -0.36040037870407104, "logits/rejected": -0.41484373807907104, "logps/chosen": -322.7749938964844, "logps/rejected": -501.8999938964844, "loss": 0.0128, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.8501954078674316, "rewards/margins": 9.721875190734863, "rewards/rejected": -13.571874618530273, "step": 2880 }, { "epoch": 2.5375494071146245, "grad_norm": 0.886082962655527, "learning_rate": 3.653339191564148e-07, "logits/chosen": -0.17131957411766052, "logits/rejected": -0.4134277403354645, "logps/chosen": -289.79998779296875, "logps/rejected": -487.6000061035156, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -4.01953125, "rewards/margins": 10.850000381469727, "rewards/rejected": -14.865625381469727, "step": 2890 }, { "epoch": 2.546332894158981, "grad_norm": 4.238037124318268, "learning_rate": 3.631370826010545e-07, "logits/chosen": -0.45598143339157104, "logits/rejected": -0.536773681640625, "logps/chosen": -296.20001220703125, "logps/rejected": -509.70001220703125, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -4.434374809265137, "rewards/margins": 10.876562118530273, "rewards/rejected": -15.300000190734863, "step": 2900 }, { "epoch": 2.5551163812033377, "grad_norm": 3.0015532373290443, "learning_rate": 3.609402460456942e-07, "logits/chosen": -0.09440918266773224, "logits/rejected": -0.29802244901657104, "logps/chosen": -309.70001220703125, "logps/rejected": -498.29998779296875, "loss": 0.0099, "rewards/accuracies": 1.0, "rewards/chosen": -4.056323051452637, "rewards/margins": 10.235937118530273, "rewards/rejected": -14.274999618530273, "step": 2910 }, { "epoch": 2.5638998682476943, "grad_norm": 2.528811211959182, "learning_rate": 3.587434094903339e-07, "logits/chosen": -0.36384278535842896, "logits/rejected": -0.4359130859375, "logps/chosen": -307.92498779296875, "logps/rejected": -503.6000061035156, "loss": 0.016, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.9273438453674316, "rewards/margins": 10.4765625, "rewards/rejected": -14.409375190734863, "step": 2920 }, { "epoch": 2.572683355292051, "grad_norm": 2.3808458617416632, "learning_rate": 3.5654657293497366e-07, "logits/chosen": -0.3380371034145355, "logits/rejected": -0.538330078125, "logps/chosen": -293.25, "logps/rejected": -557.0, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": -4.787499904632568, "rewards/margins": 11.5234375, "rewards/rejected": -16.306249618530273, "step": 2930 }, { "epoch": 2.5814668423364076, "grad_norm": 27.576277256441358, "learning_rate": 3.5434973637961337e-07, "logits/chosen": -0.4428466856479645, "logits/rejected": -0.5040649175643921, "logps/chosen": -318.6499938964844, "logps/rejected": -531.0, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": -4.626172065734863, "rewards/margins": 11.434374809265137, "rewards/rejected": -16.056249618530273, "step": 2940 }, { "epoch": 2.590250329380764, "grad_norm": 3.7528957899214577, "learning_rate": 3.521528998242531e-07, "logits/chosen": -0.6123901605606079, "logits/rejected": -0.59716796875, "logps/chosen": -285.54998779296875, "logps/rejected": -483.0, "loss": 0.0073, "rewards/accuracies": 1.0, "rewards/chosen": -4.217968940734863, "rewards/margins": 10.862500190734863, "rewards/rejected": -15.078125, "step": 2950 }, { "epoch": 2.5990338164251208, "grad_norm": 1.4994333296288798, "learning_rate": 3.499560632688928e-07, "logits/chosen": -0.39741212129592896, "logits/rejected": -0.4402832090854645, "logps/chosen": -280.8500061035156, "logps/rejected": -468.29998779296875, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": -3.57861328125, "rewards/margins": 10.243749618530273, "rewards/rejected": -13.828125, "step": 2960 }, { "epoch": 2.6078173034694774, "grad_norm": 12.541725056456663, "learning_rate": 3.477592267135325e-07, "logits/chosen": -0.4484008848667145, "logits/rejected": -0.6004883050918579, "logps/chosen": -278.1499938964844, "logps/rejected": -503.6000061035156, "loss": 0.0138, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.5152344703674316, "rewards/margins": 10.467187881469727, "rewards/rejected": -13.987500190734863, "step": 2970 }, { "epoch": 2.616600790513834, "grad_norm": 5.004953868580532, "learning_rate": 3.4556239015817225e-07, "logits/chosen": -0.5730956792831421, "logits/rejected": -0.5693603754043579, "logps/chosen": -290.20001220703125, "logps/rejected": -473.3500061035156, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -3.8511719703674316, "rewards/margins": 10.260937690734863, "rewards/rejected": -14.109375, "step": 2980 }, { "epoch": 2.6253842775581906, "grad_norm": 2.061998460415306, "learning_rate": 3.4336555360281196e-07, "logits/chosen": -0.3398681581020355, "logits/rejected": -0.511004626750946, "logps/chosen": -343.8999938964844, "logps/rejected": -573.2999877929688, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": -5.212500095367432, "rewards/margins": 11.681249618530273, "rewards/rejected": -16.890625, "step": 2990 }, { "epoch": 2.634167764602547, "grad_norm": 9.764671463642909, "learning_rate": 3.4116871704745167e-07, "logits/chosen": -0.3751464784145355, "logits/rejected": -0.47285157442092896, "logps/chosen": -311.8500061035156, "logps/rejected": -511.79998779296875, "loss": 0.0125, "rewards/accuracies": 1.0, "rewards/chosen": -4.645312309265137, "rewards/margins": 10.515625, "rewards/rejected": -15.165624618530273, "step": 3000 }, { "epoch": 2.642951251646904, "grad_norm": 5.295918352062501, "learning_rate": 3.389718804920914e-07, "logits/chosen": -0.34174805879592896, "logits/rejected": -0.5933837890625, "logps/chosen": -313.20001220703125, "logps/rejected": -541.7000122070312, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -4.809374809265137, "rewards/margins": 11.115625381469727, "rewards/rejected": -15.915624618530273, "step": 3010 }, { "epoch": 2.6517347386912604, "grad_norm": 5.873870724252253, "learning_rate": 3.3677504393673114e-07, "logits/chosen": -0.5889037847518921, "logits/rejected": -0.6572510004043579, "logps/chosen": -317.20001220703125, "logps/rejected": -546.7000122070312, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -4.73046875, "rewards/margins": 11.662500381469727, "rewards/rejected": -16.387500762939453, "step": 3020 }, { "epoch": 2.660518225735617, "grad_norm": 0.7644653864675504, "learning_rate": 3.3457820738137084e-07, "logits/chosen": -0.4492431581020355, "logits/rejected": -0.4465576112270355, "logps/chosen": -297.79998779296875, "logps/rejected": -509.1000061035156, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -4.252539157867432, "rewards/margins": 11.756250381469727, "rewards/rejected": -16.012500762939453, "step": 3030 }, { "epoch": 2.6693017127799736, "grad_norm": 0.9257951476084167, "learning_rate": 3.3238137082601055e-07, "logits/chosen": -0.47760009765625, "logits/rejected": -0.659375011920929, "logps/chosen": -346.95001220703125, "logps/rejected": -597.2999877929688, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -5.917187690734863, "rewards/margins": 11.53125, "rewards/rejected": -17.453125, "step": 3040 }, { "epoch": 2.6780851998243302, "grad_norm": 0.5399374455796954, "learning_rate": 3.3018453427065026e-07, "logits/chosen": -0.5033935308456421, "logits/rejected": -0.61279296875, "logps/chosen": -346.5, "logps/rejected": -532.9000244140625, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -4.9375, "rewards/margins": 10.987500190734863, "rewards/rejected": -15.909375190734863, "step": 3050 }, { "epoch": 2.686868686868687, "grad_norm": 0.2560300614518703, "learning_rate": 3.2798769771528997e-07, "logits/chosen": -0.45380860567092896, "logits/rejected": -0.631030261516571, "logps/chosen": -330.70001220703125, "logps/rejected": -541.4000244140625, "loss": 0.0064, "rewards/accuracies": 1.0, "rewards/chosen": -4.6708984375, "rewards/margins": 11.78125, "rewards/rejected": -16.459375381469727, "step": 3060 }, { "epoch": 2.6956521739130435, "grad_norm": 91.48705138354713, "learning_rate": 3.2579086115992973e-07, "logits/chosen": -0.4126953184604645, "logits/rejected": -0.6199706792831421, "logps/chosen": -304.29998779296875, "logps/rejected": -527.5999755859375, "loss": 0.0125, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.907031059265137, "rewards/margins": 11.228124618530273, "rewards/rejected": -16.134374618530273, "step": 3070 }, { "epoch": 2.7044356609574, "grad_norm": 14.64637064610325, "learning_rate": 3.2359402460456944e-07, "logits/chosen": -0.5648910403251648, "logits/rejected": -0.5831664800643921, "logps/chosen": -340.5, "logps/rejected": -540.0, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": -4.522656440734863, "rewards/margins": 10.778124809265137, "rewards/rejected": -15.287500381469727, "step": 3080 }, { "epoch": 2.7132191480017567, "grad_norm": 3.2718331679664545, "learning_rate": 3.2139718804920914e-07, "logits/chosen": -0.5290893316268921, "logits/rejected": -0.5699218511581421, "logps/chosen": -308.8500061035156, "logps/rejected": -520.9000244140625, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -4.353906154632568, "rewards/margins": 10.589062690734863, "rewards/rejected": -14.928125381469727, "step": 3090 }, { "epoch": 2.7220026350461133, "grad_norm": 2.1500285754823687, "learning_rate": 3.1920035149384885e-07, "logits/chosen": -0.3724121153354645, "logits/rejected": -0.45599669218063354, "logps/chosen": -273.1000061035156, "logps/rejected": -484.79998779296875, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": -4.213281154632568, "rewards/margins": 10.100000381469727, "rewards/rejected": -14.337499618530273, "step": 3100 }, { "epoch": 2.73078612209047, "grad_norm": 1.3575472143265013, "learning_rate": 3.1700351493848856e-07, "logits/chosen": -0.40112000703811646, "logits/rejected": -0.5287231206893921, "logps/chosen": -326.20001220703125, "logps/rejected": -569.0999755859375, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -4.142968654632568, "rewards/margins": 12.090624809265137, "rewards/rejected": -16.212499618530273, "step": 3110 }, { "epoch": 2.7395696091348265, "grad_norm": 1.0312123093061838, "learning_rate": 3.148066783831283e-07, "logits/chosen": -0.4745727479457855, "logits/rejected": -0.620898425579071, "logps/chosen": -319.8999938964844, "logps/rejected": -503.45001220703125, "loss": 0.0058, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.528906345367432, "rewards/margins": 10.771875381469727, "rewards/rejected": -15.300000190734863, "step": 3120 }, { "epoch": 2.748353096179183, "grad_norm": 0.6871905330276499, "learning_rate": 3.1260984182776803e-07, "logits/chosen": -0.4146972596645355, "logits/rejected": -0.567944347858429, "logps/chosen": -302.42498779296875, "logps/rejected": -495.70001220703125, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -4.281640529632568, "rewards/margins": 11.314062118530273, "rewards/rejected": -15.587499618530273, "step": 3130 }, { "epoch": 2.7571365832235397, "grad_norm": 10.682426915121416, "learning_rate": 3.1041300527240773e-07, "logits/chosen": -0.20505371689796448, "logits/rejected": -0.5253967046737671, "logps/chosen": -327.5, "logps/rejected": -579.7999877929688, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": -4.353125095367432, "rewards/margins": 12.431249618530273, "rewards/rejected": -16.787500381469727, "step": 3140 }, { "epoch": 2.7659200702678963, "grad_norm": 0.9752346164858108, "learning_rate": 3.0821616871704744e-07, "logits/chosen": -0.5622192621231079, "logits/rejected": -0.6981445550918579, "logps/chosen": -284.70001220703125, "logps/rejected": -530.7999877929688, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -4.678515434265137, "rewards/margins": 11.75, "rewards/rejected": -16.428125381469727, "step": 3150 }, { "epoch": 2.774703557312253, "grad_norm": 0.42877503439067033, "learning_rate": 3.060193321616872e-07, "logits/chosen": -0.5245605707168579, "logits/rejected": -0.592480480670929, "logps/chosen": -350.25, "logps/rejected": -546.7000122070312, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": -4.925000190734863, "rewards/margins": 11.073437690734863, "rewards/rejected": -15.996874809265137, "step": 3160 }, { "epoch": 2.7834870443566095, "grad_norm": 2.5028506890591067, "learning_rate": 3.038224956063269e-07, "logits/chosen": -0.4533935487270355, "logits/rejected": -0.640673816204071, "logps/chosen": -286.6499938964844, "logps/rejected": -510.8500061035156, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -4.190625190734863, "rewards/margins": 12.076562881469727, "rewards/rejected": -16.259374618530273, "step": 3170 }, { "epoch": 2.792270531400966, "grad_norm": 10.252978480967812, "learning_rate": 3.016256590509666e-07, "logits/chosen": -0.44575196504592896, "logits/rejected": -0.5320068597793579, "logps/chosen": -284.0, "logps/rejected": -518.4000244140625, "loss": 0.0174, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.594531059265137, "rewards/margins": 11.037500381469727, "rewards/rejected": -15.634374618530273, "step": 3180 }, { "epoch": 2.8010540184453228, "grad_norm": 7.848864819561687, "learning_rate": 2.994288224956063e-07, "logits/chosen": -0.36247557401657104, "logits/rejected": -0.7054687738418579, "logps/chosen": -271.04998779296875, "logps/rejected": -485.1000061035156, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": -4.048437595367432, "rewards/margins": 10.453125, "rewards/rejected": -14.496874809265137, "step": 3190 }, { "epoch": 2.8098375054896794, "grad_norm": 3.0225472325442757, "learning_rate": 2.9723198594024603e-07, "logits/chosen": -0.3984130918979645, "logits/rejected": -0.581591784954071, "logps/chosen": -275.29998779296875, "logps/rejected": -528.9000244140625, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -4.544140815734863, "rewards/margins": 11.425000190734863, "rewards/rejected": -15.96875, "step": 3200 }, { "epoch": 2.818620992534036, "grad_norm": 3.088898109652684, "learning_rate": 2.950351493848858e-07, "logits/chosen": -0.698559582233429, "logits/rejected": -0.779101550579071, "logps/chosen": -298.54998779296875, "logps/rejected": -465.29998779296875, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": -4.942968845367432, "rewards/margins": 10.142187118530273, "rewards/rejected": -15.084375381469727, "step": 3210 }, { "epoch": 2.8274044795783926, "grad_norm": 2.0980672733307566, "learning_rate": 2.928383128295255e-07, "logits/chosen": -0.6407226324081421, "logits/rejected": -0.665209949016571, "logps/chosen": -337.6000061035156, "logps/rejected": -561.7000122070312, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -4.978125095367432, "rewards/margins": 11.475000381469727, "rewards/rejected": -16.456249237060547, "step": 3220 }, { "epoch": 2.836187966622749, "grad_norm": 1.1844741272397061, "learning_rate": 2.906414762741652e-07, "logits/chosen": -0.504223644733429, "logits/rejected": -0.54864501953125, "logps/chosen": -308.79998779296875, "logps/rejected": -508.3999938964844, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": -4.219531059265137, "rewards/margins": 10.784375190734863, "rewards/rejected": -14.981249809265137, "step": 3230 }, { "epoch": 2.844971453667106, "grad_norm": 9.914496662647146, "learning_rate": 2.884446397188049e-07, "logits/chosen": -0.4930664002895355, "logits/rejected": -0.709277331829071, "logps/chosen": -311.70001220703125, "logps/rejected": -538.9000244140625, "loss": 0.0083, "rewards/accuracies": 1.0, "rewards/chosen": -4.564843654632568, "rewards/margins": 11.625, "rewards/rejected": -16.193750381469727, "step": 3240 }, { "epoch": 2.8537549407114624, "grad_norm": 15.229303576757754, "learning_rate": 2.862478031634446e-07, "logits/chosen": -0.4490966796875, "logits/rejected": -0.4404296875, "logps/chosen": -290.8999938964844, "logps/rejected": -505.79998779296875, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -4.3125, "rewards/margins": 9.951562881469727, "rewards/rejected": -14.25, "step": 3250 }, { "epoch": 2.862538427755819, "grad_norm": 1.3236118558852696, "learning_rate": 2.840509666080844e-07, "logits/chosen": -0.5537353754043579, "logits/rejected": -0.6509765386581421, "logps/chosen": -340.95001220703125, "logps/rejected": -543.2000122070312, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -4.659765720367432, "rewards/margins": 10.412500381469727, "rewards/rejected": -15.074999809265137, "step": 3260 }, { "epoch": 2.8713219148001756, "grad_norm": 2.5734892571720893, "learning_rate": 2.818541300527241e-07, "logits/chosen": -0.5249999761581421, "logits/rejected": -0.624316394329071, "logps/chosen": -263.3500061035156, "logps/rejected": -481.20001220703125, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -3.950000047683716, "rewards/margins": 10.765625, "rewards/rejected": -14.725000381469727, "step": 3270 }, { "epoch": 2.8801054018445322, "grad_norm": 11.304099440055024, "learning_rate": 2.796572934973638e-07, "logits/chosen": -0.4389404356479645, "logits/rejected": -0.49687498807907104, "logps/chosen": -314.45001220703125, "logps/rejected": -490.0, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -4.178808689117432, "rewards/margins": 10.743749618530273, "rewards/rejected": -14.918749809265137, "step": 3280 }, { "epoch": 2.888888888888889, "grad_norm": 1.6728224655167592, "learning_rate": 2.774604569420035e-07, "logits/chosen": -0.38946533203125, "logits/rejected": -0.4373535215854645, "logps/chosen": -298.75, "logps/rejected": -498.6000061035156, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -4.462109565734863, "rewards/margins": 10.800000190734863, "rewards/rejected": -15.253125190734863, "step": 3290 }, { "epoch": 2.8976723759332454, "grad_norm": 0.25274380122781387, "learning_rate": 2.7526362038664327e-07, "logits/chosen": -0.33930665254592896, "logits/rejected": -0.4881347715854645, "logps/chosen": -280.25, "logps/rejected": -483.6000061035156, "loss": 0.0094, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.532812595367432, "rewards/margins": 11.109375, "rewards/rejected": -15.640625, "step": 3300 }, { "epoch": 2.906455862977602, "grad_norm": 0.1544532541104994, "learning_rate": 2.73066783831283e-07, "logits/chosen": -0.4963440001010895, "logits/rejected": -0.598583996295929, "logps/chosen": -324.25, "logps/rejected": -555.2999877929688, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -4.901953220367432, "rewards/margins": 12.265625, "rewards/rejected": -17.15625, "step": 3310 }, { "epoch": 2.9152393500219587, "grad_norm": 0.5066800286376364, "learning_rate": 2.708699472759227e-07, "logits/chosen": -0.4068359434604645, "logits/rejected": -0.4866943359375, "logps/chosen": -337.1499938964844, "logps/rejected": -487.1000061035156, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": -4.911718845367432, "rewards/margins": 9.856249809265137, "rewards/rejected": -14.765625, "step": 3320 }, { "epoch": 2.9240228370663153, "grad_norm": 1.1063603089083296, "learning_rate": 2.686731107205624e-07, "logits/chosen": -0.572583019733429, "logits/rejected": -0.5684570074081421, "logps/chosen": -299.1499938964844, "logps/rejected": -510.20001220703125, "loss": 0.0086, "rewards/accuracies": 1.0, "rewards/chosen": -4.404687404632568, "rewards/margins": 10.746874809265137, "rewards/rejected": -15.140625, "step": 3330 }, { "epoch": 2.932806324110672, "grad_norm": 6.280125749340474, "learning_rate": 2.664762741652021e-07, "logits/chosen": -0.45579832792282104, "logits/rejected": -0.650195300579071, "logps/chosen": -284.5, "logps/rejected": -492.5, "loss": 0.014, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.497265815734863, "rewards/margins": 10.837499618530273, "rewards/rejected": -15.328125, "step": 3340 }, { "epoch": 2.9415898111550285, "grad_norm": 0.3801076899570813, "learning_rate": 2.6427943760984186e-07, "logits/chosen": -0.39887696504592896, "logits/rejected": -0.5736328363418579, "logps/chosen": -326.6499938964844, "logps/rejected": -584.0999755859375, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -5.340624809265137, "rewards/margins": 11.899999618530273, "rewards/rejected": -17.243749618530273, "step": 3350 }, { "epoch": 2.950373298199385, "grad_norm": 0.9652969878837827, "learning_rate": 2.6208260105448157e-07, "logits/chosen": -0.4768310487270355, "logits/rejected": -0.5660156011581421, "logps/chosen": -315.8500061035156, "logps/rejected": -536.4000244140625, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -4.846875190734863, "rewards/margins": 11.146875381469727, "rewards/rejected": -16.0, "step": 3360 }, { "epoch": 2.9591567852437417, "grad_norm": 36.565213674625845, "learning_rate": 2.598857644991213e-07, "logits/chosen": -0.5627685785293579, "logits/rejected": -0.640332043170929, "logps/chosen": -320.79998779296875, "logps/rejected": -534.9000244140625, "loss": 0.0085, "rewards/accuracies": 1.0, "rewards/chosen": -5.392187595367432, "rewards/margins": 11.270312309265137, "rewards/rejected": -16.671875, "step": 3370 }, { "epoch": 2.9679402722880983, "grad_norm": 2.216574940291986, "learning_rate": 2.57688927943761e-07, "logits/chosen": -0.4574951231479645, "logits/rejected": -0.779589831829071, "logps/chosen": -379.29998779296875, "logps/rejected": -572.5, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -5.489843845367432, "rewards/margins": 11.893750190734863, "rewards/rejected": -17.390625, "step": 3380 }, { "epoch": 2.976723759332455, "grad_norm": 0.40925350045993514, "learning_rate": 2.5549209138840064e-07, "logits/chosen": -0.4876464903354645, "logits/rejected": -0.66064453125, "logps/chosen": -313.6000061035156, "logps/rejected": -549.2000122070312, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -4.717968940734863, "rewards/margins": 11.903124809265137, "rewards/rejected": -16.612499237060547, "step": 3390 }, { "epoch": 2.9855072463768115, "grad_norm": 11.517510635696361, "learning_rate": 2.5329525483304045e-07, "logits/chosen": -0.3590087890625, "logits/rejected": -0.46257323026657104, "logps/chosen": -277.3999938964844, "logps/rejected": -503.29998779296875, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": -4.45703125, "rewards/margins": 10.392187118530273, "rewards/rejected": -14.84375, "step": 3400 }, { "epoch": 2.994290733421168, "grad_norm": 19.942890864371623, "learning_rate": 2.5109841827768016e-07, "logits/chosen": -0.4447021484375, "logits/rejected": -0.5888305902481079, "logps/chosen": -301.1000061035156, "logps/rejected": -481.6000061035156, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": -4.153124809265137, "rewards/margins": 10.885937690734863, "rewards/rejected": -15.043749809265137, "step": 3410 }, { "epoch": 3.002635046113307, "grad_norm": 0.0587504755490277, "learning_rate": 2.489015817223198e-07, "logits/chosen": -0.5388054847717285, "logits/rejected": -0.6298571228981018, "logps/chosen": -331.15789794921875, "logps/rejected": -612.4210815429688, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": -4.9884867668151855, "rewards/margins": 12.6875, "rewards/rejected": -17.6842098236084, "step": 3420 }, { "epoch": 3.0114185331576637, "grad_norm": 0.07012424769062564, "learning_rate": 2.467047451669596e-07, "logits/chosen": -0.4456115663051605, "logits/rejected": -0.6268066167831421, "logps/chosen": -299.07501220703125, "logps/rejected": -523.0999755859375, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -4.291796684265137, "rewards/margins": 11.84375, "rewards/rejected": -16.143749237060547, "step": 3430 }, { "epoch": 3.0202020202020203, "grad_norm": 0.04367284449026095, "learning_rate": 2.445079086115993e-07, "logits/chosen": -0.55078125, "logits/rejected": -0.530322253704071, "logps/chosen": -310.0, "logps/rejected": -525.0, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -4.585546970367432, "rewards/margins": 12.165624618530273, "rewards/rejected": -16.762500762939453, "step": 3440 }, { "epoch": 3.028985507246377, "grad_norm": 0.03518193849443363, "learning_rate": 2.42311072056239e-07, "logits/chosen": -0.5939239263534546, "logits/rejected": -0.636474609375, "logps/chosen": -336.8500061035156, "logps/rejected": -556.7000122070312, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -4.836718559265137, "rewards/margins": 12.96875, "rewards/rejected": -17.799999237060547, "step": 3450 }, { "epoch": 3.0377689942907335, "grad_norm": 0.7216912034197285, "learning_rate": 2.401142355008787e-07, "logits/chosen": -0.4346557557582855, "logits/rejected": -0.67578125, "logps/chosen": -326.0, "logps/rejected": -526.7999877929688, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -5.220312595367432, "rewards/margins": 12.378125190734863, "rewards/rejected": -17.590625762939453, "step": 3460 }, { "epoch": 3.04655248133509, "grad_norm": 3.3135553052952833, "learning_rate": 2.3791739894551843e-07, "logits/chosen": -0.3838745057582855, "logits/rejected": -0.597705066204071, "logps/chosen": -311.75, "logps/rejected": -543.5999755859375, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -5.508593559265137, "rewards/margins": 12.681249618530273, "rewards/rejected": -18.168750762939453, "step": 3470 }, { "epoch": 3.0553359683794468, "grad_norm": 0.17238004378534122, "learning_rate": 2.3572056239015817e-07, "logits/chosen": -0.4697509706020355, "logits/rejected": -0.6309570074081421, "logps/chosen": -287.0, "logps/rejected": -509.6499938964844, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -4.296191215515137, "rewards/margins": 12.300000190734863, "rewards/rejected": -16.596874237060547, "step": 3480 }, { "epoch": 3.0641194554238034, "grad_norm": 2.804610449463222, "learning_rate": 2.3352372583479788e-07, "logits/chosen": -0.4029479920864105, "logits/rejected": -0.6070801019668579, "logps/chosen": -281.75, "logps/rejected": -484.8999938964844, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -4.346093654632568, "rewards/margins": 11.149999618530273, "rewards/rejected": -15.509374618530273, "step": 3490 }, { "epoch": 3.07290294246816, "grad_norm": 0.057149876301900195, "learning_rate": 2.313268892794376e-07, "logits/chosen": -0.6165100336074829, "logits/rejected": -0.7337890863418579, "logps/chosen": -305.8999938964844, "logps/rejected": -534.7999877929688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -4.8486328125, "rewards/margins": 12.353124618530273, "rewards/rejected": -17.215625762939453, "step": 3500 }, { "epoch": 3.0816864295125166, "grad_norm": 0.22811537325384781, "learning_rate": 2.2913005272407732e-07, "logits/chosen": -0.4875244200229645, "logits/rejected": -0.5455566644668579, "logps/chosen": -340.70001220703125, "logps/rejected": -562.0, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -5.987500190734863, "rewards/margins": 12.168749809265137, "rewards/rejected": -18.146875381469727, "step": 3510 }, { "epoch": 3.090469916556873, "grad_norm": 0.021343716137999312, "learning_rate": 2.2693321616871705e-07, "logits/chosen": -0.3146118223667145, "logits/rejected": -0.514385998249054, "logps/chosen": -311.29998779296875, "logps/rejected": -541.2999877929688, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -4.885595798492432, "rewards/margins": 13.100000381469727, "rewards/rejected": -17.990625381469727, "step": 3520 }, { "epoch": 3.09925340360123, "grad_norm": 0.12417879870075779, "learning_rate": 2.2473637961335676e-07, "logits/chosen": -0.344482421875, "logits/rejected": -0.4461914002895355, "logps/chosen": -289.54998779296875, "logps/rejected": -564.4000244140625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -5.400000095367432, "rewards/margins": 13.162500381469727, "rewards/rejected": -18.546875, "step": 3530 }, { "epoch": 3.1080368906455864, "grad_norm": 0.2018047665978749, "learning_rate": 2.2253954305799647e-07, "logits/chosen": -0.37446290254592896, "logits/rejected": -0.3543457090854645, "logps/chosen": -317.04998779296875, "logps/rejected": -555.0, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -5.058984279632568, "rewards/margins": 13.403124809265137, "rewards/rejected": -18.459375381469727, "step": 3540 }, { "epoch": 3.116820377689943, "grad_norm": 0.5062026766405988, "learning_rate": 2.203427065026362e-07, "logits/chosen": -0.27659910917282104, "logits/rejected": -0.4387756288051605, "logps/chosen": -295.6000061035156, "logps/rejected": -549.7000122070312, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -5.474218845367432, "rewards/margins": 13.118749618530273, "rewards/rejected": -18.59375, "step": 3550 }, { "epoch": 3.1256038647342996, "grad_norm": 14.703379176269507, "learning_rate": 2.181458699472759e-07, "logits/chosen": -0.46748048067092896, "logits/rejected": -0.640185534954071, "logps/chosen": -263.3500061035156, "logps/rejected": -482.3500061035156, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -4.791211128234863, "rewards/margins": 13.259374618530273, "rewards/rejected": -18.056249618530273, "step": 3560 }, { "epoch": 3.1343873517786562, "grad_norm": 0.1444566521134434, "learning_rate": 2.1594903339191564e-07, "logits/chosen": -0.534515380859375, "logits/rejected": -0.682080090045929, "logps/chosen": -331.6000061035156, "logps/rejected": -608.5, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -6.124218940734863, "rewards/margins": 13.040624618530273, "rewards/rejected": -19.15625, "step": 3570 }, { "epoch": 3.143170838823013, "grad_norm": 4.331971375904906, "learning_rate": 2.1375219683655535e-07, "logits/chosen": -0.36004638671875, "logits/rejected": -0.5852603912353516, "logps/chosen": -335.6499938964844, "logps/rejected": -553.5999755859375, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -5.526562690734863, "rewards/margins": 12.765625, "rewards/rejected": -18.284374237060547, "step": 3580 }, { "epoch": 3.1519543258673695, "grad_norm": 0.2916502685656478, "learning_rate": 2.1155536028119509e-07, "logits/chosen": -0.39570313692092896, "logits/rejected": -0.49169921875, "logps/chosen": -316.3500061035156, "logps/rejected": -546.2999877929688, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -5.323828220367432, "rewards/margins": 13.040624618530273, "rewards/rejected": -18.371875762939453, "step": 3590 }, { "epoch": 3.160737812911726, "grad_norm": 0.12229620717068679, "learning_rate": 2.093585237258348e-07, "logits/chosen": -0.3935302793979645, "logits/rejected": -0.551684558391571, "logps/chosen": -310.32501220703125, "logps/rejected": -539.2999877929688, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -4.795312404632568, "rewards/margins": 12.709375381469727, "rewards/rejected": -17.503124237060547, "step": 3600 }, { "epoch": 3.1695212999560827, "grad_norm": 0.6629752909845313, "learning_rate": 2.071616871704745e-07, "logits/chosen": -0.48927611112594604, "logits/rejected": -0.6009277105331421, "logps/chosen": -293.20001220703125, "logps/rejected": -492.6000061035156, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -5.033593654632568, "rewards/margins": 11.946874618530273, "rewards/rejected": -16.987499237060547, "step": 3610 }, { "epoch": 3.1783047870004393, "grad_norm": 0.1354130597721949, "learning_rate": 2.0496485061511424e-07, "logits/chosen": -0.48091429471969604, "logits/rejected": -0.5238281488418579, "logps/chosen": -313.0, "logps/rejected": -559.2999877929688, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -5.739843845367432, "rewards/margins": 12.790624618530273, "rewards/rejected": -18.528125762939453, "step": 3620 }, { "epoch": 3.187088274044796, "grad_norm": 1.027853131109502, "learning_rate": 2.0276801405975394e-07, "logits/chosen": -0.5071045160293579, "logits/rejected": -0.5523437261581421, "logps/chosen": -342.79998779296875, "logps/rejected": -569.7999877929688, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -5.787499904632568, "rewards/margins": 12.743749618530273, "rewards/rejected": -18.556249618530273, "step": 3630 }, { "epoch": 3.1958717610891525, "grad_norm": 0.04941845984964749, "learning_rate": 2.0057117750439368e-07, "logits/chosen": -0.44682615995407104, "logits/rejected": -0.4822021424770355, "logps/chosen": -305.75, "logps/rejected": -545.7999877929688, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -5.321093559265137, "rewards/margins": 13.221875190734863, "rewards/rejected": -18.524999618530273, "step": 3640 }, { "epoch": 3.204655248133509, "grad_norm": 0.3647431342138875, "learning_rate": 1.9837434094903339e-07, "logits/chosen": -0.4539550840854645, "logits/rejected": -0.6092773675918579, "logps/chosen": -289.8500061035156, "logps/rejected": -583.2999877929688, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -5.383593559265137, "rewards/margins": 14.053125381469727, "rewards/rejected": -19.434375762939453, "step": 3650 }, { "epoch": 3.2134387351778657, "grad_norm": 0.14589697592733497, "learning_rate": 1.9617750439367312e-07, "logits/chosen": -0.5714355707168579, "logits/rejected": -0.5230957269668579, "logps/chosen": -323.75, "logps/rejected": -544.5999755859375, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -5.805468559265137, "rewards/margins": 12.475000381469727, "rewards/rejected": -18.274999618530273, "step": 3660 }, { "epoch": 3.2222222222222223, "grad_norm": 0.09758646843581241, "learning_rate": 1.9398066783831283e-07, "logits/chosen": -0.6319946050643921, "logits/rejected": -0.534405529499054, "logps/chosen": -318.79998779296875, "logps/rejected": -534.5499877929688, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -4.999218940734863, "rewards/margins": 13.853124618530273, "rewards/rejected": -18.865625381469727, "step": 3670 }, { "epoch": 3.231005709266579, "grad_norm": 0.1387897514689098, "learning_rate": 1.9178383128295253e-07, "logits/chosen": -0.558032214641571, "logits/rejected": -0.667065441608429, "logps/chosen": -315.6499938964844, "logps/rejected": -532.2999877929688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -5.630468845367432, "rewards/margins": 12.168749809265137, "rewards/rejected": -17.787500381469727, "step": 3680 }, { "epoch": 3.2397891963109355, "grad_norm": 0.07001536530977417, "learning_rate": 1.8958699472759227e-07, "logits/chosen": -0.541308581829071, "logits/rejected": -0.5582031011581421, "logps/chosen": -348.07501220703125, "logps/rejected": -538.4000244140625, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -5.290625095367432, "rewards/margins": 12.428125381469727, "rewards/rejected": -17.706249237060547, "step": 3690 }, { "epoch": 3.248572683355292, "grad_norm": 0.3693004106041826, "learning_rate": 1.8739015817223198e-07, "logits/chosen": -0.3478637635707855, "logits/rejected": -0.634905993938446, "logps/chosen": -316.70001220703125, "logps/rejected": -536.5999755859375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -5.672656059265137, "rewards/margins": 12.359375, "rewards/rejected": -18.024999618530273, "step": 3700 }, { "epoch": 3.2573561703996488, "grad_norm": 0.06952413183537078, "learning_rate": 1.851933216168717e-07, "logits/chosen": -0.49371337890625, "logits/rejected": -0.614794909954071, "logps/chosen": -330.5, "logps/rejected": -562.5999755859375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -5.840234279632568, "rewards/margins": 13.662500381469727, "rewards/rejected": -19.5, "step": 3710 }, { "epoch": 3.2661396574440054, "grad_norm": 0.08095931570793916, "learning_rate": 1.8299648506151142e-07, "logits/chosen": -0.24246826767921448, "logits/rejected": -0.566845715045929, "logps/chosen": -333.3500061035156, "logps/rejected": -571.9000244140625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -6.537499904632568, "rewards/margins": 12.715624809265137, "rewards/rejected": -19.25, "step": 3720 }, { "epoch": 3.274923144488362, "grad_norm": 0.06061069428025373, "learning_rate": 1.8079964850615115e-07, "logits/chosen": -0.32911378145217896, "logits/rejected": -0.5960937738418579, "logps/chosen": -313.125, "logps/rejected": -554.0999755859375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -5.826562404632568, "rewards/margins": 13.068750381469727, "rewards/rejected": -18.899999618530273, "step": 3730 }, { "epoch": 3.2837066315327186, "grad_norm": 0.03768579676101955, "learning_rate": 1.7860281195079086e-07, "logits/chosen": -0.577862560749054, "logits/rejected": -0.616943359375, "logps/chosen": -314.5, "logps/rejected": -562.7999877929688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -6.1171875, "rewards/margins": 12.778124809265137, "rewards/rejected": -18.887500762939453, "step": 3740 }, { "epoch": 3.292490118577075, "grad_norm": 0.013928735597646898, "learning_rate": 1.7640597539543057e-07, "logits/chosen": -0.47514647245407104, "logits/rejected": -0.504321277141571, "logps/chosen": -336.8999938964844, "logps/rejected": -530.7000122070312, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -5.296875, "rewards/margins": 12.465624809265137, "rewards/rejected": -17.768749237060547, "step": 3750 }, { "epoch": 3.301273605621432, "grad_norm": 0.039305790150571215, "learning_rate": 1.742091388400703e-07, "logits/chosen": -0.590869128704071, "logits/rejected": -0.6766357421875, "logps/chosen": -334.6000061035156, "logps/rejected": -613.0999755859375, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -6.609375, "rewards/margins": 13.71875, "rewards/rejected": -20.309375762939453, "step": 3760 }, { "epoch": 3.3100570926657884, "grad_norm": 0.06995313734246181, "learning_rate": 1.7201230228471e-07, "logits/chosen": -0.37846678495407104, "logits/rejected": -0.5860840082168579, "logps/chosen": -381.29998779296875, "logps/rejected": -624.0999755859375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -6.080468654632568, "rewards/margins": 14.675000190734863, "rewards/rejected": -20.762500762939453, "step": 3770 }, { "epoch": 3.318840579710145, "grad_norm": 0.1319701107720784, "learning_rate": 1.6981546572934974e-07, "logits/chosen": -0.50787353515625, "logits/rejected": -0.585693359375, "logps/chosen": -328.1499938964844, "logps/rejected": -568.7999877929688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -5.712500095367432, "rewards/margins": 13.481249809265137, "rewards/rejected": -19.203125, "step": 3780 }, { "epoch": 3.3276240667545016, "grad_norm": 0.03744637417696784, "learning_rate": 1.6761862917398945e-07, "logits/chosen": -0.31309813261032104, "logits/rejected": -0.536480724811554, "logps/chosen": -264.3500061035156, "logps/rejected": -497.79998779296875, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -4.630468845367432, "rewards/margins": 12.756250381469727, "rewards/rejected": -17.375, "step": 3790 }, { "epoch": 3.3364075537988582, "grad_norm": 0.4739392649390637, "learning_rate": 1.6542179261862919e-07, "logits/chosen": -0.4293456971645355, "logits/rejected": -0.68896484375, "logps/chosen": -280.3999938964844, "logps/rejected": -533.2000122070312, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -5.193749904632568, "rewards/margins": 12.981249809265137, "rewards/rejected": -18.181249618530273, "step": 3800 }, { "epoch": 3.345191040843215, "grad_norm": 0.096599270565019, "learning_rate": 1.632249560632689e-07, "logits/chosen": -0.4119628965854645, "logits/rejected": -0.66845703125, "logps/chosen": -358.0, "logps/rejected": -573.7999877929688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -5.8203125, "rewards/margins": 12.581250190734863, "rewards/rejected": -18.390625, "step": 3810 }, { "epoch": 3.3539745278875714, "grad_norm": 0.18683943134370976, "learning_rate": 1.610281195079086e-07, "logits/chosen": -0.4212280213832855, "logits/rejected": -0.7068115472793579, "logps/chosen": -309.45001220703125, "logps/rejected": -578.2999877929688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -5.76953125, "rewards/margins": 13.181249618530273, "rewards/rejected": -18.953125, "step": 3820 }, { "epoch": 3.362758014931928, "grad_norm": 3.391182221945087, "learning_rate": 1.5883128295254834e-07, "logits/chosen": -0.4910217225551605, "logits/rejected": -0.6852051019668579, "logps/chosen": -331.8500061035156, "logps/rejected": -586.0, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -5.95703125, "rewards/margins": 13.387499809265137, "rewards/rejected": -19.346874237060547, "step": 3830 }, { "epoch": 3.3715415019762847, "grad_norm": 0.3391484375765767, "learning_rate": 1.5663444639718804e-07, "logits/chosen": -0.3464721739292145, "logits/rejected": -0.4462890625, "logps/chosen": -314.8500061035156, "logps/rejected": -541.7000122070312, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -5.482031345367432, "rewards/margins": 13.853124618530273, "rewards/rejected": -19.331249237060547, "step": 3840 }, { "epoch": 3.3803249890206413, "grad_norm": 0.046261956201503196, "learning_rate": 1.5443760984182778e-07, "logits/chosen": -0.49821776151657104, "logits/rejected": -0.5532165765762329, "logps/chosen": -338.54998779296875, "logps/rejected": -565.7999877929688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -6.3828125, "rewards/margins": 13.606249809265137, "rewards/rejected": -20.015625, "step": 3850 }, { "epoch": 3.389108476064998, "grad_norm": 0.2863412310583335, "learning_rate": 1.5224077328646749e-07, "logits/chosen": -0.4059081971645355, "logits/rejected": -0.552075207233429, "logps/chosen": -320.25, "logps/rejected": -514.7000122070312, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -5.430078029632568, "rewards/margins": 12.412500381469727, "rewards/rejected": -17.850000381469727, "step": 3860 }, { "epoch": 3.3978919631093545, "grad_norm": 2.0521477848087746, "learning_rate": 1.5004393673110722e-07, "logits/chosen": -0.4094100892543793, "logits/rejected": -0.58203125, "logps/chosen": -325.875, "logps/rejected": -551.9000244140625, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -5.658593654632568, "rewards/margins": 12.949999809265137, "rewards/rejected": -18.606250762939453, "step": 3870 }, { "epoch": 3.406675450153711, "grad_norm": 1.6102352952681218, "learning_rate": 1.4784710017574693e-07, "logits/chosen": -0.23146972060203552, "logits/rejected": -0.4781250059604645, "logps/chosen": -314.6000061035156, "logps/rejected": -570.7999877929688, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -5.599218845367432, "rewards/margins": 13.546875, "rewards/rejected": -19.162500381469727, "step": 3880 }, { "epoch": 3.4154589371980677, "grad_norm": 0.26325172589636325, "learning_rate": 1.4565026362038664e-07, "logits/chosen": -0.3239990174770355, "logits/rejected": -0.524707019329071, "logps/chosen": -328.29998779296875, "logps/rejected": -551.2000122070312, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -5.85546875, "rewards/margins": 13.290624618530273, "rewards/rejected": -19.140625, "step": 3890 }, { "epoch": 3.4242424242424243, "grad_norm": 0.08570314287126546, "learning_rate": 1.4345342706502637e-07, "logits/chosen": -0.55743408203125, "logits/rejected": -0.744824230670929, "logps/chosen": -351.5, "logps/rejected": -579.0999755859375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -5.90234375, "rewards/margins": 13.018750190734863, "rewards/rejected": -18.924999237060547, "step": 3900 }, { "epoch": 3.433025911286781, "grad_norm": 0.6403333211012159, "learning_rate": 1.4125659050966608e-07, "logits/chosen": -0.58740234375, "logits/rejected": -0.5416015386581421, "logps/chosen": -348.1000061035156, "logps/rejected": -584.7000122070312, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -6.010156154632568, "rewards/margins": 13.271875381469727, "rewards/rejected": -19.290624618530273, "step": 3910 }, { "epoch": 3.4418093983311375, "grad_norm": 0.28318003791320656, "learning_rate": 1.390597539543058e-07, "logits/chosen": -0.46213990449905396, "logits/rejected": -0.6983642578125, "logps/chosen": -315.25, "logps/rejected": -529.2000122070312, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -5.891406059265137, "rewards/margins": 13.256250381469727, "rewards/rejected": -19.146875381469727, "step": 3920 }, { "epoch": 3.450592885375494, "grad_norm": 0.42584755351364867, "learning_rate": 1.3686291739894552e-07, "logits/chosen": -0.4601806700229645, "logits/rejected": -0.62811279296875, "logps/chosen": -282.3999938964844, "logps/rejected": -567.4000244140625, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -5.463281154632568, "rewards/margins": 13.390625, "rewards/rejected": -18.846874237060547, "step": 3930 }, { "epoch": 3.4593763724198507, "grad_norm": 0.08370260886915823, "learning_rate": 1.3466608084358525e-07, "logits/chosen": -0.550830066204071, "logits/rejected": -0.688769519329071, "logps/chosen": -291.70001220703125, "logps/rejected": -577.2000122070312, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -5.275000095367432, "rewards/margins": 13.340624809265137, "rewards/rejected": -18.606250762939453, "step": 3940 }, { "epoch": 3.4681598594642074, "grad_norm": 4.93202067654964, "learning_rate": 1.3246924428822496e-07, "logits/chosen": -0.48662108182907104, "logits/rejected": -0.597485363483429, "logps/chosen": -315.45001220703125, "logps/rejected": -535.0999755859375, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -5.650000095367432, "rewards/margins": 13.606249809265137, "rewards/rejected": -19.262500762939453, "step": 3950 }, { "epoch": 3.476943346508564, "grad_norm": 0.1720811644865383, "learning_rate": 1.3027240773286467e-07, "logits/chosen": -0.5064452886581421, "logits/rejected": -0.6212402582168579, "logps/chosen": -294.1000061035156, "logps/rejected": -517.5999755859375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -5.676562309265137, "rewards/margins": 12.800000190734863, "rewards/rejected": -18.465625762939453, "step": 3960 }, { "epoch": 3.4857268335529206, "grad_norm": 0.056000706529879506, "learning_rate": 1.280755711775044e-07, "logits/chosen": -0.3684326112270355, "logits/rejected": -0.579296886920929, "logps/chosen": -350.95001220703125, "logps/rejected": -586.0, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -6.460156440734863, "rewards/margins": 13.0, "rewards/rejected": -19.475000381469727, "step": 3970 }, { "epoch": 3.494510320597277, "grad_norm": 0.12498695523254302, "learning_rate": 1.258787346221441e-07, "logits/chosen": -0.49842530488967896, "logits/rejected": -0.5428711175918579, "logps/chosen": -298.54998779296875, "logps/rejected": -573.2999877929688, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -5.893750190734863, "rewards/margins": 14.256250381469727, "rewards/rejected": -20.15625, "step": 3980 }, { "epoch": 3.503293807641634, "grad_norm": 0.1791075095260585, "learning_rate": 1.2368189806678382e-07, "logits/chosen": -0.3945678770542145, "logits/rejected": -0.5943969488143921, "logps/chosen": -327.70001220703125, "logps/rejected": -537.4000244140625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -5.712500095367432, "rewards/margins": 12.403124809265137, "rewards/rejected": -18.09375, "step": 3990 }, { "epoch": 3.5120772946859904, "grad_norm": 0.7805988422053907, "learning_rate": 1.2148506151142355e-07, "logits/chosen": -0.4241943359375, "logits/rejected": -0.6270507574081421, "logps/chosen": -330.25, "logps/rejected": -559.7000122070312, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -5.810156345367432, "rewards/margins": 12.659375190734863, "rewards/rejected": -18.46875, "step": 4000 }, { "epoch": 3.520860781730347, "grad_norm": 0.03537933983410999, "learning_rate": 1.1928822495606326e-07, "logits/chosen": -0.29863280057907104, "logits/rejected": -0.521533191204071, "logps/chosen": -337.8500061035156, "logps/rejected": -576.0999755859375, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -5.736718654632568, "rewards/margins": 13.543749809265137, "rewards/rejected": -19.284374237060547, "step": 4010 }, { "epoch": 3.5296442687747036, "grad_norm": 0.2600667135267026, "learning_rate": 1.1709138840070298e-07, "logits/chosen": -0.573760986328125, "logits/rejected": -0.5362304449081421, "logps/chosen": -336.95001220703125, "logps/rejected": -621.7000122070312, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -6.110937595367432, "rewards/margins": 14.306249618530273, "rewards/rejected": -20.412500381469727, "step": 4020 }, { "epoch": 3.53842775581906, "grad_norm": 0.2376793746460903, "learning_rate": 1.148945518453427e-07, "logits/chosen": -0.4827636778354645, "logits/rejected": -0.597412109375, "logps/chosen": -355.70001220703125, "logps/rejected": -561.2999877929688, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -6.03515625, "rewards/margins": 13.134374618530273, "rewards/rejected": -19.168750762939453, "step": 4030 }, { "epoch": 3.547211242863417, "grad_norm": 0.15414283580929497, "learning_rate": 1.1269771528998242e-07, "logits/chosen": -0.3797851502895355, "logits/rejected": -0.646191418170929, "logps/chosen": -361.5, "logps/rejected": -548.2000122070312, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -5.896874904632568, "rewards/margins": 13.212499618530273, "rewards/rejected": -19.087499618530273, "step": 4040 }, { "epoch": 3.5559947299077734, "grad_norm": 0.11552832705517481, "learning_rate": 1.1050087873462213e-07, "logits/chosen": -0.46794432401657104, "logits/rejected": -0.634570300579071, "logps/chosen": -358.45001220703125, "logps/rejected": -600.5999755859375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -6.750781059265137, "rewards/margins": 13.059374809265137, "rewards/rejected": -19.806249618530273, "step": 4050 }, { "epoch": 3.56477821695213, "grad_norm": 5.682233814698746, "learning_rate": 1.0830404217926185e-07, "logits/chosen": -0.38398438692092896, "logits/rejected": -0.610888659954071, "logps/chosen": -381.3999938964844, "logps/rejected": -619.5999755859375, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -7.025000095367432, "rewards/margins": 13.003125190734863, "rewards/rejected": -20.03125, "step": 4060 }, { "epoch": 3.5735617039964866, "grad_norm": 0.21096572436638011, "learning_rate": 1.0610720562390157e-07, "logits/chosen": -0.47637939453125, "logits/rejected": -0.6363281011581421, "logps/chosen": -338.8999938964844, "logps/rejected": -571.7999877929688, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -5.757031440734863, "rewards/margins": 12.865625381469727, "rewards/rejected": -18.618749618530273, "step": 4070 }, { "epoch": 3.5823451910408433, "grad_norm": 0.03749244733711619, "learning_rate": 1.039103690685413e-07, "logits/chosen": -0.4880615174770355, "logits/rejected": -0.7022460699081421, "logps/chosen": -317.04998779296875, "logps/rejected": -553.2000122070312, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -5.2734375, "rewards/margins": 12.96875, "rewards/rejected": -18.246875762939453, "step": 4080 }, { "epoch": 3.5911286780852, "grad_norm": 0.018500599492528337, "learning_rate": 1.0171353251318102e-07, "logits/chosen": -0.3341308534145355, "logits/rejected": -0.6435546875, "logps/chosen": -277.3500061035156, "logps/rejected": -523.5, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -4.607812404632568, "rewards/margins": 13.524999618530273, "rewards/rejected": -18.143749237060547, "step": 4090 }, { "epoch": 3.5999121651295565, "grad_norm": 0.3789466735957833, "learning_rate": 9.951669595782074e-08, "logits/chosen": -0.4244628846645355, "logits/rejected": -0.49036866426467896, "logps/chosen": -304.1499938964844, "logps/rejected": -538.9000244140625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -5.414843559265137, "rewards/margins": 12.640625, "rewards/rejected": -18.053125381469727, "step": 4100 }, { "epoch": 3.608695652173913, "grad_norm": 0.18121754328593084, "learning_rate": 9.731985940246046e-08, "logits/chosen": -0.573193371295929, "logits/rejected": -0.6492675542831421, "logps/chosen": -327.70001220703125, "logps/rejected": -586.9000244140625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -6.04296875, "rewards/margins": 13.84375, "rewards/rejected": -19.862499237060547, "step": 4110 }, { "epoch": 3.6174791392182697, "grad_norm": 0.09562207820059641, "learning_rate": 9.512302284710017e-08, "logits/chosen": -0.315774530172348, "logits/rejected": -0.567700207233429, "logps/chosen": -303.8999938964844, "logps/rejected": -536.4500122070312, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -4.849609375, "rewards/margins": 12.740625381469727, "rewards/rejected": -17.590625762939453, "step": 4120 }, { "epoch": 3.6262626262626263, "grad_norm": 0.49094745770955, "learning_rate": 9.292618629173989e-08, "logits/chosen": -0.47467041015625, "logits/rejected": -0.6610351800918579, "logps/chosen": -287.1000061035156, "logps/rejected": -540.7999877929688, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -5.103906154632568, "rewards/margins": 13.415624618530273, "rewards/rejected": -18.537500381469727, "step": 4130 }, { "epoch": 3.635046113306983, "grad_norm": 0.21860736747811485, "learning_rate": 9.072934973637961e-08, "logits/chosen": -0.37431639432907104, "logits/rejected": -0.5453246831893921, "logps/chosen": -293.79998779296875, "logps/rejected": -492.79998779296875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -4.92578125, "rewards/margins": 11.949999809265137, "rewards/rejected": -16.868749618530273, "step": 4140 }, { "epoch": 3.6438296003513395, "grad_norm": 0.38956965812142635, "learning_rate": 8.853251318101933e-08, "logits/chosen": -0.26429444551467896, "logits/rejected": -0.5401366949081421, "logps/chosen": -297.79998779296875, "logps/rejected": -529.5999755859375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -5.659375190734863, "rewards/margins": 12.268750190734863, "rewards/rejected": -17.931249618530273, "step": 4150 }, { "epoch": 3.652613087395696, "grad_norm": 0.14862670063840186, "learning_rate": 8.633567662565905e-08, "logits/chosen": -0.46580809354782104, "logits/rejected": -0.530712902545929, "logps/chosen": -350.3500061035156, "logps/rejected": -527.0, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -5.790234565734863, "rewards/margins": 12.912500381469727, "rewards/rejected": -18.696874618530273, "step": 4160 }, { "epoch": 3.6613965744400527, "grad_norm": 0.02375537937897498, "learning_rate": 8.413884007029877e-08, "logits/chosen": -0.48432618379592896, "logits/rejected": -0.647265613079071, "logps/chosen": -333.04998779296875, "logps/rejected": -551.4000244140625, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -5.832812309265137, "rewards/margins": 12.678125381469727, "rewards/rejected": -18.515625, "step": 4170 }, { "epoch": 3.6701800614844093, "grad_norm": 0.04250831924451115, "learning_rate": 8.194200351493849e-08, "logits/chosen": -0.492645263671875, "logits/rejected": -0.642041027545929, "logps/chosen": -313.25, "logps/rejected": -545.2000122070312, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -5.129296779632568, "rewards/margins": 13.165624618530273, "rewards/rejected": -18.315624237060547, "step": 4180 }, { "epoch": 3.678963548528766, "grad_norm": 1.0820550616559024, "learning_rate": 7.97451669595782e-08, "logits/chosen": -0.307565301656723, "logits/rejected": -0.597338855266571, "logps/chosen": -283.8500061035156, "logps/rejected": -507.0, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -4.939453125, "rewards/margins": 12.824999809265137, "rewards/rejected": -17.762500762939453, "step": 4190 }, { "epoch": 3.6877470355731226, "grad_norm": 0.10975370986902552, "learning_rate": 7.754833040421792e-08, "logits/chosen": -0.49543458223342896, "logits/rejected": -0.54876708984375, "logps/chosen": -339.95001220703125, "logps/rejected": -550.2999877929688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -5.571093559265137, "rewards/margins": 13.550000190734863, "rewards/rejected": -19.118749618530273, "step": 4200 }, { "epoch": 3.696530522617479, "grad_norm": 0.11234396557322662, "learning_rate": 7.535149384885764e-08, "logits/chosen": -0.5704101324081421, "logits/rejected": -0.649707019329071, "logps/chosen": -331.1499938964844, "logps/rejected": -581.0, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -6.4921875, "rewards/margins": 12.653124809265137, "rewards/rejected": -19.134374618530273, "step": 4210 }, { "epoch": 3.7053140096618358, "grad_norm": 0.125042619338368, "learning_rate": 7.315465729349736e-08, "logits/chosen": -0.36985474824905396, "logits/rejected": -0.45720213651657104, "logps/chosen": -359.25, "logps/rejected": -587.7000122070312, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -6.571875095367432, "rewards/margins": 13.28125, "rewards/rejected": -19.859375, "step": 4220 }, { "epoch": 3.7140974967061924, "grad_norm": 0.09650117926360155, "learning_rate": 7.095782073813708e-08, "logits/chosen": -0.31544190645217896, "logits/rejected": -0.6631835699081421, "logps/chosen": -319.6000061035156, "logps/rejected": -568.5, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.482812404632568, "rewards/margins": 14.146875381469727, "rewards/rejected": -20.628124237060547, "step": 4230 }, { "epoch": 3.722880983750549, "grad_norm": 0.08831638350424685, "learning_rate": 6.87609841827768e-08, "logits/chosen": -0.38298338651657104, "logits/rejected": -0.49860841035842896, "logps/chosen": -341.0, "logps/rejected": -555.0999755859375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -6.181640625, "rewards/margins": 13.334375381469727, "rewards/rejected": -19.518749237060547, "step": 4240 }, { "epoch": 3.7316644707949056, "grad_norm": 0.08460293543123892, "learning_rate": 6.656414762741652e-08, "logits/chosen": -0.35844725370407104, "logits/rejected": -0.630566418170929, "logps/chosen": -337.6499938964844, "logps/rejected": -572.0, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -6.15234375, "rewards/margins": 13.274999618530273, "rewards/rejected": -19.431249618530273, "step": 4250 }, { "epoch": 3.740447957839262, "grad_norm": 0.143824348023393, "learning_rate": 6.436731107205623e-08, "logits/chosen": -0.39520263671875, "logits/rejected": -0.717089831829071, "logps/chosen": -329.45001220703125, "logps/rejected": -600.2999877929688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -5.817187309265137, "rewards/margins": 13.856249809265137, "rewards/rejected": -19.684375762939453, "step": 4260 }, { "epoch": 3.749231444883619, "grad_norm": 0.33337202533585036, "learning_rate": 6.217047451669595e-08, "logits/chosen": -0.33159178495407104, "logits/rejected": -0.642871081829071, "logps/chosen": -335.70001220703125, "logps/rejected": -512.5999755859375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -5.862500190734863, "rewards/margins": 12.5625, "rewards/rejected": -18.431249618530273, "step": 4270 }, { "epoch": 3.7580149319279754, "grad_norm": 0.2026855445311043, "learning_rate": 5.997363796133567e-08, "logits/chosen": -0.49125975370407104, "logits/rejected": -0.6572265625, "logps/chosen": -395.95001220703125, "logps/rejected": -584.5, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -6.251953125, "rewards/margins": 12.709375381469727, "rewards/rejected": -18.971874237060547, "step": 4280 }, { "epoch": 3.766798418972332, "grad_norm": 0.0361548595942344, "learning_rate": 5.7776801405975395e-08, "logits/chosen": -0.4994873106479645, "logits/rejected": -0.584948718547821, "logps/chosen": -331.95001220703125, "logps/rejected": -605.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -5.9453125, "rewards/margins": 13.481249809265137, "rewards/rejected": -19.424999237060547, "step": 4290 }, { "epoch": 3.7755819060166886, "grad_norm": 0.02771682335217671, "learning_rate": 5.5579964850615116e-08, "logits/chosen": -0.473388671875, "logits/rejected": -0.7215331792831421, "logps/chosen": -307.25, "logps/rejected": -564.7999877929688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -5.453515529632568, "rewards/margins": 13.425000190734863, "rewards/rejected": -18.878124237060547, "step": 4300 }, { "epoch": 3.7843653930610452, "grad_norm": 0.03886903940351864, "learning_rate": 5.338312829525484e-08, "logits/chosen": -0.39808350801467896, "logits/rejected": -0.4810348451137543, "logps/chosen": -307.3999938964844, "logps/rejected": -533.9000244140625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -5.268164157867432, "rewards/margins": 12.934374809265137, "rewards/rejected": -18.209375381469727, "step": 4310 }, { "epoch": 3.793148880105402, "grad_norm": 0.09878272026722451, "learning_rate": 5.1186291739894545e-08, "logits/chosen": -0.3619384765625, "logits/rejected": -0.626757800579071, "logps/chosen": -331.8999938964844, "logps/rejected": -568.2000122070312, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -6.399218559265137, "rewards/margins": 13.774999618530273, "rewards/rejected": -20.168750762939453, "step": 4320 }, { "epoch": 3.8019323671497585, "grad_norm": 0.3584084479160875, "learning_rate": 4.8989455184534266e-08, "logits/chosen": -0.35487061738967896, "logits/rejected": -0.6392822265625, "logps/chosen": -318.45001220703125, "logps/rejected": -559.0999755859375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -5.779687404632568, "rewards/margins": 13.628125190734863, "rewards/rejected": -19.403125762939453, "step": 4330 }, { "epoch": 3.810715854194115, "grad_norm": 0.19049367118107313, "learning_rate": 4.679261862917399e-08, "logits/chosen": -0.5264526605606079, "logits/rejected": -0.721875011920929, "logps/chosen": -341.8999938964844, "logps/rejected": -567.0999755859375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -6.092187404632568, "rewards/margins": 13.0, "rewards/rejected": -19.090625762939453, "step": 4340 }, { "epoch": 3.8194993412384717, "grad_norm": 0.21604678437533043, "learning_rate": 4.45957820738137e-08, "logits/chosen": -0.4737792909145355, "logits/rejected": -0.5712646245956421, "logps/chosen": -338.5, "logps/rejected": -588.7999877929688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -6.673437595367432, "rewards/margins": 13.199999809265137, "rewards/rejected": -19.862499237060547, "step": 4350 }, { "epoch": 3.8282828282828283, "grad_norm": 0.06144243707155663, "learning_rate": 4.239894551845342e-08, "logits/chosen": -0.3351196348667145, "logits/rejected": -0.4825683534145355, "logps/chosen": -337.70001220703125, "logps/rejected": -553.4000244140625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -5.926562309265137, "rewards/margins": 13.368749618530273, "rewards/rejected": -19.306249618530273, "step": 4360 }, { "epoch": 3.837066315327185, "grad_norm": 0.7031122614126657, "learning_rate": 4.020210896309314e-08, "logits/chosen": -0.554638683795929, "logits/rejected": -0.5219360589981079, "logps/chosen": -346.6499938964844, "logps/rejected": -547.2999877929688, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -6.016406059265137, "rewards/margins": 12.712499618530273, "rewards/rejected": -18.725000381469727, "step": 4370 }, { "epoch": 3.8458498023715415, "grad_norm": 0.13276373745420403, "learning_rate": 3.8005272407732864e-08, "logits/chosen": -0.460113525390625, "logits/rejected": -0.570849597454071, "logps/chosen": -341.0, "logps/rejected": -565.2999877929688, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -5.982031345367432, "rewards/margins": 13.262499809265137, "rewards/rejected": -19.234375, "step": 4380 }, { "epoch": 3.854633289415898, "grad_norm": 0.20446434475335842, "learning_rate": 3.580843585237258e-08, "logits/chosen": -0.29948729276657104, "logits/rejected": -0.5396484136581421, "logps/chosen": -285.6000061035156, "logps/rejected": -549.9000244140625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -5.701562404632568, "rewards/margins": 13.962499618530273, "rewards/rejected": -19.668750762939453, "step": 4390 }, { "epoch": 3.8634167764602547, "grad_norm": 0.056437066906751494, "learning_rate": 3.36115992970123e-08, "logits/chosen": -0.52593994140625, "logits/rejected": -0.6352783441543579, "logps/chosen": -301.95001220703125, "logps/rejected": -532.5999755859375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -5.192578315734863, "rewards/margins": 13.178125381469727, "rewards/rejected": -18.378124237060547, "step": 4400 }, { "epoch": 3.8722002635046113, "grad_norm": 0.05630547553940936, "learning_rate": 3.141476274165202e-08, "logits/chosen": -0.46795654296875, "logits/rejected": -0.581433117389679, "logps/chosen": -370.1499938964844, "logps/rejected": -607.7000122070312, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -7.305468559265137, "rewards/margins": 13.274999618530273, "rewards/rejected": -20.575000762939453, "step": 4410 }, { "epoch": 3.880983750548968, "grad_norm": 0.05960984621006603, "learning_rate": 2.9217926186291738e-08, "logits/chosen": -0.523327648639679, "logits/rejected": -0.629833996295929, "logps/chosen": -362.8500061035156, "logps/rejected": -560.9000244140625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -6.560156345367432, "rewards/margins": 12.931249618530273, "rewards/rejected": -19.493749618530273, "step": 4420 }, { "epoch": 3.8897672375933245, "grad_norm": 0.061988073623917846, "learning_rate": 2.7021089630931456e-08, "logits/chosen": -0.27287596464157104, "logits/rejected": -0.593658447265625, "logps/chosen": -299.95001220703125, "logps/rejected": -568.9000244140625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -5.517187595367432, "rewards/margins": 13.565625190734863, "rewards/rejected": -19.084375381469727, "step": 4430 }, { "epoch": 3.898550724637681, "grad_norm": 0.10469566463550677, "learning_rate": 2.4824253075571177e-08, "logits/chosen": -0.46533203125, "logits/rejected": -0.624438464641571, "logps/chosen": -321.0, "logps/rejected": -548.9000244140625, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -6.04296875, "rewards/margins": 13.665624618530273, "rewards/rejected": -19.703125, "step": 4440 }, { "epoch": 3.9073342116820378, "grad_norm": 0.19193234821650873, "learning_rate": 2.2627416520210894e-08, "logits/chosen": -0.46855467557907104, "logits/rejected": -0.573376476764679, "logps/chosen": -296.1000061035156, "logps/rejected": -532.4000244140625, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -5.655468940734863, "rewards/margins": 14.168749809265137, "rewards/rejected": -19.815624237060547, "step": 4450 }, { "epoch": 3.9161176987263944, "grad_norm": 0.7480621765916395, "learning_rate": 2.0430579964850612e-08, "logits/chosen": -0.4483642578125, "logits/rejected": -0.5134521722793579, "logps/chosen": -306.5249938964844, "logps/rejected": -540.5999755859375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -5.66015625, "rewards/margins": 12.878125190734863, "rewards/rejected": -18.537500381469727, "step": 4460 }, { "epoch": 3.924901185770751, "grad_norm": 0.09104710553662786, "learning_rate": 1.8233743409490333e-08, "logits/chosen": -0.18061523139476776, "logits/rejected": -0.513598620891571, "logps/chosen": -249.10000610351562, "logps/rejected": -476.8999938964844, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -4.454297065734863, "rewards/margins": 13.09375, "rewards/rejected": -17.546875, "step": 4470 }, { "epoch": 3.9336846728151076, "grad_norm": 0.25514606494219616, "learning_rate": 1.603690685413005e-08, "logits/chosen": -0.5694335699081421, "logits/rejected": -0.5858398675918579, "logps/chosen": -305.8500061035156, "logps/rejected": -536.2999877929688, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -5.771874904632568, "rewards/margins": 13.256250381469727, "rewards/rejected": -19.024999618530273, "step": 4480 }, { "epoch": 3.942468159859464, "grad_norm": 0.5539992914114478, "learning_rate": 1.3840070298769772e-08, "logits/chosen": -0.28886717557907104, "logits/rejected": -0.5505126714706421, "logps/chosen": -272.7250061035156, "logps/rejected": -498.8999938964844, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -4.639843940734863, "rewards/margins": 13.131250381469727, "rewards/rejected": -17.762500762939453, "step": 4490 }, { "epoch": 3.951251646903821, "grad_norm": 0.07195228102403328, "learning_rate": 1.164323374340949e-08, "logits/chosen": -0.2945800721645355, "logits/rejected": -0.5526977777481079, "logps/chosen": -335.79998779296875, "logps/rejected": -549.5999755859375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -5.424218654632568, "rewards/margins": 13.434374809265137, "rewards/rejected": -18.853124618530273, "step": 4500 }, { "epoch": 3.9600351339481774, "grad_norm": 1.0496414200907558, "learning_rate": 9.446397188049209e-09, "logits/chosen": -0.177490234375, "logits/rejected": -0.43366700410842896, "logps/chosen": -322.75, "logps/rejected": -533.5999755859375, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -5.568749904632568, "rewards/margins": 12.440625190734863, "rewards/rejected": -18.012500762939453, "step": 4510 }, { "epoch": 3.968818620992534, "grad_norm": 0.22805718874841105, "learning_rate": 7.249560632688927e-09, "logits/chosen": -0.36833494901657104, "logits/rejected": -0.714306652545929, "logps/chosen": -360.20001220703125, "logps/rejected": -536.2999877929688, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -6.275781154632568, "rewards/margins": 12.096875190734863, "rewards/rejected": -18.381250381469727, "step": 4520 }, { "epoch": 3.9776021080368906, "grad_norm": 0.16001405249427075, "learning_rate": 5.0527240773286466e-09, "logits/chosen": -0.3943115174770355, "logits/rejected": -0.525622546672821, "logps/chosen": -314.20001220703125, "logps/rejected": -580.4000244140625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -5.896484375, "rewards/margins": 13.443750381469727, "rewards/rejected": -19.34375, "step": 4530 }, { "epoch": 3.9863855950812472, "grad_norm": 0.0398140954658924, "learning_rate": 2.8558875219683655e-09, "logits/chosen": -0.34931641817092896, "logits/rejected": -0.632128894329071, "logps/chosen": -333.3500061035156, "logps/rejected": -555.0, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -5.680468559265137, "rewards/margins": 12.834375381469727, "rewards/rejected": -18.509374618530273, "step": 4540 }, { "epoch": 3.995169082125604, "grad_norm": 0.02578746898446639, "learning_rate": 6.590509666080844e-10, "logits/chosen": -0.394775390625, "logits/rejected": -0.557690441608429, "logps/chosen": -321.1000061035156, "logps/rejected": -567.4000244140625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -5.482812404632568, "rewards/margins": 13.378125190734863, "rewards/rejected": -18.868749618530273, "step": 4550 } ], "logging_steps": 10, "max_steps": 4552, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }