{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2546689303904924, "eval_steps": 500, "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 154.90625, "epoch": 0.0008488964346349745, "grad_norm": 1.373261530904728, "kl": 0.0003566741943359375, "learning_rate": 0.0, "loss": -0.0035, "reward": 0.12956976890563965, "reward_std": 0.10243552178144455, "rewards/preference_model_reward": 0.12956976890563965, "rewards/preference_model_reward/std": 0.10243552923202515, "step": 1 }, { "clip_ratio": 0.0, "epoch": 0.001697792869269949, "grad_norm": 1.373744508768238, "kl": 0.0003566741943359375, "learning_rate": 1e-07, "loss": -0.0035, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 426.03125, "epoch": 0.0025466893039049238, "grad_norm": 0.01976094802569778, "kl": 0.0003337860107421875, "learning_rate": 2e-07, "loss": -0.0, "reward": 0.007162425667047501, "reward_std": 0.002347785048186779, "rewards/preference_model_reward": 0.007162425667047501, "rewards/preference_model_reward/std": 0.0023477852810174227, "step": 3 }, { "clip_ratio": 0.00029364757938310504, "epoch": 0.003395585738539898, "grad_norm": 0.019704841225345854, "kl": 0.000339508056640625, "learning_rate": 3e-07, "loss": -0.0001, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 205.59375, "epoch": 0.004244482173174873, "grad_norm": 0.8550671585380242, "kl": 0.000408172607421875, "learning_rate": 4e-07, "loss": 0.0011, "reward": 0.0704927146434784, "reward_std": 0.06750915944576263, "rewards/preference_model_reward": 0.0704927146434784, "rewards/preference_model_reward/std": 0.06750915199518204, "step": 5 }, { "clip_ratio": 0.0, "epoch": 0.0050933786078098476, "grad_norm": 0.7361877708957172, "kl": 0.0003910064697265625, "learning_rate": 5e-07, "loss": 0.0011, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 227.15625, "epoch": 0.005942275042444821, "grad_norm": 0.38706524237759515, "kl": 0.0003662109375, "learning_rate": 6e-07, "loss": -0.0007, "reward": 0.03637976944446564, "reward_std": 0.037161991000175476, "rewards/preference_model_reward": 0.03637976944446564, "rewards/preference_model_reward/std": 0.037161991000175476, "step": 7 }, { "clip_ratio": 0.0003041362506337464, "epoch": 0.006791171477079796, "grad_norm": 0.3886457621776694, "kl": 0.000339508056640625, "learning_rate": 7e-07, "loss": -0.0007, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 384.0625, "epoch": 0.007640067911714771, "grad_norm": 1.3360350931326528, "kl": 0.0003337860107421875, "learning_rate": 8e-07, "loss": 0.0003, "reward": 0.2957379221916199, "reward_std": 0.1667662262916565, "rewards/preference_model_reward": 0.2957379221916199, "rewards/preference_model_reward/std": 0.1667662262916565, "step": 9 }, { "clip_ratio": 0.0004007347160950303, "epoch": 0.008488964346349746, "grad_norm": 1.3492099330380622, "kl": 0.00035858154296875, "learning_rate": 9e-07, "loss": 0.0003, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 271.34375, "epoch": 0.00933786078098472, "grad_norm": 0.5844743318217549, "kl": 0.0004730224609375, "learning_rate": 1e-06, "loss": -0.0055, "reward": 0.06409081071615219, "reward_std": 0.05993795394897461, "rewards/preference_model_reward": 0.06409081071615219, "rewards/preference_model_reward/std": 0.05993795767426491, "step": 11 }, { "clip_ratio": 0.0003285869024693966, "epoch": 0.010186757215619695, "grad_norm": 0.5853835659686467, "kl": 0.0005340576171875, "learning_rate": 1.1e-06, "loss": -0.0055, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 323.875, "epoch": 0.011035653650254669, "grad_norm": 0.5037341718615376, "kl": 0.00077056884765625, "learning_rate": 1.2e-06, "loss": -0.0005, "reward": 0.05134192109107971, "reward_std": 0.05402546375989914, "rewards/preference_model_reward": 0.05134192109107971, "rewards/preference_model_reward/std": 0.05402546748518944, "step": 13 }, { "clip_ratio": 0.00019549165153875947, "epoch": 0.011884550084889643, "grad_norm": 0.5354006783262033, "kl": 0.00098419189453125, "learning_rate": 1.3e-06, "loss": -0.0006, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 389.53125, "epoch": 0.012733446519524618, "grad_norm": 0.4376233615388839, "kl": 0.00057220458984375, "learning_rate": 1.4e-06, "loss": 0.0, "reward": 0.07725013792514801, "reward_std": 0.0637926235795021, "rewards/preference_model_reward": 0.07725013792514801, "rewards/preference_model_reward/std": 0.0637926235795021, "step": 15 }, { "clip_ratio": 0.0001568605366628617, "epoch": 0.013582342954159592, "grad_norm": 0.43520351533225, "kl": 0.000732421875, "learning_rate": 1.5e-06, "loss": -0.0, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 225.625, "epoch": 0.014431239388794566, "grad_norm": 0.009342079121610454, "kl": 0.00156402587890625, "learning_rate": 1.6e-06, "loss": -0.0, "reward": 0.003957257140427828, "reward_std": 0.0007005692459642887, "rewards/preference_model_reward": 0.003957257140427828, "rewards/preference_model_reward/std": 0.0007005691877566278, "step": 17 }, { "clip_ratio": 0.0, "epoch": 0.015280135823429542, "grad_norm": 0.009135995120751321, "kl": 0.0016937255859375, "learning_rate": 1.6999999999999998e-06, "loss": -0.0, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 618.53125, "epoch": 0.016129032258064516, "grad_norm": 0.8342082066432049, "kl": 0.00165557861328125, "learning_rate": 1.8e-06, "loss": -0.003, "reward": 0.30697351694107056, "reward_std": 0.12625738978385925, "rewards/preference_model_reward": 0.30697351694107056, "rewards/preference_model_reward/std": 0.12625740468502045, "step": 19 }, { "clip_ratio": 0.00025422428734600544, "epoch": 0.01697792869269949, "grad_norm": 0.8045973414054722, "kl": 0.00189208984375, "learning_rate": 1.8999999999999998e-06, "loss": -0.0032, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 341.0, "epoch": 0.017826825127334467, "grad_norm": 0.15747378780960536, "kl": 0.0018157958984375, "learning_rate": 2e-06, "loss": 0.0013, "reward": 0.025423400104045868, "reward_std": 0.022510820999741554, "rewards/preference_model_reward": 0.025423400104045868, "rewards/preference_model_reward/std": 0.022510822862386703, "step": 21 }, { "clip_ratio": 0.00045937151298858225, "epoch": 0.01867572156196944, "grad_norm": 0.15703129987750525, "kl": 0.0022125244140625, "learning_rate": 2e-06, "loss": 0.0012, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 232.5625, "epoch": 0.019524617996604415, "grad_norm": 0.4416727840666515, "kl": 0.0036468505859375, "learning_rate": 2e-06, "loss": 0.0003, "reward": 0.04581625759601593, "reward_std": 0.0430915392935276, "rewards/preference_model_reward": 0.04581625759601593, "rewards/preference_model_reward/std": 0.0430915392935276, "step": 23 }, { "clip_ratio": 0.0006831242935732007, "epoch": 0.02037351443123939, "grad_norm": 0.44514421266142573, "kl": 0.0042724609375, "learning_rate": 2e-06, "loss": 0.0001, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 212.34375, "epoch": 0.021222410865874362, "grad_norm": 0.965193272956362, "kl": 0.006103515625, "learning_rate": 2e-06, "loss": -0.0014, "reward": 0.11097941547632217, "reward_std": 0.0762963593006134, "rewards/preference_model_reward": 0.11097941547632217, "rewards/preference_model_reward/std": 0.0762963593006134, "step": 25 }, { "clip_ratio": 0.0, "epoch": 0.022071307300509338, "grad_norm": 0.9125624994776861, "kl": 0.00677490234375, "learning_rate": 2e-06, "loss": -0.0016, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 270.6875, "epoch": 0.022920203735144314, "grad_norm": 1.021646949062738, "kl": 0.00738525390625, "learning_rate": 2e-06, "loss": -0.0024, "reward": 0.14990350604057312, "reward_std": 0.10197865962982178, "rewards/preference_model_reward": 0.14990350604057312, "rewards/preference_model_reward/std": 0.10197865217924118, "step": 27 }, { "clip_ratio": 0.0003397603868506849, "epoch": 0.023769100169779286, "grad_norm": 1.0520153952968034, "kl": 0.00872802734375, "learning_rate": 2e-06, "loss": -0.0027, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 758.28125, "epoch": 0.02461799660441426, "grad_norm": 0.7119844877358423, "kl": 0.006744384765625, "learning_rate": 2e-06, "loss": 0.0009, "reward": 0.09009624272584915, "reward_std": 0.09022250026464462, "rewards/preference_model_reward": 0.09009624272584915, "rewards/preference_model_reward/std": 0.09022250026464462, "step": 29 }, { "clip_ratio": 0.0006554110441356897, "epoch": 0.025466893039049237, "grad_norm": 0.5478668890905144, "kl": 0.007568359375, "learning_rate": 2e-06, "loss": 0.0008, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 317.96875, "epoch": 0.02631578947368421, "grad_norm": 0.3101174990761895, "kl": 0.00958251953125, "learning_rate": 2e-06, "loss": 0.0011, "reward": 0.05284169688820839, "reward_std": 0.028878774493932724, "rewards/preference_model_reward": 0.05284169688820839, "rewards/preference_model_reward/std": 0.028878774493932724, "step": 31 }, { "clip_ratio": 0.0008713441202417016, "epoch": 0.027164685908319185, "grad_norm": 0.2969256230681903, "kl": 0.01007080078125, "learning_rate": 2e-06, "loss": 0.0009, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 346.5, "epoch": 0.02801358234295416, "grad_norm": 0.5964158292526848, "kl": 0.01025390625, "learning_rate": 2e-06, "loss": -0.0002, "reward": 0.10442396998405457, "reward_std": 0.0710761621594429, "rewards/preference_model_reward": 0.10442396998405457, "rewards/preference_model_reward/std": 0.0710761621594429, "step": 33 }, { "clip_ratio": 0.001167232054285705, "epoch": 0.028862478777589132, "grad_norm": 0.6459875868432908, "kl": 0.011474609375, "learning_rate": 2e-06, "loss": -0.0004, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 325.125, "epoch": 0.029711375212224108, "grad_norm": 0.8665540483399039, "kl": 0.012451171875, "learning_rate": 2e-06, "loss": -0.0007, "reward": 0.20713286101818085, "reward_std": 0.08856458961963654, "rewards/preference_model_reward": 0.20713286101818085, "rewards/preference_model_reward/std": 0.08856458961963654, "step": 35 }, { "clip_ratio": 0.0004757290589623153, "epoch": 0.030560271646859084, "grad_norm": 0.8458082294842536, "kl": 0.013916015625, "learning_rate": 2e-06, "loss": -0.001, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 259.5625, "epoch": 0.031409168081494056, "grad_norm": 0.6451884349368371, "kl": 0.018798828125, "learning_rate": 2e-06, "loss": -0.0021, "reward": 0.07655464112758636, "reward_std": 0.06220533698797226, "rewards/preference_model_reward": 0.07655464112758636, "rewards/preference_model_reward/std": 0.06220533698797226, "step": 37 }, { "clip_ratio": 0.00036129303043708205, "epoch": 0.03225806451612903, "grad_norm": 0.6180427670131671, "kl": 0.020751953125, "learning_rate": 2e-06, "loss": -0.0023, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 217.6875, "epoch": 0.03310696095076401, "grad_norm": 0.15623847767793025, "kl": 0.027099609375, "learning_rate": 2e-06, "loss": 0.0006, "reward": 0.01216259878128767, "reward_std": 0.015137026086449623, "rewards/preference_model_reward": 0.01216259878128767, "rewards/preference_model_reward/std": 0.015137026086449623, "step": 39 }, { "clip_ratio": 0.0, "epoch": 0.03395585738539898, "grad_norm": 0.16166488666255469, "kl": 0.02880859375, "learning_rate": 2e-06, "loss": 0.0005, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 340.25, "epoch": 0.03480475382003396, "grad_norm": 1.2594757575533009, "kl": 0.022216796875, "learning_rate": 2e-06, "loss": 0.0002, "reward": 0.36821770668029785, "reward_std": 0.10101611167192459, "rewards/preference_model_reward": 0.36821770668029785, "rewards/preference_model_reward/std": 0.101016104221344, "step": 41 }, { "clip_ratio": 0.0007307034684345126, "epoch": 0.035653650254668934, "grad_norm": 1.027698571984344, "kl": 0.0245361328125, "learning_rate": 2e-06, "loss": -0.0002, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 303.3125, "epoch": 0.0365025466893039, "grad_norm": 0.4510741866786348, "kl": 0.0252685546875, "learning_rate": 2e-06, "loss": 0.0, "reward": 0.056258413940668106, "reward_std": 0.05226214602589607, "rewards/preference_model_reward": 0.056258413940668106, "rewards/preference_model_reward/std": 0.05226214602589607, "step": 43 }, { "clip_ratio": 0.0008270645630545914, "epoch": 0.03735144312393888, "grad_norm": 0.4538945689960808, "kl": 0.02685546875, "learning_rate": 2e-06, "loss": -0.0002, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 696.4375, "epoch": 0.038200339558573854, "grad_norm": 0.6552940518465648, "kl": 0.02490234375, "learning_rate": 2e-06, "loss": 0.001, "reward": 0.28145015239715576, "reward_std": 0.09661795943975449, "rewards/preference_model_reward": 0.28145015239715576, "rewards/preference_model_reward/std": 0.09661795198917389, "step": 45 }, { "clip_ratio": 0.0003099275636486709, "epoch": 0.03904923599320883, "grad_norm": 0.6063830680426688, "kl": 0.026123046875, "learning_rate": 2e-06, "loss": 0.0007, "step": 46 }, { "clip_ratio": 0.0, "completion_length": 185.5625, "epoch": 0.039898132427843805, "grad_norm": 0.8803723441580334, "kl": 0.036376953125, "learning_rate": 2e-06, "loss": 0.0057, "reward": 0.14047113060951233, "reward_std": 0.07379527390003204, "rewards/preference_model_reward": 0.14047113060951233, "rewards/preference_model_reward/std": 0.07379526644945145, "step": 47 }, { "clip_ratio": 0.00016545334074180573, "epoch": 0.04074702886247878, "grad_norm": 0.8768664086923781, "kl": 0.03857421875, "learning_rate": 2e-06, "loss": 0.0053, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 409.84375, "epoch": 0.04159592529711375, "grad_norm": 0.790693661049164, "kl": 0.0303955078125, "learning_rate": 2e-06, "loss": 0.0013, "reward": 0.16792196035385132, "reward_std": 0.08975110948085785, "rewards/preference_model_reward": 0.16792196035385132, "rewards/preference_model_reward/std": 0.08975110203027725, "step": 49 }, { "clip_ratio": 0.0004502690862864256, "epoch": 0.042444821731748725, "grad_norm": 0.7661168129852652, "kl": 0.03173828125, "learning_rate": 2e-06, "loss": 0.001, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 186.875, "epoch": 0.0432937181663837, "grad_norm": 0.5403939950650409, "kl": 0.044189453125, "learning_rate": 2e-06, "loss": -0.0019, "reward": 0.06892818212509155, "reward_std": 0.036003705114126205, "rewards/preference_model_reward": 0.06892818212509155, "rewards/preference_model_reward/std": 0.036003705114126205, "step": 51 }, { "clip_ratio": 0.002485671080648899, "epoch": 0.044142614601018676, "grad_norm": 0.5730308488231836, "kl": 0.047119140625, "learning_rate": 2e-06, "loss": -0.0021, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 335.25, "epoch": 0.04499151103565365, "grad_norm": 0.7389454596845632, "kl": 0.041748046875, "learning_rate": 2e-06, "loss": -0.0063, "reward": 0.28060293197631836, "reward_std": 0.07954739779233932, "rewards/preference_model_reward": 0.28060293197631836, "rewards/preference_model_reward/std": 0.07954739034175873, "step": 53 }, { "clip_ratio": 0.0005645600031130016, "epoch": 0.04584040747028863, "grad_norm": 0.7275148836573597, "kl": 0.04296875, "learning_rate": 2e-06, "loss": -0.0066, "step": 54 }, { "clip_ratio": 0.0, "completion_length": 321.6875, "epoch": 0.0466893039049236, "grad_norm": 0.673383864573446, "kl": 0.031494140625, "learning_rate": 2e-06, "loss": 0.0018, "reward": 0.13006240129470825, "reward_std": 0.07470076531171799, "rewards/preference_model_reward": 0.13006240129470825, "rewards/preference_model_reward/std": 0.07470076531171799, "step": 55 }, { "clip_ratio": 0.00018761330284178257, "epoch": 0.04753820033955857, "grad_norm": 0.5932243482520125, "kl": 0.03125, "learning_rate": 2e-06, "loss": 0.0015, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 629.0625, "epoch": 0.04838709677419355, "grad_norm": 0.5943555323097186, "kl": 0.03759765625, "learning_rate": 2e-06, "loss": 0.0022, "reward": 0.16671660542488098, "reward_std": 0.08239807188510895, "rewards/preference_model_reward": 0.16671660542488098, "rewards/preference_model_reward/std": 0.08239807188510895, "step": 57 }, { "clip_ratio": 0.0007027126266621053, "epoch": 0.04923599320882852, "grad_norm": 0.5944317831243726, "kl": 0.0390625, "learning_rate": 2e-06, "loss": 0.002, "step": 58 }, { "clip_ratio": 0.0, "completion_length": 221.90625, "epoch": 0.0500848896434635, "grad_norm": 0.4673677413132102, "kl": 0.06689453125, "learning_rate": 2e-06, "loss": -0.0017, "reward": 0.031242549419403076, "reward_std": 0.04061814025044441, "rewards/preference_model_reward": 0.031242549419403076, "rewards/preference_model_reward/std": 0.04061814025044441, "step": 59 }, { "clip_ratio": 0.0010640884283930063, "epoch": 0.050933786078098474, "grad_norm": 0.46769299125491254, "kl": 0.0693359375, "learning_rate": 2e-06, "loss": -0.0019, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 235.625, "epoch": 0.05178268251273345, "grad_norm": 0.9433079745488324, "kl": 0.041259765625, "learning_rate": 2e-06, "loss": -0.0046, "reward": 0.3223969340324402, "reward_std": 0.08566058427095413, "rewards/preference_model_reward": 0.3223969340324402, "rewards/preference_model_reward/std": 0.08566058427095413, "step": 61 }, { "clip_ratio": 0.0005407010903581977, "epoch": 0.05263157894736842, "grad_norm": 0.8827749256609521, "kl": 0.043212890625, "learning_rate": 2e-06, "loss": -0.0051, "step": 62 }, { "clip_ratio": 0.0, "completion_length": 321.21875, "epoch": 0.053480475382003394, "grad_norm": 1.376741075127532, "kl": 0.072265625, "learning_rate": 2e-06, "loss": -0.0027, "reward": 0.18953999876976013, "reward_std": 0.0982605516910553, "rewards/preference_model_reward": 0.18953999876976013, "rewards/preference_model_reward/std": 0.0982605367898941, "step": 63 }, { "clip_ratio": 0.0017892650794237852, "epoch": 0.05432937181663837, "grad_norm": 1.0129637166861898, "kl": 0.076171875, "learning_rate": 2e-06, "loss": -0.003, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 225.25, "epoch": 0.055178268251273345, "grad_norm": 1.0280761369580704, "kl": 0.08203125, "learning_rate": 2e-06, "loss": -0.0121, "reward": 0.3603067398071289, "reward_std": 0.09477485716342926, "rewards/preference_model_reward": 0.3603067398071289, "rewards/preference_model_reward/std": 0.09477484971284866, "step": 65 }, { "clip_ratio": 0.00028635968919843435, "epoch": 0.05602716468590832, "grad_norm": 1.012643043603393, "kl": 0.08544921875, "learning_rate": 2e-06, "loss": -0.0126, "step": 66 }, { "clip_ratio": 0.0, "completion_length": 718.90625, "epoch": 0.056876061120543296, "grad_norm": 0.5544711698032301, "kl": 0.049560546875, "learning_rate": 2e-06, "loss": 0.0015, "reward": 0.0976465493440628, "reward_std": 0.08025789260864258, "rewards/preference_model_reward": 0.0976465493440628, "rewards/preference_model_reward/std": 0.08025789260864258, "step": 67 }, { "clip_ratio": 0.0004293117090128362, "epoch": 0.057724957555178265, "grad_norm": 0.5758752923955168, "kl": 0.05029296875, "learning_rate": 2e-06, "loss": 0.0012, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 313.8125, "epoch": 0.05857385398981324, "grad_norm": 0.5384984962195909, "kl": 0.08056640625, "learning_rate": 2e-06, "loss": -0.0024, "reward": 0.037503279745578766, "reward_std": 0.050285980105400085, "rewards/preference_model_reward": 0.037503279745578766, "rewards/preference_model_reward/std": 0.05028597638010979, "step": 69 }, { "clip_ratio": 0.000297203310765326, "epoch": 0.059422750424448216, "grad_norm": 0.5193566163583858, "kl": 0.08251953125, "learning_rate": 2e-06, "loss": -0.0026, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 289.84375, "epoch": 0.06027164685908319, "grad_norm": 1.5206690115948938, "kl": 0.0849609375, "learning_rate": 2e-06, "loss": 0.0006, "reward": 0.3103345036506653, "reward_std": 0.14627772569656372, "rewards/preference_model_reward": 0.3103345036506653, "rewards/preference_model_reward/std": 0.14627772569656372, "step": 71 }, { "clip_ratio": 0.0004342186148278415, "epoch": 0.06112054329371817, "grad_norm": 1.4731091282595996, "kl": 0.0927734375, "learning_rate": 2e-06, "loss": -0.0001, "step": 72 }, { "clip_ratio": 0.0, "completion_length": 338.90625, "epoch": 0.06196943972835314, "grad_norm": 0.5932547022811241, "kl": 0.0556640625, "learning_rate": 2e-06, "loss": -0.0004, "reward": 0.10221391916275024, "reward_std": 0.07499799132347107, "rewards/preference_model_reward": 0.10221391916275024, "rewards/preference_model_reward/std": 0.07499799132347107, "step": 73 }, { "clip_ratio": 0.00036755931796506047, "epoch": 0.06281833616298811, "grad_norm": 0.5752683509803187, "kl": 0.05810546875, "learning_rate": 2e-06, "loss": -0.0006, "step": 74 }, { "clip_ratio": 0.0, "completion_length": 311.28125, "epoch": 0.0636672325976231, "grad_norm": 0.5598161958043475, "kl": 0.07080078125, "learning_rate": 2e-06, "loss": -0.0015, "reward": 0.07023796439170837, "reward_std": 0.06094999983906746, "rewards/preference_model_reward": 0.07023796439170837, "rewards/preference_model_reward/std": 0.06094999611377716, "step": 75 }, { "clip_ratio": 9.596929157851264e-05, "epoch": 0.06451612903225806, "grad_norm": 0.5569802415068833, "kl": 0.07275390625, "learning_rate": 2e-06, "loss": -0.0018, "step": 76 }, { "clip_ratio": 0.0, "completion_length": 227.625, "epoch": 0.06536502546689305, "grad_norm": 1.2813688126905285, "kl": 0.09521484375, "learning_rate": 2e-06, "loss": -0.0028, "reward": 0.1808125078678131, "reward_std": 0.10328490287065506, "rewards/preference_model_reward": 0.1808125078678131, "rewards/preference_model_reward/std": 0.10328490287065506, "step": 77 }, { "clip_ratio": 0.0006906483322381973, "epoch": 0.06621392190152801, "grad_norm": 1.2343717842035047, "kl": 0.09765625, "learning_rate": 2e-06, "loss": -0.0035, "step": 78 }, { "clip_ratio": 0.0, "completion_length": 239.25, "epoch": 0.06706281833616298, "grad_norm": 1.2476565697443593, "kl": 0.09423828125, "learning_rate": 2e-06, "loss": 0.0004, "reward": 0.3275872468948364, "reward_std": 0.10609177500009537, "rewards/preference_model_reward": 0.3275872468948364, "rewards/preference_model_reward/std": 0.10609177500009537, "step": 79 }, { "clip_ratio": 0.00013171759201213717, "epoch": 0.06791171477079797, "grad_norm": 1.191757488980422, "kl": 0.09765625, "learning_rate": 2e-06, "loss": -0.0003, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 356.5, "epoch": 0.06876061120543293, "grad_norm": 0.10573846819647138, "kl": 0.07666015625, "learning_rate": 2e-06, "loss": 0.0002, "reward": 0.023218905553221703, "reward_std": 0.013258407823741436, "rewards/preference_model_reward": 0.023218905553221703, "rewards/preference_model_reward/std": 0.013258407823741436, "step": 81 }, { "clip_ratio": 0.0003507659712340683, "epoch": 0.06960950764006792, "grad_norm": 0.1103870844147634, "kl": 0.0771484375, "learning_rate": 2e-06, "loss": 0.0001, "step": 82 }, { "clip_ratio": 0.0, "completion_length": 388.15625, "epoch": 0.07045840407470289, "grad_norm": 1.1684465615315396, "kl": 0.11474609375, "learning_rate": 2e-06, "loss": -0.0114, "reward": 0.16451352834701538, "reward_std": 0.12782737612724304, "rewards/preference_model_reward": 0.16451352834701538, "rewards/preference_model_reward/std": 0.12782739102840424, "step": 83 }, { "clip_ratio": 0.00016542727826163173, "epoch": 0.07130730050933787, "grad_norm": 1.1475084655450116, "kl": 0.115234375, "learning_rate": 2e-06, "loss": -0.012, "step": 84 }, { "clip_ratio": 0.0, "completion_length": 717.90625, "epoch": 0.07215619694397284, "grad_norm": 0.1671276234234309, "kl": 0.0439453125, "learning_rate": 2e-06, "loss": 0.0003, "reward": 0.004496478941291571, "reward_std": 0.01374930702149868, "rewards/preference_model_reward": 0.004496478941291571, "rewards/preference_model_reward/std": 0.01374930702149868, "step": 85 }, { "clip_ratio": 0.0011028368026018143, "epoch": 0.0730050933786078, "grad_norm": 0.11851554080319895, "kl": 0.04150390625, "learning_rate": 2e-06, "loss": 0.0003, "step": 86 }, { "clip_ratio": 0.0, "completion_length": 597.875, "epoch": 0.07385398981324279, "grad_norm": 0.8930226438210058, "kl": 0.10107421875, "learning_rate": 2e-06, "loss": -0.0002, "reward": 0.2464292198419571, "reward_std": 0.1284564882516861, "rewards/preference_model_reward": 0.2464292198419571, "rewards/preference_model_reward/std": 0.1284564733505249, "step": 87 }, { "clip_ratio": 0.0003217374032828957, "epoch": 0.07470288624787776, "grad_norm": 0.8976677967365754, "kl": 0.10205078125, "learning_rate": 2e-06, "loss": -0.0007, "step": 88 }, { "clip_ratio": 0.0, "completion_length": 309.0, "epoch": 0.07555178268251274, "grad_norm": 0.8544800016185599, "kl": 0.1005859375, "learning_rate": 2e-06, "loss": -0.0008, "reward": 0.11918962001800537, "reward_std": 0.08243891596794128, "rewards/preference_model_reward": 0.11918962001800537, "rewards/preference_model_reward/std": 0.08243890851736069, "step": 89 }, { "clip_ratio": 0.0003866804763674736, "epoch": 0.07640067911714771, "grad_norm": 0.9945706586547373, "kl": 0.10107421875, "learning_rate": 2e-06, "loss": -0.0013, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 603.15625, "epoch": 0.07724957555178268, "grad_norm": 0.6302983763196456, "kl": 0.10009765625, "learning_rate": 2e-06, "loss": 0.0001, "reward": 0.13992220163345337, "reward_std": 0.08116798847913742, "rewards/preference_model_reward": 0.13992220163345337, "rewards/preference_model_reward/std": 0.08116798847913742, "step": 91 }, { "clip_ratio": 0.00037479729508049786, "epoch": 0.07809847198641766, "grad_norm": 0.6164704370528037, "kl": 0.099609375, "learning_rate": 2e-06, "loss": -0.0002, "step": 92 }, { "clip_ratio": 0.0, "completion_length": 312.15625, "epoch": 0.07894736842105263, "grad_norm": 1.3844161112543023, "kl": 0.10888671875, "learning_rate": 2e-06, "loss": -0.0053, "reward": 0.36870962381362915, "reward_std": 0.134088933467865, "rewards/preference_model_reward": 0.36870962381362915, "rewards/preference_model_reward/std": 0.1340889185667038, "step": 93 }, { "clip_ratio": 0.001708789262920618, "epoch": 0.07979626485568761, "grad_norm": 1.2611380216933197, "kl": 0.10888671875, "learning_rate": 2e-06, "loss": -0.0061, "step": 94 }, { "clip_ratio": 0.0, "completion_length": 345.84375, "epoch": 0.08064516129032258, "grad_norm": 0.7996421963852738, "kl": 0.1142578125, "learning_rate": 2e-06, "loss": 0.0001, "reward": 0.11349868029356003, "reward_std": 0.08194027096033096, "rewards/preference_model_reward": 0.11349868029356003, "rewards/preference_model_reward/std": 0.08194026350975037, "step": 95 }, { "clip_ratio": 0.000727824226487428, "epoch": 0.08149405772495756, "grad_norm": 0.851345288015861, "kl": 0.11474609375, "learning_rate": 2e-06, "loss": -0.0004, "step": 96 }, { "clip_ratio": 0.0, "completion_length": 496.8125, "epoch": 0.08234295415959253, "grad_norm": 0.7233456374057684, "kl": 0.0771484375, "learning_rate": 2e-06, "loss": 0.0005, "reward": 0.062306515872478485, "reward_std": 0.06466341018676758, "rewards/preference_model_reward": 0.062306515872478485, "rewards/preference_model_reward/std": 0.06466341018676758, "step": 97 }, { "clip_ratio": 0.0024484877940267324, "epoch": 0.0831918505942275, "grad_norm": 0.5356124474945269, "kl": 0.0751953125, "learning_rate": 2e-06, "loss": 0.0003, "step": 98 }, { "clip_ratio": 0.0, "completion_length": 346.46875, "epoch": 0.08404074702886248, "grad_norm": 0.9856700443791508, "kl": 0.08447265625, "learning_rate": 2e-06, "loss": -0.0026, "reward": 0.128938689827919, "reward_std": 0.10464771091938019, "rewards/preference_model_reward": 0.128938689827919, "rewards/preference_model_reward/std": 0.10464771091938019, "step": 99 }, { "clip_ratio": 0.00026480897213332355, "epoch": 0.08488964346349745, "grad_norm": 0.9630686055340228, "kl": 0.08642578125, "learning_rate": 2e-06, "loss": -0.0032, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 265.71875, "epoch": 0.08573853989813243, "grad_norm": 1.4006922971408575, "kl": 0.099609375, "learning_rate": 2e-06, "loss": 0.0017, "reward": 0.20826829969882965, "reward_std": 0.08805741369724274, "rewards/preference_model_reward": 0.20826829969882965, "rewards/preference_model_reward/std": 0.08805741369724274, "step": 101 }, { "clip_ratio": 0.00047059552161954343, "epoch": 0.0865874363327674, "grad_norm": 1.0093723749145036, "kl": 0.099609375, "learning_rate": 2e-06, "loss": 0.0014, "step": 102 }, { "clip_ratio": 0.0, "completion_length": 352.34375, "epoch": 0.08743633276740238, "grad_norm": 0.7907866558806076, "kl": 0.099609375, "learning_rate": 2e-06, "loss": -0.0069, "reward": 0.1357034146785736, "reward_std": 0.08183176815509796, "rewards/preference_model_reward": 0.1357034146785736, "rewards/preference_model_reward/std": 0.08183176815509796, "step": 103 }, { "clip_ratio": 0.00016943408991210163, "epoch": 0.08828522920203735, "grad_norm": 0.8163727108346147, "kl": 0.1005859375, "learning_rate": 2e-06, "loss": -0.0074, "step": 104 }, { "clip_ratio": 0.0, "completion_length": 405.1875, "epoch": 0.08913412563667232, "grad_norm": 1.2455238091620369, "kl": 0.1015625, "learning_rate": 2e-06, "loss": -0.0006, "reward": 0.32809197902679443, "reward_std": 0.13614021241664886, "rewards/preference_model_reward": 0.32809197902679443, "rewards/preference_model_reward/std": 0.13614021241664886, "step": 105 }, { "clip_ratio": 0.00047510667354799807, "epoch": 0.0899830220713073, "grad_norm": 1.175271310166888, "kl": 0.1044921875, "learning_rate": 2e-06, "loss": -0.0012, "step": 106 }, { "clip_ratio": 0.0, "completion_length": 507.28125, "epoch": 0.09083191850594227, "grad_norm": 1.4891793433061915, "kl": 0.09521484375, "learning_rate": 2e-06, "loss": 0.0058, "reward": 0.3271234631538391, "reward_std": 0.11905878782272339, "rewards/preference_model_reward": 0.3271234631538391, "rewards/preference_model_reward/std": 0.11905878782272339, "step": 107 }, { "clip_ratio": 0.0007332629174925387, "epoch": 0.09168081494057725, "grad_norm": 0.9610577492843277, "kl": 0.09814453125, "learning_rate": 2e-06, "loss": 0.0054, "step": 108 }, { "clip_ratio": 0.0, "completion_length": 677.375, "epoch": 0.09252971137521222, "grad_norm": 0.5710959299764182, "kl": 0.1005859375, "learning_rate": 2e-06, "loss": 0.0005, "reward": 0.3096367120742798, "reward_std": 0.08406942337751389, "rewards/preference_model_reward": 0.3096367120742798, "rewards/preference_model_reward/std": 0.08406941592693329, "step": 109 }, { "clip_ratio": 0.00041483400855213404, "epoch": 0.0933786078098472, "grad_norm": 0.5381736695457521, "kl": 0.10205078125, "learning_rate": 2e-06, "loss": 0.0001, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 408.78125, "epoch": 0.09422750424448217, "grad_norm": 1.04134918622721, "kl": 0.130859375, "learning_rate": 2e-06, "loss": -0.0019, "reward": 0.16213001310825348, "reward_std": 0.09974581748247147, "rewards/preference_model_reward": 0.16213001310825348, "rewards/preference_model_reward/std": 0.09974581748247147, "step": 111 }, { "clip_ratio": 0.0008149376371875405, "epoch": 0.09507640067911714, "grad_norm": 0.9160731616594122, "kl": 0.1328125, "learning_rate": 2e-06, "loss": -0.0024, "step": 112 }, { "clip_ratio": 0.0, "completion_length": 373.96875, "epoch": 0.09592529711375213, "grad_norm": 1.0946577521442298, "kl": 0.12158203125, "learning_rate": 2e-06, "loss": 0.007, "reward": 0.2697640657424927, "reward_std": 0.10352278500795364, "rewards/preference_model_reward": 0.2697640657424927, "rewards/preference_model_reward/std": 0.10352278500795364, "step": 113 }, { "clip_ratio": 0.0013311142101883888, "epoch": 0.0967741935483871, "grad_norm": 0.9667455701612728, "kl": 0.12353515625, "learning_rate": 2e-06, "loss": 0.0065, "step": 114 }, { "clip_ratio": 0.0, "completion_length": 306.0, "epoch": 0.09762308998302208, "grad_norm": 1.3217916635151856, "kl": 0.12890625, "learning_rate": 2e-06, "loss": 0.0095, "reward": 0.26093003153800964, "reward_std": 0.125474750995636, "rewards/preference_model_reward": 0.26093003153800964, "rewards/preference_model_reward/std": 0.125474750995636, "step": 115 }, { "clip_ratio": 0.0, "epoch": 0.09847198641765705, "grad_norm": 1.249179472861132, "kl": 0.1318359375, "learning_rate": 2e-06, "loss": 0.0088, "step": 116 }, { "clip_ratio": 0.0, "completion_length": 418.71875, "epoch": 0.09932088285229201, "grad_norm": 1.0159825028105314, "kl": 0.126953125, "learning_rate": 2e-06, "loss": 0.0004, "reward": 0.21273019909858704, "reward_std": 0.10706693679094315, "rewards/preference_model_reward": 0.21273019909858704, "rewards/preference_model_reward/std": 0.10706692934036255, "step": 117 }, { "clip_ratio": 0.00037145469104871154, "epoch": 0.100169779286927, "grad_norm": 0.932911697631841, "kl": 0.12890625, "learning_rate": 2e-06, "loss": -0.0002, "step": 118 }, { "clip_ratio": 0.0, "completion_length": 555.0625, "epoch": 0.10101867572156197, "grad_norm": 1.3593075889098412, "kl": 0.1123046875, "learning_rate": 2e-06, "loss": 0.0152, "reward": 0.42055854201316833, "reward_std": 0.1595481038093567, "rewards/preference_model_reward": 0.42055854201316833, "rewards/preference_model_reward/std": 0.1595481038093567, "step": 119 }, { "clip_ratio": 0.0009689436410553753, "epoch": 0.10186757215619695, "grad_norm": 1.5262381686745565, "kl": 0.115234375, "learning_rate": 2e-06, "loss": 0.0144, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 357.5625, "epoch": 0.10271646859083192, "grad_norm": 0.6284715836457387, "kl": 0.1533203125, "learning_rate": 2e-06, "loss": -0.0006, "reward": 0.5042369365692139, "reward_std": 0.05617382749915123, "rewards/preference_model_reward": 0.5042369365692139, "rewards/preference_model_reward/std": 0.05617383494973183, "step": 121 }, { "clip_ratio": 0.0016699727857485414, "epoch": 0.1035653650254669, "grad_norm": 0.5676292275365208, "kl": 0.154296875, "learning_rate": 2e-06, "loss": -0.0009, "step": 122 }, { "clip_ratio": 0.0, "completion_length": 413.6875, "epoch": 0.10441426146010187, "grad_norm": 0.6166063462451017, "kl": 0.1240234375, "learning_rate": 2e-06, "loss": -0.0012, "reward": 0.4225958585739136, "reward_std": 0.06309302896261215, "rewards/preference_model_reward": 0.4225958585739136, "rewards/preference_model_reward/std": 0.06309301406145096, "step": 123 }, { "clip_ratio": 0.001325472490862012, "epoch": 0.10526315789473684, "grad_norm": 0.5030461455471626, "kl": 0.1240234375, "learning_rate": 2e-06, "loss": -0.0014, "step": 124 }, { "clip_ratio": 0.0, "completion_length": 654.625, "epoch": 0.10611205432937182, "grad_norm": 0.9061656748551837, "kl": 0.1416015625, "learning_rate": 2e-06, "loss": -0.0003, "reward": 0.26992088556289673, "reward_std": 0.12210524082183838, "rewards/preference_model_reward": 0.26992088556289673, "rewards/preference_model_reward/std": 0.12210523337125778, "step": 125 }, { "clip_ratio": 0.0003366470627952367, "epoch": 0.10696095076400679, "grad_norm": 0.8859074333947582, "kl": 0.142578125, "learning_rate": 2e-06, "loss": -0.0008, "step": 126 }, { "clip_ratio": 0.0, "completion_length": 318.96875, "epoch": 0.10780984719864177, "grad_norm": 0.7830937452002261, "kl": 0.07763671875, "learning_rate": 2e-06, "loss": -0.0029, "reward": 0.31064295768737793, "reward_std": 0.0781577080488205, "rewards/preference_model_reward": 0.31064295768737793, "rewards/preference_model_reward/std": 0.07815771549940109, "step": 127 }, { "clip_ratio": 0.0004959848592989147, "epoch": 0.10865874363327674, "grad_norm": 0.7465755882530519, "kl": 0.07861328125, "learning_rate": 2e-06, "loss": -0.0034, "step": 128 }, { "clip_ratio": 0.0, "completion_length": 285.4375, "epoch": 0.10950764006791172, "grad_norm": 0.6716474625246404, "kl": 0.150390625, "learning_rate": 2e-06, "loss": -0.0014, "reward": 0.4774589240550995, "reward_std": 0.04319845512509346, "rewards/preference_model_reward": 0.4774589240550995, "rewards/preference_model_reward/std": 0.04319845885038376, "step": 129 }, { "clip_ratio": 0.0006459264550358057, "epoch": 0.11035653650254669, "grad_norm": 0.4277286893768862, "kl": 0.15234375, "learning_rate": 2e-06, "loss": -0.0016, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 230.78125, "epoch": 0.11120543293718166, "grad_norm": 0.9935245836857972, "kl": 0.169921875, "learning_rate": 2e-06, "loss": -0.0023, "reward": 0.3256058692932129, "reward_std": 0.07717268913984299, "rewards/preference_model_reward": 0.3256058692932129, "rewards/preference_model_reward/std": 0.07717268913984299, "step": 131 }, { "clip_ratio": 0.00220286101102829, "epoch": 0.11205432937181664, "grad_norm": 0.9136352224117723, "kl": 0.16796875, "learning_rate": 2e-06, "loss": -0.0028, "step": 132 }, { "clip_ratio": 0.0, "completion_length": 350.46875, "epoch": 0.11290322580645161, "grad_norm": 0.5870925563537904, "kl": 0.1640625, "learning_rate": 2e-06, "loss": 0.0002, "reward": 0.08773044496774673, "reward_std": 0.06896770745515823, "rewards/preference_model_reward": 0.08773044496774673, "rewards/preference_model_reward/std": 0.06896770745515823, "step": 133 }, { "clip_ratio": 0.0005350956926122308, "epoch": 0.11375212224108659, "grad_norm": 0.5639384325042808, "kl": 0.16015625, "learning_rate": 2e-06, "loss": -0.0002, "step": 134 }, { "clip_ratio": 0.0, "completion_length": 528.0625, "epoch": 0.11460101867572156, "grad_norm": 0.48480451064325536, "kl": 0.12109375, "learning_rate": 2e-06, "loss": -0.0014, "reward": 0.4244787096977234, "reward_std": 0.05737914890050888, "rewards/preference_model_reward": 0.4244787096977234, "rewards/preference_model_reward/std": 0.05737914890050888, "step": 135 }, { "clip_ratio": 0.000636638724245131, "epoch": 0.11544991511035653, "grad_norm": 0.4122336602818054, "kl": 0.11328125, "learning_rate": 2e-06, "loss": -0.0017, "step": 136 }, { "clip_ratio": 0.0, "completion_length": 257.625, "epoch": 0.11629881154499151, "grad_norm": 1.4922729095300895, "kl": 0.203125, "learning_rate": 2e-06, "loss": -0.0004, "reward": 0.3075970411300659, "reward_std": 0.10743933171033859, "rewards/preference_model_reward": 0.3075970411300659, "rewards/preference_model_reward/std": 0.10743933171033859, "step": 137 }, { "clip_ratio": 0.0006169785629026592, "epoch": 0.11714770797962648, "grad_norm": 1.1808501339893651, "kl": 0.203125, "learning_rate": 2e-06, "loss": -0.001, "step": 138 }, { "clip_ratio": 0.0, "completion_length": 306.9375, "epoch": 0.11799660441426146, "grad_norm": 1.0160072383114551, "kl": 0.16796875, "learning_rate": 2e-06, "loss": -0.0061, "reward": 0.46265456080436707, "reward_std": 0.11661313474178314, "rewards/preference_model_reward": 0.46265456080436707, "rewards/preference_model_reward/std": 0.11661314219236374, "step": 139 }, { "clip_ratio": 0.0003987574018537998, "epoch": 0.11884550084889643, "grad_norm": 0.9531276983222098, "kl": 0.166015625, "learning_rate": 2e-06, "loss": -0.0068, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 503.5, "epoch": 0.11969439728353141, "grad_norm": 1.1419798492524438, "kl": 0.1787109375, "learning_rate": 2e-06, "loss": 0.0048, "reward": 0.20830851793289185, "reward_std": 0.13461197912693024, "rewards/preference_model_reward": 0.20830851793289185, "rewards/preference_model_reward/std": 0.13461197912693024, "step": 141 }, { "clip_ratio": 0.00026064369012601674, "epoch": 0.12054329371816638, "grad_norm": 1.072087725228961, "kl": 0.1806640625, "learning_rate": 2e-06, "loss": 0.004, "step": 142 }, { "clip_ratio": 0.0, "completion_length": 384.1875, "epoch": 0.12139219015280135, "grad_norm": 1.363810772987835, "kl": 0.1884765625, "learning_rate": 2e-06, "loss": -0.0039, "reward": 0.24709536135196686, "reward_std": 0.13785149157047272, "rewards/preference_model_reward": 0.24709536135196686, "rewards/preference_model_reward/std": 0.1378515064716339, "step": 143 }, { "clip_ratio": 0.00023824731761123985, "epoch": 0.12224108658743633, "grad_norm": 1.280172314576326, "kl": 0.19140625, "learning_rate": 2e-06, "loss": -0.0047, "step": 144 }, { "clip_ratio": 0.0, "completion_length": 425.90625, "epoch": 0.1230899830220713, "grad_norm": 0.5570698552144494, "kl": 0.115234375, "learning_rate": 2e-06, "loss": -0.0017, "reward": 0.12276525795459747, "reward_std": 0.06314485520124435, "rewards/preference_model_reward": 0.12276525795459747, "rewards/preference_model_reward/std": 0.06314485520124435, "step": 145 }, { "clip_ratio": 0.0003630488063208759, "epoch": 0.12393887945670629, "grad_norm": 0.5707319334424644, "kl": 0.11474609375, "learning_rate": 2e-06, "loss": -0.002, "step": 146 }, { "clip_ratio": 0.0, "completion_length": 331.625, "epoch": 0.12478777589134125, "grad_norm": 0.8955034856932126, "kl": 0.189453125, "learning_rate": 2e-06, "loss": -0.002, "reward": 0.06658346205949783, "reward_std": 0.05682339146733284, "rewards/preference_model_reward": 0.06658346205949783, "rewards/preference_model_reward/std": 0.05682339146733284, "step": 147 }, { "clip_ratio": 0.0011066581355407834, "epoch": 0.12563667232597622, "grad_norm": 0.6244418580201853, "kl": 0.19140625, "learning_rate": 2e-06, "loss": -0.0023, "step": 148 }, { "clip_ratio": 0.0, "completion_length": 393.15625, "epoch": 0.1264855687606112, "grad_norm": 1.2895611117206778, "kl": 0.1875, "learning_rate": 2e-06, "loss": -0.0053, "reward": 0.25148850679397583, "reward_std": 0.12007515132427216, "rewards/preference_model_reward": 0.25148850679397583, "rewards/preference_model_reward/std": 0.12007514387369156, "step": 149 }, { "clip_ratio": 0.0006402829312719405, "epoch": 0.1273344651952462, "grad_norm": 1.288543979995357, "kl": 0.189453125, "learning_rate": 2e-06, "loss": -0.0061, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 440.78125, "epoch": 0.12818336162988114, "grad_norm": 1.2555636888842705, "kl": 0.193359375, "learning_rate": 2e-06, "loss": -0.0041, "reward": 0.2623947262763977, "reward_std": 0.11035064607858658, "rewards/preference_model_reward": 0.2623947262763977, "rewards/preference_model_reward/std": 0.11035064607858658, "step": 151 }, { "clip_ratio": 0.003197396406903863, "epoch": 0.12903225806451613, "grad_norm": 1.1625036279466894, "kl": 0.197265625, "learning_rate": 2e-06, "loss": -0.0047, "step": 152 }, { "clip_ratio": 0.0, "completion_length": 271.59375, "epoch": 0.1298811544991511, "grad_norm": 1.096093984535528, "kl": 0.251953125, "learning_rate": 2e-06, "loss": -0.0016, "reward": 0.3598458170890808, "reward_std": 0.08515099436044693, "rewards/preference_model_reward": 0.3598458170890808, "rewards/preference_model_reward/std": 0.08515099436044693, "step": 153 }, { "clip_ratio": 0.0012925309129059315, "epoch": 0.1307300509337861, "grad_norm": 1.00178630558753, "kl": 0.255859375, "learning_rate": 2e-06, "loss": -0.0023, "step": 154 }, { "clip_ratio": 0.0, "completion_length": 415.5, "epoch": 0.13157894736842105, "grad_norm": 1.1081030432870604, "kl": 0.177734375, "learning_rate": 2e-06, "loss": 0.0115, "reward": 0.3061649799346924, "reward_std": 0.11071331799030304, "rewards/preference_model_reward": 0.3061649799346924, "rewards/preference_model_reward/std": 0.11071331799030304, "step": 155 }, { "clip_ratio": 0.0003086737706325948, "epoch": 0.13242784380305603, "grad_norm": 1.0874086536996357, "kl": 0.177734375, "learning_rate": 2e-06, "loss": 0.0109, "step": 156 }, { "clip_ratio": 0.0, "completion_length": 417.03125, "epoch": 0.133276740237691, "grad_norm": 0.7599994731662741, "kl": 0.177734375, "learning_rate": 2e-06, "loss": -0.0007, "reward": 0.48063063621520996, "reward_std": 0.06754690408706665, "rewards/preference_model_reward": 0.48063063621520996, "rewards/preference_model_reward/std": 0.06754691153764725, "step": 157 }, { "clip_ratio": 0.0018070859368890524, "epoch": 0.13412563667232597, "grad_norm": 0.597446098810492, "kl": 0.1650390625, "learning_rate": 2e-06, "loss": -0.001, "step": 158 }, { "clip_ratio": 0.0, "completion_length": 383.9375, "epoch": 0.13497453310696095, "grad_norm": 0.7677380389143111, "kl": 0.2041015625, "learning_rate": 2e-06, "loss": -0.0046, "reward": 0.1170925423502922, "reward_std": 0.07494159787893295, "rewards/preference_model_reward": 0.1170925423502922, "rewards/preference_model_reward/std": 0.07494159787893295, "step": 159 }, { "clip_ratio": 0.0012223758967593312, "epoch": 0.13582342954159593, "grad_norm": 0.7399797210592777, "kl": 0.203125, "learning_rate": 2e-06, "loss": -0.0051, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 722.25, "epoch": 0.1366723259762309, "grad_norm": 0.8765706673008182, "kl": 0.189453125, "learning_rate": 2e-06, "loss": 0.0001, "reward": 0.3232240676879883, "reward_std": 0.09787525236606598, "rewards/preference_model_reward": 0.3232240676879883, "rewards/preference_model_reward/std": 0.09787525236606598, "step": 161 }, { "clip_ratio": 0.0010285093449056149, "epoch": 0.13752122241086587, "grad_norm": 0.762896466304412, "kl": 0.1875, "learning_rate": 2e-06, "loss": -0.0002, "step": 162 }, { "clip_ratio": 0.0, "completion_length": 714.09375, "epoch": 0.13837011884550085, "grad_norm": 1.1627137317630705, "kl": 0.12353515625, "learning_rate": 2e-06, "loss": 0.0081, "reward": 0.3034539818763733, "reward_std": 0.15702968835830688, "rewards/preference_model_reward": 0.3034539818763733, "rewards/preference_model_reward/std": 0.15702970325946808, "step": 163 }, { "clip_ratio": 0.0006113144336268306, "epoch": 0.13921901528013583, "grad_norm": 1.110258121456432, "kl": 0.12158203125, "learning_rate": 2e-06, "loss": 0.0073, "step": 164 }, { "clip_ratio": 0.0, "completion_length": 391.21875, "epoch": 0.1400679117147708, "grad_norm": 1.089926535448989, "kl": 0.1708984375, "learning_rate": 2e-06, "loss": 0.0037, "reward": 0.22248202562332153, "reward_std": 0.11353754997253418, "rewards/preference_model_reward": 0.22248202562332153, "rewards/preference_model_reward/std": 0.11353754997253418, "step": 165 }, { "clip_ratio": 0.0005485577858053148, "epoch": 0.14091680814940577, "grad_norm": 0.9711244878108193, "kl": 0.1708984375, "learning_rate": 2e-06, "loss": 0.0031, "step": 166 }, { "clip_ratio": 0.0, "completion_length": 426.21875, "epoch": 0.14176570458404075, "grad_norm": 1.2634303413205847, "kl": 0.2392578125, "learning_rate": 2e-06, "loss": 0.0015, "reward": 0.23580655455589294, "reward_std": 0.12878787517547607, "rewards/preference_model_reward": 0.23580655455589294, "rewards/preference_model_reward/std": 0.12878787517547607, "step": 167 }, { "clip_ratio": 0.0008147264015860856, "epoch": 0.14261460101867574, "grad_norm": 1.2475202369294187, "kl": 0.2392578125, "learning_rate": 2e-06, "loss": 0.0007, "step": 168 }, { "clip_ratio": 0.0, "completion_length": 420.84375, "epoch": 0.1434634974533107, "grad_norm": 0.9256116658563932, "kl": 0.1806640625, "learning_rate": 2e-06, "loss": 0.0017, "reward": 0.19842864573001862, "reward_std": 0.09470146149396896, "rewards/preference_model_reward": 0.19842864573001862, "rewards/preference_model_reward/std": 0.09470146149396896, "step": 169 }, { "clip_ratio": 0.0016190335154533386, "epoch": 0.14431239388794567, "grad_norm": 0.8308927390156897, "kl": 0.1796875, "learning_rate": 2e-06, "loss": 0.0012, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 511.125, "epoch": 0.14516129032258066, "grad_norm": 0.9408637846666039, "kl": 0.181640625, "learning_rate": 2e-06, "loss": 0.0081, "reward": 0.19552090764045715, "reward_std": 0.1133582592010498, "rewards/preference_model_reward": 0.19552090764045715, "rewards/preference_model_reward/std": 0.11335825175046921, "step": 171 }, { "clip_ratio": 0.000971193250734359, "epoch": 0.1460101867572156, "grad_norm": 0.946800803701003, "kl": 0.181640625, "learning_rate": 2e-06, "loss": 0.0076, "step": 172 }, { "clip_ratio": 0.0, "completion_length": 645.34375, "epoch": 0.1468590831918506, "grad_norm": 0.8208513143064141, "kl": 0.22265625, "learning_rate": 2e-06, "loss": 0.002, "reward": 0.4708970785140991, "reward_std": 0.10244568437337875, "rewards/preference_model_reward": 0.4708970785140991, "rewards/preference_model_reward/std": 0.10244568437337875, "step": 173 }, { "clip_ratio": 0.0008575035026296973, "epoch": 0.14770797962648557, "grad_norm": 0.777907252016811, "kl": 0.220703125, "learning_rate": 2e-06, "loss": 0.0015, "step": 174 }, { "clip_ratio": 0.0, "completion_length": 352.5625, "epoch": 0.14855687606112053, "grad_norm": 0.7724021747968359, "kl": 0.26171875, "learning_rate": 2e-06, "loss": -0.0009, "reward": 0.28050410747528076, "reward_std": 0.07774435728788376, "rewards/preference_model_reward": 0.28050410747528076, "rewards/preference_model_reward/std": 0.07774436473846436, "step": 175 }, { "clip_ratio": 0.0007047850522212684, "epoch": 0.1494057724957555, "grad_norm": 0.8617498914678887, "kl": 0.26171875, "learning_rate": 2e-06, "loss": -0.0014, "step": 176 }, { "clip_ratio": 0.0, "completion_length": 528.8125, "epoch": 0.1502546689303905, "grad_norm": 1.0109383309762656, "kl": 0.19921875, "learning_rate": 2e-06, "loss": 0.0031, "reward": 0.400208055973053, "reward_std": 0.11498203873634338, "rewards/preference_model_reward": 0.400208055973053, "rewards/preference_model_reward/std": 0.11498204618692398, "step": 177 }, { "clip_ratio": 0.0015235163737088442, "epoch": 0.15110356536502548, "grad_norm": 0.9751593675624571, "kl": 0.19921875, "learning_rate": 2e-06, "loss": 0.0024, "step": 178 }, { "clip_ratio": 0.0, "completion_length": 364.84375, "epoch": 0.15195246179966043, "grad_norm": 1.0436702904156618, "kl": 0.1962890625, "learning_rate": 2e-06, "loss": -0.0041, "reward": 0.431307315826416, "reward_std": 0.10992471128702164, "rewards/preference_model_reward": 0.431307315826416, "rewards/preference_model_reward/std": 0.10992471128702164, "step": 179 }, { "clip_ratio": 0.00033796619391068816, "epoch": 0.15280135823429541, "grad_norm": 0.9638853656528253, "kl": 0.1953125, "learning_rate": 2e-06, "loss": -0.0048, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 421.03125, "epoch": 0.1536502546689304, "grad_norm": 1.0786137221446535, "kl": 0.2158203125, "learning_rate": 2e-06, "loss": 0.0106, "reward": 0.26917120814323425, "reward_std": 0.11035769432783127, "rewards/preference_model_reward": 0.26917120814323425, "rewards/preference_model_reward/std": 0.11035769432783127, "step": 181 }, { "clip_ratio": 0.0005866018473170698, "epoch": 0.15449915110356535, "grad_norm": 1.0797645934269513, "kl": 0.21484375, "learning_rate": 2e-06, "loss": 0.01, "step": 182 }, { "clip_ratio": 0.0, "completion_length": 340.40625, "epoch": 0.15534804753820033, "grad_norm": 0.9516406822692116, "kl": 0.21484375, "learning_rate": 2e-06, "loss": -0.0008, "reward": 0.4257683753967285, "reward_std": 0.08008842915296555, "rewards/preference_model_reward": 0.4257683753967285, "rewards/preference_model_reward/std": 0.08008842915296555, "step": 183 }, { "clip_ratio": 0.0007616700022481382, "epoch": 0.15619694397283532, "grad_norm": 1.0243333961812868, "kl": 0.2158203125, "learning_rate": 2e-06, "loss": -0.0014, "step": 184 }, { "clip_ratio": 0.0, "completion_length": 309.3125, "epoch": 0.1570458404074703, "grad_norm": 1.0966611105512216, "kl": 0.19140625, "learning_rate": 2e-06, "loss": 0.0, "reward": 0.44364723563194275, "reward_std": 0.08954507112503052, "rewards/preference_model_reward": 0.44364723563194275, "rewards/preference_model_reward/std": 0.08954507112503052, "step": 185 }, { "clip_ratio": 0.0007273735827766359, "epoch": 0.15789473684210525, "grad_norm": 0.8995850425186181, "kl": 0.19140625, "learning_rate": 2e-06, "loss": -0.0006, "step": 186 }, { "clip_ratio": 0.0, "completion_length": 350.03125, "epoch": 0.15874363327674024, "grad_norm": 1.0365032710666704, "kl": 0.1943359375, "learning_rate": 2e-06, "loss": -0.0025, "reward": 0.503156304359436, "reward_std": 0.06975705921649933, "rewards/preference_model_reward": 0.503156304359436, "rewards/preference_model_reward/std": 0.06975706666707993, "step": 187 }, { "clip_ratio": 0.003309250809252262, "epoch": 0.15959252971137522, "grad_norm": 0.7719048533542018, "kl": 0.1953125, "learning_rate": 2e-06, "loss": -0.0028, "step": 188 }, { "clip_ratio": 0.0, "completion_length": 469.40625, "epoch": 0.16044142614601017, "grad_norm": 1.2152567376815486, "kl": 0.19140625, "learning_rate": 2e-06, "loss": -0.0018, "reward": 0.2653403878211975, "reward_std": 0.13634686172008514, "rewards/preference_model_reward": 0.2653403878211975, "rewards/preference_model_reward/std": 0.13634686172008514, "step": 189 }, { "clip_ratio": 0.00013139564543962479, "epoch": 0.16129032258064516, "grad_norm": 1.1599347877086612, "kl": 0.19140625, "learning_rate": 2e-06, "loss": -0.0027, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 387.0625, "epoch": 0.16213921901528014, "grad_norm": 1.1561729382206092, "kl": 0.21484375, "learning_rate": 2e-06, "loss": 0.0003, "reward": 0.3497307002544403, "reward_std": 0.11881572753190994, "rewards/preference_model_reward": 0.3497307002544403, "rewards/preference_model_reward/std": 0.11881572753190994, "step": 191 }, { "clip_ratio": 0.0006593581638298929, "epoch": 0.16298811544991512, "grad_norm": 1.0934785615900324, "kl": 0.216796875, "learning_rate": 2e-06, "loss": -0.0005, "step": 192 }, { "clip_ratio": 0.0, "completion_length": 466.8125, "epoch": 0.16383701188455008, "grad_norm": 0.5519710984838219, "kl": 0.1845703125, "learning_rate": 2e-06, "loss": 0.0013, "reward": 0.10650002956390381, "reward_std": 0.059550777077674866, "rewards/preference_model_reward": 0.10650002956390381, "rewards/preference_model_reward/std": 0.059550777077674866, "step": 193 }, { "clip_ratio": 0.0010145865380764008, "epoch": 0.16468590831918506, "grad_norm": 0.8499180844539812, "kl": 0.185546875, "learning_rate": 2e-06, "loss": 0.001, "step": 194 }, { "clip_ratio": 0.0, "completion_length": 556.59375, "epoch": 0.16553480475382004, "grad_norm": 0.9414095054306995, "kl": 0.13671875, "learning_rate": 2e-06, "loss": 0.0036, "reward": 0.35372745990753174, "reward_std": 0.11836274713277817, "rewards/preference_model_reward": 0.35372745990753174, "rewards/preference_model_reward/std": 0.11836273968219757, "step": 195 }, { "clip_ratio": 0.00011072463530581445, "epoch": 0.166383701188455, "grad_norm": 0.8517491157753083, "kl": 0.13671875, "learning_rate": 2e-06, "loss": 0.0031, "step": 196 }, { "clip_ratio": 0.0, "completion_length": 441.21875, "epoch": 0.16723259762308998, "grad_norm": 1.0176591850675871, "kl": 0.203125, "learning_rate": 2e-06, "loss": -0.003, "reward": 0.32411473989486694, "reward_std": 0.10493102669715881, "rewards/preference_model_reward": 0.32411473989486694, "rewards/preference_model_reward/std": 0.10493102669715881, "step": 197 }, { "clip_ratio": 0.0005816287593916059, "epoch": 0.16808149405772496, "grad_norm": 0.9532792399626693, "kl": 0.2041015625, "learning_rate": 2e-06, "loss": -0.0036, "step": 198 }, { "clip_ratio": 0.0, "completion_length": 693.71875, "epoch": 0.16893039049235994, "grad_norm": 0.6157709458820001, "kl": 0.17578125, "learning_rate": 2e-06, "loss": 0.0012, "reward": 0.5120692849159241, "reward_std": 0.08368350565433502, "rewards/preference_model_reward": 0.5120692849159241, "rewards/preference_model_reward/std": 0.08368349820375443, "step": 199 }, { "clip_ratio": 0.000810971308965236, "epoch": 0.1697792869269949, "grad_norm": 0.5744963108218382, "kl": 0.1748046875, "learning_rate": 2e-06, "loss": 0.0008, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 499.34375, "epoch": 0.17062818336162988, "grad_norm": 1.171086543231942, "kl": 0.181640625, "learning_rate": 2e-06, "loss": -0.0044, "reward": 0.3279721438884735, "reward_std": 0.14420974254608154, "rewards/preference_model_reward": 0.3279721438884735, "rewards/preference_model_reward/std": 0.14420974254608154, "step": 201 }, { "clip_ratio": 0.0011094075161963701, "epoch": 0.17147707979626486, "grad_norm": 1.121059254698811, "kl": 0.1806640625, "learning_rate": 2e-06, "loss": -0.0052, "step": 202 }, { "clip_ratio": 0.0, "completion_length": 292.6875, "epoch": 0.17232597623089982, "grad_norm": 0.8874841846577225, "kl": 0.1083984375, "learning_rate": 2e-06, "loss": -0.0089, "reward": 0.27286988496780396, "reward_std": 0.09546167403459549, "rewards/preference_model_reward": 0.27286988496780396, "rewards/preference_model_reward/std": 0.09546167403459549, "step": 203 }, { "clip_ratio": 0.00020234723342582583, "epoch": 0.1731748726655348, "grad_norm": 0.8726296645362996, "kl": 0.1083984375, "learning_rate": 2e-06, "loss": -0.0095, "step": 204 }, { "clip_ratio": 0.0, "completion_length": 504.875, "epoch": 0.17402376910016978, "grad_norm": 2.7391485484558045, "kl": 0.1904296875, "learning_rate": 2e-06, "loss": -0.0018, "reward": 0.41756588220596313, "reward_std": 0.11001207679510117, "rewards/preference_model_reward": 0.41756588220596313, "rewards/preference_model_reward/std": 0.11001206934452057, "step": 205 }, { "clip_ratio": 0.0005510338814929128, "epoch": 0.17487266553480477, "grad_norm": 0.8607142192205527, "kl": 0.189453125, "learning_rate": 2e-06, "loss": -0.002, "step": 206 }, { "clip_ratio": 0.0, "completion_length": 452.84375, "epoch": 0.17572156196943972, "grad_norm": 0.4685652506244187, "kl": 0.1923828125, "learning_rate": 2e-06, "loss": 0.0003, "reward": 0.48838815093040466, "reward_std": 0.039198972284793854, "rewards/preference_model_reward": 0.48838815093040466, "rewards/preference_model_reward/std": 0.03919896483421326, "step": 207 }, { "clip_ratio": 0.001379701541736722, "epoch": 0.1765704584040747, "grad_norm": 0.36885619072170894, "kl": 0.1904296875, "learning_rate": 2e-06, "loss": 0.0002, "step": 208 }, { "clip_ratio": 0.0, "completion_length": 457.8125, "epoch": 0.1774193548387097, "grad_norm": 0.660983587706389, "kl": 0.173828125, "learning_rate": 2e-06, "loss": -0.0025, "reward": 0.08188852667808533, "reward_std": 0.06869849562644958, "rewards/preference_model_reward": 0.08188852667808533, "rewards/preference_model_reward/std": 0.06869849562644958, "step": 209 }, { "clip_ratio": 0.0004064367385581136, "epoch": 0.17826825127334464, "grad_norm": 0.5900136843275542, "kl": 0.1728515625, "learning_rate": 2e-06, "loss": -0.0029, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 536.34375, "epoch": 0.17911714770797962, "grad_norm": 1.192737456986498, "kl": 0.208984375, "learning_rate": 2e-06, "loss": 0.0126, "reward": 0.25954583287239075, "reward_std": 0.09377846866846085, "rewards/preference_model_reward": 0.25954583287239075, "rewards/preference_model_reward/std": 0.09377846121788025, "step": 211 }, { "clip_ratio": 0.0005144176539033651, "epoch": 0.1799660441426146, "grad_norm": 0.9034533154555449, "kl": 0.19921875, "learning_rate": 2e-06, "loss": 0.0122, "step": 212 }, { "clip_ratio": 0.0, "completion_length": 503.5625, "epoch": 0.1808149405772496, "grad_norm": 0.9811032516295555, "kl": 0.208984375, "learning_rate": 2e-06, "loss": 0.0015, "reward": 0.22673586010932922, "reward_std": 0.1181153729557991, "rewards/preference_model_reward": 0.22673586010932922, "rewards/preference_model_reward/std": 0.1181153655052185, "step": 213 }, { "clip_ratio": 0.0006308910087682307, "epoch": 0.18166383701188454, "grad_norm": 1.2177987016178247, "kl": 0.2060546875, "learning_rate": 2e-06, "loss": 0.0009, "step": 214 }, { "clip_ratio": 0.0, "completion_length": 380.9375, "epoch": 0.18251273344651953, "grad_norm": 1.1883998105439184, "kl": 0.2041015625, "learning_rate": 2e-06, "loss": 0.0026, "reward": 0.3673512935638428, "reward_std": 0.11825986206531525, "rewards/preference_model_reward": 0.3673512935638428, "rewards/preference_model_reward/std": 0.11825986206531525, "step": 215 }, { "clip_ratio": 0.0004939221544191241, "epoch": 0.1833616298811545, "grad_norm": 1.1187570941679779, "kl": 0.2060546875, "learning_rate": 2e-06, "loss": 0.0018, "step": 216 }, { "clip_ratio": 0.0, "completion_length": 482.78125, "epoch": 0.18421052631578946, "grad_norm": 1.2336234795768202, "kl": 0.1943359375, "learning_rate": 2e-06, "loss": -0.0007, "reward": 0.22679108381271362, "reward_std": 0.13555686175823212, "rewards/preference_model_reward": 0.22679108381271362, "rewards/preference_model_reward/std": 0.13555686175823212, "step": 217 }, { "clip_ratio": 0.0012310510501265526, "epoch": 0.18505942275042445, "grad_norm": 1.1626357202169864, "kl": 0.197265625, "learning_rate": 2e-06, "loss": -0.0014, "step": 218 }, { "clip_ratio": 0.0, "completion_length": 319.25, "epoch": 0.18590831918505943, "grad_norm": 0.90605092796741, "kl": 0.18359375, "learning_rate": 2e-06, "loss": -0.0023, "reward": 0.44145655632019043, "reward_std": 0.09349598735570908, "rewards/preference_model_reward": 0.44145655632019043, "rewards/preference_model_reward/std": 0.09349598735570908, "step": 219 }, { "clip_ratio": 0.0001996077917283401, "epoch": 0.1867572156196944, "grad_norm": 0.8461078416569522, "kl": 0.185546875, "learning_rate": 2e-06, "loss": -0.0028, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 469.25, "epoch": 0.18760611205432937, "grad_norm": 1.2596387776384212, "kl": 0.2177734375, "learning_rate": 2e-06, "loss": 0.0053, "reward": 0.3222573697566986, "reward_std": 0.10504135489463806, "rewards/preference_model_reward": 0.3222573697566986, "rewards/preference_model_reward/std": 0.10504135489463806, "step": 221 }, { "clip_ratio": 0.00020092798513360322, "epoch": 0.18845500848896435, "grad_norm": 0.9722158375203807, "kl": 0.21875, "learning_rate": 2e-06, "loss": 0.0048, "step": 222 }, { "clip_ratio": 0.0, "completion_length": 360.34375, "epoch": 0.18930390492359933, "grad_norm": 1.365145072141222, "kl": 0.2392578125, "learning_rate": 2e-06, "loss": 0.0032, "reward": 0.4276666045188904, "reward_std": 0.12780673801898956, "rewards/preference_model_reward": 0.4276666045188904, "rewards/preference_model_reward/std": 0.12780673801898956, "step": 223 }, { "clip_ratio": 0.0016522787045687437, "epoch": 0.19015280135823429, "grad_norm": 1.286088004349321, "kl": 0.2421875, "learning_rate": 2e-06, "loss": 0.0023, "step": 224 }, { "clip_ratio": 0.0, "completion_length": 517.375, "epoch": 0.19100169779286927, "grad_norm": 1.0225752401513326, "kl": 0.2138671875, "learning_rate": 2e-06, "loss": 0.0013, "reward": 0.3944551348686218, "reward_std": 0.11224386841058731, "rewards/preference_model_reward": 0.3944551348686218, "rewards/preference_model_reward/std": 0.11224386096000671, "step": 225 }, { "clip_ratio": 0.0007841808255761862, "epoch": 0.19185059422750425, "grad_norm": 0.9149056778209514, "kl": 0.197265625, "learning_rate": 2e-06, "loss": 0.0008, "step": 226 }, { "clip_ratio": 0.0, "completion_length": 291.59375, "epoch": 0.1926994906621392, "grad_norm": 1.288986938092742, "kl": 0.2734375, "learning_rate": 2e-06, "loss": -0.0003, "reward": 0.4484859108924866, "reward_std": 0.04202309623360634, "rewards/preference_model_reward": 0.4484859108924866, "rewards/preference_model_reward/std": 0.04202309623360634, "step": 227 }, { "clip_ratio": 0.00215684762224555, "epoch": 0.1935483870967742, "grad_norm": 0.4977519171941583, "kl": 0.275390625, "learning_rate": 2e-06, "loss": -0.0004, "step": 228 }, { "clip_ratio": 0.0, "completion_length": 518.6875, "epoch": 0.19439728353140917, "grad_norm": 1.1492317219584003, "kl": 0.23046875, "learning_rate": 2e-06, "loss": 0.007, "reward": 0.39957520365715027, "reward_std": 0.12616394460201263, "rewards/preference_model_reward": 0.39957520365715027, "rewards/preference_model_reward/std": 0.12616392970085144, "step": 229 }, { "clip_ratio": 0.0006631789728999138, "epoch": 0.19524617996604415, "grad_norm": 1.1237887647036364, "kl": 0.2314453125, "learning_rate": 2e-06, "loss": 0.0063, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 264.15625, "epoch": 0.1960950764006791, "grad_norm": 1.5446214982606454, "kl": 0.3828125, "learning_rate": 2e-06, "loss": -0.003, "reward": 0.051013268530368805, "reward_std": 0.04769134148955345, "rewards/preference_model_reward": 0.051013268530368805, "rewards/preference_model_reward/std": 0.04769134148955345, "step": 231 }, { "clip_ratio": 0.0015723377000540495, "epoch": 0.1969439728353141, "grad_norm": 0.7937791917154097, "kl": 0.37890625, "learning_rate": 2e-06, "loss": -0.0032, "step": 232 }, { "clip_ratio": 0.0, "completion_length": 653.4375, "epoch": 0.19779286926994907, "grad_norm": 1.285817858002923, "kl": 0.1953125, "learning_rate": 2e-06, "loss": -0.0031, "reward": 0.4576322138309479, "reward_std": 0.12770842015743256, "rewards/preference_model_reward": 0.4576322138309479, "rewards/preference_model_reward/std": 0.12770840525627136, "step": 233 }, { "clip_ratio": 0.0005655796267092228, "epoch": 0.19864176570458403, "grad_norm": 0.9988328328952946, "kl": 0.1943359375, "learning_rate": 2e-06, "loss": -0.0037, "step": 234 }, { "clip_ratio": 0.0, "completion_length": 625.875, "epoch": 0.199490662139219, "grad_norm": 0.5736428283590878, "kl": 0.3359375, "learning_rate": 2e-06, "loss": 0.0007, "reward": 0.5476886034011841, "reward_std": 0.019688162952661514, "rewards/preference_model_reward": 0.5476886034011841, "rewards/preference_model_reward/std": 0.01968817040324211, "step": 235 }, { "clip_ratio": 0.004047113005071878, "epoch": 0.200339558573854, "grad_norm": 0.3004588551896628, "kl": 0.267578125, "learning_rate": 2e-06, "loss": 0.0006, "step": 236 }, { "clip_ratio": 0.0, "completion_length": 397.1875, "epoch": 0.20118845500848898, "grad_norm": 0.7407875649985818, "kl": 0.255859375, "learning_rate": 2e-06, "loss": -0.0036, "reward": 0.5196166038513184, "reward_std": 0.0838971957564354, "rewards/preference_model_reward": 0.5196166038513184, "rewards/preference_model_reward/std": 0.0838971957564354, "step": 237 }, { "clip_ratio": 0.000780011061578989, "epoch": 0.20203735144312393, "grad_norm": 0.7989807862404035, "kl": 0.251953125, "learning_rate": 2e-06, "loss": -0.004, "step": 238 }, { "clip_ratio": 0.0, "completion_length": 663.1875, "epoch": 0.2028862478777589, "grad_norm": 1.1362959567666375, "kl": 0.201171875, "learning_rate": 2e-06, "loss": 0.0049, "reward": 0.3731197118759155, "reward_std": 0.13519592583179474, "rewards/preference_model_reward": 0.3731197118759155, "rewards/preference_model_reward/std": 0.13519594073295593, "step": 239 }, { "clip_ratio": 0.0008930441690608859, "epoch": 0.2037351443123939, "grad_norm": 0.9826027457053308, "kl": 0.197265625, "learning_rate": 2e-06, "loss": 0.0043, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 638.65625, "epoch": 0.20458404074702885, "grad_norm": 1.2194567876973144, "kl": 0.12890625, "learning_rate": 2e-06, "loss": 0.0058, "reward": 0.4657590687274933, "reward_std": 0.16892072558403015, "rewards/preference_model_reward": 0.4657590687274933, "rewards/preference_model_reward/std": 0.16892069578170776, "step": 241 }, { "clip_ratio": 0.0005752947181463242, "epoch": 0.20543293718166383, "grad_norm": 2.051110848359452, "kl": 0.12451171875, "learning_rate": 2e-06, "loss": 0.0051, "step": 242 }, { "clip_ratio": 0.0, "completion_length": 337.1875, "epoch": 0.20628183361629882, "grad_norm": 0.4124833822582155, "kl": 0.23046875, "learning_rate": 2e-06, "loss": -0.0006, "reward": 0.5514668226242065, "reward_std": 0.03596644848585129, "rewards/preference_model_reward": 0.5514668226242065, "rewards/preference_model_reward/std": 0.03596644848585129, "step": 243 }, { "clip_ratio": 0.0035508163273334503, "epoch": 0.2071307300509338, "grad_norm": 0.4171934728891184, "kl": 0.224609375, "learning_rate": 2e-06, "loss": -0.0008, "step": 244 }, { "clip_ratio": 0.0, "completion_length": 408.9375, "epoch": 0.20797962648556875, "grad_norm": 2.9889867378208055, "kl": 0.2333984375, "learning_rate": 2e-06, "loss": 0.0025, "reward": 0.2543810307979584, "reward_std": 0.11507824808359146, "rewards/preference_model_reward": 0.2543810307979584, "rewards/preference_model_reward/std": 0.11507824808359146, "step": 245 }, { "clip_ratio": 0.002395933959633112, "epoch": 0.20882852292020374, "grad_norm": 1.1132478545817386, "kl": 0.2314453125, "learning_rate": 2e-06, "loss": 0.0024, "step": 246 }, { "clip_ratio": 0.0, "completion_length": 451.6875, "epoch": 0.20967741935483872, "grad_norm": 0.16838590128648132, "kl": 0.1953125, "learning_rate": 2e-06, "loss": 0.0009, "reward": 0.5358486175537109, "reward_std": 0.017411619424819946, "rewards/preference_model_reward": 0.5358486175537109, "rewards/preference_model_reward/std": 0.0174116063863039, "step": 247 }, { "clip_ratio": 0.0004304340109229088, "epoch": 0.21052631578947367, "grad_norm": 0.16083907124687224, "kl": 0.19140625, "learning_rate": 2e-06, "loss": 0.0008, "step": 248 }, { "clip_ratio": 0.0, "completion_length": 373.0, "epoch": 0.21137521222410866, "grad_norm": 1.336981028115652, "kl": 0.1953125, "learning_rate": 2e-06, "loss": -0.0051, "reward": 0.15268400311470032, "reward_std": 0.11865763366222382, "rewards/preference_model_reward": 0.15268400311470032, "rewards/preference_model_reward/std": 0.11865763366222382, "step": 249 }, { "clip_ratio": 0.0005081939161755145, "epoch": 0.21222410865874364, "grad_norm": 1.2864787994059055, "kl": 0.1923828125, "learning_rate": 2e-06, "loss": -0.0058, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 593.5625, "epoch": 0.21307300509337862, "grad_norm": 0.692959080153562, "kl": 0.126953125, "learning_rate": 2e-06, "loss": -0.0002, "reward": 0.09329767525196075, "reward_std": 0.08787816017866135, "rewards/preference_model_reward": 0.09329767525196075, "rewards/preference_model_reward/std": 0.08787816762924194, "step": 251 }, { "clip_ratio": 0.0005807211855426431, "epoch": 0.21392190152801357, "grad_norm": 0.724372938551975, "kl": 0.12451171875, "learning_rate": 2e-06, "loss": -0.0006, "step": 252 }, { "clip_ratio": 0.0, "completion_length": 262.46875, "epoch": 0.21477079796264856, "grad_norm": 1.1375236466013237, "kl": 0.220703125, "learning_rate": 2e-06, "loss": 0.0023, "reward": 0.20732049643993378, "reward_std": 0.09915804862976074, "rewards/preference_model_reward": 0.20732049643993378, "rewards/preference_model_reward/std": 0.09915804862976074, "step": 253 }, { "clip_ratio": 0.0012860854621976614, "epoch": 0.21561969439728354, "grad_norm": 1.1117646406622659, "kl": 0.21875, "learning_rate": 2e-06, "loss": 0.0015, "step": 254 }, { "clip_ratio": 0.0, "completion_length": 488.15625, "epoch": 0.2164685908319185, "grad_norm": 0.83190430608096, "kl": 0.19921875, "learning_rate": 2e-06, "loss": -0.0001, "reward": 0.16658729314804077, "reward_std": 0.0986584797501564, "rewards/preference_model_reward": 0.16658729314804077, "rewards/preference_model_reward/std": 0.0986584797501564, "step": 255 }, { "clip_ratio": 0.00012755101488437504, "epoch": 0.21731748726655348, "grad_norm": 0.6949799847951833, "kl": 0.1962890625, "learning_rate": 2e-06, "loss": -0.0005, "step": 256 }, { "clip_ratio": 0.0, "completion_length": 306.8125, "epoch": 0.21816638370118846, "grad_norm": 0.8460209137088387, "kl": 0.1962890625, "learning_rate": 2e-06, "loss": -0.0024, "reward": 0.13335028290748596, "reward_std": 0.08512399345636368, "rewards/preference_model_reward": 0.13335028290748596, "rewards/preference_model_reward/std": 0.08512399345636368, "step": 257 }, { "clip_ratio": 0.0, "epoch": 0.21901528013582344, "grad_norm": 0.8304477251507847, "kl": 0.1943359375, "learning_rate": 2e-06, "loss": -0.003, "step": 258 }, { "clip_ratio": 0.0, "completion_length": 381.78125, "epoch": 0.2198641765704584, "grad_norm": 1.097994057597742, "kl": 0.1923828125, "learning_rate": 2e-06, "loss": -0.004, "reward": 0.20581325888633728, "reward_std": 0.11145073920488358, "rewards/preference_model_reward": 0.20581325888633728, "rewards/preference_model_reward/std": 0.11145073920488358, "step": 259 }, { "clip_ratio": 0.000915912794880569, "epoch": 0.22071307300509338, "grad_norm": 1.0577251056985664, "kl": 0.1904296875, "learning_rate": 2e-06, "loss": -0.0049, "step": 260 }, { "clip_ratio": 0.0, "completion_length": 410.8125, "epoch": 0.22156196943972836, "grad_norm": 0.9488227851729216, "kl": 0.150390625, "learning_rate": 2e-06, "loss": 0.01, "reward": 0.14306291937828064, "reward_std": 0.08440835028886795, "rewards/preference_model_reward": 0.14306291937828064, "rewards/preference_model_reward/std": 0.08440835028886795, "step": 261 }, { "clip_ratio": 0.0006555670406669378, "epoch": 0.22241086587436332, "grad_norm": 0.9625461445287203, "kl": 0.1494140625, "learning_rate": 2e-06, "loss": 0.0097, "step": 262 }, { "clip_ratio": 0.0, "completion_length": 441.15625, "epoch": 0.2232597623089983, "grad_norm": 1.3378528546974586, "kl": 0.201171875, "learning_rate": 2e-06, "loss": 0.0111, "reward": 0.3541616201400757, "reward_std": 0.14822125434875488, "rewards/preference_model_reward": 0.3541616201400757, "rewards/preference_model_reward/std": 0.14822125434875488, "step": 263 }, { "clip_ratio": 0.0002928848844021559, "epoch": 0.22410865874363328, "grad_norm": 1.44097954206129, "kl": 0.2021484375, "learning_rate": 2e-06, "loss": 0.0103, "step": 264 }, { "clip_ratio": 0.0, "completion_length": 328.5625, "epoch": 0.22495755517826826, "grad_norm": 1.0594223141668664, "kl": 0.228515625, "learning_rate": 2e-06, "loss": -0.0016, "reward": 0.4411153793334961, "reward_std": 0.11199039965867996, "rewards/preference_model_reward": 0.4411153793334961, "rewards/preference_model_reward/std": 0.11199039220809937, "step": 265 }, { "clip_ratio": 0.0003811471979133785, "epoch": 0.22580645161290322, "grad_norm": 1.048487426692034, "kl": 0.2314453125, "learning_rate": 2e-06, "loss": -0.0024, "step": 266 }, { "clip_ratio": 0.0, "completion_length": 501.90625, "epoch": 0.2266553480475382, "grad_norm": 0.764207880163219, "kl": 0.1845703125, "learning_rate": 2e-06, "loss": -0.0022, "reward": 0.4925358295440674, "reward_std": 0.09214819222688675, "rewards/preference_model_reward": 0.4925358295440674, "rewards/preference_model_reward/std": 0.09214819967746735, "step": 267 }, { "clip_ratio": 0.000623343454208225, "epoch": 0.22750424448217318, "grad_norm": 0.7021748192613388, "kl": 0.1845703125, "learning_rate": 2e-06, "loss": -0.0027, "step": 268 }, { "clip_ratio": 0.0, "completion_length": 406.78125, "epoch": 0.22835314091680814, "grad_norm": 1.2534216873477357, "kl": 0.2109375, "learning_rate": 2e-06, "loss": 0.0004, "reward": 0.346091091632843, "reward_std": 0.11527692526578903, "rewards/preference_model_reward": 0.346091091632843, "rewards/preference_model_reward/std": 0.11527692526578903, "step": 269 }, { "clip_ratio": 0.0006988497916609049, "epoch": 0.22920203735144312, "grad_norm": 1.028430046135459, "kl": 0.2119140625, "learning_rate": 2e-06, "loss": -0.0001, "step": 270 }, { "clip_ratio": 0.0, "completion_length": 474.875, "epoch": 0.2300509337860781, "grad_norm": 13.308680917152811, "kl": 0.7265625, "learning_rate": 2e-06, "loss": -0.0046, "reward": 0.39890241622924805, "reward_std": 0.10348767042160034, "rewards/preference_model_reward": 0.39890241622924805, "rewards/preference_model_reward/std": 0.10348766297101974, "step": 271 }, { "clip_ratio": 0.0009435814572498202, "epoch": 0.23089983022071306, "grad_norm": 0.9508081839275468, "kl": 0.1708984375, "learning_rate": 2e-06, "loss": -0.0057, "step": 272 }, { "clip_ratio": 0.0, "completion_length": 335.78125, "epoch": 0.23174872665534804, "grad_norm": 0.9495076380585504, "kl": 0.1650390625, "learning_rate": 2e-06, "loss": -0.0052, "reward": 0.19895681738853455, "reward_std": 0.10379483550786972, "rewards/preference_model_reward": 0.19895681738853455, "rewards/preference_model_reward/std": 0.10379482805728912, "step": 273 }, { "clip_ratio": 0.0, "epoch": 0.23259762308998302, "grad_norm": 0.9358387855912544, "kl": 0.1650390625, "learning_rate": 2e-06, "loss": -0.0059, "step": 274 }, { "clip_ratio": 0.0, "completion_length": 430.8125, "epoch": 0.233446519524618, "grad_norm": 1.172070244878363, "kl": 0.1748046875, "learning_rate": 2e-06, "loss": 0.0189, "reward": 0.4197525382041931, "reward_std": 0.11745458096265793, "rewards/preference_model_reward": 0.4197525382041931, "rewards/preference_model_reward/std": 0.11745458096265793, "step": 275 }, { "clip_ratio": 0.00059707515174523, "epoch": 0.23429541595925296, "grad_norm": 1.1205887604531841, "kl": 0.173828125, "learning_rate": 2e-06, "loss": 0.0182, "step": 276 }, { "clip_ratio": 0.0, "completion_length": 380.84375, "epoch": 0.23514431239388794, "grad_norm": 1.0667324137631118, "kl": 0.189453125, "learning_rate": 2e-06, "loss": -0.0014, "reward": 0.4866299033164978, "reward_std": 0.10881662368774414, "rewards/preference_model_reward": 0.4866299033164978, "rewards/preference_model_reward/std": 0.10881662368774414, "step": 277 }, { "clip_ratio": 0.0003319675161037594, "epoch": 0.23599320882852293, "grad_norm": 1.4205333564695013, "kl": 0.2265625, "learning_rate": 2e-06, "loss": -0.0021, "step": 278 }, { "clip_ratio": 0.0, "completion_length": 409.96875, "epoch": 0.23684210526315788, "grad_norm": 1.2256158038709593, "kl": 0.1904296875, "learning_rate": 2e-06, "loss": -0.0022, "reward": 0.3310420513153076, "reward_std": 0.13203255832195282, "rewards/preference_model_reward": 0.3310420513153076, "rewards/preference_model_reward/std": 0.13203254342079163, "step": 279 }, { "clip_ratio": 0.00029370313859544694, "epoch": 0.23769100169779286, "grad_norm": 1.140780771833912, "kl": 0.1923828125, "learning_rate": 2e-06, "loss": -0.0031, "step": 280 }, { "clip_ratio": 0.0, "completion_length": 299.8125, "epoch": 0.23853989813242785, "grad_norm": 0.5773592648416564, "kl": 0.18359375, "learning_rate": 2e-06, "loss": 0.0001, "reward": 0.06664206832647324, "reward_std": 0.046818241477012634, "rewards/preference_model_reward": 0.06664206832647324, "rewards/preference_model_reward/std": 0.046818237751722336, "step": 281 }, { "clip_ratio": 0.002476999070495367, "epoch": 0.23938879456706283, "grad_norm": 0.5215923415469397, "kl": 0.1796875, "learning_rate": 2e-06, "loss": -0.0002, "step": 282 }, { "clip_ratio": 0.0, "completion_length": 641.5, "epoch": 0.24023769100169778, "grad_norm": 0.3036782782176285, "kl": 0.1865234375, "learning_rate": 2e-06, "loss": -0.0012, "reward": 0.4948778748512268, "reward_std": 0.042491715401411057, "rewards/preference_model_reward": 0.4948778748512268, "rewards/preference_model_reward/std": 0.042491719126701355, "step": 283 }, { "clip_ratio": 0.0014662991743534803, "epoch": 0.24108658743633277, "grad_norm": 1.2693793919900047, "kl": 0.1865234375, "learning_rate": 2e-06, "loss": -0.0014, "step": 284 }, { "clip_ratio": 0.0, "completion_length": 326.25, "epoch": 0.24193548387096775, "grad_norm": 0.9299842399613616, "kl": 0.2353515625, "learning_rate": 2e-06, "loss": -0.0024, "reward": 0.46875959634780884, "reward_std": 0.09210902452468872, "rewards/preference_model_reward": 0.46875959634780884, "rewards/preference_model_reward/std": 0.09210902452468872, "step": 285 }, { "clip_ratio": 0.0002825378905981779, "epoch": 0.2427843803056027, "grad_norm": 1.269996090531689, "kl": 0.236328125, "learning_rate": 2e-06, "loss": -0.0031, "step": 286 }, { "clip_ratio": 0.0, "completion_length": 358.0, "epoch": 0.2436332767402377, "grad_norm": 0.845178527409992, "kl": 0.1904296875, "learning_rate": 2e-06, "loss": -0.0032, "reward": 0.14985330402851105, "reward_std": 0.07015400379896164, "rewards/preference_model_reward": 0.14985330402851105, "rewards/preference_model_reward/std": 0.07015399634838104, "step": 287 }, { "clip_ratio": 0.002010664436966181, "epoch": 0.24448217317487267, "grad_norm": 0.7374485292445523, "kl": 0.1923828125, "learning_rate": 2e-06, "loss": -0.0037, "step": 288 }, { "clip_ratio": 0.0, "completion_length": 494.5625, "epoch": 0.24533106960950765, "grad_norm": 0.9545115708280292, "kl": 0.2119140625, "learning_rate": 2e-06, "loss": 0.0043, "reward": 0.44061005115509033, "reward_std": 0.1013847216963768, "rewards/preference_model_reward": 0.44061005115509033, "rewards/preference_model_reward/std": 0.1013847216963768, "step": 289 }, { "clip_ratio": 0.0008882409892976284, "epoch": 0.2461799660441426, "grad_norm": 1.0760062283966654, "kl": 0.212890625, "learning_rate": 2e-06, "loss": 0.0037, "step": 290 }, { "clip_ratio": 0.0, "completion_length": 622.78125, "epoch": 0.2470288624787776, "grad_norm": 0.2204401442573945, "kl": 0.216796875, "learning_rate": 2e-06, "loss": 0.0006, "reward": 0.5982410907745361, "reward_std": 0.01771736703813076, "rewards/preference_model_reward": 0.5982410907745361, "rewards/preference_model_reward/std": 0.01771736331284046, "step": 291 }, { "clip_ratio": 0.000654590898193419, "epoch": 0.24787775891341257, "grad_norm": 0.14373274603213665, "kl": 0.203125, "learning_rate": 2e-06, "loss": 0.0005, "step": 292 }, { "clip_ratio": 0.0, "completion_length": 474.4375, "epoch": 0.24872665534804753, "grad_norm": 1.0199416518092594, "kl": 0.197265625, "learning_rate": 2e-06, "loss": -0.0064, "reward": 0.3177827000617981, "reward_std": 0.13039816915988922, "rewards/preference_model_reward": 0.3177827000617981, "rewards/preference_model_reward/std": 0.13039815425872803, "step": 293 }, { "clip_ratio": 0.0005917281378060579, "epoch": 0.2495755517826825, "grad_norm": 0.9840967197672208, "kl": 0.1953125, "learning_rate": 2e-06, "loss": -0.0071, "step": 294 }, { "clip_ratio": 0.0, "completion_length": 397.0625, "epoch": 0.25042444821731746, "grad_norm": 1.8823223194664953, "kl": 0.2099609375, "learning_rate": 2e-06, "loss": -0.0006, "reward": 0.31412482261657715, "reward_std": 0.1166299358010292, "rewards/preference_model_reward": 0.31412482261657715, "rewards/preference_model_reward/std": 0.11662992835044861, "step": 295 }, { "clip_ratio": 0.0003579020267352462, "epoch": 0.25127334465195245, "grad_norm": 1.1537409422856328, "kl": 0.208984375, "learning_rate": 2e-06, "loss": -0.0011, "step": 296 }, { "clip_ratio": 0.0, "completion_length": 628.5625, "epoch": 0.25212224108658743, "grad_norm": 1.1927096956384737, "kl": 0.1708984375, "learning_rate": 2e-06, "loss": 0.0164, "reward": 0.3190678358078003, "reward_std": 0.14439481496810913, "rewards/preference_model_reward": 0.3190678358078003, "rewards/preference_model_reward/std": 0.14439481496810913, "step": 297 }, { "clip_ratio": 0.000986331608146429, "epoch": 0.2529711375212224, "grad_norm": 1.1803756478909, "kl": 0.1708984375, "learning_rate": 2e-06, "loss": 0.0156, "step": 298 }, { "clip_ratio": 0.0, "completion_length": 357.5, "epoch": 0.2538200339558574, "grad_norm": 2.207926479632091, "kl": 0.2109375, "learning_rate": 2e-06, "loss": -0.0057, "reward": 0.3464009761810303, "reward_std": 0.1354563981294632, "rewards/preference_model_reward": 0.3464009761810303, "rewards/preference_model_reward/std": 0.135456383228302, "step": 299 }, { "clip_ratio": 0.0006329367170110345, "epoch": 0.2546689303904924, "grad_norm": 1.4143240393325465, "kl": 0.2099609375, "learning_rate": 2e-06, "loss": -0.0064, "step": 300 } ], "logging_steps": 1, "max_steps": 625, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }