{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5, "eval_steps": 500, "global_step": 364, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.41666666666666663, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 147.33334350585938, "completions/mean_terminated_length": 69.71428680419922, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.0013736263736263737, "grad_norm": 9.55423641204834, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0, "num_tokens": 6200.0, "reward": 3568.513671875, "reward_std": 2485.587158203125, "rewards/reward_long_completions/mean": 428.75, "rewards/reward_long_completions/std": 372.8414001464844, "rewards/reward_long_sentences/mean": 16.507143020629883, "rewards/reward_long_sentences/std": 23.103343963623047, "rewards/reward_low_threat_score/mean": 0.9998356699943542, "rewards/reward_low_threat_score/std": 0.00024321906676050276, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.45833333333333337, "completions/max_length": 256.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 138.2916717529297, "completions/mean_terminated_length": 38.69231033325195, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.0027472527472527475, "grad_norm": 5.068399429321289, "kl": 0.0, "learning_rate": 2.197802197802198e-06, "loss": 0.0, "num_tokens": 14375.0, "reward": 3688.550537109375, "reward_std": 3243.63818359375, "rewards/reward_long_completions/mean": 444.375, "rewards/reward_long_completions/std": 409.96221923828125, "rewards/reward_long_sentences/mean": 14.025299072265625, "rewards/reward_long_sentences/std": 9.690044403076172, "rewards/reward_low_threat_score/mean": 0.9998847842216492, "rewards/reward_low_threat_score/std": 6.694205239909934e-06, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.45833333333333337, "completions/max_length": 256.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 148.95834350585938, "completions/mean_terminated_length": 58.38461685180664, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.004120879120879121, "grad_norm": 4.424472808837891, "kl": 0.000433737674029544, "learning_rate": 4.395604395604396e-06, "loss": 0.0, "num_tokens": 20442.0, "reward": 3652.406982421875, "reward_std": 2988.04443359375, "rewards/reward_long_completions/mean": 440.2083435058594, "rewards/reward_long_completions/std": 362.528564453125, "rewards/reward_long_sentences/mean": 13.41264820098877, "rewards/reward_long_sentences/std": 9.499146461486816, "rewards/reward_low_threat_score/mean": 0.9998778700828552, "rewards/reward_low_threat_score/std": 1.6613879779470153e-05, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.45833333333333337, "completions/max_length": 256.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 150.45834350585938, "completions/mean_terminated_length": 61.153846740722656, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.005494505494505495, "grad_norm": 14.504254341125488, "kl": 0.0007149417651817203, "learning_rate": 6.5934065934065935e-06, "loss": 0.0001, "num_tokens": 27217.0, "reward": 3661.546875, "reward_std": 3005.169677734375, "rewards/reward_long_completions/mean": 437.4166564941406, "rewards/reward_long_completions/std": 374.5256652832031, "rewards/reward_long_sentences/mean": 23.28639793395996, "rewards/reward_long_sentences/std": 35.287818908691406, "rewards/reward_low_threat_score/mean": 0.9998757839202881, "rewards/reward_low_threat_score/std": 4.090140646439977e-05, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5416666666666667, "completions/max_length": 256.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 158.95834350585938, "completions/mean_terminated_length": 44.272727966308594, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.006868131868131868, "grad_norm": 6.098145008087158, "kl": 0.0005340282805263996, "learning_rate": 8.791208791208792e-06, "loss": 0.0002, "num_tokens": 33908.0, "reward": 3195.9306640625, "reward_std": 2304.882080078125, "rewards/reward_long_completions/mean": 379.0416564941406, "rewards/reward_long_completions/std": 325.0670166015625, "rewards/reward_long_sentences/mean": 27.268938064575195, "rewards/reward_long_sentences/std": 36.58531188964844, "rewards/reward_low_threat_score/mean": 0.9998847842216492, "rewards/reward_low_threat_score/std": 7.722564987489022e-06, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.7083333333333333, "completions/max_length": 256.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 199.1666717529297, "completions/mean_terminated_length": 61.142860412597656, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.008241758241758242, "grad_norm": 5.340779781341553, "kl": 0.0008734992006793618, "learning_rate": 1.0989010989010989e-05, "loss": 0.0003, "num_tokens": 42508.0, "reward": 4396.15673828125, "reward_std": 2179.102294921875, "rewards/reward_long_completions/mean": 529.1666870117188, "rewards/reward_long_completions/std": 295.7225036621094, "rewards/reward_long_sentences/mean": 17.88572883605957, "rewards/reward_long_sentences/std": 11.36023235321045, "rewards/reward_low_threat_score/mean": 0.9998812675476074, "rewards/reward_low_threat_score/std": 1.2293223335291259e-05, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.7083333333333333, "completions/max_length": 256.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 199.25, "completions/mean_terminated_length": 61.42857360839844, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.009615384615384616, "grad_norm": 11.266074180603027, "kl": 0.0022053613793104887, "learning_rate": 1.3186813186813187e-05, "loss": 0.0054, "num_tokens": 52602.0, "reward": 4036.3828125, "reward_std": 2202.05615234375, "rewards/reward_long_completions/mean": 485.2916564941406, "rewards/reward_long_completions/std": 267.1375732421875, "rewards/reward_long_sentences/mean": 17.853097915649414, "rewards/reward_long_sentences/std": 15.891824722290039, "rewards/reward_low_threat_score/mean": 0.9998841881752014, "rewards/reward_low_threat_score/std": 9.566709195496514e-06, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.7916666666666666, "completions/max_length": 256.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 236.83334350585938, "completions/mean_terminated_length": 164.0, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.01098901098901099, "grad_norm": 1.1406697034835815, "kl": 0.0028431271202862263, "learning_rate": 1.5384615384615387e-05, "loss": 0.0001, "num_tokens": 60054.0, "reward": 4962.908203125, "reward_std": 1260.50732421875, "rewards/reward_long_completions/mean": 595.4583129882812, "rewards/reward_long_completions/std": 224.28805541992188, "rewards/reward_long_sentences/mean": 25.076269149780273, "rewards/reward_long_sentences/std": 21.472362518310547, "rewards/reward_low_threat_score/mean": 0.9998769760131836, "rewards/reward_low_threat_score/std": 2.3236994820763357e-05, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 256.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 238.95834350585938, "completions/mean_terminated_length": 119.66667175292969, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.012362637362637362, "grad_norm": 1.6051141023635864, "kl": 0.0048497626557946205, "learning_rate": 1.7582417582417584e-05, "loss": 0.0002, "num_tokens": 69165.0, "reward": 5187.63232421875, "reward_std": 1470.990478515625, "rewards/reward_long_completions/mean": 619.7916870117188, "rewards/reward_long_completions/std": 200.0610809326172, "rewards/reward_long_sentences/mean": 32.86149215698242, "rewards/reward_long_sentences/std": 29.214521408081055, "rewards/reward_low_threat_score/mean": 0.9998783469200134, "rewards/reward_low_threat_score/std": 1.8587061276775785e-05, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.013736263736263736, "grad_norm": 0.9508539438247681, "kl": 0.006186956539750099, "learning_rate": 1.978021978021978e-05, "loss": 0.0002, "num_tokens": 77573.0, "reward": 5730.0625, "reward_std": 1434.03857421875, "rewards/reward_long_completions/mean": 689.0, "rewards/reward_long_completions/std": 205.66055297851562, "rewards/reward_long_sentences/mean": 25.1815128326416, "rewards/reward_long_sentences/std": 21.682249069213867, "rewards/reward_low_threat_score/mean": 0.9998700022697449, "rewards/reward_low_threat_score/std": 4.2984298488590866e-05, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9166666666666666, "completions/max_length": 256.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 246.58334350585938, "completions/mean_terminated_length": 143.0, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.01510989010989011, "grad_norm": 0.937313973903656, "kl": 0.008032940328121185, "learning_rate": 2.1978021978021977e-05, "loss": 0.0003, "num_tokens": 84719.0, "reward": 5612.9404296875, "reward_std": 1143.014892578125, "rewards/reward_long_completions/mean": 677.3333129882812, "rewards/reward_long_completions/std": 165.62100219726562, "rewards/reward_long_sentences/mean": 18.557411193847656, "rewards/reward_long_sentences/std": 11.858016014099121, "rewards/reward_low_threat_score/mean": 0.9998767971992493, "rewards/reward_low_threat_score/std": 2.4614004360046238e-05, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.016483516483516484, "grad_norm": 1.0877865552902222, "kl": 0.015011820942163467, "learning_rate": 2.4175824175824177e-05, "loss": 0.0006, "num_tokens": 95463.0, "reward": 5302.15673828125, "reward_std": 799.2890625, "rewards/reward_long_completions/mean": 637.4583129882812, "rewards/reward_long_completions/std": 146.63558959960938, "rewards/reward_long_sentences/mean": 23.519601821899414, "rewards/reward_long_sentences/std": 19.27816390991211, "rewards/reward_low_threat_score/mean": 0.999880313873291, "rewards/reward_low_threat_score/std": 1.2732376490021124e-05, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9583333333333334, "completions/max_length": 256.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 250.83334350585938, "completions/mean_terminated_length": 132.0, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.017857142857142856, "grad_norm": 1.140388011932373, "kl": 0.014447374269366264, "learning_rate": 2.6373626373626374e-05, "loss": 0.0006, "num_tokens": 104771.0, "reward": 6174.15380859375, "reward_std": 1238.9588623046875, "rewards/reward_long_completions/mean": 745.75, "rewards/reward_long_completions/std": 218.30418395996094, "rewards/reward_long_sentences/mean": 18.6695613861084, "rewards/reward_long_sentences/std": 7.7078680992126465, "rewards/reward_low_threat_score/mean": 0.9998776912689209, "rewards/reward_low_threat_score/std": 1.4053748600417748e-05, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9166666666666666, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 247.75, "completions/mean_terminated_length": 157.0, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.019230769230769232, "grad_norm": 1.1372143030166626, "kl": 0.021010398864746094, "learning_rate": 2.857142857142857e-05, "loss": 0.0009, "num_tokens": 113733.0, "reward": 5195.478515625, "reward_std": 1159.0579833984375, "rewards/reward_long_completions/mean": 626.375, "rewards/reward_long_completions/std": 164.3579864501953, "rewards/reward_long_sentences/mean": 18.641340255737305, "rewards/reward_long_sentences/std": 12.443629264831543, "rewards/reward_low_threat_score/mean": 0.9998853206634521, "rewards/reward_low_threat_score/std": 7.113198535080301e-06, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9583333333333334, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 255.95834350585938, "completions/mean_terminated_length": 255.0, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.020604395604395604, "grad_norm": 0.8927009701728821, "kl": 0.0163731686770916, "learning_rate": 3.0769230769230774e-05, "loss": 0.0007, "num_tokens": 123444.0, "reward": 5642.7548828125, "reward_std": 992.2362060546875, "rewards/reward_long_completions/mean": 681.5, "rewards/reward_long_completions/std": 182.76522827148438, "rewards/reward_long_sentences/mean": 17.218650817871094, "rewards/reward_long_sentences/std": 7.560858726501465, "rewards/reward_low_threat_score/mean": 0.9998770356178284, "rewards/reward_low_threat_score/std": 1.9538354536052793e-05, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.02197802197802198, "grad_norm": 0.8958719372749329, "kl": 0.026359189301729202, "learning_rate": 3.296703296703297e-05, "loss": 0.0011, "num_tokens": 132828.0, "reward": 7071.66796875, "reward_std": 1391.1435546875, "rewards/reward_long_completions/mean": 852.0, "rewards/reward_long_completions/std": 199.34719848632812, "rewards/reward_long_sentences/mean": 26.847442626953125, "rewards/reward_long_sentences/std": 28.552104949951172, "rewards/reward_low_threat_score/mean": 0.9998016357421875, "rewards/reward_low_threat_score/std": 0.00040717990486882627, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9583333333333334, "completions/max_length": 256.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 255.5, "completions/mean_terminated_length": 244.0, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.023351648351648352, "grad_norm": 0.8223163485527039, "kl": 0.030283691361546516, "learning_rate": 3.516483516483517e-05, "loss": 0.0012, "num_tokens": 141300.0, "reward": 6378.10107421875, "reward_std": 1012.95751953125, "rewards/reward_long_completions/mean": 768.75, "rewards/reward_long_completions/std": 217.05905151367188, "rewards/reward_long_sentences/mean": 23.418798446655273, "rewards/reward_long_sentences/std": 19.512178421020508, "rewards/reward_low_threat_score/mean": 0.999880850315094, "rewards/reward_low_threat_score/std": 5.891323780815583e-06, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.024725274725274724, "grad_norm": 0.8994293212890625, "kl": 0.028242330998182297, "learning_rate": 3.7362637362637365e-05, "loss": 0.0011, "num_tokens": 155208.0, "reward": 5278.0791015625, "reward_std": 1080.840087890625, "rewards/reward_long_completions/mean": 632.875, "rewards/reward_long_completions/std": 163.38600158691406, "rewards/reward_long_sentences/mean": 27.68024253845215, "rewards/reward_long_sentences/std": 25.471553802490234, "rewards/reward_low_threat_score/mean": 0.9998841285705566, "rewards/reward_low_threat_score/std": 5.579048774961848e-06, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9583333333333334, "completions/max_length": 256.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 252.2916717529297, "completions/mean_terminated_length": 167.0, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.0260989010989011, "grad_norm": 1.0123169422149658, "kl": 0.028721116483211517, "learning_rate": 3.956043956043956e-05, "loss": 0.0012, "num_tokens": 165235.0, "reward": 6126.1513671875, "reward_std": 1111.8857421875, "rewards/reward_long_completions/mean": 739.2083129882812, "rewards/reward_long_completions/std": 191.5445098876953, "rewards/reward_long_sentences/mean": 20.403406143188477, "rewards/reward_long_sentences/std": 15.211712837219238, "rewards/reward_low_threat_score/mean": 0.9998844265937805, "rewards/reward_low_threat_score/std": 7.190121777966851e-06, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.027472527472527472, "grad_norm": 0.9043582677841187, "kl": 0.03920399397611618, "learning_rate": 4.1758241758241765e-05, "loss": 0.0016, "num_tokens": 175187.0, "reward": 6234.5576171875, "reward_std": 1430.1063232421875, "rewards/reward_long_completions/mean": 754.5, "rewards/reward_long_completions/std": 187.91441345214844, "rewards/reward_long_sentences/mean": 15.177899360656738, "rewards/reward_long_sentences/std": 8.936412811279297, "rewards/reward_low_threat_score/mean": 0.9998851418495178, "rewards/reward_low_threat_score/std": 6.574356575583806e-06, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.028846153846153848, "grad_norm": 0.8395311236381531, "kl": 0.05012500286102295, "learning_rate": 4.3956043956043955e-05, "loss": 0.002, "num_tokens": 186311.0, "reward": 6728.2734375, "reward_std": 1219.9100341796875, "rewards/reward_long_completions/mean": 815.625, "rewards/reward_long_completions/std": 186.58786010742188, "rewards/reward_long_sentences/mean": 12.908676147460938, "rewards/reward_long_sentences/std": 7.161063194274902, "rewards/reward_low_threat_score/mean": 0.9994990229606628, "rewards/reward_low_threat_score/std": 0.0017419634386897087, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.03021978021978022, "grad_norm": 0.7509822249412537, "kl": 0.033319465816020966, "learning_rate": 4.615384615384616e-05, "loss": 0.0013, "num_tokens": 195563.0, "reward": 7268.1298828125, "reward_std": 1617.4627685546875, "rewards/reward_long_completions/mean": 876.125, "rewards/reward_long_completions/std": 226.96299743652344, "rewards/reward_long_sentences/mean": 26.445180892944336, "rewards/reward_long_sentences/std": 43.756378173828125, "rewards/reward_low_threat_score/mean": 0.9998738765716553, "rewards/reward_low_threat_score/std": 2.530500387365464e-05, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.03159340659340659, "grad_norm": 0.8786510825157166, "kl": 0.04795674979686737, "learning_rate": 4.8351648351648355e-05, "loss": 0.0019, "num_tokens": 205779.0, "reward": 7041.06396484375, "reward_std": 1205.6651611328125, "rewards/reward_long_completions/mean": 852.3333129882812, "rewards/reward_long_completions/std": 161.3539581298828, "rewards/reward_long_sentences/mean": 16.569040298461914, "rewards/reward_long_sentences/std": 14.149473190307617, "rewards/reward_low_threat_score/mean": 0.9998874068260193, "rewards/reward_low_threat_score/std": 6.278262844716664e-06, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9583333333333334, "completions/max_length": 256.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 252.33334350585938, "completions/mean_terminated_length": 168.0, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.03296703296703297, "grad_norm": 0.7992393970489502, "kl": 0.05039969086647034, "learning_rate": 5.054945054945055e-05, "loss": 0.002, "num_tokens": 215471.0, "reward": 6605.67138671875, "reward_std": 1132.4881591796875, "rewards/reward_long_completions/mean": 798.4583129882812, "rewards/reward_long_completions/std": 176.78184509277344, "rewards/reward_long_sentences/mean": 18.496292114257812, "rewards/reward_long_sentences/std": 20.724517822265625, "rewards/reward_low_threat_score/mean": 0.9998507499694824, "rewards/reward_low_threat_score/std": 0.00010103530075866729, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.034340659340659344, "grad_norm": 0.9030084609985352, "kl": 0.05149734765291214, "learning_rate": 5.274725274725275e-05, "loss": 0.0021, "num_tokens": 224763.0, "reward": 7555.9326171875, "reward_std": 1232.7958984375, "rewards/reward_long_completions/mean": 915.7083129882812, "rewards/reward_long_completions/std": 258.4144592285156, "rewards/reward_long_sentences/mean": 15.1347017288208, "rewards/reward_long_sentences/std": 8.416716575622559, "rewards/reward_low_threat_score/mean": 0.9998717308044434, "rewards/reward_low_threat_score/std": 3.16317418764811e-05, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.03571428571428571, "grad_norm": 0.7929250001907349, "kl": 0.07963591814041138, "learning_rate": 5.494505494505495e-05, "loss": 0.0032, "num_tokens": 233795.0, "reward": 8173.826171875, "reward_std": 700.1514892578125, "rewards/reward_long_completions/mean": 992.75, "rewards/reward_long_completions/std": 93.83068084716797, "rewards/reward_long_sentences/mean": 10.92309284210205, "rewards/reward_long_sentences/std": 3.985023260116577, "rewards/reward_low_threat_score/mean": 0.9998623728752136, "rewards/reward_low_threat_score/std": 9.916435374179855e-05, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.03708791208791209, "grad_norm": 0.841202974319458, "kl": 0.10049758851528168, "learning_rate": 5.714285714285714e-05, "loss": 0.004, "num_tokens": 243523.0, "reward": 8269.04296875, "reward_std": 759.47119140625, "rewards/reward_long_completions/mean": 1003.4583129882812, "rewards/reward_long_completions/std": 90.5192642211914, "rewards/reward_long_sentences/mean": 13.215075492858887, "rewards/reward_long_sentences/std": 6.251865863800049, "rewards/reward_low_threat_score/mean": 0.9998839497566223, "rewards/reward_low_threat_score/std": 5.777440946985735e-06, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.038461538461538464, "grad_norm": 0.9120985269546509, "kl": 0.12286963313817978, "learning_rate": 5.9340659340659345e-05, "loss": 0.0049, "num_tokens": 253983.0, "reward": 8546.8544921875, "reward_std": 773.6561279296875, "rewards/reward_long_completions/mean": 1038.0833740234375, "rewards/reward_long_completions/std": 102.9296646118164, "rewards/reward_long_sentences/mean": 11.356366157531738, "rewards/reward_long_sentences/std": 3.4283668994903564, "rewards/reward_low_threat_score/mean": 0.999876081943512, "rewards/reward_low_threat_score/std": 2.30454152188031e-05, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.03983516483516483, "grad_norm": 0.8597483038902283, "kl": 0.13207492232322693, "learning_rate": 6.153846153846155e-05, "loss": 0.0053, "num_tokens": 266295.0, "reward": 8614.9033203125, "reward_std": 598.1918334960938, "rewards/reward_long_completions/mean": 1046.25, "rewards/reward_long_completions/std": 80.77033996582031, "rewards/reward_long_sentences/mean": 11.69629192352295, "rewards/reward_long_sentences/std": 4.650681018829346, "rewards/reward_low_threat_score/mean": 0.9998829960823059, "rewards/reward_low_threat_score/std": 9.057535862666555e-06, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.04120879120879121, "grad_norm": 0.8166387677192688, "kl": 0.15667298436164856, "learning_rate": 6.373626373626373e-05, "loss": 0.0063, "num_tokens": 276815.0, "reward": 9155.04296875, "reward_std": 654.998046875, "rewards/reward_long_completions/mean": 1112.0833740234375, "rewards/reward_long_completions/std": 95.15658569335938, "rewards/reward_long_sentences/mean": 11.84032154083252, "rewards/reward_long_sentences/std": 4.758576393127441, "rewards/reward_low_threat_score/mean": 0.9998804926872253, "rewards/reward_low_threat_score/std": 6.726719220750965e-06, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.042582417582417584, "grad_norm": 0.8241127729415894, "kl": 0.25302720069885254, "learning_rate": 6.593406593406594e-05, "loss": 0.0101, "num_tokens": 284419.0, "reward": 10300.69921875, "reward_std": 693.1300659179688, "rewards/reward_long_completions/mean": 1253.125, "rewards/reward_long_completions/std": 101.23382568359375, "rewards/reward_long_sentences/mean": 8.589949607849121, "rewards/reward_long_sentences/std": 1.0609889030456543, "rewards/reward_low_threat_score/mean": 0.9998804926872253, "rewards/reward_low_threat_score/std": 4.849046035815263e-06, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.04395604395604396, "grad_norm": 0.8461864590644836, "kl": 0.25924262404441833, "learning_rate": 6.813186813186814e-05, "loss": 0.0104, "num_tokens": 294447.0, "reward": 11193.322265625, "reward_std": 473.551025390625, "rewards/reward_long_completions/mean": 1361.7916259765625, "rewards/reward_long_completions/std": 61.83145523071289, "rewards/reward_long_sentences/mean": 9.15124797821045, "rewards/reward_long_sentences/std": 1.2760999202728271, "rewards/reward_low_threat_score/mean": 0.9998789429664612, "rewards/reward_low_threat_score/std": 4.0835284380591474e-06, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.04532967032967033, "grad_norm": 0.6956198811531067, "kl": 0.22335819900035858, "learning_rate": 7.032967032967034e-05, "loss": 0.0089, "num_tokens": 306751.0, "reward": 11227.89453125, "reward_std": 597.6326904296875, "rewards/reward_long_completions/mean": 1365.4166259765625, "rewards/reward_long_completions/std": 278.5418395996094, "rewards/reward_long_sentences/mean": 10.648924827575684, "rewards/reward_long_sentences/std": 3.7118663787841797, "rewards/reward_low_threat_score/mean": 0.999877393245697, "rewards/reward_low_threat_score/std": 5.80041523789987e-06, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.046703296703296704, "grad_norm": 0.8699533343315125, "kl": 0.2814876139163971, "learning_rate": 7.252747252747253e-05, "loss": 0.0113, "num_tokens": 314991.0, "reward": 12857.712890625, "reward_std": 1262.0302734375, "rewards/reward_long_completions/mean": 1561.625, "rewards/reward_long_completions/std": 318.7488708496094, "rewards/reward_long_sentences/mean": 17.243385314941406, "rewards/reward_long_sentences/std": 7.880738258361816, "rewards/reward_low_threat_score/mean": 0.9998712539672852, "rewards/reward_low_threat_score/std": 8.144679668475874e-06, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.04807692307692308, "grad_norm": 1.0916948318481445, "kl": 0.3633003234863281, "learning_rate": 7.472527472527473e-05, "loss": 0.0145, "num_tokens": 325623.0, "reward": 15610.4365234375, "reward_std": 808.4728393554688, "rewards/reward_long_completions/mean": 1850.375, "rewards/reward_long_completions/std": 323.7195129394531, "rewards/reward_long_sentences/mean": 136.15602111816406, "rewards/reward_long_sentences/std": 97.08489227294922, "rewards/reward_low_threat_score/mean": 0.9998602271080017, "rewards/reward_low_threat_score/std": 8.191022061510012e-06, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.04945054945054945, "grad_norm": 0.8700809478759766, "kl": 0.22090792655944824, "learning_rate": 7.692307692307693e-05, "loss": 0.0088, "num_tokens": 333859.0, "reward": 17788.892578125, "reward_std": 1516.815185546875, "rewards/reward_long_completions/mean": 2102.5, "rewards/reward_long_completions/std": 494.153564453125, "rewards/reward_long_sentences/mean": 170.57899475097656, "rewards/reward_long_sentences/std": 98.35679626464844, "rewards/reward_low_threat_score/mean": 0.9998334050178528, "rewards/reward_low_threat_score/std": 3.0829283787170425e-05, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.050824175824175824, "grad_norm": 0.8723951578140259, "kl": 0.13462691009044647, "learning_rate": 7.912087912087912e-05, "loss": 0.0054, "num_tokens": 341627.0, "reward": 17291.0390625, "reward_std": 2585.4501953125, "rewards/reward_long_completions/mean": 2045.25, "rewards/reward_long_completions/std": 656.9696044921875, "rewards/reward_long_sentences/mean": 161.77828979492188, "rewards/reward_long_sentences/std": 108.53572082519531, "rewards/reward_low_threat_score/mean": 0.9998288154602051, "rewards/reward_low_threat_score/std": 4.050599454785697e-05, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.0521978021978022, "grad_norm": 0.34895259141921997, "kl": 0.1503923535346985, "learning_rate": 8.131868131868132e-05, "loss": 0.006, "num_tokens": 352419.0, "reward": 20649.30859375, "reward_std": 1089.694580078125, "rewards/reward_long_completions/mean": 2425.833251953125, "rewards/reward_long_completions/std": 296.8428955078125, "rewards/reward_long_sentences/mean": 235.286865234375, "rewards/reward_long_sentences/std": 56.643035888671875, "rewards/reward_low_threat_score/mean": 0.9997933506965637, "rewards/reward_low_threat_score/std": 2.586980008345563e-05, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.05357142857142857, "grad_norm": 0.3808337152004242, "kl": 0.13797467947006226, "learning_rate": 8.351648351648353e-05, "loss": 0.0055, "num_tokens": 365299.0, "reward": 20871.7109375, "reward_std": 1100.9354248046875, "rewards/reward_long_completions/mean": 2453.833251953125, "rewards/reward_long_completions/std": 299.0020446777344, "rewards/reward_long_sentences/mean": 233.088134765625, "rewards/reward_long_sentences/std": 60.38532257080078, "rewards/reward_low_threat_score/mean": 0.9997913241386414, "rewards/reward_low_threat_score/std": 3.7562032957794145e-05, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.054945054945054944, "grad_norm": 0.4941976070404053, "kl": 0.12012534588575363, "learning_rate": 8.571428571428571e-05, "loss": 0.0048, "num_tokens": 375403.0, "reward": 19191.2734375, "reward_std": 338.137451171875, "rewards/reward_long_completions/mean": 2258.75, "rewards/reward_long_completions/std": 618.3214721679688, "rewards/reward_long_sentences/mean": 208.0441131591797, "rewards/reward_long_sentences/std": 95.69821166992188, "rewards/reward_low_threat_score/mean": 0.9998027682304382, "rewards/reward_low_threat_score/std": 3.610231215134263e-05, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.05631868131868132, "grad_norm": 1.27899169921875, "kl": 0.3085135817527771, "learning_rate": 8.791208791208791e-05, "loss": 0.0123, "num_tokens": 384007.0, "reward": 19351.35546875, "reward_std": 709.7264404296875, "rewards/reward_long_completions/mean": 2280.958251953125, "rewards/reward_long_completions/std": 523.7840576171875, "rewards/reward_long_sentences/mean": 201.2694549560547, "rewards/reward_long_sentences/std": 94.26702880859375, "rewards/reward_low_threat_score/mean": 0.9998120665550232, "rewards/reward_low_threat_score/std": 3.6868321330985054e-05, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.057692307692307696, "grad_norm": 0.4398014545440674, "kl": 0.15548112988471985, "learning_rate": 9.010989010989012e-05, "loss": 0.0062, "num_tokens": 392447.0, "reward": 18963.693359375, "reward_std": 2708.23388671875, "rewards/reward_long_completions/mean": 2240.416748046875, "rewards/reward_long_completions/std": 555.9338989257812, "rewards/reward_long_sentences/mean": 184.2130889892578, "rewards/reward_long_sentences/std": 105.39789581298828, "rewards/reward_low_threat_score/mean": 0.999798059463501, "rewards/reward_low_threat_score/std": 4.455574890016578e-05, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.059065934065934064, "grad_norm": 0.47885459661483765, "kl": 0.27550753951072693, "learning_rate": 9.230769230769232e-05, "loss": 0.011, "num_tokens": 403539.0, "reward": 16350.8359375, "reward_std": 1393.378173828125, "rewards/reward_long_completions/mean": 1968.625, "rewards/reward_long_completions/std": 254.01809692382812, "rewards/reward_long_sentences/mean": 65.56205749511719, "rewards/reward_long_sentences/std": 80.78085327148438, "rewards/reward_low_threat_score/mean": 0.999833881855011, "rewards/reward_low_threat_score/std": 5.9725258324760944e-05, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.06043956043956044, "grad_norm": 0.773730993270874, "kl": 0.2250758409500122, "learning_rate": 9.450549450549451e-05, "loss": 0.009, "num_tokens": 413835.0, "reward": 14526.990234375, "reward_std": 1937.2313232421875, "rewards/reward_long_completions/mean": 1756.75, "rewards/reward_long_completions/std": 509.076416015625, "rewards/reward_long_sentences/mean": 38.74198913574219, "rewards/reward_long_sentences/std": 68.1504135131836, "rewards/reward_low_threat_score/mean": 0.9998491406440735, "rewards/reward_low_threat_score/std": 4.993580296286382e-05, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.061813186813186816, "grad_norm": 0.6252362728118896, "kl": 0.28614798188209534, "learning_rate": 9.670329670329671e-05, "loss": 0.0114, "num_tokens": 424567.0, "reward": 15535.021484375, "reward_std": 1589.260498046875, "rewards/reward_long_completions/mean": 1882.4166259765625, "rewards/reward_long_completions/std": 358.2585754394531, "rewards/reward_long_sentences/mean": 31.9193115234375, "rewards/reward_long_sentences/std": 44.83122253417969, "rewards/reward_low_threat_score/mean": 0.999854326248169, "rewards/reward_low_threat_score/std": 1.7640164514887147e-05, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.06318681318681318, "grad_norm": 0.6698728203773499, "kl": 0.33742478489875793, "learning_rate": 9.89010989010989e-05, "loss": 0.0135, "num_tokens": 435403.0, "reward": 15854.0234375, "reward_std": 1866.2747802734375, "rewards/reward_long_completions/mean": 1917.1666259765625, "rewards/reward_long_completions/std": 445.2794494628906, "rewards/reward_long_sentences/mean": 42.44483947753906, "rewards/reward_long_sentences/std": 55.12824249267578, "rewards/reward_low_threat_score/mean": 0.9998542666435242, "rewards/reward_low_threat_score/std": 2.866951035684906e-05, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.06456043956043957, "grad_norm": 0.7649576663970947, "kl": 0.4234192371368408, "learning_rate": 0.0001010989010989011, "loss": 0.0169, "num_tokens": 445939.0, "reward": 19167.140625, "reward_std": 2883.21240234375, "rewards/reward_long_completions/mean": 2304.041748046875, "rewards/reward_long_completions/std": 439.25, "rewards/reward_long_sentences/mean": 86.12908935546875, "rewards/reward_long_sentences/std": 75.87317657470703, "rewards/reward_low_threat_score/mean": 0.9998188018798828, "rewards/reward_low_threat_score/std": 4.996474308427423e-05, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.06593406593406594, "grad_norm": 0.8650764226913452, "kl": 0.40963709354400635, "learning_rate": 0.00010329670329670331, "loss": 0.0164, "num_tokens": 457827.0, "reward": 18710.0078125, "reward_std": 2807.151123046875, "rewards/reward_long_completions/mean": 2253.291748046875, "rewards/reward_long_completions/std": 362.4237976074219, "rewards/reward_long_sentences/mean": 73.45549774169922, "rewards/reward_long_sentences/std": 71.34930419921875, "rewards/reward_low_threat_score/mean": 0.9997755885124207, "rewards/reward_low_threat_score/std": 0.00016920518828555942, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.0673076923076923, "grad_norm": 0.831486701965332, "kl": 0.3928694725036621, "learning_rate": 0.0001054945054945055, "loss": 0.0157, "num_tokens": 467363.0, "reward": 16381.046875, "reward_std": 1449.155517578125, "rewards/reward_long_completions/mean": 1977.625, "rewards/reward_long_completions/std": 406.36798095703125, "rewards/reward_long_sentences/mean": 52.130035400390625, "rewards/reward_long_sentences/std": 52.68746566772461, "rewards/reward_low_threat_score/mean": 0.999847412109375, "rewards/reward_low_threat_score/std": 4.002095738542266e-05, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9583333333333334, "completions/max_length": 256.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 253.58334350585938, "completions/mean_terminated_length": 198.0, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.06868131868131869, "grad_norm": 0.8107019066810608, "kl": 0.23879021406173706, "learning_rate": 0.0001076923076923077, "loss": 0.0095, "num_tokens": 478377.0, "reward": 14349.9013671875, "reward_std": 1111.90234375, "rewards/reward_long_completions/mean": 1717.8333740234375, "rewards/reward_long_completions/std": 669.89306640625, "rewards/reward_long_sentences/mean": 82.50303649902344, "rewards/reward_long_sentences/std": 89.53267669677734, "rewards/reward_low_threat_score/mean": 0.999833881855011, "rewards/reward_low_threat_score/std": 5.463038178277202e-05, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.07005494505494506, "grad_norm": 0.6449283361434937, "kl": 0.3063722252845764, "learning_rate": 0.0001098901098901099, "loss": 0.0123, "num_tokens": 486701.0, "reward": 18832.75390625, "reward_std": 2169.609130859375, "rewards/reward_long_completions/mean": 2223.5, "rewards/reward_long_completions/std": 264.740234375, "rewards/reward_long_sentences/mean": 186.59791564941406, "rewards/reward_long_sentences/std": 69.86194610595703, "rewards/reward_low_threat_score/mean": 0.9997989535331726, "rewards/reward_low_threat_score/std": 4.358592559583485e-05, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.07142857142857142, "grad_norm": 0.7455602884292603, "kl": 0.23345538973808289, "learning_rate": 0.0001120879120879121, "loss": 0.0093, "num_tokens": 499189.0, "reward": 16127.5888671875, "reward_std": 1167.63330078125, "rewards/reward_long_completions/mean": 1905.4166259765625, "rewards/reward_long_completions/std": 604.5938110351562, "rewards/reward_long_sentences/mean": 156.488037109375, "rewards/reward_long_sentences/std": 107.40445709228516, "rewards/reward_low_threat_score/mean": 0.999833345413208, "rewards/reward_low_threat_score/std": 4.063014785060659e-05, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.07280219780219781, "grad_norm": 0.6644715666770935, "kl": 0.19592507183551788, "learning_rate": 0.00011428571428571428, "loss": 0.0078, "num_tokens": 508077.0, "reward": 19132.115234375, "reward_std": 797.6666870117188, "rewards/reward_long_completions/mean": 2249.208251953125, "rewards/reward_long_completions/std": 460.3047790527344, "rewards/reward_long_sentences/mean": 213.919921875, "rewards/reward_long_sentences/std": 90.62826538085938, "rewards/reward_low_threat_score/mean": 0.9997726082801819, "rewards/reward_low_threat_score/std": 8.210101077565923e-05, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.07417582417582418, "grad_norm": 0.44350892305374146, "kl": 0.19577279686927795, "learning_rate": 0.0001164835164835165, "loss": 0.0078, "num_tokens": 516421.0, "reward": 21666.435546875, "reward_std": 327.875244140625, "rewards/reward_long_completions/mean": 2544.583251953125, "rewards/reward_long_completions/std": 42.50209045410156, "rewards/reward_long_sentences/mean": 248.75, "rewards/reward_long_sentences/std": 26.135517120361328, "rewards/reward_low_threat_score/mean": 0.9997321963310242, "rewards/reward_low_threat_score/std": 4.408807944855653e-05, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.07554945054945054, "grad_norm": 0.5608872175216675, "kl": 0.22780603170394897, "learning_rate": 0.00011868131868131869, "loss": 0.0091, "num_tokens": 526205.0, "reward": 20366.6171875, "reward_std": 548.2073974609375, "rewards/reward_long_completions/mean": 2398.708251953125, "rewards/reward_long_completions/std": 601.463134765625, "rewards/reward_long_sentences/mean": 216.6853485107422, "rewards/reward_long_sentences/std": 87.8457260131836, "rewards/reward_low_threat_score/mean": 0.9996650815010071, "rewards/reward_low_threat_score/std": 0.00012151811097282916, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.07692307692307693, "grad_norm": 0.5219376683235168, "kl": 0.21699753403663635, "learning_rate": 0.00012087912087912087, "loss": 0.0087, "num_tokens": 536393.0, "reward": 19031.8125, "reward_std": 318.31365966796875, "rewards/reward_long_completions/mean": 2251.958251953125, "rewards/reward_long_completions/std": 780.4905395507812, "rewards/reward_long_sentences/mean": 176.0446319580078, "rewards/reward_long_sentences/std": 114.97904968261719, "rewards/reward_low_threat_score/mean": 0.9996920228004456, "rewards/reward_low_threat_score/std": 0.00014512118650600314, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.0782967032967033, "grad_norm": 0.3388201594352722, "kl": 0.1780703067779541, "learning_rate": 0.0001230769230769231, "loss": 0.0071, "num_tokens": 546169.0, "reward": 21702.044921875, "reward_std": 170.92724609375, "rewards/reward_long_completions/mean": 2561.791748046875, "rewards/reward_long_completions/std": 564.7734375, "rewards/reward_long_sentences/mean": 216.23541259765625, "rewards/reward_long_sentences/std": 90.71696472167969, "rewards/reward_low_threat_score/mean": 0.9996034502983093, "rewards/reward_low_threat_score/std": 0.00013678042159881443, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.07967032967032966, "grad_norm": 0.5109489560127258, "kl": 0.13640940189361572, "learning_rate": 0.00012527472527472527, "loss": 0.0054, "num_tokens": 557357.0, "reward": 19050.24609375, "reward_std": 283.5286560058594, "rewards/reward_long_completions/mean": 2253.041748046875, "rewards/reward_long_completions/std": 831.3817749023438, "rewards/reward_long_sentences/mean": 178.98948669433594, "rewards/reward_long_sentences/std": 112.23505401611328, "rewards/reward_low_threat_score/mean": 0.9996156096458435, "rewards/reward_low_threat_score/std": 0.00019591661111917347, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.08104395604395605, "grad_norm": 0.22309787571430206, "kl": 0.1447746902704239, "learning_rate": 0.00012747252747252746, "loss": 0.0057, "num_tokens": 568201.0, "reward": 23890.99609375, "reward_std": 9.975363731384277, "rewards/reward_long_completions/mean": 2813.083251953125, "rewards/reward_long_completions/std": 1.3805060386657715, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9994553923606873, "rewards/reward_low_threat_score/std": 6.358301470754668e-05, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.08241758241758242, "grad_norm": 0.2901628911495209, "kl": 0.1423759013414383, "learning_rate": 0.0001296703296703297, "loss": 0.0057, "num_tokens": 578877.0, "reward": 22117.654296875, "reward_std": 380.1587829589844, "rewards/reward_long_completions/mean": 2612.125, "rewards/reward_long_completions/std": 473.9473571777344, "rewards/reward_long_sentences/mean": 217.15972900390625, "rewards/reward_long_sentences/std": 88.7616195678711, "rewards/reward_low_threat_score/mean": 0.9994549751281738, "rewards/reward_low_threat_score/std": 0.00018892224761657417, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.08379120879120878, "grad_norm": 0.2484268844127655, "kl": 0.14895033836364746, "learning_rate": 0.00013186813186813188, "loss": 0.006, "num_tokens": 587937.0, "reward": 23063.18359375, "reward_std": 685.9218139648438, "rewards/reward_long_completions/mean": 2727.0, "rewards/reward_long_completions/std": 168.0251007080078, "rewards/reward_long_sentences/mean": 218.3423614501953, "rewards/reward_long_sentences/std": 75.83684539794922, "rewards/reward_low_threat_score/mean": 0.9995088577270508, "rewards/reward_low_threat_score/std": 0.00020899114315398037, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.08516483516483517, "grad_norm": 0.2974075376987457, "kl": 0.14511168003082275, "learning_rate": 0.00013406593406593405, "loss": 0.0058, "num_tokens": 598857.0, "reward": 23239.34375, "reward_std": 502.13665771484375, "rewards/reward_long_completions/mean": 2747.833251953125, "rewards/reward_long_completions/std": 174.1527557373047, "rewards/reward_long_sentences/mean": 220.0, "rewards/reward_long_sentences/std": 72.82540130615234, "rewards/reward_low_threat_score/mean": 0.9994818568229675, "rewards/reward_low_threat_score/std": 0.0001962967071449384, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.08653846153846154, "grad_norm": 0.11439555883407593, "kl": 0.10759229958057404, "learning_rate": 0.00013626373626373628, "loss": 0.0043, "num_tokens": 609033.0, "reward": 23856.623046875, "reward_std": 58.43872833251953, "rewards/reward_long_completions/mean": 2811.041748046875, "rewards/reward_long_completions/std": 14.369288444519043, "rewards/reward_long_sentences/mean": 250.5625, "rewards/reward_long_sentences/std": 26.215375900268555, "rewards/reward_low_threat_score/mean": 0.9993965029716492, "rewards/reward_low_threat_score/std": 9.20126840355806e-05, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.08791208791208792, "grad_norm": 1.595268031451269e-06, "kl": 0.10779287666082382, "learning_rate": 0.00013846153846153847, "loss": 0.0043, "num_tokens": 617333.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.08928571428571429, "grad_norm": 2.496514753147494e-05, "kl": 0.10413557291030884, "learning_rate": 0.00014065934065934067, "loss": 0.0042, "num_tokens": 628833.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.09065934065934066, "grad_norm": 5.7596174883656204e-05, "kl": 0.09574110805988312, "learning_rate": 0.00014285714285714287, "loss": 0.0038, "num_tokens": 639337.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.09203296703296704, "grad_norm": 5.264363949208928e-07, "kl": 0.09972178936004639, "learning_rate": 0.00014505494505494506, "loss": 0.004, "num_tokens": 648913.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.09340659340659341, "grad_norm": 6.671574510619394e-07, "kl": 0.10196229815483093, "learning_rate": 0.00014725274725274726, "loss": 0.0041, "num_tokens": 659385.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.09478021978021978, "grad_norm": 7.42224344207898e-08, "kl": 0.10761024802923203, "learning_rate": 0.00014945054945054946, "loss": 0.0043, "num_tokens": 668877.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.09615384615384616, "grad_norm": 5.740647956997691e-10, "kl": 0.1049078106880188, "learning_rate": 0.00015164835164835165, "loss": 0.0042, "num_tokens": 677481.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.09752747252747253, "grad_norm": 8.155037534152143e-08, "kl": 0.10476384311914444, "learning_rate": 0.00015384615384615385, "loss": 0.0042, "num_tokens": 687513.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.0989010989010989, "grad_norm": 4.595435143528448e-09, "kl": 0.09646934270858765, "learning_rate": 0.00015604395604395605, "loss": 0.0039, "num_tokens": 699625.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.10027472527472528, "grad_norm": 5.3945310024561266e-11, "kl": 0.09836632013320923, "learning_rate": 0.00015824175824175824, "loss": 0.0039, "num_tokens": 707485.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.10164835164835165, "grad_norm": 0.0008082823478616774, "kl": 0.1048198938369751, "learning_rate": 0.00016043956043956044, "loss": 0.0042, "num_tokens": 715801.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.10302197802197802, "grad_norm": 2.4316031499438395e-07, "kl": 0.1103041023015976, "learning_rate": 0.00016263736263736264, "loss": 0.0044, "num_tokens": 724897.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.1043956043956044, "grad_norm": 1.856940823330433e-09, "kl": 0.10049188882112503, "learning_rate": 0.00016483516483516484, "loss": 0.004, "num_tokens": 733905.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.10576923076923077, "grad_norm": 4.225860106288337e-09, "kl": 0.11534600704908371, "learning_rate": 0.00016703296703296706, "loss": 0.0046, "num_tokens": 743677.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.10714285714285714, "grad_norm": 0.0001399174507241696, "kl": 0.09628599882125854, "learning_rate": 0.00016923076923076923, "loss": 0.0039, "num_tokens": 752877.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.10851648351648352, "grad_norm": 2.0390640564915685e-11, "kl": 0.10977591574192047, "learning_rate": 0.00017142857142857143, "loss": 0.0044, "num_tokens": 762957.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.10989010989010989, "grad_norm": 8.782960470021806e-12, "kl": 0.11187249422073364, "learning_rate": 0.00017362637362637365, "loss": 0.0045, "num_tokens": 770965.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.11126373626373626, "grad_norm": 1.590226145209428e-10, "kl": 0.0960511863231659, "learning_rate": 0.00017582417582417582, "loss": 0.0038, "num_tokens": 781297.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.11263736263736264, "grad_norm": 1.4153247150650028e-11, "kl": 0.10151113569736481, "learning_rate": 0.00017802197802197802, "loss": 0.0041, "num_tokens": 791609.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.11401098901098901, "grad_norm": 7.078324471693209e-12, "kl": 0.09849902987480164, "learning_rate": 0.00018021978021978024, "loss": 0.0039, "num_tokens": 803421.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.11538461538461539, "grad_norm": 1.133346194670537e-10, "kl": 0.09681393206119537, "learning_rate": 0.0001824175824175824, "loss": 0.0039, "num_tokens": 811969.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.11675824175824176, "grad_norm": 6.051986134458787e-10, "kl": 0.10433407127857208, "learning_rate": 0.00018461538461538463, "loss": 0.0042, "num_tokens": 822717.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.11813186813186813, "grad_norm": 2.2543650857187458e-07, "kl": 0.10451744496822357, "learning_rate": 0.00018681318681318683, "loss": 0.0042, "num_tokens": 833457.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.11950549450549451, "grad_norm": 5.0021293530877475e-12, "kl": 0.10208427906036377, "learning_rate": 0.00018901098901098903, "loss": 0.0041, "num_tokens": 843425.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.12087912087912088, "grad_norm": 3.4788530563212294e-12, "kl": 0.10412843525409698, "learning_rate": 0.00019120879120879122, "loss": 0.0042, "num_tokens": 850717.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.12225274725274725, "grad_norm": 7.881081204308416e-11, "kl": 0.09822498261928558, "learning_rate": 0.00019340659340659342, "loss": 0.0039, "num_tokens": 860901.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.12362637362637363, "grad_norm": 4.362278333697178e-12, "kl": 0.10267078876495361, "learning_rate": 0.00019560439560439562, "loss": 0.0041, "num_tokens": 871309.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.125, "grad_norm": 4.334324565924419e-12, "kl": 0.1009175255894661, "learning_rate": 0.0001978021978021978, "loss": 0.004, "num_tokens": 883297.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.12637362637362637, "grad_norm": 1.7429892251730195e-10, "kl": 0.09698716551065445, "learning_rate": 0.0002, "loss": 0.0039, "num_tokens": 892785.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.12774725274725274, "grad_norm": 2.110382832029578e-12, "kl": 0.0992315486073494, "learning_rate": 0.0001999933787549241, "loss": 0.004, "num_tokens": 902409.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.12912087912087913, "grad_norm": 5.5665363811463475e-12, "kl": 0.100763700902462, "learning_rate": 0.00019997351589651408, "loss": 0.004, "num_tokens": 913749.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.1304945054945055, "grad_norm": 2.6420497734047643e-12, "kl": 0.10744968056678772, "learning_rate": 0.00019994041405510705, "loss": 0.0043, "num_tokens": 924009.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.13186813186813187, "grad_norm": 9.27724008725761e-11, "kl": 0.0951426774263382, "learning_rate": 0.00019989407761421109, "loss": 0.0038, "num_tokens": 937209.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.13324175824175824, "grad_norm": 5.427550631154077e-10, "kl": 0.1034160777926445, "learning_rate": 0.00019983451270992478, "loss": 0.0041, "num_tokens": 947309.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.1346153846153846, "grad_norm": 1.72243144226647e-12, "kl": 0.10008010268211365, "learning_rate": 0.0001997617272301248, "loss": 0.004, "num_tokens": 956829.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.13598901098901098, "grad_norm": 6.00792887661683e-11, "kl": 0.09552207589149475, "learning_rate": 0.00019967573081342103, "loss": 0.0038, "num_tokens": 966141.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.13736263736263737, "grad_norm": 1.6059244862740019e-12, "kl": 0.10627005994319916, "learning_rate": 0.00019957653484788053, "loss": 0.0043, "num_tokens": 977833.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.13873626373626374, "grad_norm": 8.566219955596921e-11, "kl": 0.0998491495847702, "learning_rate": 0.0001994641524695193, "loss": 0.004, "num_tokens": 986577.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.1401098901098901, "grad_norm": 7.384452771486139e-11, "kl": 0.1055595800280571, "learning_rate": 0.00019933859856056265, "loss": 0.0042, "num_tokens": 995141.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.14148351648351648, "grad_norm": 4.835925410118236e-11, "kl": 0.10763506591320038, "learning_rate": 0.00019919988974747473, "loss": 0.0043, "num_tokens": 1003805.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.14285714285714285, "grad_norm": 3.129229375620213e-10, "kl": 0.10003698617219925, "learning_rate": 0.00019904804439875633, "loss": 0.004, "num_tokens": 1014753.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.14423076923076922, "grad_norm": 1.1115260795113713e-10, "kl": 0.10658489167690277, "learning_rate": 0.00019888308262251285, "loss": 0.0043, "num_tokens": 1022949.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.14560439560439561, "grad_norm": 1.2503302559974117e-10, "kl": 0.09754768759012222, "learning_rate": 0.00019870502626379127, "loss": 0.0039, "num_tokens": 1032169.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.14697802197802198, "grad_norm": 3.395952616336295e-11, "kl": 0.09804193675518036, "learning_rate": 0.0001985138989016874, "loss": 0.0039, "num_tokens": 1042961.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.14835164835164835, "grad_norm": 1.6939252734868293e-12, "kl": 0.09172150492668152, "learning_rate": 0.00019830972584622324, "loss": 0.0037, "num_tokens": 1054637.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.14972527472527472, "grad_norm": 1.244372660469395e-12, "kl": 0.10438120365142822, "learning_rate": 0.00019809253413499565, "loss": 0.0042, "num_tokens": 1062693.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.1510989010989011, "grad_norm": 8.285517765216355e-07, "kl": 0.09450401365756989, "learning_rate": 0.00019786235252959553, "loss": 0.0038, "num_tokens": 1071993.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.15247252747252749, "grad_norm": 1.9150474608875534e-11, "kl": 0.09982839226722717, "learning_rate": 0.00019761921151179937, "loss": 0.004, "num_tokens": 1080241.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.15384615384615385, "grad_norm": 4.0674307671961785e-11, "kl": 0.10261037945747375, "learning_rate": 0.00019736314327953243, "loss": 0.0041, "num_tokens": 1090677.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.15521978021978022, "grad_norm": 1.0439300118791905e-10, "kl": 0.10477376729249954, "learning_rate": 0.0001970941817426052, "loss": 0.0042, "num_tokens": 1100505.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.1565934065934066, "grad_norm": 1.3879349058942836e-11, "kl": 0.1169905737042427, "learning_rate": 0.00019681236251822273, "loss": 0.0047, "num_tokens": 1109401.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.15796703296703296, "grad_norm": 7.332617678912356e-05, "kl": 0.11021736264228821, "learning_rate": 0.00019651772292626803, "loss": 0.0044, "num_tokens": 1117093.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.15934065934065933, "grad_norm": 8.01576334197307e-06, "kl": 0.10156182944774628, "learning_rate": 0.00019621030198436006, "loss": 0.0041, "num_tokens": 1125901.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.16071428571428573, "grad_norm": 1.877682315187812e-10, "kl": 0.09663717448711395, "learning_rate": 0.00019589014040268676, "loss": 0.0039, "num_tokens": 1134949.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.1620879120879121, "grad_norm": 4.791067889642342e-11, "kl": 0.09934892505407333, "learning_rate": 0.0001955572805786141, "loss": 0.004, "num_tokens": 1145785.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.16346153846153846, "grad_norm": 9.415924984157442e-11, "kl": 0.09586071223020554, "learning_rate": 0.00019521176659107142, "loss": 0.0038, "num_tokens": 1155929.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.16483516483516483, "grad_norm": 1.3628459472059262e-10, "kl": 0.09471525251865387, "learning_rate": 0.00019485364419471454, "loss": 0.0038, "num_tokens": 1165641.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.1662087912087912, "grad_norm": 7.0211218783367e-11, "kl": 0.09812217950820923, "learning_rate": 0.00019448296081386656, "loss": 0.0039, "num_tokens": 1175369.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.16758241758241757, "grad_norm": 1.5089866205533764e-12, "kl": 0.10879385471343994, "learning_rate": 0.00019409976553623766, "loss": 0.0044, "num_tokens": 1188041.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.16895604395604397, "grad_norm": 9.072555532362025e-13, "kl": 0.10184387862682343, "learning_rate": 0.00019370410910642471, "loss": 0.0041, "num_tokens": 1196985.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.17032967032967034, "grad_norm": 1.5136632182041754e-12, "kl": 0.10410472750663757, "learning_rate": 0.0001932960439191915, "loss": 0.0042, "num_tokens": 1207173.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.1717032967032967, "grad_norm": 4.7814877057739125e-11, "kl": 0.09943380951881409, "learning_rate": 0.00019287562401253022, "loss": 0.004, "num_tokens": 1215913.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.17307692307692307, "grad_norm": 1.4522151927168214e-12, "kl": 0.09782374650239944, "learning_rate": 0.00019244290506050568, "loss": 0.0039, "num_tokens": 1227929.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.17445054945054944, "grad_norm": 1.167991376359312e-12, "kl": 0.09713561832904816, "learning_rate": 0.00019199794436588243, "loss": 0.0039, "num_tokens": 1238165.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.17582417582417584, "grad_norm": 1.6131876390268474e-10, "kl": 0.10569487512111664, "learning_rate": 0.00019154080085253666, "loss": 0.0042, "num_tokens": 1249837.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.1771978021978022, "grad_norm": 1.0080569191883715e-12, "kl": 0.09871838241815567, "learning_rate": 0.00019107153505765306, "loss": 0.0039, "num_tokens": 1258753.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.17857142857142858, "grad_norm": 9.894976548199819e-13, "kl": 0.09806834161281586, "learning_rate": 0.00019059020912370834, "loss": 0.0039, "num_tokens": 1269917.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.17994505494505494, "grad_norm": 8.075190999079496e-05, "kl": 0.09455280005931854, "learning_rate": 0.0001900968867902419, "loss": 0.0038, "num_tokens": 1279953.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.1813186813186813, "grad_norm": 9.762452651829179e-11, "kl": 0.10221787542104721, "learning_rate": 0.00018959163338541518, "loss": 0.0041, "num_tokens": 1290457.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.18269230769230768, "grad_norm": 1.461808105118756e-12, "kl": 0.10973717272281647, "learning_rate": 0.00018907451581736054, "loss": 0.0044, "num_tokens": 1301045.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.18406593406593408, "grad_norm": 6.480055162683129e-06, "kl": 0.10601633787155151, "learning_rate": 0.000188545602565321, "loss": 0.0042, "num_tokens": 1310641.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.18543956043956045, "grad_norm": 4.994483333575772e-06, "kl": 0.10223455727100372, "learning_rate": 0.00018800496367058186, "loss": 0.0041, "num_tokens": 1319737.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.18681318681318682, "grad_norm": 1.7573209597568362e-12, "kl": 0.10154764354228973, "learning_rate": 0.00018745267072719555, "loss": 0.0041, "num_tokens": 1332761.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.18818681318681318, "grad_norm": 6.921498096890133e-11, "kl": 0.10028126835823059, "learning_rate": 0.00018688879687250067, "loss": 0.004, "num_tokens": 1341549.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.18956043956043955, "grad_norm": 1.2960707507225777e-10, "kl": 0.09794390201568604, "learning_rate": 0.0001863134167774369, "loss": 0.0039, "num_tokens": 1350757.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.19093406593406592, "grad_norm": 5.230375160758527e-11, "kl": 0.10160030424594879, "learning_rate": 0.0001857266066366567, "loss": 0.0041, "num_tokens": 1360501.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.19230769230769232, "grad_norm": 1.0560229990810854e-12, "kl": 0.103685662150383, "learning_rate": 0.00018512844415843514, "loss": 0.0041, "num_tokens": 1370313.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.1936813186813187, "grad_norm": 2.674767074495321e-10, "kl": 0.09330381453037262, "learning_rate": 0.0001845190085543795, "loss": 0.0037, "num_tokens": 1380677.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.19505494505494506, "grad_norm": 2.2778855190974667e-11, "kl": 0.09999950975179672, "learning_rate": 0.0001838983805289396, "loss": 0.004, "num_tokens": 1390657.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.19642857142857142, "grad_norm": 1.9878954844898544e-05, "kl": 0.09865181148052216, "learning_rate": 0.00018326664226872065, "loss": 0.0039, "num_tokens": 1399485.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.1978021978021978, "grad_norm": 2.795741693706333e-10, "kl": 0.09503644704818726, "learning_rate": 0.0001826238774315995, "loss": 0.0038, "num_tokens": 1408801.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.19917582417582416, "grad_norm": 8.927055600814471e-13, "kl": 0.0966976135969162, "learning_rate": 0.0001819701711356464, "loss": 0.0039, "num_tokens": 1417849.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.20054945054945056, "grad_norm": 2.8431548781959748e-11, "kl": 0.09874029457569122, "learning_rate": 0.00018130560994785325, "loss": 0.0039, "num_tokens": 1427401.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.20192307692307693, "grad_norm": 1.6977610723528658e-12, "kl": 0.09741991758346558, "learning_rate": 0.00018063028187266986, "loss": 0.0039, "num_tokens": 1440597.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.2032967032967033, "grad_norm": 1.99341563221489e-12, "kl": 0.1147979199886322, "learning_rate": 0.00017994427634035015, "loss": 0.0046, "num_tokens": 1448809.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.20467032967032966, "grad_norm": 3.51227449901792e-11, "kl": 0.099201500415802, "learning_rate": 0.00017924768419510904, "loss": 0.004, "num_tokens": 1456757.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.20604395604395603, "grad_norm": 1.0256424615856519e-12, "kl": 0.09562988579273224, "learning_rate": 0.0001785405976830929, "loss": 0.0038, "num_tokens": 1465425.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.20741758241758243, "grad_norm": 1.108717393068226e-12, "kl": 0.09575396776199341, "learning_rate": 0.00017782311044016338, "loss": 0.0038, "num_tokens": 1474741.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.2087912087912088, "grad_norm": 5.0341545829724055e-06, "kl": 0.10148782283067703, "learning_rate": 0.00017709531747949796, "loss": 0.0041, "num_tokens": 1482581.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.21016483516483517, "grad_norm": 2.99870933639923e-10, "kl": 0.1021440252661705, "learning_rate": 0.00017635731517900782, "loss": 0.0041, "num_tokens": 1492853.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.21153846153846154, "grad_norm": 1.091152992613309e-12, "kl": 0.09182263910770416, "learning_rate": 0.0001756092012685749, "loss": 0.0037, "num_tokens": 1503373.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.2129120879120879, "grad_norm": 1.6996105044186915e-12, "kl": 0.10179007798433304, "learning_rate": 0.00017485107481711012, "loss": 0.0041, "num_tokens": 1514689.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.21428571428571427, "grad_norm": 5.036477485065305e-11, "kl": 0.10354246199131012, "learning_rate": 0.00017408303621943417, "loss": 0.0041, "num_tokens": 1524061.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.21565934065934067, "grad_norm": 2.611482141645638e-10, "kl": 0.09654616564512253, "learning_rate": 0.00017330518718298264, "loss": 0.0039, "num_tokens": 1533365.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.21703296703296704, "grad_norm": 2.519448018267756e-11, "kl": 0.09937654435634613, "learning_rate": 0.00017251763071433765, "loss": 0.004, "num_tokens": 1542957.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.2184065934065934, "grad_norm": 1.7613983882558415e-10, "kl": 0.09588401019573212, "learning_rate": 0.000171720471105587, "loss": 0.0038, "num_tokens": 1553277.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.21978021978021978, "grad_norm": 7.541706148472826e-11, "kl": 0.09644173085689545, "learning_rate": 0.0001709138139205133, "loss": 0.0039, "num_tokens": 1563205.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.22115384615384615, "grad_norm": 7.581285599300713e-11, "kl": 0.1057298481464386, "learning_rate": 0.00017009776598061495, "loss": 0.0042, "num_tokens": 1573037.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.22252747252747251, "grad_norm": 1.159514359527769e-10, "kl": 0.0927734524011612, "learning_rate": 0.00016927243535095997, "loss": 0.0037, "num_tokens": 1581629.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.2239010989010989, "grad_norm": 1.9156437547351857e-10, "kl": 0.10507844388484955, "learning_rate": 0.00016843793132587567, "loss": 0.0042, "num_tokens": 1594721.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.22527472527472528, "grad_norm": 1.0183750418946147e-05, "kl": 0.0972292572259903, "learning_rate": 0.00016759436441447545, "loss": 0.0039, "num_tokens": 1603965.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.22664835164835165, "grad_norm": 1.5577465532767842e-12, "kl": 0.09718875586986542, "learning_rate": 0.00016674184632602446, "loss": 0.0039, "num_tokens": 1613061.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.22802197802197802, "grad_norm": 1.7360203986155343e-12, "kl": 0.11227698624134064, "learning_rate": 0.00016588048995514658, "loss": 0.0045, "num_tokens": 1622673.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.22939560439560439, "grad_norm": 1.896784595709078e-12, "kl": 0.10721049457788467, "learning_rate": 0.00016501040936687443, "loss": 0.0043, "num_tokens": 1632625.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.23076923076923078, "grad_norm": 5.4286829254124314e-08, "kl": 0.10550742596387863, "learning_rate": 0.0001641317197815442, "loss": 0.0042, "num_tokens": 1640109.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.23214285714285715, "grad_norm": 1.3051394687213502e-10, "kl": 0.09590387344360352, "learning_rate": 0.00016324453755953773, "loss": 0.0038, "num_tokens": 1650469.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.23351648351648352, "grad_norm": 7.107788663196501e-11, "kl": 0.10096496343612671, "learning_rate": 0.00016234898018587337, "loss": 0.004, "num_tokens": 1658477.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.2348901098901099, "grad_norm": 1.040791259600271e-12, "kl": 0.0943106859922409, "learning_rate": 0.00016144516625464812, "loss": 0.0038, "num_tokens": 1672037.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.23626373626373626, "grad_norm": 2.61657570610474e-11, "kl": 0.09802871197462082, "learning_rate": 0.00016053321545333283, "loss": 0.0039, "num_tokens": 1682865.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.23763736263736263, "grad_norm": 3.613826946025078e-11, "kl": 0.10666793584823608, "learning_rate": 0.00015961324854692254, "loss": 0.0043, "num_tokens": 1693105.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.23901098901098902, "grad_norm": 2.485877476088305e-11, "kl": 0.09390038251876831, "learning_rate": 0.00015868538736194427, "loss": 0.0038, "num_tokens": 1704197.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.2403846153846154, "grad_norm": 3.605274065399122e-11, "kl": 0.10316925495862961, "learning_rate": 0.000157749754770324, "loss": 0.0041, "num_tokens": 1714561.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.24175824175824176, "grad_norm": 8.445318749883413e-11, "kl": 0.10414046049118042, "learning_rate": 0.00015680647467311557, "loss": 0.0042, "num_tokens": 1725569.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.24313186813186813, "grad_norm": 1.7004415453839017e-12, "kl": 0.10776882618665695, "learning_rate": 0.00015585567198409298, "loss": 0.0043, "num_tokens": 1736413.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.2445054945054945, "grad_norm": 1.2177051034178987e-10, "kl": 0.10657480359077454, "learning_rate": 0.00015489747261320866, "loss": 0.0043, "num_tokens": 1745773.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.24587912087912087, "grad_norm": 9.938124628794398e-11, "kl": 0.09978938102722168, "learning_rate": 0.00015393200344991995, "loss": 0.004, "num_tokens": 1756281.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.24725274725274726, "grad_norm": 4.697896932581713e-11, "kl": 0.0992407351732254, "learning_rate": 0.00015295939234638564, "loss": 0.004, "num_tokens": 1765641.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.24862637362637363, "grad_norm": 1.7098846210455987e-12, "kl": 0.09283805638551712, "learning_rate": 0.00015197976810053544, "loss": 0.0037, "num_tokens": 1778021.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.25, "grad_norm": 1.6262817481571545e-10, "kl": 0.10042310506105423, "learning_rate": 0.0001509932604390136, "loss": 0.004, "num_tokens": 1787493.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.25137362637362637, "grad_norm": 5.83793777209074e-11, "kl": 0.09976917505264282, "learning_rate": 0.00015000000000000001, "loss": 0.004, "num_tokens": 1795685.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.25274725274725274, "grad_norm": 0.00017252341785933822, "kl": 0.09585652500391006, "learning_rate": 0.0001490001183159105, "loss": 0.0038, "num_tokens": 1805761.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.2541208791208791, "grad_norm": 1.81119528264162e-08, "kl": 0.10210035741329193, "learning_rate": 0.00014799374779597867, "loss": 0.0041, "num_tokens": 1814417.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.2554945054945055, "grad_norm": 2.972102078335759e-11, "kl": 0.10080772638320923, "learning_rate": 0.0001469810217087214, "loss": 0.004, "num_tokens": 1823125.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.25686813186813184, "grad_norm": 2.005105283198194e-11, "kl": 0.10225978493690491, "learning_rate": 0.0001459620741642912, "loss": 0.0041, "num_tokens": 1832653.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.25824175824175827, "grad_norm": 2.347315918027615e-11, "kl": 0.09697722643613815, "learning_rate": 0.00014493704009671613, "loss": 0.0039, "num_tokens": 1844421.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.25961538461538464, "grad_norm": 2.595573712557697e-12, "kl": 0.10577282309532166, "learning_rate": 0.0001439060552460318, "loss": 0.0042, "num_tokens": 1853273.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.260989010989011, "grad_norm": 1.0048063726550427e-12, "kl": 0.09902996569871902, "learning_rate": 0.00014286925614030542, "loss": 0.004, "num_tokens": 1863113.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.2623626373626374, "grad_norm": 1.477954841844209e-10, "kl": 0.1023586243391037, "learning_rate": 0.0001418267800775565, "loss": 0.0041, "num_tokens": 1874073.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.26373626373626374, "grad_norm": 1.87038687489427e-12, "kl": 0.09767340123653412, "learning_rate": 0.00014077876510757502, "loss": 0.0039, "num_tokens": 1885169.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.2651098901098901, "grad_norm": 2.1191756682803486e-10, "kl": 0.09933942556381226, "learning_rate": 0.00013972535001364014, "loss": 0.004, "num_tokens": 1894905.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.2664835164835165, "grad_norm": 1.0312095144807132e-12, "kl": 0.10606560111045837, "learning_rate": 0.0001386666742941419, "loss": 0.0042, "num_tokens": 1903493.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.26785714285714285, "grad_norm": 1.0315787937406617e-12, "kl": 0.09857428073883057, "learning_rate": 0.00013760287814410823, "loss": 0.0039, "num_tokens": 1913553.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.2692307692307692, "grad_norm": 1.1809293820608247e-10, "kl": 0.10582117736339569, "learning_rate": 0.00013653410243663952, "loss": 0.0042, "num_tokens": 1923949.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.2706043956043956, "grad_norm": 8.466682910324153e-11, "kl": 0.09563669562339783, "learning_rate": 0.00013546048870425356, "loss": 0.0038, "num_tokens": 1935317.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.27197802197802196, "grad_norm": 1.7039015597769547e-12, "kl": 0.09831060469150543, "learning_rate": 0.00013438217912014317, "loss": 0.0039, "num_tokens": 1945021.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.2733516483516483, "grad_norm": 9.848157012104508e-11, "kl": 0.10532185435295105, "learning_rate": 0.00013329931647934883, "loss": 0.0042, "num_tokens": 1955029.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.27472527472527475, "grad_norm": 9.041581506608054e-06, "kl": 0.0995059534907341, "learning_rate": 0.00013221204417984908, "loss": 0.004, "num_tokens": 1964497.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.2760989010989011, "grad_norm": 3.1604455164035983e-10, "kl": 0.10081399977207184, "learning_rate": 0.0001311205062035711, "loss": 0.004, "num_tokens": 1976293.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.2774725274725275, "grad_norm": 3.3440097912951217e-11, "kl": 0.10248749703168869, "learning_rate": 0.0001300248470973239, "loss": 0.0041, "num_tokens": 1985425.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.27884615384615385, "grad_norm": 1.6364778455957296e-12, "kl": 0.09849707782268524, "learning_rate": 0.00012892521195365678, "loss": 0.0039, "num_tokens": 1996225.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.2802197802197802, "grad_norm": 6.492853477091387e-11, "kl": 0.10074122250080109, "learning_rate": 0.0001278217463916453, "loss": 0.004, "num_tokens": 2006277.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.2815934065934066, "grad_norm": 7.825871762179304e-06, "kl": 0.10076166689395905, "learning_rate": 0.0001267145965376078, "loss": 0.004, "num_tokens": 2015309.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.28296703296703296, "grad_norm": 3.4164594764352074e-11, "kl": 0.10823695361614227, "learning_rate": 0.0001256039090057547, "loss": 0.0043, "num_tokens": 2024293.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.28434065934065933, "grad_norm": 2.0349579660239314e-11, "kl": 0.09978428483009338, "learning_rate": 0.00012448983087877307, "loss": 0.004, "num_tokens": 2033685.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.2857142857142857, "grad_norm": 8.391226949400199e-12, "kl": 0.09664393961429596, "learning_rate": 0.00012337250968834913, "loss": 0.0039, "num_tokens": 2044057.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.28708791208791207, "grad_norm": 3.1038085296364315e-11, "kl": 0.09822292625904083, "learning_rate": 0.00012225209339563145, "loss": 0.0039, "num_tokens": 2052421.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.28846153846153844, "grad_norm": 2.1641757830259678e-11, "kl": 0.10072015225887299, "learning_rate": 0.00012112873037163728, "loss": 0.004, "num_tokens": 2063337.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.28983516483516486, "grad_norm": 1.142271607162959e-12, "kl": 0.10045331716537476, "learning_rate": 0.00012000256937760445, "loss": 0.004, "num_tokens": 2071489.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.29120879120879123, "grad_norm": 1.6328326191139553e-10, "kl": 0.09756055474281311, "learning_rate": 0.00011887375954529168, "loss": 0.0039, "num_tokens": 2081465.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.2925824175824176, "grad_norm": 5.7384222373890736e-11, "kl": 0.0973486378788948, "learning_rate": 0.00011774245035722983, "loss": 0.0039, "num_tokens": 2091169.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.29395604395604397, "grad_norm": 8.592917501680508e-13, "kl": 0.09655678272247314, "learning_rate": 0.00011660879162692675, "loss": 0.0039, "num_tokens": 2099149.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.29532967032967034, "grad_norm": 8.378822267084141e-13, "kl": 0.09749593585729599, "learning_rate": 0.00011547293347902812, "loss": 0.0039, "num_tokens": 2108153.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.2967032967032967, "grad_norm": 9.868863504181036e-10, "kl": 0.10469315946102142, "learning_rate": 0.00011433502632943735, "loss": 0.0042, "num_tokens": 2118289.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.2980769230769231, "grad_norm": 6.370412530820602e-11, "kl": 0.09354037046432495, "learning_rate": 0.00011319522086539667, "loss": 0.0037, "num_tokens": 2126937.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.29945054945054944, "grad_norm": 3.3110042485517965e-11, "kl": 0.0943954735994339, "learning_rate": 0.0001120536680255323, "loss": 0.0038, "num_tokens": 2136533.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.3008241758241758, "grad_norm": 2.3006430094341113e-11, "kl": 0.09597738832235336, "learning_rate": 0.00011091051897986678, "loss": 0.0038, "num_tokens": 2145665.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.3021978021978022, "grad_norm": 1.4305159704366965e-12, "kl": 0.10086312890052795, "learning_rate": 0.00010976592510979982, "loss": 0.004, "num_tokens": 2155693.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.30357142857142855, "grad_norm": 6.00549471263534e-11, "kl": 0.10609370470046997, "learning_rate": 0.00010862003798806196, "loss": 0.0042, "num_tokens": 2164929.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.30494505494505497, "grad_norm": 3.265708073740825e-05, "kl": 0.10385602712631226, "learning_rate": 0.00010747300935864243, "loss": 0.0042, "num_tokens": 2175785.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.30631868131868134, "grad_norm": 1.5007060264610184e-12, "kl": 0.10799738764762878, "learning_rate": 0.00010632499111669454, "loss": 0.0043, "num_tokens": 2186877.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.3076923076923077, "grad_norm": 1.424686341122694e-10, "kl": 0.10518504679203033, "learning_rate": 0.00010517613528842097, "loss": 0.0042, "num_tokens": 2194805.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.3090659340659341, "grad_norm": 5.296512187169533e-11, "kl": 0.10310830175876617, "learning_rate": 0.00010402659401094152, "loss": 0.0041, "num_tokens": 2204861.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.31043956043956045, "grad_norm": 2.2079555198062373e-11, "kl": 0.10355495661497116, "learning_rate": 0.00010287651951214674, "loss": 0.0041, "num_tokens": 2214013.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.3118131868131868, "grad_norm": 2.394778264580566e-10, "kl": 0.09841965138912201, "learning_rate": 0.00010172606409053886, "loss": 0.0039, "num_tokens": 2225469.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.3131868131868132, "grad_norm": 9.006347100395939e-13, "kl": 0.10139922797679901, "learning_rate": 0.00010057538009506377, "loss": 0.0041, "num_tokens": 2235629.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.31456043956043955, "grad_norm": 3.525576011687015e-11, "kl": 0.1048964187502861, "learning_rate": 9.942461990493625e-05, "loss": 0.0042, "num_tokens": 2244481.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.3159340659340659, "grad_norm": 1.5577804671207396e-11, "kl": 0.09502234309911728, "learning_rate": 9.827393590946116e-05, "loss": 0.0038, "num_tokens": 2254329.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.3173076923076923, "grad_norm": 2.5303288353484277e-09, "kl": 0.10155314207077026, "learning_rate": 9.712348048785329e-05, "loss": 0.0041, "num_tokens": 2264837.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.31868131868131866, "grad_norm": 4.4064044080194265e-10, "kl": 0.10295082628726959, "learning_rate": 9.597340598905852e-05, "loss": 0.0041, "num_tokens": 2274401.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.32005494505494503, "grad_norm": 1.1155374584140287e-12, "kl": 0.09711534529924393, "learning_rate": 9.482386471157904e-05, "loss": 0.0039, "num_tokens": 2283069.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.32142857142857145, "grad_norm": 9.831378520175349e-06, "kl": 0.10743525624275208, "learning_rate": 9.367500888330545e-05, "loss": 0.0043, "num_tokens": 2291769.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.3228021978021978, "grad_norm": 3.384180088938926e-11, "kl": 0.10411744564771652, "learning_rate": 9.252699064135758e-05, "loss": 0.0042, "num_tokens": 2300757.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.3241758241758242, "grad_norm": 1.3304262438396108e-06, "kl": 0.10176746547222137, "learning_rate": 9.137996201193805e-05, "loss": 0.0041, "num_tokens": 2312577.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.32554945054945056, "grad_norm": 2.321955435036216e-06, "kl": 0.10588907450437546, "learning_rate": 9.023407489020019e-05, "loss": 0.0042, "num_tokens": 2323009.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.3269230769230769, "grad_norm": 1.7605322755187558e-10, "kl": 0.10773593187332153, "learning_rate": 8.908948102013326e-05, "loss": 0.0043, "num_tokens": 2332313.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.3282967032967033, "grad_norm": 5.940694464134921e-11, "kl": 0.09714624285697937, "learning_rate": 8.79463319744677e-05, "loss": 0.0039, "num_tokens": 2343697.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.32967032967032966, "grad_norm": 3.659261435529082e-11, "kl": 0.10449790954589844, "learning_rate": 8.680477913460338e-05, "loss": 0.0042, "num_tokens": 2352321.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.33104395604395603, "grad_norm": 9.933285747762421e-13, "kl": 0.09594735503196716, "learning_rate": 8.566497367056267e-05, "loss": 0.0038, "num_tokens": 2362385.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.3324175824175824, "grad_norm": 6.661509538430366e-11, "kl": 0.09952082484960556, "learning_rate": 8.452706652097186e-05, "loss": 0.004, "num_tokens": 2375529.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.33379120879120877, "grad_norm": 3.02602144730546e-11, "kl": 0.10189050436019897, "learning_rate": 8.339120837307325e-05, "loss": 0.0041, "num_tokens": 2384333.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.33516483516483514, "grad_norm": 1.5691574558815424e-12, "kl": 0.0989951491355896, "learning_rate": 8.225754964277018e-05, "loss": 0.004, "num_tokens": 2395353.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.33653846153846156, "grad_norm": 2.396232084284078e-12, "kl": 0.10867413133382797, "learning_rate": 8.112624045470835e-05, "loss": 0.0043, "num_tokens": 2404429.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.33791208791208793, "grad_norm": 2.970406212665644e-11, "kl": 0.10207439959049225, "learning_rate": 7.999743062239557e-05, "loss": 0.0041, "num_tokens": 2417437.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.3392857142857143, "grad_norm": 5.221999526838772e-05, "kl": 0.09248314797878265, "learning_rate": 7.887126962836273e-05, "loss": 0.0037, "num_tokens": 2426121.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.34065934065934067, "grad_norm": 1.113661897075846e-12, "kl": 0.09790235757827759, "learning_rate": 7.774790660436858e-05, "loss": 0.0039, "num_tokens": 2434929.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.34203296703296704, "grad_norm": 1.0877071812687156e-12, "kl": 0.10433720052242279, "learning_rate": 7.662749031165092e-05, "loss": 0.0042, "num_tokens": 2445301.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.3434065934065934, "grad_norm": 1.0031064173743331e-10, "kl": 0.09679612517356873, "learning_rate": 7.551016912122691e-05, "loss": 0.0039, "num_tokens": 2455925.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.3447802197802198, "grad_norm": 2.615710426034923e-11, "kl": 0.10794855654239655, "learning_rate": 7.43960909942453e-05, "loss": 0.0043, "num_tokens": 2466229.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.34615384615384615, "grad_norm": 2.0180241158684886e-11, "kl": 0.09860313683748245, "learning_rate": 7.328540346239223e-05, "loss": 0.0039, "num_tokens": 2475881.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.3475274725274725, "grad_norm": 3.183017391328313e-11, "kl": 0.09505751729011536, "learning_rate": 7.217825360835473e-05, "loss": 0.0038, "num_tokens": 2484377.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.3489010989010989, "grad_norm": 1.2316015178190365e-12, "kl": 0.10996980965137482, "learning_rate": 7.107478804634325e-05, "loss": 0.0044, "num_tokens": 2492709.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.35027472527472525, "grad_norm": 1.3284381927824995e-10, "kl": 0.09889383614063263, "learning_rate": 6.997515290267611e-05, "loss": 0.004, "num_tokens": 2502353.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.3516483516483517, "grad_norm": 4.187723431914314e-11, "kl": 0.09898985177278519, "learning_rate": 6.887949379642893e-05, "loss": 0.004, "num_tokens": 2510969.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.35302197802197804, "grad_norm": 2.207370050633095e-11, "kl": 0.0981399267911911, "learning_rate": 6.778795582015097e-05, "loss": 0.0039, "num_tokens": 2519873.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.3543956043956044, "grad_norm": 2.1747707801278438e-11, "kl": 0.09707866609096527, "learning_rate": 6.67006835206512e-05, "loss": 0.0039, "num_tokens": 2529929.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.3557692307692308, "grad_norm": 5.637412758829896e-11, "kl": 0.10217994451522827, "learning_rate": 6.561782087985681e-05, "loss": 0.0041, "num_tokens": 2541345.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.35714285714285715, "grad_norm": 1.4500476557335884e-12, "kl": 0.09792613983154297, "learning_rate": 6.453951129574644e-05, "loss": 0.0039, "num_tokens": 2552849.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.3585164835164835, "grad_norm": 1.2997244339812974e-12, "kl": 0.10093636810779572, "learning_rate": 6.34658975633605e-05, "loss": 0.004, "num_tokens": 2562057.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.3598901098901099, "grad_norm": 1.1950127302637337e-12, "kl": 0.09681189060211182, "learning_rate": 6.239712185589181e-05, "loss": 0.0039, "num_tokens": 2571145.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.36126373626373626, "grad_norm": 3.766434386598405e-10, "kl": 0.10406795144081116, "learning_rate": 6.133332570585812e-05, "loss": 0.0042, "num_tokens": 2581113.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.3626373626373626, "grad_norm": 9.761036380215304e-13, "kl": 0.09790156036615372, "learning_rate": 6.02746499863599e-05, "loss": 0.0039, "num_tokens": 2593145.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.364010989010989, "grad_norm": 8.13757806798765e-13, "kl": 0.09999528527259827, "learning_rate": 5.922123489242499e-05, "loss": 0.004, "num_tokens": 2601413.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.36538461538461536, "grad_norm": 9.211899906075294e-13, "kl": 0.10531137883663177, "learning_rate": 5.817321992244351e-05, "loss": 0.0042, "num_tokens": 2609797.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.36675824175824173, "grad_norm": 1.4020107774390111e-12, "kl": 0.0950576514005661, "learning_rate": 5.713074385969457e-05, "loss": 0.0038, "num_tokens": 2619893.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.36813186813186816, "grad_norm": 3.034852924521658e-11, "kl": 0.1098005622625351, "learning_rate": 5.6093944753968206e-05, "loss": 0.0044, "num_tokens": 2628801.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.3695054945054945, "grad_norm": 1.9201255169187803e-11, "kl": 0.10106509923934937, "learning_rate": 5.506295990328385e-05, "loss": 0.004, "num_tokens": 2637441.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.3708791208791209, "grad_norm": 4.421710220192665e-11, "kl": 0.09477835893630981, "learning_rate": 5.4037925835708837e-05, "loss": 0.0038, "num_tokens": 2646469.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.37225274725274726, "grad_norm": 4.080715370946564e-06, "kl": 0.09966424107551575, "learning_rate": 5.3018978291278633e-05, "loss": 0.004, "num_tokens": 2655105.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.37362637362637363, "grad_norm": 1.0148173534146876e-12, "kl": 0.10145386308431625, "learning_rate": 5.200625220402139e-05, "loss": 0.0041, "num_tokens": 2664101.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.375, "grad_norm": 6.974342631194119e-11, "kl": 0.10559321939945221, "learning_rate": 5.0999881684089525e-05, "loss": 0.0042, "num_tokens": 2673001.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.37637362637362637, "grad_norm": 1.4655575364397322e-11, "kl": 0.09224438667297363, "learning_rate": 5.000000000000002e-05, "loss": 0.0037, "num_tokens": 2684313.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.37774725274725274, "grad_norm": 1.3707387004971427e-12, "kl": 0.09745369106531143, "learning_rate": 4.900673956098644e-05, "loss": 0.0039, "num_tokens": 2695113.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.3791208791208791, "grad_norm": 4.913976517362251e-11, "kl": 0.10092083364725113, "learning_rate": 4.802023189946454e-05, "loss": 0.004, "num_tokens": 2706029.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.3804945054945055, "grad_norm": 6.81619899411956e-11, "kl": 0.10475972294807434, "learning_rate": 4.704060765361433e-05, "loss": 0.0042, "num_tokens": 2715653.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.38186813186813184, "grad_norm": 3.179009139264721e-11, "kl": 0.10568700730800629, "learning_rate": 4.606799655008009e-05, "loss": 0.0042, "num_tokens": 2726785.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.38324175824175827, "grad_norm": 5.2127913097166356e-11, "kl": 0.09345732629299164, "learning_rate": 4.510252738679136e-05, "loss": 0.0037, "num_tokens": 2737013.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.38461538461538464, "grad_norm": 1.0868561909835317e-12, "kl": 0.10062143206596375, "learning_rate": 4.4144328015907035e-05, "loss": 0.004, "num_tokens": 2745973.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.385989010989011, "grad_norm": 1.2533992940766403e-12, "kl": 0.10508638620376587, "learning_rate": 4.3193525326884435e-05, "loss": 0.0042, "num_tokens": 2756053.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.3873626373626374, "grad_norm": 1.765243767132274e-12, "kl": 0.10787059366703033, "learning_rate": 4.225024522967602e-05, "loss": 0.0043, "num_tokens": 2767649.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.38873626373626374, "grad_norm": 8.049406974297568e-11, "kl": 0.10331256687641144, "learning_rate": 4.131461263805576e-05, "loss": 0.0041, "num_tokens": 2777197.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.3901098901098901, "grad_norm": 2.5957644367302635e-10, "kl": 0.09450673311948776, "learning_rate": 4.038675145307747e-05, "loss": 0.0038, "num_tokens": 2789601.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.3914835164835165, "grad_norm": 1.0235039812206415e-12, "kl": 0.10084620118141174, "learning_rate": 3.946678454666719e-05, "loss": 0.004, "num_tokens": 2797669.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.39285714285714285, "grad_norm": 1.0243484662927904e-12, "kl": 0.10402539372444153, "learning_rate": 3.85548337453519e-05, "loss": 0.0042, "num_tokens": 2805701.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.3942307692307692, "grad_norm": 3.214341987023417e-11, "kl": 0.09496676921844482, "learning_rate": 3.7651019814126654e-05, "loss": 0.0038, "num_tokens": 2817093.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.3956043956043956, "grad_norm": 5.4907243196566924e-08, "kl": 0.10488665103912354, "learning_rate": 3.675546244046228e-05, "loss": 0.0042, "num_tokens": 2826285.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.39697802197802196, "grad_norm": 1.0598282234458578e-12, "kl": 0.11106427013874054, "learning_rate": 3.5868280218455796e-05, "loss": 0.0044, "num_tokens": 2835765.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.3983516483516483, "grad_norm": 1.7353105064010776e-11, "kl": 0.1012025773525238, "learning_rate": 3.498959063312558e-05, "loss": 0.004, "num_tokens": 2844633.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.39972527472527475, "grad_norm": 8.372238319287462e-11, "kl": 0.10018837451934814, "learning_rate": 3.411951004485343e-05, "loss": 0.004, "num_tokens": 2853105.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.4010989010989011, "grad_norm": 3.6175333661958575e-06, "kl": 0.10144409537315369, "learning_rate": 3.325815367397557e-05, "loss": 0.0041, "num_tokens": 2861009.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.4024725274725275, "grad_norm": 3.5911225382267986e-11, "kl": 0.10155778378248215, "learning_rate": 3.2405635585524565e-05, "loss": 0.0041, "num_tokens": 2871245.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.40384615384615385, "grad_norm": 2.9936240986128126e-11, "kl": 0.09421390295028687, "learning_rate": 3.1562068674124344e-05, "loss": 0.0038, "num_tokens": 2880705.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.4052197802197802, "grad_norm": 6.449795558749472e-11, "kl": 0.1042446494102478, "learning_rate": 3.072756464904006e-05, "loss": 0.0042, "num_tokens": 2889937.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.4065934065934066, "grad_norm": 7.366781490381058e-11, "kl": 0.09985928237438202, "learning_rate": 2.9902234019385057e-05, "loss": 0.004, "num_tokens": 2898205.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.40796703296703296, "grad_norm": 1.3522000359700304e-12, "kl": 0.09752359986305237, "learning_rate": 2.9086186079486688e-05, "loss": 0.0039, "num_tokens": 2908261.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.40934065934065933, "grad_norm": 2.858543610151365e-11, "kl": 0.09792864322662354, "learning_rate": 2.8279528894413022e-05, "loss": 0.0039, "num_tokens": 2917089.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.4107142857142857, "grad_norm": 9.04626512587825e-11, "kl": 0.09729765355587006, "learning_rate": 2.7482369285662378e-05, "loss": 0.0039, "num_tokens": 2925861.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.41208791208791207, "grad_norm": 4.835294930671807e-06, "kl": 0.0979449450969696, "learning_rate": 2.669481281701739e-05, "loss": 0.0039, "num_tokens": 2934985.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.41346153846153844, "grad_norm": 9.755229263275211e-11, "kl": 0.0972272977232933, "learning_rate": 2.5916963780565894e-05, "loss": 0.0039, "num_tokens": 2943421.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.41483516483516486, "grad_norm": 8.854209227138199e-06, "kl": 0.0970732569694519, "learning_rate": 2.514892518288988e-05, "loss": 0.0039, "num_tokens": 2951469.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.41620879120879123, "grad_norm": 9.167576420221479e-11, "kl": 0.10965196788311005, "learning_rate": 2.43907987314251e-05, "loss": 0.0044, "num_tokens": 2962373.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.4175824175824176, "grad_norm": 1.4960105636924181e-12, "kl": 0.11205114424228668, "learning_rate": 2.364268482099218e-05, "loss": 0.0045, "num_tokens": 2972621.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.41895604395604397, "grad_norm": 1.3127583872382664e-12, "kl": 0.09711016714572906, "learning_rate": 2.290468252050204e-05, "loss": 0.0039, "num_tokens": 2984985.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.42032967032967034, "grad_norm": 1.2451610922892264e-12, "kl": 0.0965256541967392, "learning_rate": 2.2176889559836656e-05, "loss": 0.0039, "num_tokens": 2994257.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.4217032967032967, "grad_norm": 9.882599599775332e-11, "kl": 0.09205780178308487, "learning_rate": 2.145940231690713e-05, "loss": 0.0037, "num_tokens": 3004353.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.4230769230769231, "grad_norm": 4.157865024501106e-11, "kl": 0.10926282405853271, "learning_rate": 2.0752315804890977e-05, "loss": 0.0044, "num_tokens": 3013489.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.42445054945054944, "grad_norm": 1.508394971427851e-12, "kl": 0.10076142847537994, "learning_rate": 2.0055723659649904e-05, "loss": 0.004, "num_tokens": 3024781.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.4258241758241758, "grad_norm": 4.805372766369942e-11, "kl": 0.0967416986823082, "learning_rate": 1.9369718127330117e-05, "loss": 0.0039, "num_tokens": 3035437.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.4271978021978022, "grad_norm": 7.416853242681043e-11, "kl": 0.09801910072565079, "learning_rate": 1.8694390052146737e-05, "loss": 0.0039, "num_tokens": 3045349.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.42857142857142855, "grad_norm": 1.3410829521540157e-12, "kl": 0.10218630731105804, "learning_rate": 1.8029828864353583e-05, "loss": 0.0041, "num_tokens": 3054929.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.42994505494505497, "grad_norm": 7.076898095315087e-11, "kl": 0.09475124627351761, "learning_rate": 1.7376122568400532e-05, "loss": 0.0038, "num_tokens": 3063393.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.43131868131868134, "grad_norm": 6.046815825833107e-11, "kl": 0.10225345194339752, "learning_rate": 1.6733357731279377e-05, "loss": 0.0041, "num_tokens": 3072501.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.4326923076923077, "grad_norm": 1.0916311604658446e-10, "kl": 0.09987315535545349, "learning_rate": 1.6101619471060413e-05, "loss": 0.004, "num_tokens": 3082573.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.4340659340659341, "grad_norm": 9.738304997466973e-13, "kl": 0.09536496549844742, "learning_rate": 1.5480991445620542e-05, "loss": 0.0038, "num_tokens": 3092509.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.43543956043956045, "grad_norm": 8.070032142537542e-11, "kl": 0.10127323865890503, "learning_rate": 1.4871555841564887e-05, "loss": 0.0041, "num_tokens": 3100817.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.4368131868131868, "grad_norm": 5.653191456622686e-11, "kl": 0.10101597011089325, "learning_rate": 1.4273393363343323e-05, "loss": 0.004, "num_tokens": 3111821.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.4381868131868132, "grad_norm": 3.85667747737628e-11, "kl": 0.09691362082958221, "learning_rate": 1.368658322256311e-05, "loss": 0.0039, "num_tokens": 3121425.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.43956043956043955, "grad_norm": 1.3714702984390925e-10, "kl": 0.10133294761180878, "learning_rate": 1.311120312749935e-05, "loss": 0.0041, "num_tokens": 3130609.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.4409340659340659, "grad_norm": 1.827989148939224e-12, "kl": 0.0983535423874855, "learning_rate": 1.2547329272804476e-05, "loss": 0.0039, "num_tokens": 3140557.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.4423076923076923, "grad_norm": 1.921972025975549e-10, "kl": 0.09434045106172562, "learning_rate": 1.1995036329418152e-05, "loss": 0.0038, "num_tokens": 3150069.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.44368131868131866, "grad_norm": 2.8063482093720893e-10, "kl": 0.10108077526092529, "learning_rate": 1.1454397434679021e-05, "loss": 0.004, "num_tokens": 3161221.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.44505494505494503, "grad_norm": 8.468883233581082e-11, "kl": 0.09994029998779297, "learning_rate": 1.0925484182639467e-05, "loss": 0.004, "num_tokens": 3172933.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.44642857142857145, "grad_norm": 2.95151698992413e-06, "kl": 0.09841667115688324, "learning_rate": 1.040836661458482e-05, "loss": 0.0039, "num_tokens": 3181385.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.4478021978021978, "grad_norm": 1.15118374902079e-12, "kl": 0.09957189857959747, "learning_rate": 9.903113209758096e-06, "loss": 0.004, "num_tokens": 3191717.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.4491758241758242, "grad_norm": 1.0604441769146433e-08, "kl": 0.10443182289600372, "learning_rate": 9.409790876291658e-06, "loss": 0.0042, "num_tokens": 3201081.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.45054945054945056, "grad_norm": 8.62764720777065e-11, "kl": 0.10838870704174042, "learning_rate": 8.928464942346948e-06, "loss": 0.0043, "num_tokens": 3210933.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.4519230769230769, "grad_norm": 1.2316087349972804e-07, "kl": 0.09911349415779114, "learning_rate": 8.45919914746337e-06, "loss": 0.004, "num_tokens": 3221153.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.4532967032967033, "grad_norm": 1.0647577958211585e-10, "kl": 0.10504838824272156, "learning_rate": 8.002055634117578e-06, "loss": 0.0042, "num_tokens": 3231405.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.45467032967032966, "grad_norm": 1.244575081014998e-12, "kl": 0.10420121252536774, "learning_rate": 7.557094939494325e-06, "loss": 0.0042, "num_tokens": 3241173.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.45604395604395603, "grad_norm": 1.4938249205329046e-12, "kl": 0.09474171698093414, "learning_rate": 7.124375987469767e-06, "loss": 0.0038, "num_tokens": 3252501.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.4574175824175824, "grad_norm": 1.7934731477708965e-10, "kl": 0.1060134544968605, "learning_rate": 6.703956080808515e-06, "loss": 0.0042, "num_tokens": 3264573.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.45879120879120877, "grad_norm": 1.457722180811527e-12, "kl": 0.10438117384910583, "learning_rate": 6.2958908935752955e-06, "loss": 0.0042, "num_tokens": 3274709.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.46016483516483514, "grad_norm": 2.8901670987013794e-11, "kl": 0.10029658675193787, "learning_rate": 5.900234463762366e-06, "loss": 0.004, "num_tokens": 3283949.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.46153846153846156, "grad_norm": 6.476191805049325e-11, "kl": 0.09902748465538025, "learning_rate": 5.517039186133433e-06, "loss": 0.004, "num_tokens": 3293989.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.46291208791208793, "grad_norm": 9.27395313321533e-13, "kl": 0.10053876042366028, "learning_rate": 5.146355805285452e-06, "loss": 0.004, "num_tokens": 3302997.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.4642857142857143, "grad_norm": 3.29013448430171e-11, "kl": 0.10124952346086502, "learning_rate": 4.788233408928589e-06, "loss": 0.004, "num_tokens": 3314037.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.46565934065934067, "grad_norm": 3.248756229368155e-06, "kl": 0.10238954424858093, "learning_rate": 4.442719421385922e-06, "loss": 0.0041, "num_tokens": 3324697.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.46703296703296704, "grad_norm": 1.3699092858351913e-12, "kl": 0.10652117431163788, "learning_rate": 4.109859597313237e-06, "loss": 0.0043, "num_tokens": 3336585.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.4684065934065934, "grad_norm": 5.157354751594845e-11, "kl": 0.09955325722694397, "learning_rate": 3.789698015639953e-06, "loss": 0.004, "num_tokens": 3346197.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.4697802197802198, "grad_norm": 1.3222115459801675e-12, "kl": 0.10493311285972595, "learning_rate": 3.4822770737319875e-06, "loss": 0.0042, "num_tokens": 3355181.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.47115384615384615, "grad_norm": 1.5043312732651581e-12, "kl": 0.10462287813425064, "learning_rate": 3.1876374817772837e-06, "loss": 0.0042, "num_tokens": 3366737.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.4725274725274725, "grad_norm": 4.902834735420747e-11, "kl": 0.09703518450260162, "learning_rate": 2.905818257394799e-06, "loss": 0.0039, "num_tokens": 3376117.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.4739010989010989, "grad_norm": 6.632150384433544e-11, "kl": 0.09672616422176361, "learning_rate": 2.636856720467573e-06, "loss": 0.0039, "num_tokens": 3388001.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.47527472527472525, "grad_norm": 1.3104578186484694e-12, "kl": 0.09617304056882858, "learning_rate": 2.380788488200658e-06, "loss": 0.0038, "num_tokens": 3396709.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.4766483516483517, "grad_norm": 5.8661682282579175e-12, "kl": 0.10723964869976044, "learning_rate": 2.137647470404469e-06, "loss": 0.0043, "num_tokens": 3407981.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.47802197802197804, "grad_norm": 1.3990342087946694e-12, "kl": 0.09630632400512695, "learning_rate": 1.9074658650043763e-06, "loss": 0.0039, "num_tokens": 3418057.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.4793956043956044, "grad_norm": 2.079944896871133e-11, "kl": 0.10698801279067993, "learning_rate": 1.6902741537767609e-06, "loss": 0.0043, "num_tokens": 3427249.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.4807692307692308, "grad_norm": 1.9194815875644977e-11, "kl": 0.09451061487197876, "learning_rate": 1.48610109831262e-06, "loss": 0.0038, "num_tokens": 3439697.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.48214285714285715, "grad_norm": 9.080054763632717e-11, "kl": 0.10139994323253632, "learning_rate": 1.2949737362087156e-06, "loss": 0.0041, "num_tokens": 3449993.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.4835164835164835, "grad_norm": 4.820320531617739e-11, "kl": 0.10317197442054749, "learning_rate": 1.1169173774871478e-06, "loss": 0.0041, "num_tokens": 3459849.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.4848901098901099, "grad_norm": 2.5441667869663398e-11, "kl": 0.09690375626087189, "learning_rate": 9.519556012436815e-07, "loss": 0.0039, "num_tokens": 3469389.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.48626373626373626, "grad_norm": 9.724451799364431e-11, "kl": 0.10639894753694534, "learning_rate": 8.00110252525299e-07, "loss": 0.0043, "num_tokens": 3480089.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.4876373626373626, "grad_norm": 5.4797388848726314e-11, "kl": 0.10000244528055191, "learning_rate": 6.61401439437348e-07, "loss": 0.004, "num_tokens": 3490337.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.489010989010989, "grad_norm": 1.0073086809114784e-10, "kl": 0.0931711196899414, "learning_rate": 5.358475304807375e-07, "loss": 0.0037, "num_tokens": 3500121.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.49038461538461536, "grad_norm": 2.323979984147906e-12, "kl": 0.10835463553667068, "learning_rate": 4.2346515211948433e-07, "loss": 0.0043, "num_tokens": 3510689.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.49175824175824173, "grad_norm": 3.630342207405768e-11, "kl": 0.10091705620288849, "learning_rate": 3.2426918657900704e-07, "loss": 0.004, "num_tokens": 3519673.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.49313186813186816, "grad_norm": 1.6740421982658349e-12, "kl": 0.1126289963722229, "learning_rate": 2.382727698752474e-07, "loss": 0.0045, "num_tokens": 3530029.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.4945054945054945, "grad_norm": 5.1068045625601854e-11, "kl": 0.1063089519739151, "learning_rate": 1.654872900752169e-07, "loss": 0.0043, "num_tokens": 3538601.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.4958791208791209, "grad_norm": 1.203455304160661e-12, "kl": 0.10159197449684143, "learning_rate": 1.0592238578892577e-07, "loss": 0.0041, "num_tokens": 3547161.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.49725274725274726, "grad_norm": 1.3296638096127467e-12, "kl": 0.10309585183858871, "learning_rate": 5.958594489295921e-08, "loss": 0.0041, "num_tokens": 3557141.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.49862637362637363, "grad_norm": 6.471718300149476e-11, "kl": 0.09392774105072021, "learning_rate": 2.6484103485924227e-08, "loss": 0.0038, "num_tokens": 3567945.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.5, "grad_norm": 1.8021133630502506e-12, "kl": 0.10265575349330902, "learning_rate": 6.621245075910665e-09, "loss": 0.0041, "num_tokens": 3578065.0, "reward": 23906.708984375, "reward_std": 0.0, "rewards/reward_long_completions/mean": 2815.0, "rewards/reward_long_completions/std": 0.0, "rewards/reward_long_sentences/mean": 256.0, "rewards/reward_long_sentences/std": 0.0, "rewards/reward_low_threat_score/mean": 0.9993722438812256, "rewards/reward_low_threat_score/std": 0.0, "step": 364 } ], "logging_steps": 1, "max_steps": 364, "num_input_tokens_seen": 3578065, "num_train_epochs": 1, "save_steps": 54, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 12, "trial_name": null, "trial_params": null }