| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.032, | |
| "eval_steps": 500, | |
| "global_step": 100, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.00032, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.8027206659317017, | |
| "kl": 1.71183273778297e-05, | |
| "learning_rate": 5e-06, | |
| "loss": 0.0, | |
| "num_tokens": 196560.0, | |
| "reward": 0.7936198115348816, | |
| "reward_std": 0.17962533235549927, | |
| "rewards/reward_len/mean": 0.7936197519302368, | |
| "rewards/reward_len/std": 0.3258915841579437, | |
| "step": 1 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.00064, | |
| "frac_reward_zero_std": 0.5625, | |
| "grad_norm": 0.7602783441543579, | |
| "kl": 0.00532680656760931, | |
| "learning_rate": 4.999999859632295e-06, | |
| "loss": 0.0, | |
| "num_tokens": 393040.0, | |
| "reward": 0.7620443105697632, | |
| "reward_std": 0.08783292770385742, | |
| "rewards/reward_len/mean": 0.7620443105697632, | |
| "rewards/reward_len/std": 0.3672178089618683, | |
| "step": 2 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.00096, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.7249953150749207, | |
| "kl": 0.005223212763667107, | |
| "learning_rate": 4.9999994385291934e-06, | |
| "loss": 0.0, | |
| "num_tokens": 589344.0, | |
| "reward": 0.8811849355697632, | |
| "reward_std": 0.13462764024734497, | |
| "rewards/reward_len/mean": 0.8811849355697632, | |
| "rewards/reward_len/std": 0.2868101894855499, | |
| "step": 3 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.00128, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 1.0998049974441528, | |
| "kl": 0.005302521400153637, | |
| "learning_rate": 4.9999987366907436e-06, | |
| "loss": 0.0, | |
| "num_tokens": 785824.0, | |
| "reward": 0.837890625, | |
| "reward_std": 0.17104606330394745, | |
| "rewards/reward_len/mean": 0.837890625, | |
| "rewards/reward_len/std": 0.274716854095459, | |
| "step": 4 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.0016, | |
| "frac_reward_zero_std": 0.375, | |
| "grad_norm": 0.8081106543540955, | |
| "kl": 0.005471091717481613, | |
| "learning_rate": 4.999997754117024e-06, | |
| "loss": 0.0, | |
| "num_tokens": 982400.0, | |
| "reward": 0.7578125, | |
| "reward_std": 0.15783792734146118, | |
| "rewards/reward_len/mean": 0.7578125, | |
| "rewards/reward_len/std": 0.3556174635887146, | |
| "step": 5 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.00192, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.7931028008460999, | |
| "kl": 0.005383472889661789, | |
| "learning_rate": 4.999996490808146e-06, | |
| "loss": 0.0, | |
| "num_tokens": 1178736.0, | |
| "reward": 0.8411458730697632, | |
| "reward_std": 0.15851536393165588, | |
| "rewards/reward_len/mean": 0.8411458730697632, | |
| "rewards/reward_len/std": 0.2852468192577362, | |
| "step": 6 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.00224, | |
| "frac_reward_zero_std": 0.375, | |
| "grad_norm": 0.855617880821228, | |
| "kl": 0.005938877817243338, | |
| "learning_rate": 4.9999949467642495e-06, | |
| "loss": 0.0, | |
| "num_tokens": 1375184.0, | |
| "reward": 0.7444661855697632, | |
| "reward_std": 0.18254899978637695, | |
| "rewards/reward_len/mean": 0.7444661855697632, | |
| "rewards/reward_len/std": 0.5324408411979675, | |
| "step": 7 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.00256, | |
| "frac_reward_zero_std": 0.375, | |
| "grad_norm": 0.8006792068481445, | |
| "kl": 0.006116841919720173, | |
| "learning_rate": 4.999993121985509e-06, | |
| "loss": 0.0, | |
| "num_tokens": 1571472.0, | |
| "reward": 0.755859375, | |
| "reward_std": 0.14840292930603027, | |
| "rewards/reward_len/mean": 0.755859375, | |
| "rewards/reward_len/std": 0.41854918003082275, | |
| "step": 8 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.00288, | |
| "frac_reward_zero_std": 0.625, | |
| "grad_norm": 0.5651379823684692, | |
| "kl": 0.006282718852162361, | |
| "learning_rate": 4.99999101647213e-06, | |
| "loss": 0.0, | |
| "num_tokens": 1767696.0, | |
| "reward": 0.8597005605697632, | |
| "reward_std": 0.09105785191059113, | |
| "rewards/reward_len/mean": 0.8597005605697632, | |
| "rewards/reward_len/std": 0.3063139021396637, | |
| "step": 9 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.0032, | |
| "frac_reward_zero_std": 0.3125, | |
| "grad_norm": 0.7746672630310059, | |
| "kl": 0.006277492269873619, | |
| "learning_rate": 4.9999886302243486e-06, | |
| "loss": 0.0, | |
| "num_tokens": 1964352.0, | |
| "reward": 0.736328125, | |
| "reward_std": 0.16170179843902588, | |
| "rewards/reward_len/mean": 0.736328125, | |
| "rewards/reward_len/std": 0.35940179228782654, | |
| "step": 10 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.00352, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.8158230185508728, | |
| "kl": 0.0064939698204398155, | |
| "learning_rate": 4.999985963242432e-06, | |
| "loss": 0.0, | |
| "num_tokens": 2160848.0, | |
| "reward": 0.7190755605697632, | |
| "reward_std": 0.14770188927650452, | |
| "rewards/reward_len/mean": 0.7190755605697632, | |
| "rewards/reward_len/std": 0.39762866497039795, | |
| "step": 11 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.00384, | |
| "frac_reward_zero_std": 0.625, | |
| "grad_norm": 0.4778946042060852, | |
| "kl": 0.00701224897056818, | |
| "learning_rate": 4.99998301552668e-06, | |
| "loss": 0.0, | |
| "num_tokens": 2357440.0, | |
| "reward": 0.9007161855697632, | |
| "reward_std": 0.08900929242372513, | |
| "rewards/reward_len/mean": 0.9007161855697632, | |
| "rewards/reward_len/std": 0.2015477418899536, | |
| "step": 12 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.00416, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.7480363845825195, | |
| "kl": 0.006892648059874773, | |
| "learning_rate": 4.999979787077425e-06, | |
| "loss": 0.0, | |
| "num_tokens": 2553776.0, | |
| "reward": 0.736328125, | |
| "reward_std": 0.11627350747585297, | |
| "rewards/reward_len/mean": 0.736328125, | |
| "rewards/reward_len/std": 0.4063463509082794, | |
| "step": 13 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.00448, | |
| "frac_reward_zero_std": 0.375, | |
| "grad_norm": 0.6913530826568604, | |
| "kl": 0.007525671273469925, | |
| "learning_rate": 4.9999762778950265e-06, | |
| "loss": 0.0, | |
| "num_tokens": 2750176.0, | |
| "reward": 0.8733724355697632, | |
| "reward_std": 0.13304749131202698, | |
| "rewards/reward_len/mean": 0.8733724355697632, | |
| "rewards/reward_len/std": 0.260817289352417, | |
| "step": 14 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.0048, | |
| "frac_reward_zero_std": 0.3125, | |
| "grad_norm": 0.9006426334381104, | |
| "kl": 0.007139429450035095, | |
| "learning_rate": 4.999972487979882e-06, | |
| "loss": 0.0, | |
| "num_tokens": 2946432.0, | |
| "reward": 0.7353515625, | |
| "reward_std": 0.18793734908103943, | |
| "rewards/reward_len/mean": 0.7353515625, | |
| "rewards/reward_len/std": 0.4250172972679138, | |
| "step": 15 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.00512, | |
| "frac_reward_zero_std": 0.4375, | |
| "grad_norm": 0.7369216084480286, | |
| "kl": 0.007592486217617989, | |
| "learning_rate": 4.999968417332415e-06, | |
| "loss": 0.0, | |
| "num_tokens": 3142960.0, | |
| "reward": 0.8525390625, | |
| "reward_std": 0.13952995836734772, | |
| "rewards/reward_len/mean": 0.8525390625, | |
| "rewards/reward_len/std": 0.2791503369808197, | |
| "step": 16 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.00544, | |
| "frac_reward_zero_std": 0.3125, | |
| "grad_norm": 0.7958468198776245, | |
| "kl": 0.007764819078147411, | |
| "learning_rate": 4.999964065953083e-06, | |
| "loss": 0.0, | |
| "num_tokens": 3339504.0, | |
| "reward": 0.7796224355697632, | |
| "reward_std": 0.13519705832004547, | |
| "rewards/reward_len/mean": 0.7796224355697632, | |
| "rewards/reward_len/std": 0.2986285090446472, | |
| "step": 17 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.00576, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.8955850005149841, | |
| "kl": 0.008127257227897644, | |
| "learning_rate": 4.999959433842374e-06, | |
| "loss": 0.0, | |
| "num_tokens": 3536064.0, | |
| "reward": 0.6917318105697632, | |
| "reward_std": 0.22921673953533173, | |
| "rewards/reward_len/mean": 0.6917317509651184, | |
| "rewards/reward_len/std": 0.3818399906158447, | |
| "step": 18 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.00608, | |
| "frac_reward_zero_std": 0.375, | |
| "grad_norm": 0.7979457974433899, | |
| "kl": 0.007617972791194916, | |
| "learning_rate": 4.999954521000811e-06, | |
| "loss": 0.0, | |
| "num_tokens": 3732256.0, | |
| "reward": 0.78515625, | |
| "reward_std": 0.1195596233010292, | |
| "rewards/reward_len/mean": 0.78515625, | |
| "rewards/reward_len/std": 0.32990217208862305, | |
| "step": 19 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.0064, | |
| "frac_reward_zero_std": 0.3125, | |
| "grad_norm": 0.7890306115150452, | |
| "kl": 0.007777344435453415, | |
| "learning_rate": 4.999949327428941e-06, | |
| "loss": 0.0, | |
| "num_tokens": 3928640.0, | |
| "reward": 0.7919921875, | |
| "reward_std": 0.17304885387420654, | |
| "rewards/reward_len/mean": 0.7919921875, | |
| "rewards/reward_len/std": 0.3526277244091034, | |
| "step": 20 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.00672, | |
| "frac_reward_zero_std": 0.4375, | |
| "grad_norm": 0.7711103558540344, | |
| "kl": 0.008527114056050777, | |
| "learning_rate": 4.999943853127351e-06, | |
| "loss": 0.0, | |
| "num_tokens": 4124784.0, | |
| "reward": 0.9065755605697632, | |
| "reward_std": 0.09724702686071396, | |
| "rewards/reward_len/mean": 0.9065755605697632, | |
| "rewards/reward_len/std": 0.252506285905838, | |
| "step": 21 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.00704, | |
| "frac_reward_zero_std": 0.3125, | |
| "grad_norm": 0.7866696119308472, | |
| "kl": 0.008756128139793873, | |
| "learning_rate": 4.999938098096655e-06, | |
| "loss": 0.0, | |
| "num_tokens": 4321184.0, | |
| "reward": 0.8844401240348816, | |
| "reward_std": 0.1446431577205658, | |
| "rewards/reward_len/mean": 0.8844401240348816, | |
| "rewards/reward_len/std": 0.2464737743139267, | |
| "step": 22 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.00736, | |
| "frac_reward_zero_std": 0.3125, | |
| "grad_norm": 0.8422706723213196, | |
| "kl": 0.009292546659708023, | |
| "learning_rate": 4.999932062337498e-06, | |
| "loss": 0.0, | |
| "num_tokens": 4517488.0, | |
| "reward": 0.9410807490348816, | |
| "reward_std": 0.12082913517951965, | |
| "rewards/reward_len/mean": 0.9410807490348816, | |
| "rewards/reward_len/std": 0.16378451883792877, | |
| "step": 23 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.00768, | |
| "frac_reward_zero_std": 0.4375, | |
| "grad_norm": 0.7936801314353943, | |
| "kl": 0.009337708353996277, | |
| "learning_rate": 4.999925745850559e-06, | |
| "loss": 0.0, | |
| "num_tokens": 4713888.0, | |
| "reward": 0.7981771230697632, | |
| "reward_std": 0.1319822520017624, | |
| "rewards/reward_len/mean": 0.7981771230697632, | |
| "rewards/reward_len/std": 0.29971110820770264, | |
| "step": 24 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.008, | |
| "frac_reward_zero_std": 0.3125, | |
| "grad_norm": 0.8625982403755188, | |
| "kl": 0.009664739482104778, | |
| "learning_rate": 4.999919148636547e-06, | |
| "loss": 0.0, | |
| "num_tokens": 4910480.0, | |
| "reward": 0.7828776240348816, | |
| "reward_std": 0.15538470447063446, | |
| "rewards/reward_len/mean": 0.7828776240348816, | |
| "rewards/reward_len/std": 0.3985660672187805, | |
| "step": 25 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.00832, | |
| "frac_reward_zero_std": 0.1875, | |
| "grad_norm": 0.7866320013999939, | |
| "kl": 0.009115578606724739, | |
| "learning_rate": 4.999912270696202e-06, | |
| "loss": 0.0, | |
| "num_tokens": 5107040.0, | |
| "reward": 0.837890625, | |
| "reward_std": 0.19406211376190186, | |
| "rewards/reward_len/mean": 0.837890625, | |
| "rewards/reward_len/std": 0.28484001755714417, | |
| "step": 26 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.00864, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.7605833411216736, | |
| "kl": 0.010722242295742035, | |
| "learning_rate": 4.999905112030298e-06, | |
| "loss": 0.0, | |
| "num_tokens": 5303472.0, | |
| "reward": 0.8326823115348816, | |
| "reward_std": 0.16619378328323364, | |
| "rewards/reward_len/mean": 0.8326823115348816, | |
| "rewards/reward_len/std": 0.3326784670352936, | |
| "step": 27 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.00896, | |
| "frac_reward_zero_std": 0.375, | |
| "grad_norm": 0.8247683048248291, | |
| "kl": 0.010293405503034592, | |
| "learning_rate": 4.999897672639636e-06, | |
| "loss": 0.0, | |
| "num_tokens": 5499968.0, | |
| "reward": 0.7662760019302368, | |
| "reward_std": 0.13208766281604767, | |
| "rewards/reward_len/mean": 0.7662760615348816, | |
| "rewards/reward_len/std": 0.3626876771450043, | |
| "step": 28 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.00928, | |
| "frac_reward_zero_std": 0.1875, | |
| "grad_norm": 0.9354327917098999, | |
| "kl": 0.010291656479239464, | |
| "learning_rate": 4.9998899525250556e-06, | |
| "loss": 0.0, | |
| "num_tokens": 5696496.0, | |
| "reward": 0.6142578125, | |
| "reward_std": 0.2659006714820862, | |
| "rewards/reward_len/mean": 0.6142578125, | |
| "rewards/reward_len/std": 0.5169708728790283, | |
| "step": 29 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.0096, | |
| "frac_reward_zero_std": 0.3125, | |
| "grad_norm": 0.7617059946060181, | |
| "kl": 0.011677569709718227, | |
| "learning_rate": 4.99988195168742e-06, | |
| "loss": 0.0, | |
| "num_tokens": 5892672.0, | |
| "reward": 0.818359375, | |
| "reward_std": 0.16987799108028412, | |
| "rewards/reward_len/mean": 0.818359375, | |
| "rewards/reward_len/std": 0.2849595248699188, | |
| "step": 30 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.00992, | |
| "frac_reward_zero_std": 0.4375, | |
| "grad_norm": 0.6369988322257996, | |
| "kl": 0.011780554428696632, | |
| "learning_rate": 4.99987367012763e-06, | |
| "loss": 0.0, | |
| "num_tokens": 6089264.0, | |
| "reward": 0.8307291865348816, | |
| "reward_std": 0.15030531585216522, | |
| "rewards/reward_len/mean": 0.8307291865348816, | |
| "rewards/reward_len/std": 0.3026638329029083, | |
| "step": 31 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.01024, | |
| "frac_reward_zero_std": 0.625, | |
| "grad_norm": 0.5698184370994568, | |
| "kl": 0.011647619307041168, | |
| "learning_rate": 4.9998651078466144e-06, | |
| "loss": 0.0, | |
| "num_tokens": 6285792.0, | |
| "reward": 0.8603515625, | |
| "reward_std": 0.06890285015106201, | |
| "rewards/reward_len/mean": 0.8603515625, | |
| "rewards/reward_len/std": 0.3024090826511383, | |
| "step": 32 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.01056, | |
| "frac_reward_zero_std": 0.625, | |
| "grad_norm": 0.7035416960716248, | |
| "kl": 0.01283220387995243, | |
| "learning_rate": 4.999856264845334e-06, | |
| "loss": 0.0, | |
| "num_tokens": 6482272.0, | |
| "reward": 0.8284505605697632, | |
| "reward_std": 0.0647423267364502, | |
| "rewards/reward_len/mean": 0.8284505605697632, | |
| "rewards/reward_len/std": 0.2645607888698578, | |
| "step": 33 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.01088, | |
| "frac_reward_zero_std": 0.3125, | |
| "grad_norm": 0.7955151200294495, | |
| "kl": 0.013212130405008793, | |
| "learning_rate": 4.999847141124784e-06, | |
| "loss": 0.0, | |
| "num_tokens": 6678928.0, | |
| "reward": 0.9065755605697632, | |
| "reward_std": 0.15104928612709045, | |
| "rewards/reward_len/mean": 0.9065755605697632, | |
| "rewards/reward_len/std": 0.2394418716430664, | |
| "step": 34 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.0112, | |
| "frac_reward_zero_std": 0.3125, | |
| "grad_norm": 0.7888926267623901, | |
| "kl": 0.01455118041485548, | |
| "learning_rate": 4.999837736685987e-06, | |
| "loss": 0.0, | |
| "num_tokens": 6875680.0, | |
| "reward": 0.8001302480697632, | |
| "reward_std": 0.18624618649482727, | |
| "rewards/reward_len/mean": 0.8001302480697632, | |
| "rewards/reward_len/std": 0.3228481709957123, | |
| "step": 35 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.01152, | |
| "frac_reward_zero_std": 0.375, | |
| "grad_norm": 0.7999125719070435, | |
| "kl": 0.016429290175437927, | |
| "learning_rate": 4.9998280515300006e-06, | |
| "loss": 0.0, | |
| "num_tokens": 7071792.0, | |
| "reward": 0.7805989980697632, | |
| "reward_std": 0.10644528269767761, | |
| "rewards/reward_len/mean": 0.7805989980697632, | |
| "rewards/reward_len/std": 0.3641582727432251, | |
| "step": 36 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.01184, | |
| "frac_reward_zero_std": 0.1875, | |
| "grad_norm": 0.8360075950622559, | |
| "kl": 0.01572563126683235, | |
| "learning_rate": 4.999818085657911e-06, | |
| "loss": 0.0, | |
| "num_tokens": 7268400.0, | |
| "reward": 0.8115234375, | |
| "reward_std": 0.19910216331481934, | |
| "rewards/reward_len/mean": 0.8115234375, | |
| "rewards/reward_len/std": 0.3251221179962158, | |
| "step": 37 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.01216, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.8303148746490479, | |
| "kl": 0.015369746834039688, | |
| "learning_rate": 4.9998078390708375e-06, | |
| "loss": 0.0, | |
| "num_tokens": 7464832.0, | |
| "reward": 0.7236328125, | |
| "reward_std": 0.20254287123680115, | |
| "rewards/reward_len/mean": 0.7236328125, | |
| "rewards/reward_len/std": 0.49883776903152466, | |
| "step": 38 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.01248, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.8379618525505066, | |
| "kl": 0.015162697061896324, | |
| "learning_rate": 4.999797311769932e-06, | |
| "loss": 0.0, | |
| "num_tokens": 7661264.0, | |
| "reward": 0.8512369990348816, | |
| "reward_std": 0.1815636157989502, | |
| "rewards/reward_len/mean": 0.8512369990348816, | |
| "rewards/reward_len/std": 0.30639442801475525, | |
| "step": 39 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.0128, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.9366811513900757, | |
| "kl": 0.01498054526746273, | |
| "learning_rate": 4.999786503756376e-06, | |
| "loss": 0.0, | |
| "num_tokens": 7857696.0, | |
| "reward": 0.8359375, | |
| "reward_std": 0.14262336492538452, | |
| "rewards/reward_len/mean": 0.8359375, | |
| "rewards/reward_len/std": 0.29101234674453735, | |
| "step": 40 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.01312, | |
| "frac_reward_zero_std": 0.125, | |
| "grad_norm": 0.8755704164505005, | |
| "kl": 0.017075425013899803, | |
| "learning_rate": 4.999775415031381e-06, | |
| "loss": 0.0, | |
| "num_tokens": 8054128.0, | |
| "reward": 0.8531901240348816, | |
| "reward_std": 0.19629715383052826, | |
| "rewards/reward_len/mean": 0.8531901240348816, | |
| "rewards/reward_len/std": 0.29085859656333923, | |
| "step": 41 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.01344, | |
| "frac_reward_zero_std": 0.4375, | |
| "grad_norm": 0.7478809356689453, | |
| "kl": 0.016752440482378006, | |
| "learning_rate": 4.999764045596195e-06, | |
| "loss": 0.0, | |
| "num_tokens": 8250448.0, | |
| "reward": 0.8297526240348816, | |
| "reward_std": 0.10785592347383499, | |
| "rewards/reward_len/mean": 0.8297526240348816, | |
| "rewards/reward_len/std": 0.3374955654144287, | |
| "step": 42 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.01376, | |
| "frac_reward_zero_std": 0.375, | |
| "grad_norm": 0.79532790184021, | |
| "kl": 0.01613219454884529, | |
| "learning_rate": 4.999752395452095e-06, | |
| "loss": 0.0, | |
| "num_tokens": 8446864.0, | |
| "reward": 0.798828125, | |
| "reward_std": 0.15989980101585388, | |
| "rewards/reward_len/mean": 0.798828125, | |
| "rewards/reward_len/std": 0.32186606526374817, | |
| "step": 43 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.01408, | |
| "frac_reward_zero_std": 0.4375, | |
| "grad_norm": 0.757718563079834, | |
| "kl": 0.016810301691293716, | |
| "learning_rate": 4.999740464600386e-06, | |
| "loss": 0.0, | |
| "num_tokens": 8643536.0, | |
| "reward": 0.8186849355697632, | |
| "reward_std": 0.14407595992088318, | |
| "rewards/reward_len/mean": 0.8186849355697632, | |
| "rewards/reward_len/std": 0.3191888630390167, | |
| "step": 44 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.0144, | |
| "frac_reward_zero_std": 0.4375, | |
| "grad_norm": 0.6717040538787842, | |
| "kl": 0.019070003181695938, | |
| "learning_rate": 4.9997282530424114e-06, | |
| "loss": 0.0, | |
| "num_tokens": 8839776.0, | |
| "reward": 0.8984375, | |
| "reward_std": 0.15830302238464355, | |
| "rewards/reward_len/mean": 0.8984375, | |
| "rewards/reward_len/std": 0.24285823106765747, | |
| "step": 45 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.01472, | |
| "frac_reward_zero_std": 0.3125, | |
| "grad_norm": 0.8255589008331299, | |
| "kl": 0.01899189129471779, | |
| "learning_rate": 4.999715760779541e-06, | |
| "loss": 0.0, | |
| "num_tokens": 9036368.0, | |
| "reward": 0.7731119990348816, | |
| "reward_std": 0.15324297547340393, | |
| "rewards/reward_len/mean": 0.7731119394302368, | |
| "rewards/reward_len/std": 0.3341211676597595, | |
| "step": 46 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.01504, | |
| "frac_reward_zero_std": 0.125, | |
| "grad_norm": 0.9899515509605408, | |
| "kl": 0.018817655742168427, | |
| "learning_rate": 4.9997029878131776e-06, | |
| "loss": 0.0, | |
| "num_tokens": 9232672.0, | |
| "reward": 0.7311198115348816, | |
| "reward_std": 0.198727086186409, | |
| "rewards/reward_len/mean": 0.7311197519302368, | |
| "rewards/reward_len/std": 0.39068886637687683, | |
| "step": 47 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.01536, | |
| "frac_reward_zero_std": 0.3125, | |
| "grad_norm": 0.7901699542999268, | |
| "kl": 0.017967861145734787, | |
| "learning_rate": 4.999689934144754e-06, | |
| "loss": 0.0, | |
| "num_tokens": 9429104.0, | |
| "reward": 0.8727213740348816, | |
| "reward_std": 0.14133989810943604, | |
| "rewards/reward_len/mean": 0.8727213144302368, | |
| "rewards/reward_len/std": 0.27691739797592163, | |
| "step": 48 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.01568, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.8071075677871704, | |
| "kl": 0.01948385313153267, | |
| "learning_rate": 4.99967659977574e-06, | |
| "loss": 0.0, | |
| "num_tokens": 9625744.0, | |
| "reward": 0.8323568105697632, | |
| "reward_std": 0.1779971718788147, | |
| "rewards/reward_len/mean": 0.8323567509651184, | |
| "rewards/reward_len/std": 0.301727294921875, | |
| "step": 49 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.016, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.7960187792778015, | |
| "kl": 0.019299190491437912, | |
| "learning_rate": 4.999662984707629e-06, | |
| "loss": 0.0, | |
| "num_tokens": 9822208.0, | |
| "reward": 0.7916666865348816, | |
| "reward_std": 0.17278119921684265, | |
| "rewards/reward_len/mean": 0.7916666865348816, | |
| "rewards/reward_len/std": 0.3409265875816345, | |
| "step": 50 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.01632, | |
| "frac_reward_zero_std": 0.3125, | |
| "grad_norm": 0.815869152545929, | |
| "kl": 0.01802164316177368, | |
| "learning_rate": 4.999649088941951e-06, | |
| "loss": 0.0, | |
| "num_tokens": 10018832.0, | |
| "reward": 0.7574869990348816, | |
| "reward_std": 0.16599968075752258, | |
| "rewards/reward_len/mean": 0.7574869990348816, | |
| "rewards/reward_len/std": 0.3090384006500244, | |
| "step": 51 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.01664, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.824347734451294, | |
| "kl": 0.020238451659679413, | |
| "learning_rate": 4.999634912480268e-06, | |
| "loss": 0.0, | |
| "num_tokens": 10215008.0, | |
| "reward": 0.8421224355697632, | |
| "reward_std": 0.14578115940093994, | |
| "rewards/reward_len/mean": 0.8421224355697632, | |
| "rewards/reward_len/std": 0.27820974588394165, | |
| "step": 52 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.01696, | |
| "frac_reward_zero_std": 0.4375, | |
| "grad_norm": 0.7971190810203552, | |
| "kl": 0.021430665627121925, | |
| "learning_rate": 4.99962045532417e-06, | |
| "loss": 0.0, | |
| "num_tokens": 10411328.0, | |
| "reward": 0.90625, | |
| "reward_std": 0.1278911530971527, | |
| "rewards/reward_len/mean": 0.90625, | |
| "rewards/reward_len/std": 0.22140371799468994, | |
| "step": 53 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.01728, | |
| "frac_reward_zero_std": 0.4375, | |
| "grad_norm": 0.7196866273880005, | |
| "kl": 0.02385888248682022, | |
| "learning_rate": 4.999605717475281e-06, | |
| "loss": 0.0, | |
| "num_tokens": 10607744.0, | |
| "reward": 0.7164713144302368, | |
| "reward_std": 0.1332741230726242, | |
| "rewards/reward_len/mean": 0.7164713740348816, | |
| "rewards/reward_len/std": 0.4236203730106354, | |
| "step": 54 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.0176, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.694108247756958, | |
| "kl": 0.019158754497766495, | |
| "learning_rate": 4.999590698935257e-06, | |
| "loss": 0.0, | |
| "num_tokens": 10804432.0, | |
| "reward": 0.83984375, | |
| "reward_std": 0.07713833451271057, | |
| "rewards/reward_len/mean": 0.83984375, | |
| "rewards/reward_len/std": 0.3475888669490814, | |
| "step": 55 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.01792, | |
| "frac_reward_zero_std": 0.375, | |
| "grad_norm": 0.7138301730155945, | |
| "kl": 0.021222909912467003, | |
| "learning_rate": 4.999575399705782e-06, | |
| "loss": 0.0, | |
| "num_tokens": 11000960.0, | |
| "reward": 0.8362630605697632, | |
| "reward_std": 0.13972680270671844, | |
| "rewards/reward_len/mean": 0.8362630605697632, | |
| "rewards/reward_len/std": 0.3137500584125519, | |
| "step": 56 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.01824, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.5983142852783203, | |
| "kl": 0.023208746686577797, | |
| "learning_rate": 4.999559819788578e-06, | |
| "loss": 0.0, | |
| "num_tokens": 11197456.0, | |
| "reward": 0.8756510615348816, | |
| "reward_std": 0.09809666872024536, | |
| "rewards/reward_len/mean": 0.8756510615348816, | |
| "rewards/reward_len/std": 0.27435725927352905, | |
| "step": 57 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.01856, | |
| "frac_reward_zero_std": 0.4375, | |
| "grad_norm": 0.7307888865470886, | |
| "kl": 0.021232981234788895, | |
| "learning_rate": 4.999543959185391e-06, | |
| "loss": 0.0, | |
| "num_tokens": 11394064.0, | |
| "reward": 0.8444010615348816, | |
| "reward_std": 0.11457288265228271, | |
| "rewards/reward_len/mean": 0.8444010019302368, | |
| "rewards/reward_len/std": 0.28051677346229553, | |
| "step": 58 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.01888, | |
| "frac_reward_zero_std": 0.375, | |
| "grad_norm": 0.813261866569519, | |
| "kl": 0.02231256291270256, | |
| "learning_rate": 4.999527817898004e-06, | |
| "loss": 0.0, | |
| "num_tokens": 11590608.0, | |
| "reward": 0.853515625, | |
| "reward_std": 0.17535921931266785, | |
| "rewards/reward_len/mean": 0.853515625, | |
| "rewards/reward_len/std": 0.4461461901664734, | |
| "step": 59 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.0192, | |
| "frac_reward_zero_std": 0.4375, | |
| "grad_norm": 0.7521728873252869, | |
| "kl": 0.022967170923948288, | |
| "learning_rate": 4.999511395928228e-06, | |
| "loss": 0.0, | |
| "num_tokens": 11787168.0, | |
| "reward": 0.701171875, | |
| "reward_std": 0.15171362459659576, | |
| "rewards/reward_len/mean": 0.701171875, | |
| "rewards/reward_len/std": 0.4368668794631958, | |
| "step": 60 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.01952, | |
| "frac_reward_zero_std": 0.375, | |
| "grad_norm": 0.8079439997673035, | |
| "kl": 0.023756055161356926, | |
| "learning_rate": 4.9994946932779076e-06, | |
| "loss": 0.0, | |
| "num_tokens": 11983168.0, | |
| "reward": 0.9052734375, | |
| "reward_std": 0.11953707039356232, | |
| "rewards/reward_len/mean": 0.9052734375, | |
| "rewards/reward_len/std": 0.23069322109222412, | |
| "step": 61 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.01984, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.6609861850738525, | |
| "kl": 0.02364611253142357, | |
| "learning_rate": 4.99947770994892e-06, | |
| "loss": 0.0, | |
| "num_tokens": 12179648.0, | |
| "reward": 0.912109375, | |
| "reward_std": 0.124283567070961, | |
| "rewards/reward_len/mean": 0.912109375, | |
| "rewards/reward_len/std": 0.2015216052532196, | |
| "step": 62 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.02016, | |
| "frac_reward_zero_std": 0.4375, | |
| "grad_norm": 0.6409368515014648, | |
| "kl": 0.02544046938419342, | |
| "learning_rate": 4.999460445943169e-06, | |
| "loss": 0.0001, | |
| "num_tokens": 12375952.0, | |
| "reward": 0.7766927480697632, | |
| "reward_std": 0.15531742572784424, | |
| "rewards/reward_len/mean": 0.7766927480697632, | |
| "rewards/reward_len/std": 0.3684055209159851, | |
| "step": 63 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.02048, | |
| "frac_reward_zero_std": 0.4375, | |
| "grad_norm": 0.6801385283470154, | |
| "kl": 0.02245226688683033, | |
| "learning_rate": 4.999442901262598e-06, | |
| "loss": 0.0, | |
| "num_tokens": 12572432.0, | |
| "reward": 0.9143880605697632, | |
| "reward_std": 0.09244164079427719, | |
| "rewards/reward_len/mean": 0.9143880605697632, | |
| "rewards/reward_len/std": 0.18908163905143738, | |
| "step": 64 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.0208, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.688310980796814, | |
| "kl": 0.025234345346689224, | |
| "learning_rate": 4.9994250759091725e-06, | |
| "loss": 0.0001, | |
| "num_tokens": 12768864.0, | |
| "reward": 0.8714193105697632, | |
| "reward_std": 0.11069254577159882, | |
| "rewards/reward_len/mean": 0.8714193105697632, | |
| "rewards/reward_len/std": 0.2919362485408783, | |
| "step": 65 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.02112, | |
| "frac_reward_zero_std": 0.5625, | |
| "grad_norm": 0.6891856789588928, | |
| "kl": 0.023964237421751022, | |
| "learning_rate": 4.999406969884897e-06, | |
| "loss": 0.0, | |
| "num_tokens": 12965328.0, | |
| "reward": 0.8665364980697632, | |
| "reward_std": 0.1188942939043045, | |
| "rewards/reward_len/mean": 0.8665364980697632, | |
| "rewards/reward_len/std": 0.26203906536102295, | |
| "step": 66 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.02144, | |
| "frac_reward_zero_std": 0.6875, | |
| "grad_norm": 0.5637570023536682, | |
| "kl": 0.026914432644844055, | |
| "learning_rate": 4.9993885831918035e-06, | |
| "loss": 0.0001, | |
| "num_tokens": 13161712.0, | |
| "reward": 0.8720703125, | |
| "reward_std": 0.09208361804485321, | |
| "rewards/reward_len/mean": 0.8720703125, | |
| "rewards/reward_len/std": 0.2989489436149597, | |
| "step": 67 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.02176, | |
| "frac_reward_zero_std": 0.5625, | |
| "grad_norm": 0.5956972241401672, | |
| "kl": 0.026400156319141388, | |
| "learning_rate": 4.999369915831958e-06, | |
| "loss": 0.0001, | |
| "num_tokens": 13358048.0, | |
| "reward": 0.7919921875, | |
| "reward_std": 0.08180129528045654, | |
| "rewards/reward_len/mean": 0.7919921875, | |
| "rewards/reward_len/std": 0.3576885163784027, | |
| "step": 68 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.02208, | |
| "frac_reward_zero_std": 0.625, | |
| "grad_norm": 0.5297041535377502, | |
| "kl": 0.02976173348724842, | |
| "learning_rate": 4.999350967807455e-06, | |
| "loss": 0.0001, | |
| "num_tokens": 13554560.0, | |
| "reward": 0.9059244990348816, | |
| "reward_std": 0.08187607675790787, | |
| "rewards/reward_len/mean": 0.9059244990348816, | |
| "rewards/reward_len/std": 0.20148861408233643, | |
| "step": 69 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.0224, | |
| "frac_reward_zero_std": 0.5625, | |
| "grad_norm": 0.5297337174415588, | |
| "kl": 0.029380029067397118, | |
| "learning_rate": 4.999331739120423e-06, | |
| "loss": 0.0001, | |
| "num_tokens": 13750992.0, | |
| "reward": 0.8766276240348816, | |
| "reward_std": 0.08794420212507248, | |
| "rewards/reward_len/mean": 0.8766276240348816, | |
| "rewards/reward_len/std": 0.24631834030151367, | |
| "step": 70 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.02272, | |
| "frac_reward_zero_std": 0.6875, | |
| "grad_norm": 0.5374413728713989, | |
| "kl": 0.03178559988737106, | |
| "learning_rate": 4.999312229773022e-06, | |
| "loss": 0.0001, | |
| "num_tokens": 13947232.0, | |
| "reward": 0.96484375, | |
| "reward_std": 0.07383135706186295, | |
| "rewards/reward_len/mean": 0.96484375, | |
| "rewards/reward_len/std": 0.17232529819011688, | |
| "step": 71 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.02304, | |
| "frac_reward_zero_std": 0.5625, | |
| "grad_norm": 0.6321916580200195, | |
| "kl": 0.02617986872792244, | |
| "learning_rate": 4.9992924397674414e-06, | |
| "loss": 0.0001, | |
| "num_tokens": 14143712.0, | |
| "reward": 0.9111328125, | |
| "reward_std": 0.08529947698116302, | |
| "rewards/reward_len/mean": 0.9111328125, | |
| "rewards/reward_len/std": 0.2390221208333969, | |
| "step": 72 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.02336, | |
| "frac_reward_zero_std": 0.3125, | |
| "grad_norm": 0.8528509140014648, | |
| "kl": 0.029244577512145042, | |
| "learning_rate": 4.999272369105904e-06, | |
| "loss": 0.0001, | |
| "num_tokens": 14340208.0, | |
| "reward": 0.8619791865348816, | |
| "reward_std": 0.1358361542224884, | |
| "rewards/reward_len/mean": 0.8619791269302368, | |
| "rewards/reward_len/std": 0.3006777763366699, | |
| "step": 73 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.02368, | |
| "frac_reward_zero_std": 0.4375, | |
| "grad_norm": 0.7068591117858887, | |
| "kl": 0.038017794489860535, | |
| "learning_rate": 4.999252017790665e-06, | |
| "loss": 0.0001, | |
| "num_tokens": 14536384.0, | |
| "reward": 0.8388671875, | |
| "reward_std": 0.126865953207016, | |
| "rewards/reward_len/mean": 0.8388671875, | |
| "rewards/reward_len/std": 0.2714577317237854, | |
| "step": 74 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.024, | |
| "frac_reward_zero_std": 0.3125, | |
| "grad_norm": 0.7906740307807922, | |
| "kl": 0.035286713391542435, | |
| "learning_rate": 4.999231385824008e-06, | |
| "loss": 0.0001, | |
| "num_tokens": 14732848.0, | |
| "reward": 0.8375651240348816, | |
| "reward_std": 0.20768919587135315, | |
| "rewards/reward_len/mean": 0.8375651240348816, | |
| "rewards/reward_len/std": 0.327578604221344, | |
| "step": 75 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.02432, | |
| "frac_reward_zero_std": 0.4375, | |
| "grad_norm": 0.7323654890060425, | |
| "kl": 0.029418617486953735, | |
| "learning_rate": 4.99921047320825e-06, | |
| "loss": 0.0001, | |
| "num_tokens": 14929392.0, | |
| "reward": 0.8792318105697632, | |
| "reward_std": 0.109856978058815, | |
| "rewards/reward_len/mean": 0.8792318105697632, | |
| "rewards/reward_len/std": 0.28465378284454346, | |
| "step": 76 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.02464, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.7487607598304749, | |
| "kl": 0.02889881655573845, | |
| "learning_rate": 4.999189279945741e-06, | |
| "loss": 0.0001, | |
| "num_tokens": 15125728.0, | |
| "reward": 0.8938802480697632, | |
| "reward_std": 0.10382238030433655, | |
| "rewards/reward_len/mean": 0.8938801884651184, | |
| "rewards/reward_len/std": 0.26159700751304626, | |
| "step": 77 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.02496, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.7281383275985718, | |
| "kl": 0.03049377351999283, | |
| "learning_rate": 4.999167806038858e-06, | |
| "loss": 0.0001, | |
| "num_tokens": 15322032.0, | |
| "reward": 0.8343099355697632, | |
| "reward_std": 0.14393801987171173, | |
| "rewards/reward_len/mean": 0.8343099355697632, | |
| "rewards/reward_len/std": 0.3518146574497223, | |
| "step": 78 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.02528, | |
| "frac_reward_zero_std": 0.5625, | |
| "grad_norm": 0.6423137784004211, | |
| "kl": 0.03136272728443146, | |
| "learning_rate": 4.999146051490016e-06, | |
| "loss": 0.0001, | |
| "num_tokens": 15518320.0, | |
| "reward": 0.9407552480697632, | |
| "reward_std": 0.0995391458272934, | |
| "rewards/reward_len/mean": 0.9407551884651184, | |
| "rewards/reward_len/std": 0.24368993937969208, | |
| "step": 79 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.0256, | |
| "frac_reward_zero_std": 0.375, | |
| "grad_norm": 0.8017213940620422, | |
| "kl": 0.03368952497839928, | |
| "learning_rate": 4.999124016301654e-06, | |
| "loss": 0.0001, | |
| "num_tokens": 15714848.0, | |
| "reward": 0.857421875, | |
| "reward_std": 0.14494842290878296, | |
| "rewards/reward_len/mean": 0.857421875, | |
| "rewards/reward_len/std": 0.286716103553772, | |
| "step": 80 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.02592, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.8540568947792053, | |
| "kl": 0.03586439788341522, | |
| "learning_rate": 4.99910170047625e-06, | |
| "loss": 0.0001, | |
| "num_tokens": 15911296.0, | |
| "reward": 0.8414713740348816, | |
| "reward_std": 0.18263337016105652, | |
| "rewards/reward_len/mean": 0.8414713740348816, | |
| "rewards/reward_len/std": 0.3021599352359772, | |
| "step": 81 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.02624, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.8202486038208008, | |
| "kl": 0.04007921367883682, | |
| "learning_rate": 4.999079104016308e-06, | |
| "loss": 0.0001, | |
| "num_tokens": 16107648.0, | |
| "reward": 0.8688151240348816, | |
| "reward_std": 0.1747693419456482, | |
| "rewards/reward_len/mean": 0.8688150644302368, | |
| "rewards/reward_len/std": 0.2946781814098358, | |
| "step": 82 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.02656, | |
| "frac_reward_zero_std": 0.625, | |
| "grad_norm": 0.697175920009613, | |
| "kl": 0.03438268229365349, | |
| "learning_rate": 4.999056226924366e-06, | |
| "loss": 0.0001, | |
| "num_tokens": 16304064.0, | |
| "reward": 0.9052734375, | |
| "reward_std": 0.10430242866277695, | |
| "rewards/reward_len/mean": 0.9052734375, | |
| "rewards/reward_len/std": 0.24809814989566803, | |
| "step": 83 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.02688, | |
| "frac_reward_zero_std": 0.625, | |
| "grad_norm": 0.6200621724128723, | |
| "kl": 0.030285635963082314, | |
| "learning_rate": 4.999033069202992e-06, | |
| "loss": 0.0001, | |
| "num_tokens": 16500336.0, | |
| "reward": 0.9410807490348816, | |
| "reward_std": 0.08145523071289062, | |
| "rewards/reward_len/mean": 0.9410807490348816, | |
| "rewards/reward_len/std": 0.16886034607887268, | |
| "step": 84 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.0272, | |
| "frac_reward_zero_std": 0.4375, | |
| "grad_norm": 0.6632947325706482, | |
| "kl": 0.03737508878111839, | |
| "learning_rate": 4.999009630854787e-06, | |
| "loss": 0.0001, | |
| "num_tokens": 16696880.0, | |
| "reward": 0.8681640625, | |
| "reward_std": 0.09984727203845978, | |
| "rewards/reward_len/mean": 0.8681640625, | |
| "rewards/reward_len/std": 0.3139059841632843, | |
| "step": 85 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.02752, | |
| "frac_reward_zero_std": 0.6875, | |
| "grad_norm": 0.5139328837394714, | |
| "kl": 0.033338725566864014, | |
| "learning_rate": 4.998985911882383e-06, | |
| "loss": 0.0001, | |
| "num_tokens": 16893424.0, | |
| "reward": 0.9231771230697632, | |
| "reward_std": 0.064088836312294, | |
| "rewards/reward_len/mean": 0.9231771230697632, | |
| "rewards/reward_len/std": 0.20548295974731445, | |
| "step": 86 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.02784, | |
| "frac_reward_zero_std": 0.4375, | |
| "grad_norm": 0.7163829803466797, | |
| "kl": 0.04579077661037445, | |
| "learning_rate": 4.998961912288445e-06, | |
| "loss": 0.0001, | |
| "num_tokens": 17089792.0, | |
| "reward": 0.9013671875, | |
| "reward_std": 0.14256593585014343, | |
| "rewards/reward_len/mean": 0.9013671875, | |
| "rewards/reward_len/std": 0.2612590491771698, | |
| "step": 87 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.02816, | |
| "frac_reward_zero_std": 0.625, | |
| "grad_norm": 0.6266374588012695, | |
| "kl": 0.03986838087439537, | |
| "learning_rate": 4.998937632075667e-06, | |
| "loss": 0.0001, | |
| "num_tokens": 17285984.0, | |
| "reward": 0.9016927480697632, | |
| "reward_std": 0.08280540257692337, | |
| "rewards/reward_len/mean": 0.9016926884651184, | |
| "rewards/reward_len/std": 0.24128234386444092, | |
| "step": 88 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.02848, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.5966895222663879, | |
| "kl": 0.033690571784973145, | |
| "learning_rate": 4.998913071246774e-06, | |
| "loss": 0.0001, | |
| "num_tokens": 17482432.0, | |
| "reward": 0.9534505605697632, | |
| "reward_std": 0.08552976697683334, | |
| "rewards/reward_len/mean": 0.9534505009651184, | |
| "rewards/reward_len/std": 0.1531372368335724, | |
| "step": 89 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.0288, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.6456487774848938, | |
| "kl": 0.06617207825183868, | |
| "learning_rate": 4.998888229804526e-06, | |
| "loss": 0.0001, | |
| "num_tokens": 17678768.0, | |
| "reward": 0.8577474355697632, | |
| "reward_std": 0.10766549408435822, | |
| "rewards/reward_len/mean": 0.8577474355697632, | |
| "rewards/reward_len/std": 0.3121115267276764, | |
| "step": 90 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.02912, | |
| "frac_reward_zero_std": 0.5625, | |
| "grad_norm": 0.6451205015182495, | |
| "kl": 0.03554663062095642, | |
| "learning_rate": 4.998863107751711e-06, | |
| "loss": 0.0001, | |
| "num_tokens": 17875056.0, | |
| "reward": 0.8850911855697632, | |
| "reward_std": 0.09045256674289703, | |
| "rewards/reward_len/mean": 0.8850911855697632, | |
| "rewards/reward_len/std": 0.2813292443752289, | |
| "step": 91 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.02944, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.6420301795005798, | |
| "kl": 0.03609941899776459, | |
| "learning_rate": 4.998837705091152e-06, | |
| "loss": 0.0001, | |
| "num_tokens": 18071504.0, | |
| "reward": 0.9020182490348816, | |
| "reward_std": 0.12863239645957947, | |
| "rewards/reward_len/mean": 0.9020181894302368, | |
| "rewards/reward_len/std": 0.2481461763381958, | |
| "step": 92 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.02976, | |
| "frac_reward_zero_std": 0.4375, | |
| "grad_norm": 0.744374692440033, | |
| "kl": 0.03676271066069603, | |
| "learning_rate": 4.9988120218257e-06, | |
| "loss": 0.0001, | |
| "num_tokens": 18267712.0, | |
| "reward": 0.8391927480697632, | |
| "reward_std": 0.13910341262817383, | |
| "rewards/reward_len/mean": 0.8391926884651184, | |
| "rewards/reward_len/std": 0.33368995785713196, | |
| "step": 93 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.03008, | |
| "frac_reward_zero_std": 0.4375, | |
| "grad_norm": 0.6814514398574829, | |
| "kl": 0.04908795654773712, | |
| "learning_rate": 4.99878605795824e-06, | |
| "loss": 0.0001, | |
| "num_tokens": 18464272.0, | |
| "reward": 0.8059896230697632, | |
| "reward_std": 0.10761559009552002, | |
| "rewards/reward_len/mean": 0.8059896230697632, | |
| "rewards/reward_len/std": 0.3403044641017914, | |
| "step": 94 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.0304, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.6609547138214111, | |
| "kl": 0.04120853543281555, | |
| "learning_rate": 4.998759813491687e-06, | |
| "loss": 0.0001, | |
| "num_tokens": 18660592.0, | |
| "reward": 0.8746744990348816, | |
| "reward_std": 0.12070801854133606, | |
| "rewards/reward_len/mean": 0.8746744990348816, | |
| "rewards/reward_len/std": 0.3313665986061096, | |
| "step": 95 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.03072, | |
| "frac_reward_zero_std": 0.4375, | |
| "grad_norm": 0.6958045959472656, | |
| "kl": 0.051205702126026154, | |
| "learning_rate": 4.998733288428987e-06, | |
| "loss": 0.0001, | |
| "num_tokens": 18857088.0, | |
| "reward": 0.8590494990348816, | |
| "reward_std": 0.12743408977985382, | |
| "rewards/reward_len/mean": 0.8590494990348816, | |
| "rewards/reward_len/std": 0.2551549971103668, | |
| "step": 96 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.03104, | |
| "frac_reward_zero_std": 0.5625, | |
| "grad_norm": 0.5829638838768005, | |
| "kl": 0.03792741894721985, | |
| "learning_rate": 4.998706482773121e-06, | |
| "loss": 0.0001, | |
| "num_tokens": 19053536.0, | |
| "reward": 0.8645833730697632, | |
| "reward_std": 0.08903007954359055, | |
| "rewards/reward_len/mean": 0.8645833134651184, | |
| "rewards/reward_len/std": 0.27937448024749756, | |
| "step": 97 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.03136, | |
| "frac_reward_zero_std": 0.4375, | |
| "grad_norm": 0.6745487451553345, | |
| "kl": 0.04386503994464874, | |
| "learning_rate": 4.998679396527099e-06, | |
| "loss": 0.0001, | |
| "num_tokens": 19250192.0, | |
| "reward": 0.8658854365348816, | |
| "reward_std": 0.11358411610126495, | |
| "rewards/reward_len/mean": 0.8658853769302368, | |
| "rewards/reward_len/std": 0.21724016964435577, | |
| "step": 98 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.03168, | |
| "frac_reward_zero_std": 0.375, | |
| "grad_norm": 0.8316110968589783, | |
| "kl": 0.04867399483919144, | |
| "learning_rate": 4.99865202969396e-06, | |
| "loss": 0.0001, | |
| "num_tokens": 19446624.0, | |
| "reward": 0.8932291865348816, | |
| "reward_std": 0.11117450892925262, | |
| "rewards/reward_len/mean": 0.8932291865348816, | |
| "rewards/reward_len/std": 0.20754754543304443, | |
| "step": 99 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 729.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 729.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 729.0, | |
| "completions/min_terminated_length": 0.0, | |
| "epoch": 0.032, | |
| "frac_reward_zero_std": 0.125, | |
| "grad_norm": 0.9266502857208252, | |
| "kl": 0.04411531612277031, | |
| "learning_rate": 4.9986243822767795e-06, | |
| "loss": 0.0001, | |
| "num_tokens": 19643296.0, | |
| "reward": 0.8235677480697632, | |
| "reward_std": 0.17513221502304077, | |
| "rewards/reward_len/mean": 0.8235677480697632, | |
| "rewards/reward_len/std": 0.28480416536331177, | |
| "step": 100 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 9375, | |
| "num_input_tokens_seen": 19643296, | |
| "num_train_epochs": 3, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |