jiuhai's picture
Add files using upload-large-folder tool
0f3fb16 verified
raw
history blame
77.4 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.032,
"eval_steps": 500,
"global_step": 100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.00032,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.8027206659317017,
"kl": 1.71183273778297e-05,
"learning_rate": 5e-06,
"loss": 0.0,
"num_tokens": 196560.0,
"reward": 0.7936198115348816,
"reward_std": 0.17962533235549927,
"rewards/reward_len/mean": 0.7936197519302368,
"rewards/reward_len/std": 0.3258915841579437,
"step": 1
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.00064,
"frac_reward_zero_std": 0.5625,
"grad_norm": 0.7602783441543579,
"kl": 0.00532680656760931,
"learning_rate": 4.999999859632295e-06,
"loss": 0.0,
"num_tokens": 393040.0,
"reward": 0.7620443105697632,
"reward_std": 0.08783292770385742,
"rewards/reward_len/mean": 0.7620443105697632,
"rewards/reward_len/std": 0.3672178089618683,
"step": 2
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.00096,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.7249953150749207,
"kl": 0.005223212763667107,
"learning_rate": 4.9999994385291934e-06,
"loss": 0.0,
"num_tokens": 589344.0,
"reward": 0.8811849355697632,
"reward_std": 0.13462764024734497,
"rewards/reward_len/mean": 0.8811849355697632,
"rewards/reward_len/std": 0.2868101894855499,
"step": 3
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.00128,
"frac_reward_zero_std": 0.25,
"grad_norm": 1.0998049974441528,
"kl": 0.005302521400153637,
"learning_rate": 4.9999987366907436e-06,
"loss": 0.0,
"num_tokens": 785824.0,
"reward": 0.837890625,
"reward_std": 0.17104606330394745,
"rewards/reward_len/mean": 0.837890625,
"rewards/reward_len/std": 0.274716854095459,
"step": 4
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.0016,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.8081106543540955,
"kl": 0.005471091717481613,
"learning_rate": 4.999997754117024e-06,
"loss": 0.0,
"num_tokens": 982400.0,
"reward": 0.7578125,
"reward_std": 0.15783792734146118,
"rewards/reward_len/mean": 0.7578125,
"rewards/reward_len/std": 0.3556174635887146,
"step": 5
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.00192,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.7931028008460999,
"kl": 0.005383472889661789,
"learning_rate": 4.999996490808146e-06,
"loss": 0.0,
"num_tokens": 1178736.0,
"reward": 0.8411458730697632,
"reward_std": 0.15851536393165588,
"rewards/reward_len/mean": 0.8411458730697632,
"rewards/reward_len/std": 0.2852468192577362,
"step": 6
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.00224,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.855617880821228,
"kl": 0.005938877817243338,
"learning_rate": 4.9999949467642495e-06,
"loss": 0.0,
"num_tokens": 1375184.0,
"reward": 0.7444661855697632,
"reward_std": 0.18254899978637695,
"rewards/reward_len/mean": 0.7444661855697632,
"rewards/reward_len/std": 0.5324408411979675,
"step": 7
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.00256,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.8006792068481445,
"kl": 0.006116841919720173,
"learning_rate": 4.999993121985509e-06,
"loss": 0.0,
"num_tokens": 1571472.0,
"reward": 0.755859375,
"reward_std": 0.14840292930603027,
"rewards/reward_len/mean": 0.755859375,
"rewards/reward_len/std": 0.41854918003082275,
"step": 8
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.00288,
"frac_reward_zero_std": 0.625,
"grad_norm": 0.5651379823684692,
"kl": 0.006282718852162361,
"learning_rate": 4.99999101647213e-06,
"loss": 0.0,
"num_tokens": 1767696.0,
"reward": 0.8597005605697632,
"reward_std": 0.09105785191059113,
"rewards/reward_len/mean": 0.8597005605697632,
"rewards/reward_len/std": 0.3063139021396637,
"step": 9
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.0032,
"frac_reward_zero_std": 0.3125,
"grad_norm": 0.7746672630310059,
"kl": 0.006277492269873619,
"learning_rate": 4.9999886302243486e-06,
"loss": 0.0,
"num_tokens": 1964352.0,
"reward": 0.736328125,
"reward_std": 0.16170179843902588,
"rewards/reward_len/mean": 0.736328125,
"rewards/reward_len/std": 0.35940179228782654,
"step": 10
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.00352,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.8158230185508728,
"kl": 0.0064939698204398155,
"learning_rate": 4.999985963242432e-06,
"loss": 0.0,
"num_tokens": 2160848.0,
"reward": 0.7190755605697632,
"reward_std": 0.14770188927650452,
"rewards/reward_len/mean": 0.7190755605697632,
"rewards/reward_len/std": 0.39762866497039795,
"step": 11
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.00384,
"frac_reward_zero_std": 0.625,
"grad_norm": 0.4778946042060852,
"kl": 0.00701224897056818,
"learning_rate": 4.99998301552668e-06,
"loss": 0.0,
"num_tokens": 2357440.0,
"reward": 0.9007161855697632,
"reward_std": 0.08900929242372513,
"rewards/reward_len/mean": 0.9007161855697632,
"rewards/reward_len/std": 0.2015477418899536,
"step": 12
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.00416,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.7480363845825195,
"kl": 0.006892648059874773,
"learning_rate": 4.999979787077425e-06,
"loss": 0.0,
"num_tokens": 2553776.0,
"reward": 0.736328125,
"reward_std": 0.11627350747585297,
"rewards/reward_len/mean": 0.736328125,
"rewards/reward_len/std": 0.4063463509082794,
"step": 13
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.00448,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.6913530826568604,
"kl": 0.007525671273469925,
"learning_rate": 4.9999762778950265e-06,
"loss": 0.0,
"num_tokens": 2750176.0,
"reward": 0.8733724355697632,
"reward_std": 0.13304749131202698,
"rewards/reward_len/mean": 0.8733724355697632,
"rewards/reward_len/std": 0.260817289352417,
"step": 14
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.0048,
"frac_reward_zero_std": 0.3125,
"grad_norm": 0.9006426334381104,
"kl": 0.007139429450035095,
"learning_rate": 4.999972487979882e-06,
"loss": 0.0,
"num_tokens": 2946432.0,
"reward": 0.7353515625,
"reward_std": 0.18793734908103943,
"rewards/reward_len/mean": 0.7353515625,
"rewards/reward_len/std": 0.4250172972679138,
"step": 15
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.00512,
"frac_reward_zero_std": 0.4375,
"grad_norm": 0.7369216084480286,
"kl": 0.007592486217617989,
"learning_rate": 4.999968417332415e-06,
"loss": 0.0,
"num_tokens": 3142960.0,
"reward": 0.8525390625,
"reward_std": 0.13952995836734772,
"rewards/reward_len/mean": 0.8525390625,
"rewards/reward_len/std": 0.2791503369808197,
"step": 16
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.00544,
"frac_reward_zero_std": 0.3125,
"grad_norm": 0.7958468198776245,
"kl": 0.007764819078147411,
"learning_rate": 4.999964065953083e-06,
"loss": 0.0,
"num_tokens": 3339504.0,
"reward": 0.7796224355697632,
"reward_std": 0.13519705832004547,
"rewards/reward_len/mean": 0.7796224355697632,
"rewards/reward_len/std": 0.2986285090446472,
"step": 17
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.00576,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8955850005149841,
"kl": 0.008127257227897644,
"learning_rate": 4.999959433842374e-06,
"loss": 0.0,
"num_tokens": 3536064.0,
"reward": 0.6917318105697632,
"reward_std": 0.22921673953533173,
"rewards/reward_len/mean": 0.6917317509651184,
"rewards/reward_len/std": 0.3818399906158447,
"step": 18
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.00608,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.7979457974433899,
"kl": 0.007617972791194916,
"learning_rate": 4.999954521000811e-06,
"loss": 0.0,
"num_tokens": 3732256.0,
"reward": 0.78515625,
"reward_std": 0.1195596233010292,
"rewards/reward_len/mean": 0.78515625,
"rewards/reward_len/std": 0.32990217208862305,
"step": 19
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.0064,
"frac_reward_zero_std": 0.3125,
"grad_norm": 0.7890306115150452,
"kl": 0.007777344435453415,
"learning_rate": 4.999949327428941e-06,
"loss": 0.0,
"num_tokens": 3928640.0,
"reward": 0.7919921875,
"reward_std": 0.17304885387420654,
"rewards/reward_len/mean": 0.7919921875,
"rewards/reward_len/std": 0.3526277244091034,
"step": 20
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.00672,
"frac_reward_zero_std": 0.4375,
"grad_norm": 0.7711103558540344,
"kl": 0.008527114056050777,
"learning_rate": 4.999943853127351e-06,
"loss": 0.0,
"num_tokens": 4124784.0,
"reward": 0.9065755605697632,
"reward_std": 0.09724702686071396,
"rewards/reward_len/mean": 0.9065755605697632,
"rewards/reward_len/std": 0.252506285905838,
"step": 21
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.00704,
"frac_reward_zero_std": 0.3125,
"grad_norm": 0.7866696119308472,
"kl": 0.008756128139793873,
"learning_rate": 4.999938098096655e-06,
"loss": 0.0,
"num_tokens": 4321184.0,
"reward": 0.8844401240348816,
"reward_std": 0.1446431577205658,
"rewards/reward_len/mean": 0.8844401240348816,
"rewards/reward_len/std": 0.2464737743139267,
"step": 22
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.00736,
"frac_reward_zero_std": 0.3125,
"grad_norm": 0.8422706723213196,
"kl": 0.009292546659708023,
"learning_rate": 4.999932062337498e-06,
"loss": 0.0,
"num_tokens": 4517488.0,
"reward": 0.9410807490348816,
"reward_std": 0.12082913517951965,
"rewards/reward_len/mean": 0.9410807490348816,
"rewards/reward_len/std": 0.16378451883792877,
"step": 23
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.00768,
"frac_reward_zero_std": 0.4375,
"grad_norm": 0.7936801314353943,
"kl": 0.009337708353996277,
"learning_rate": 4.999925745850559e-06,
"loss": 0.0,
"num_tokens": 4713888.0,
"reward": 0.7981771230697632,
"reward_std": 0.1319822520017624,
"rewards/reward_len/mean": 0.7981771230697632,
"rewards/reward_len/std": 0.29971110820770264,
"step": 24
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.008,
"frac_reward_zero_std": 0.3125,
"grad_norm": 0.8625982403755188,
"kl": 0.009664739482104778,
"learning_rate": 4.999919148636547e-06,
"loss": 0.0,
"num_tokens": 4910480.0,
"reward": 0.7828776240348816,
"reward_std": 0.15538470447063446,
"rewards/reward_len/mean": 0.7828776240348816,
"rewards/reward_len/std": 0.3985660672187805,
"step": 25
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.00832,
"frac_reward_zero_std": 0.1875,
"grad_norm": 0.7866320013999939,
"kl": 0.009115578606724739,
"learning_rate": 4.999912270696202e-06,
"loss": 0.0,
"num_tokens": 5107040.0,
"reward": 0.837890625,
"reward_std": 0.19406211376190186,
"rewards/reward_len/mean": 0.837890625,
"rewards/reward_len/std": 0.28484001755714417,
"step": 26
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.00864,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.7605833411216736,
"kl": 0.010722242295742035,
"learning_rate": 4.999905112030298e-06,
"loss": 0.0,
"num_tokens": 5303472.0,
"reward": 0.8326823115348816,
"reward_std": 0.16619378328323364,
"rewards/reward_len/mean": 0.8326823115348816,
"rewards/reward_len/std": 0.3326784670352936,
"step": 27
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.00896,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.8247683048248291,
"kl": 0.010293405503034592,
"learning_rate": 4.999897672639636e-06,
"loss": 0.0,
"num_tokens": 5499968.0,
"reward": 0.7662760019302368,
"reward_std": 0.13208766281604767,
"rewards/reward_len/mean": 0.7662760615348816,
"rewards/reward_len/std": 0.3626876771450043,
"step": 28
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.00928,
"frac_reward_zero_std": 0.1875,
"grad_norm": 0.9354327917098999,
"kl": 0.010291656479239464,
"learning_rate": 4.9998899525250556e-06,
"loss": 0.0,
"num_tokens": 5696496.0,
"reward": 0.6142578125,
"reward_std": 0.2659006714820862,
"rewards/reward_len/mean": 0.6142578125,
"rewards/reward_len/std": 0.5169708728790283,
"step": 29
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.0096,
"frac_reward_zero_std": 0.3125,
"grad_norm": 0.7617059946060181,
"kl": 0.011677569709718227,
"learning_rate": 4.99988195168742e-06,
"loss": 0.0,
"num_tokens": 5892672.0,
"reward": 0.818359375,
"reward_std": 0.16987799108028412,
"rewards/reward_len/mean": 0.818359375,
"rewards/reward_len/std": 0.2849595248699188,
"step": 30
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.00992,
"frac_reward_zero_std": 0.4375,
"grad_norm": 0.6369988322257996,
"kl": 0.011780554428696632,
"learning_rate": 4.99987367012763e-06,
"loss": 0.0,
"num_tokens": 6089264.0,
"reward": 0.8307291865348816,
"reward_std": 0.15030531585216522,
"rewards/reward_len/mean": 0.8307291865348816,
"rewards/reward_len/std": 0.3026638329029083,
"step": 31
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.01024,
"frac_reward_zero_std": 0.625,
"grad_norm": 0.5698184370994568,
"kl": 0.011647619307041168,
"learning_rate": 4.9998651078466144e-06,
"loss": 0.0,
"num_tokens": 6285792.0,
"reward": 0.8603515625,
"reward_std": 0.06890285015106201,
"rewards/reward_len/mean": 0.8603515625,
"rewards/reward_len/std": 0.3024090826511383,
"step": 32
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.01056,
"frac_reward_zero_std": 0.625,
"grad_norm": 0.7035416960716248,
"kl": 0.01283220387995243,
"learning_rate": 4.999856264845334e-06,
"loss": 0.0,
"num_tokens": 6482272.0,
"reward": 0.8284505605697632,
"reward_std": 0.0647423267364502,
"rewards/reward_len/mean": 0.8284505605697632,
"rewards/reward_len/std": 0.2645607888698578,
"step": 33
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.01088,
"frac_reward_zero_std": 0.3125,
"grad_norm": 0.7955151200294495,
"kl": 0.013212130405008793,
"learning_rate": 4.999847141124784e-06,
"loss": 0.0,
"num_tokens": 6678928.0,
"reward": 0.9065755605697632,
"reward_std": 0.15104928612709045,
"rewards/reward_len/mean": 0.9065755605697632,
"rewards/reward_len/std": 0.2394418716430664,
"step": 34
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.0112,
"frac_reward_zero_std": 0.3125,
"grad_norm": 0.7888926267623901,
"kl": 0.01455118041485548,
"learning_rate": 4.999837736685987e-06,
"loss": 0.0,
"num_tokens": 6875680.0,
"reward": 0.8001302480697632,
"reward_std": 0.18624618649482727,
"rewards/reward_len/mean": 0.8001302480697632,
"rewards/reward_len/std": 0.3228481709957123,
"step": 35
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.01152,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.7999125719070435,
"kl": 0.016429290175437927,
"learning_rate": 4.9998280515300006e-06,
"loss": 0.0,
"num_tokens": 7071792.0,
"reward": 0.7805989980697632,
"reward_std": 0.10644528269767761,
"rewards/reward_len/mean": 0.7805989980697632,
"rewards/reward_len/std": 0.3641582727432251,
"step": 36
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.01184,
"frac_reward_zero_std": 0.1875,
"grad_norm": 0.8360075950622559,
"kl": 0.01572563126683235,
"learning_rate": 4.999818085657911e-06,
"loss": 0.0,
"num_tokens": 7268400.0,
"reward": 0.8115234375,
"reward_std": 0.19910216331481934,
"rewards/reward_len/mean": 0.8115234375,
"rewards/reward_len/std": 0.3251221179962158,
"step": 37
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.01216,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.8303148746490479,
"kl": 0.015369746834039688,
"learning_rate": 4.9998078390708375e-06,
"loss": 0.0,
"num_tokens": 7464832.0,
"reward": 0.7236328125,
"reward_std": 0.20254287123680115,
"rewards/reward_len/mean": 0.7236328125,
"rewards/reward_len/std": 0.49883776903152466,
"step": 38
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.01248,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.8379618525505066,
"kl": 0.015162697061896324,
"learning_rate": 4.999797311769932e-06,
"loss": 0.0,
"num_tokens": 7661264.0,
"reward": 0.8512369990348816,
"reward_std": 0.1815636157989502,
"rewards/reward_len/mean": 0.8512369990348816,
"rewards/reward_len/std": 0.30639442801475525,
"step": 39
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.0128,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.9366811513900757,
"kl": 0.01498054526746273,
"learning_rate": 4.999786503756376e-06,
"loss": 0.0,
"num_tokens": 7857696.0,
"reward": 0.8359375,
"reward_std": 0.14262336492538452,
"rewards/reward_len/mean": 0.8359375,
"rewards/reward_len/std": 0.29101234674453735,
"step": 40
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.01312,
"frac_reward_zero_std": 0.125,
"grad_norm": 0.8755704164505005,
"kl": 0.017075425013899803,
"learning_rate": 4.999775415031381e-06,
"loss": 0.0,
"num_tokens": 8054128.0,
"reward": 0.8531901240348816,
"reward_std": 0.19629715383052826,
"rewards/reward_len/mean": 0.8531901240348816,
"rewards/reward_len/std": 0.29085859656333923,
"step": 41
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.01344,
"frac_reward_zero_std": 0.4375,
"grad_norm": 0.7478809356689453,
"kl": 0.016752440482378006,
"learning_rate": 4.999764045596195e-06,
"loss": 0.0,
"num_tokens": 8250448.0,
"reward": 0.8297526240348816,
"reward_std": 0.10785592347383499,
"rewards/reward_len/mean": 0.8297526240348816,
"rewards/reward_len/std": 0.3374955654144287,
"step": 42
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.01376,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.79532790184021,
"kl": 0.01613219454884529,
"learning_rate": 4.999752395452095e-06,
"loss": 0.0,
"num_tokens": 8446864.0,
"reward": 0.798828125,
"reward_std": 0.15989980101585388,
"rewards/reward_len/mean": 0.798828125,
"rewards/reward_len/std": 0.32186606526374817,
"step": 43
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.01408,
"frac_reward_zero_std": 0.4375,
"grad_norm": 0.757718563079834,
"kl": 0.016810301691293716,
"learning_rate": 4.999740464600386e-06,
"loss": 0.0,
"num_tokens": 8643536.0,
"reward": 0.8186849355697632,
"reward_std": 0.14407595992088318,
"rewards/reward_len/mean": 0.8186849355697632,
"rewards/reward_len/std": 0.3191888630390167,
"step": 44
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.0144,
"frac_reward_zero_std": 0.4375,
"grad_norm": 0.6717040538787842,
"kl": 0.019070003181695938,
"learning_rate": 4.9997282530424114e-06,
"loss": 0.0,
"num_tokens": 8839776.0,
"reward": 0.8984375,
"reward_std": 0.15830302238464355,
"rewards/reward_len/mean": 0.8984375,
"rewards/reward_len/std": 0.24285823106765747,
"step": 45
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.01472,
"frac_reward_zero_std": 0.3125,
"grad_norm": 0.8255589008331299,
"kl": 0.01899189129471779,
"learning_rate": 4.999715760779541e-06,
"loss": 0.0,
"num_tokens": 9036368.0,
"reward": 0.7731119990348816,
"reward_std": 0.15324297547340393,
"rewards/reward_len/mean": 0.7731119394302368,
"rewards/reward_len/std": 0.3341211676597595,
"step": 46
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.01504,
"frac_reward_zero_std": 0.125,
"grad_norm": 0.9899515509605408,
"kl": 0.018817655742168427,
"learning_rate": 4.9997029878131776e-06,
"loss": 0.0,
"num_tokens": 9232672.0,
"reward": 0.7311198115348816,
"reward_std": 0.198727086186409,
"rewards/reward_len/mean": 0.7311197519302368,
"rewards/reward_len/std": 0.39068886637687683,
"step": 47
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.01536,
"frac_reward_zero_std": 0.3125,
"grad_norm": 0.7901699542999268,
"kl": 0.017967861145734787,
"learning_rate": 4.999689934144754e-06,
"loss": 0.0,
"num_tokens": 9429104.0,
"reward": 0.8727213740348816,
"reward_std": 0.14133989810943604,
"rewards/reward_len/mean": 0.8727213144302368,
"rewards/reward_len/std": 0.27691739797592163,
"step": 48
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.01568,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.8071075677871704,
"kl": 0.01948385313153267,
"learning_rate": 4.99967659977574e-06,
"loss": 0.0,
"num_tokens": 9625744.0,
"reward": 0.8323568105697632,
"reward_std": 0.1779971718788147,
"rewards/reward_len/mean": 0.8323567509651184,
"rewards/reward_len/std": 0.301727294921875,
"step": 49
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.016,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.7960187792778015,
"kl": 0.019299190491437912,
"learning_rate": 4.999662984707629e-06,
"loss": 0.0,
"num_tokens": 9822208.0,
"reward": 0.7916666865348816,
"reward_std": 0.17278119921684265,
"rewards/reward_len/mean": 0.7916666865348816,
"rewards/reward_len/std": 0.3409265875816345,
"step": 50
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.01632,
"frac_reward_zero_std": 0.3125,
"grad_norm": 0.815869152545929,
"kl": 0.01802164316177368,
"learning_rate": 4.999649088941951e-06,
"loss": 0.0,
"num_tokens": 10018832.0,
"reward": 0.7574869990348816,
"reward_std": 0.16599968075752258,
"rewards/reward_len/mean": 0.7574869990348816,
"rewards/reward_len/std": 0.3090384006500244,
"step": 51
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.01664,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.824347734451294,
"kl": 0.020238451659679413,
"learning_rate": 4.999634912480268e-06,
"loss": 0.0,
"num_tokens": 10215008.0,
"reward": 0.8421224355697632,
"reward_std": 0.14578115940093994,
"rewards/reward_len/mean": 0.8421224355697632,
"rewards/reward_len/std": 0.27820974588394165,
"step": 52
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.01696,
"frac_reward_zero_std": 0.4375,
"grad_norm": 0.7971190810203552,
"kl": 0.021430665627121925,
"learning_rate": 4.99962045532417e-06,
"loss": 0.0,
"num_tokens": 10411328.0,
"reward": 0.90625,
"reward_std": 0.1278911530971527,
"rewards/reward_len/mean": 0.90625,
"rewards/reward_len/std": 0.22140371799468994,
"step": 53
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.01728,
"frac_reward_zero_std": 0.4375,
"grad_norm": 0.7196866273880005,
"kl": 0.02385888248682022,
"learning_rate": 4.999605717475281e-06,
"loss": 0.0,
"num_tokens": 10607744.0,
"reward": 0.7164713144302368,
"reward_std": 0.1332741230726242,
"rewards/reward_len/mean": 0.7164713740348816,
"rewards/reward_len/std": 0.4236203730106354,
"step": 54
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.0176,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.694108247756958,
"kl": 0.019158754497766495,
"learning_rate": 4.999590698935257e-06,
"loss": 0.0,
"num_tokens": 10804432.0,
"reward": 0.83984375,
"reward_std": 0.07713833451271057,
"rewards/reward_len/mean": 0.83984375,
"rewards/reward_len/std": 0.3475888669490814,
"step": 55
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.01792,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.7138301730155945,
"kl": 0.021222909912467003,
"learning_rate": 4.999575399705782e-06,
"loss": 0.0,
"num_tokens": 11000960.0,
"reward": 0.8362630605697632,
"reward_std": 0.13972680270671844,
"rewards/reward_len/mean": 0.8362630605697632,
"rewards/reward_len/std": 0.3137500584125519,
"step": 56
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.01824,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.5983142852783203,
"kl": 0.023208746686577797,
"learning_rate": 4.999559819788578e-06,
"loss": 0.0,
"num_tokens": 11197456.0,
"reward": 0.8756510615348816,
"reward_std": 0.09809666872024536,
"rewards/reward_len/mean": 0.8756510615348816,
"rewards/reward_len/std": 0.27435725927352905,
"step": 57
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.01856,
"frac_reward_zero_std": 0.4375,
"grad_norm": 0.7307888865470886,
"kl": 0.021232981234788895,
"learning_rate": 4.999543959185391e-06,
"loss": 0.0,
"num_tokens": 11394064.0,
"reward": 0.8444010615348816,
"reward_std": 0.11457288265228271,
"rewards/reward_len/mean": 0.8444010019302368,
"rewards/reward_len/std": 0.28051677346229553,
"step": 58
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.01888,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.813261866569519,
"kl": 0.02231256291270256,
"learning_rate": 4.999527817898004e-06,
"loss": 0.0,
"num_tokens": 11590608.0,
"reward": 0.853515625,
"reward_std": 0.17535921931266785,
"rewards/reward_len/mean": 0.853515625,
"rewards/reward_len/std": 0.4461461901664734,
"step": 59
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.0192,
"frac_reward_zero_std": 0.4375,
"grad_norm": 0.7521728873252869,
"kl": 0.022967170923948288,
"learning_rate": 4.999511395928228e-06,
"loss": 0.0,
"num_tokens": 11787168.0,
"reward": 0.701171875,
"reward_std": 0.15171362459659576,
"rewards/reward_len/mean": 0.701171875,
"rewards/reward_len/std": 0.4368668794631958,
"step": 60
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.01952,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.8079439997673035,
"kl": 0.023756055161356926,
"learning_rate": 4.9994946932779076e-06,
"loss": 0.0,
"num_tokens": 11983168.0,
"reward": 0.9052734375,
"reward_std": 0.11953707039356232,
"rewards/reward_len/mean": 0.9052734375,
"rewards/reward_len/std": 0.23069322109222412,
"step": 61
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.01984,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.6609861850738525,
"kl": 0.02364611253142357,
"learning_rate": 4.99947770994892e-06,
"loss": 0.0,
"num_tokens": 12179648.0,
"reward": 0.912109375,
"reward_std": 0.124283567070961,
"rewards/reward_len/mean": 0.912109375,
"rewards/reward_len/std": 0.2015216052532196,
"step": 62
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.02016,
"frac_reward_zero_std": 0.4375,
"grad_norm": 0.6409368515014648,
"kl": 0.02544046938419342,
"learning_rate": 4.999460445943169e-06,
"loss": 0.0001,
"num_tokens": 12375952.0,
"reward": 0.7766927480697632,
"reward_std": 0.15531742572784424,
"rewards/reward_len/mean": 0.7766927480697632,
"rewards/reward_len/std": 0.3684055209159851,
"step": 63
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.02048,
"frac_reward_zero_std": 0.4375,
"grad_norm": 0.6801385283470154,
"kl": 0.02245226688683033,
"learning_rate": 4.999442901262598e-06,
"loss": 0.0,
"num_tokens": 12572432.0,
"reward": 0.9143880605697632,
"reward_std": 0.09244164079427719,
"rewards/reward_len/mean": 0.9143880605697632,
"rewards/reward_len/std": 0.18908163905143738,
"step": 64
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.0208,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.688310980796814,
"kl": 0.025234345346689224,
"learning_rate": 4.9994250759091725e-06,
"loss": 0.0001,
"num_tokens": 12768864.0,
"reward": 0.8714193105697632,
"reward_std": 0.11069254577159882,
"rewards/reward_len/mean": 0.8714193105697632,
"rewards/reward_len/std": 0.2919362485408783,
"step": 65
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.02112,
"frac_reward_zero_std": 0.5625,
"grad_norm": 0.6891856789588928,
"kl": 0.023964237421751022,
"learning_rate": 4.999406969884897e-06,
"loss": 0.0,
"num_tokens": 12965328.0,
"reward": 0.8665364980697632,
"reward_std": 0.1188942939043045,
"rewards/reward_len/mean": 0.8665364980697632,
"rewards/reward_len/std": 0.26203906536102295,
"step": 66
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.02144,
"frac_reward_zero_std": 0.6875,
"grad_norm": 0.5637570023536682,
"kl": 0.026914432644844055,
"learning_rate": 4.9993885831918035e-06,
"loss": 0.0001,
"num_tokens": 13161712.0,
"reward": 0.8720703125,
"reward_std": 0.09208361804485321,
"rewards/reward_len/mean": 0.8720703125,
"rewards/reward_len/std": 0.2989489436149597,
"step": 67
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.02176,
"frac_reward_zero_std": 0.5625,
"grad_norm": 0.5956972241401672,
"kl": 0.026400156319141388,
"learning_rate": 4.999369915831958e-06,
"loss": 0.0001,
"num_tokens": 13358048.0,
"reward": 0.7919921875,
"reward_std": 0.08180129528045654,
"rewards/reward_len/mean": 0.7919921875,
"rewards/reward_len/std": 0.3576885163784027,
"step": 68
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.02208,
"frac_reward_zero_std": 0.625,
"grad_norm": 0.5297041535377502,
"kl": 0.02976173348724842,
"learning_rate": 4.999350967807455e-06,
"loss": 0.0001,
"num_tokens": 13554560.0,
"reward": 0.9059244990348816,
"reward_std": 0.08187607675790787,
"rewards/reward_len/mean": 0.9059244990348816,
"rewards/reward_len/std": 0.20148861408233643,
"step": 69
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.0224,
"frac_reward_zero_std": 0.5625,
"grad_norm": 0.5297337174415588,
"kl": 0.029380029067397118,
"learning_rate": 4.999331739120423e-06,
"loss": 0.0001,
"num_tokens": 13750992.0,
"reward": 0.8766276240348816,
"reward_std": 0.08794420212507248,
"rewards/reward_len/mean": 0.8766276240348816,
"rewards/reward_len/std": 0.24631834030151367,
"step": 70
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.02272,
"frac_reward_zero_std": 0.6875,
"grad_norm": 0.5374413728713989,
"kl": 0.03178559988737106,
"learning_rate": 4.999312229773022e-06,
"loss": 0.0001,
"num_tokens": 13947232.0,
"reward": 0.96484375,
"reward_std": 0.07383135706186295,
"rewards/reward_len/mean": 0.96484375,
"rewards/reward_len/std": 0.17232529819011688,
"step": 71
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.02304,
"frac_reward_zero_std": 0.5625,
"grad_norm": 0.6321916580200195,
"kl": 0.02617986872792244,
"learning_rate": 4.9992924397674414e-06,
"loss": 0.0001,
"num_tokens": 14143712.0,
"reward": 0.9111328125,
"reward_std": 0.08529947698116302,
"rewards/reward_len/mean": 0.9111328125,
"rewards/reward_len/std": 0.2390221208333969,
"step": 72
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.02336,
"frac_reward_zero_std": 0.3125,
"grad_norm": 0.8528509140014648,
"kl": 0.029244577512145042,
"learning_rate": 4.999272369105904e-06,
"loss": 0.0001,
"num_tokens": 14340208.0,
"reward": 0.8619791865348816,
"reward_std": 0.1358361542224884,
"rewards/reward_len/mean": 0.8619791269302368,
"rewards/reward_len/std": 0.3006777763366699,
"step": 73
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.02368,
"frac_reward_zero_std": 0.4375,
"grad_norm": 0.7068591117858887,
"kl": 0.038017794489860535,
"learning_rate": 4.999252017790665e-06,
"loss": 0.0001,
"num_tokens": 14536384.0,
"reward": 0.8388671875,
"reward_std": 0.126865953207016,
"rewards/reward_len/mean": 0.8388671875,
"rewards/reward_len/std": 0.2714577317237854,
"step": 74
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.024,
"frac_reward_zero_std": 0.3125,
"grad_norm": 0.7906740307807922,
"kl": 0.035286713391542435,
"learning_rate": 4.999231385824008e-06,
"loss": 0.0001,
"num_tokens": 14732848.0,
"reward": 0.8375651240348816,
"reward_std": 0.20768919587135315,
"rewards/reward_len/mean": 0.8375651240348816,
"rewards/reward_len/std": 0.327578604221344,
"step": 75
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.02432,
"frac_reward_zero_std": 0.4375,
"grad_norm": 0.7323654890060425,
"kl": 0.029418617486953735,
"learning_rate": 4.99921047320825e-06,
"loss": 0.0001,
"num_tokens": 14929392.0,
"reward": 0.8792318105697632,
"reward_std": 0.109856978058815,
"rewards/reward_len/mean": 0.8792318105697632,
"rewards/reward_len/std": 0.28465378284454346,
"step": 76
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.02464,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.7487607598304749,
"kl": 0.02889881655573845,
"learning_rate": 4.999189279945741e-06,
"loss": 0.0001,
"num_tokens": 15125728.0,
"reward": 0.8938802480697632,
"reward_std": 0.10382238030433655,
"rewards/reward_len/mean": 0.8938801884651184,
"rewards/reward_len/std": 0.26159700751304626,
"step": 77
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.02496,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.7281383275985718,
"kl": 0.03049377351999283,
"learning_rate": 4.999167806038858e-06,
"loss": 0.0001,
"num_tokens": 15322032.0,
"reward": 0.8343099355697632,
"reward_std": 0.14393801987171173,
"rewards/reward_len/mean": 0.8343099355697632,
"rewards/reward_len/std": 0.3518146574497223,
"step": 78
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.02528,
"frac_reward_zero_std": 0.5625,
"grad_norm": 0.6423137784004211,
"kl": 0.03136272728443146,
"learning_rate": 4.999146051490016e-06,
"loss": 0.0001,
"num_tokens": 15518320.0,
"reward": 0.9407552480697632,
"reward_std": 0.0995391458272934,
"rewards/reward_len/mean": 0.9407551884651184,
"rewards/reward_len/std": 0.24368993937969208,
"step": 79
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.0256,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.8017213940620422,
"kl": 0.03368952497839928,
"learning_rate": 4.999124016301654e-06,
"loss": 0.0001,
"num_tokens": 15714848.0,
"reward": 0.857421875,
"reward_std": 0.14494842290878296,
"rewards/reward_len/mean": 0.857421875,
"rewards/reward_len/std": 0.286716103553772,
"step": 80
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.02592,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.8540568947792053,
"kl": 0.03586439788341522,
"learning_rate": 4.99910170047625e-06,
"loss": 0.0001,
"num_tokens": 15911296.0,
"reward": 0.8414713740348816,
"reward_std": 0.18263337016105652,
"rewards/reward_len/mean": 0.8414713740348816,
"rewards/reward_len/std": 0.3021599352359772,
"step": 81
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.02624,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.8202486038208008,
"kl": 0.04007921367883682,
"learning_rate": 4.999079104016308e-06,
"loss": 0.0001,
"num_tokens": 16107648.0,
"reward": 0.8688151240348816,
"reward_std": 0.1747693419456482,
"rewards/reward_len/mean": 0.8688150644302368,
"rewards/reward_len/std": 0.2946781814098358,
"step": 82
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.02656,
"frac_reward_zero_std": 0.625,
"grad_norm": 0.697175920009613,
"kl": 0.03438268229365349,
"learning_rate": 4.999056226924366e-06,
"loss": 0.0001,
"num_tokens": 16304064.0,
"reward": 0.9052734375,
"reward_std": 0.10430242866277695,
"rewards/reward_len/mean": 0.9052734375,
"rewards/reward_len/std": 0.24809814989566803,
"step": 83
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.02688,
"frac_reward_zero_std": 0.625,
"grad_norm": 0.6200621724128723,
"kl": 0.030285635963082314,
"learning_rate": 4.999033069202992e-06,
"loss": 0.0001,
"num_tokens": 16500336.0,
"reward": 0.9410807490348816,
"reward_std": 0.08145523071289062,
"rewards/reward_len/mean": 0.9410807490348816,
"rewards/reward_len/std": 0.16886034607887268,
"step": 84
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.0272,
"frac_reward_zero_std": 0.4375,
"grad_norm": 0.6632947325706482,
"kl": 0.03737508878111839,
"learning_rate": 4.999009630854787e-06,
"loss": 0.0001,
"num_tokens": 16696880.0,
"reward": 0.8681640625,
"reward_std": 0.09984727203845978,
"rewards/reward_len/mean": 0.8681640625,
"rewards/reward_len/std": 0.3139059841632843,
"step": 85
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.02752,
"frac_reward_zero_std": 0.6875,
"grad_norm": 0.5139328837394714,
"kl": 0.033338725566864014,
"learning_rate": 4.998985911882383e-06,
"loss": 0.0001,
"num_tokens": 16893424.0,
"reward": 0.9231771230697632,
"reward_std": 0.064088836312294,
"rewards/reward_len/mean": 0.9231771230697632,
"rewards/reward_len/std": 0.20548295974731445,
"step": 86
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.02784,
"frac_reward_zero_std": 0.4375,
"grad_norm": 0.7163829803466797,
"kl": 0.04579077661037445,
"learning_rate": 4.998961912288445e-06,
"loss": 0.0001,
"num_tokens": 17089792.0,
"reward": 0.9013671875,
"reward_std": 0.14256593585014343,
"rewards/reward_len/mean": 0.9013671875,
"rewards/reward_len/std": 0.2612590491771698,
"step": 87
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.02816,
"frac_reward_zero_std": 0.625,
"grad_norm": 0.6266374588012695,
"kl": 0.03986838087439537,
"learning_rate": 4.998937632075667e-06,
"loss": 0.0001,
"num_tokens": 17285984.0,
"reward": 0.9016927480697632,
"reward_std": 0.08280540257692337,
"rewards/reward_len/mean": 0.9016926884651184,
"rewards/reward_len/std": 0.24128234386444092,
"step": 88
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.02848,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.5966895222663879,
"kl": 0.033690571784973145,
"learning_rate": 4.998913071246774e-06,
"loss": 0.0001,
"num_tokens": 17482432.0,
"reward": 0.9534505605697632,
"reward_std": 0.08552976697683334,
"rewards/reward_len/mean": 0.9534505009651184,
"rewards/reward_len/std": 0.1531372368335724,
"step": 89
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.0288,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.6456487774848938,
"kl": 0.06617207825183868,
"learning_rate": 4.998888229804526e-06,
"loss": 0.0001,
"num_tokens": 17678768.0,
"reward": 0.8577474355697632,
"reward_std": 0.10766549408435822,
"rewards/reward_len/mean": 0.8577474355697632,
"rewards/reward_len/std": 0.3121115267276764,
"step": 90
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.02912,
"frac_reward_zero_std": 0.5625,
"grad_norm": 0.6451205015182495,
"kl": 0.03554663062095642,
"learning_rate": 4.998863107751711e-06,
"loss": 0.0001,
"num_tokens": 17875056.0,
"reward": 0.8850911855697632,
"reward_std": 0.09045256674289703,
"rewards/reward_len/mean": 0.8850911855697632,
"rewards/reward_len/std": 0.2813292443752289,
"step": 91
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.02944,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.6420301795005798,
"kl": 0.03609941899776459,
"learning_rate": 4.998837705091152e-06,
"loss": 0.0001,
"num_tokens": 18071504.0,
"reward": 0.9020182490348816,
"reward_std": 0.12863239645957947,
"rewards/reward_len/mean": 0.9020181894302368,
"rewards/reward_len/std": 0.2481461763381958,
"step": 92
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.02976,
"frac_reward_zero_std": 0.4375,
"grad_norm": 0.744374692440033,
"kl": 0.03676271066069603,
"learning_rate": 4.9988120218257e-06,
"loss": 0.0001,
"num_tokens": 18267712.0,
"reward": 0.8391927480697632,
"reward_std": 0.13910341262817383,
"rewards/reward_len/mean": 0.8391926884651184,
"rewards/reward_len/std": 0.33368995785713196,
"step": 93
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.03008,
"frac_reward_zero_std": 0.4375,
"grad_norm": 0.6814514398574829,
"kl": 0.04908795654773712,
"learning_rate": 4.99878605795824e-06,
"loss": 0.0001,
"num_tokens": 18464272.0,
"reward": 0.8059896230697632,
"reward_std": 0.10761559009552002,
"rewards/reward_len/mean": 0.8059896230697632,
"rewards/reward_len/std": 0.3403044641017914,
"step": 94
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.0304,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.6609547138214111,
"kl": 0.04120853543281555,
"learning_rate": 4.998759813491687e-06,
"loss": 0.0001,
"num_tokens": 18660592.0,
"reward": 0.8746744990348816,
"reward_std": 0.12070801854133606,
"rewards/reward_len/mean": 0.8746744990348816,
"rewards/reward_len/std": 0.3313665986061096,
"step": 95
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.03072,
"frac_reward_zero_std": 0.4375,
"grad_norm": 0.6958045959472656,
"kl": 0.051205702126026154,
"learning_rate": 4.998733288428987e-06,
"loss": 0.0001,
"num_tokens": 18857088.0,
"reward": 0.8590494990348816,
"reward_std": 0.12743408977985382,
"rewards/reward_len/mean": 0.8590494990348816,
"rewards/reward_len/std": 0.2551549971103668,
"step": 96
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.03104,
"frac_reward_zero_std": 0.5625,
"grad_norm": 0.5829638838768005,
"kl": 0.03792741894721985,
"learning_rate": 4.998706482773121e-06,
"loss": 0.0001,
"num_tokens": 19053536.0,
"reward": 0.8645833730697632,
"reward_std": 0.08903007954359055,
"rewards/reward_len/mean": 0.8645833134651184,
"rewards/reward_len/std": 0.27937448024749756,
"step": 97
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.03136,
"frac_reward_zero_std": 0.4375,
"grad_norm": 0.6745487451553345,
"kl": 0.04386503994464874,
"learning_rate": 4.998679396527099e-06,
"loss": 0.0001,
"num_tokens": 19250192.0,
"reward": 0.8658854365348816,
"reward_std": 0.11358411610126495,
"rewards/reward_len/mean": 0.8658853769302368,
"rewards/reward_len/std": 0.21724016964435577,
"step": 98
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.03168,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.8316110968589783,
"kl": 0.04867399483919144,
"learning_rate": 4.99865202969396e-06,
"loss": 0.0001,
"num_tokens": 19446624.0,
"reward": 0.8932291865348816,
"reward_std": 0.11117450892925262,
"rewards/reward_len/mean": 0.8932291865348816,
"rewards/reward_len/std": 0.20754754543304443,
"step": 99
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 729.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 729.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.032,
"frac_reward_zero_std": 0.125,
"grad_norm": 0.9266502857208252,
"kl": 0.04411531612277031,
"learning_rate": 4.9986243822767795e-06,
"loss": 0.0001,
"num_tokens": 19643296.0,
"reward": 0.8235677480697632,
"reward_std": 0.17513221502304077,
"rewards/reward_len/mean": 0.8235677480697632,
"rewards/reward_len/std": 0.28480416536331177,
"step": 100
}
],
"logging_steps": 1,
"max_steps": 9375,
"num_input_tokens_seen": 19643296,
"num_train_epochs": 3,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}