{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.032, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.00032, "frac_reward_zero_std": 0.25, "grad_norm": 0.8027206659317017, "kl": 1.71183273778297e-05, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 196560.0, "reward": 0.7936198115348816, "reward_std": 0.17962533235549927, "rewards/reward_len/mean": 0.7936197519302368, "rewards/reward_len/std": 0.3258915841579437, "step": 1 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.00064, "frac_reward_zero_std": 0.5625, "grad_norm": 0.7602783441543579, "kl": 0.00532680656760931, "learning_rate": 4.999999859632295e-06, "loss": 0.0, "num_tokens": 393040.0, "reward": 0.7620443105697632, "reward_std": 0.08783292770385742, "rewards/reward_len/mean": 0.7620443105697632, "rewards/reward_len/std": 0.3672178089618683, "step": 2 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.00096, "frac_reward_zero_std": 0.5, "grad_norm": 0.7249953150749207, "kl": 0.005223212763667107, "learning_rate": 4.9999994385291934e-06, "loss": 0.0, "num_tokens": 589344.0, "reward": 0.8811849355697632, "reward_std": 0.13462764024734497, "rewards/reward_len/mean": 0.8811849355697632, "rewards/reward_len/std": 0.2868101894855499, "step": 3 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.00128, "frac_reward_zero_std": 0.25, "grad_norm": 1.0998049974441528, "kl": 0.005302521400153637, "learning_rate": 4.9999987366907436e-06, "loss": 0.0, "num_tokens": 785824.0, "reward": 0.837890625, "reward_std": 0.17104606330394745, "rewards/reward_len/mean": 0.837890625, "rewards/reward_len/std": 0.274716854095459, "step": 4 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0016, "frac_reward_zero_std": 0.375, "grad_norm": 0.8081106543540955, "kl": 0.005471091717481613, "learning_rate": 4.999997754117024e-06, "loss": 0.0, "num_tokens": 982400.0, "reward": 0.7578125, "reward_std": 0.15783792734146118, "rewards/reward_len/mean": 0.7578125, "rewards/reward_len/std": 0.3556174635887146, "step": 5 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.00192, "frac_reward_zero_std": 0.25, "grad_norm": 0.7931028008460999, "kl": 0.005383472889661789, "learning_rate": 4.999996490808146e-06, "loss": 0.0, "num_tokens": 1178736.0, "reward": 0.8411458730697632, "reward_std": 0.15851536393165588, "rewards/reward_len/mean": 0.8411458730697632, "rewards/reward_len/std": 0.2852468192577362, "step": 6 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.00224, "frac_reward_zero_std": 0.375, "grad_norm": 0.855617880821228, "kl": 0.005938877817243338, "learning_rate": 4.9999949467642495e-06, "loss": 0.0, "num_tokens": 1375184.0, "reward": 0.7444661855697632, "reward_std": 0.18254899978637695, "rewards/reward_len/mean": 0.7444661855697632, "rewards/reward_len/std": 0.5324408411979675, "step": 7 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.00256, "frac_reward_zero_std": 0.375, "grad_norm": 0.8006792068481445, "kl": 0.006116841919720173, "learning_rate": 4.999993121985509e-06, "loss": 0.0, "num_tokens": 1571472.0, "reward": 0.755859375, "reward_std": 0.14840292930603027, "rewards/reward_len/mean": 0.755859375, "rewards/reward_len/std": 0.41854918003082275, "step": 8 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.00288, "frac_reward_zero_std": 0.625, "grad_norm": 0.5651379823684692, "kl": 0.006282718852162361, "learning_rate": 4.99999101647213e-06, "loss": 0.0, "num_tokens": 1767696.0, "reward": 0.8597005605697632, "reward_std": 0.09105785191059113, "rewards/reward_len/mean": 0.8597005605697632, "rewards/reward_len/std": 0.3063139021396637, "step": 9 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0032, "frac_reward_zero_std": 0.3125, "grad_norm": 0.7746672630310059, "kl": 0.006277492269873619, "learning_rate": 4.9999886302243486e-06, "loss": 0.0, "num_tokens": 1964352.0, "reward": 0.736328125, "reward_std": 0.16170179843902588, "rewards/reward_len/mean": 0.736328125, "rewards/reward_len/std": 0.35940179228782654, "step": 10 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.00352, "frac_reward_zero_std": 0.25, "grad_norm": 0.8158230185508728, "kl": 0.0064939698204398155, "learning_rate": 4.999985963242432e-06, "loss": 0.0, "num_tokens": 2160848.0, "reward": 0.7190755605697632, "reward_std": 0.14770188927650452, "rewards/reward_len/mean": 0.7190755605697632, "rewards/reward_len/std": 0.39762866497039795, "step": 11 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.00384, "frac_reward_zero_std": 0.625, "grad_norm": 0.4778946042060852, "kl": 0.00701224897056818, "learning_rate": 4.99998301552668e-06, "loss": 0.0, "num_tokens": 2357440.0, "reward": 0.9007161855697632, "reward_std": 0.08900929242372513, "rewards/reward_len/mean": 0.9007161855697632, "rewards/reward_len/std": 0.2015477418899536, "step": 12 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.00416, "frac_reward_zero_std": 0.5, "grad_norm": 0.7480363845825195, "kl": 0.006892648059874773, "learning_rate": 4.999979787077425e-06, "loss": 0.0, "num_tokens": 2553776.0, "reward": 0.736328125, "reward_std": 0.11627350747585297, "rewards/reward_len/mean": 0.736328125, "rewards/reward_len/std": 0.4063463509082794, "step": 13 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.00448, "frac_reward_zero_std": 0.375, "grad_norm": 0.6913530826568604, "kl": 0.007525671273469925, "learning_rate": 4.9999762778950265e-06, "loss": 0.0, "num_tokens": 2750176.0, "reward": 0.8733724355697632, "reward_std": 0.13304749131202698, "rewards/reward_len/mean": 0.8733724355697632, "rewards/reward_len/std": 0.260817289352417, "step": 14 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0048, "frac_reward_zero_std": 0.3125, "grad_norm": 0.9006426334381104, "kl": 0.007139429450035095, "learning_rate": 4.999972487979882e-06, "loss": 0.0, "num_tokens": 2946432.0, "reward": 0.7353515625, "reward_std": 0.18793734908103943, "rewards/reward_len/mean": 0.7353515625, "rewards/reward_len/std": 0.4250172972679138, "step": 15 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.00512, "frac_reward_zero_std": 0.4375, "grad_norm": 0.7369216084480286, "kl": 0.007592486217617989, "learning_rate": 4.999968417332415e-06, "loss": 0.0, "num_tokens": 3142960.0, "reward": 0.8525390625, "reward_std": 0.13952995836734772, "rewards/reward_len/mean": 0.8525390625, "rewards/reward_len/std": 0.2791503369808197, "step": 16 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.00544, "frac_reward_zero_std": 0.3125, "grad_norm": 0.7958468198776245, "kl": 0.007764819078147411, "learning_rate": 4.999964065953083e-06, "loss": 0.0, "num_tokens": 3339504.0, "reward": 0.7796224355697632, "reward_std": 0.13519705832004547, "rewards/reward_len/mean": 0.7796224355697632, "rewards/reward_len/std": 0.2986285090446472, "step": 17 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.00576, "frac_reward_zero_std": 0.0, "grad_norm": 0.8955850005149841, "kl": 0.008127257227897644, "learning_rate": 4.999959433842374e-06, "loss": 0.0, "num_tokens": 3536064.0, "reward": 0.6917318105697632, "reward_std": 0.22921673953533173, "rewards/reward_len/mean": 0.6917317509651184, "rewards/reward_len/std": 0.3818399906158447, "step": 18 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.00608, "frac_reward_zero_std": 0.375, "grad_norm": 0.7979457974433899, "kl": 0.007617972791194916, "learning_rate": 4.999954521000811e-06, "loss": 0.0, "num_tokens": 3732256.0, "reward": 0.78515625, "reward_std": 0.1195596233010292, "rewards/reward_len/mean": 0.78515625, "rewards/reward_len/std": 0.32990217208862305, "step": 19 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0064, "frac_reward_zero_std": 0.3125, "grad_norm": 0.7890306115150452, "kl": 0.007777344435453415, "learning_rate": 4.999949327428941e-06, "loss": 0.0, "num_tokens": 3928640.0, "reward": 0.7919921875, "reward_std": 0.17304885387420654, "rewards/reward_len/mean": 0.7919921875, "rewards/reward_len/std": 0.3526277244091034, "step": 20 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.00672, "frac_reward_zero_std": 0.4375, "grad_norm": 0.7711103558540344, "kl": 0.008527114056050777, "learning_rate": 4.999943853127351e-06, "loss": 0.0, "num_tokens": 4124784.0, "reward": 0.9065755605697632, "reward_std": 0.09724702686071396, "rewards/reward_len/mean": 0.9065755605697632, "rewards/reward_len/std": 0.252506285905838, "step": 21 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.00704, "frac_reward_zero_std": 0.3125, "grad_norm": 0.7866696119308472, "kl": 0.008756128139793873, "learning_rate": 4.999938098096655e-06, "loss": 0.0, "num_tokens": 4321184.0, "reward": 0.8844401240348816, "reward_std": 0.1446431577205658, "rewards/reward_len/mean": 0.8844401240348816, "rewards/reward_len/std": 0.2464737743139267, "step": 22 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.00736, "frac_reward_zero_std": 0.3125, "grad_norm": 0.8422706723213196, "kl": 0.009292546659708023, "learning_rate": 4.999932062337498e-06, "loss": 0.0, "num_tokens": 4517488.0, "reward": 0.9410807490348816, "reward_std": 0.12082913517951965, "rewards/reward_len/mean": 0.9410807490348816, "rewards/reward_len/std": 0.16378451883792877, "step": 23 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.00768, "frac_reward_zero_std": 0.4375, "grad_norm": 0.7936801314353943, "kl": 0.009337708353996277, "learning_rate": 4.999925745850559e-06, "loss": 0.0, "num_tokens": 4713888.0, "reward": 0.7981771230697632, "reward_std": 0.1319822520017624, "rewards/reward_len/mean": 0.7981771230697632, "rewards/reward_len/std": 0.29971110820770264, "step": 24 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.008, "frac_reward_zero_std": 0.3125, "grad_norm": 0.8625982403755188, "kl": 0.009664739482104778, "learning_rate": 4.999919148636547e-06, "loss": 0.0, "num_tokens": 4910480.0, "reward": 0.7828776240348816, "reward_std": 0.15538470447063446, "rewards/reward_len/mean": 0.7828776240348816, "rewards/reward_len/std": 0.3985660672187805, "step": 25 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.00832, "frac_reward_zero_std": 0.1875, "grad_norm": 0.7866320013999939, "kl": 0.009115578606724739, "learning_rate": 4.999912270696202e-06, "loss": 0.0, "num_tokens": 5107040.0, "reward": 0.837890625, "reward_std": 0.19406211376190186, "rewards/reward_len/mean": 0.837890625, "rewards/reward_len/std": 0.28484001755714417, "step": 26 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.00864, "frac_reward_zero_std": 0.25, "grad_norm": 0.7605833411216736, "kl": 0.010722242295742035, "learning_rate": 4.999905112030298e-06, "loss": 0.0, "num_tokens": 5303472.0, "reward": 0.8326823115348816, "reward_std": 0.16619378328323364, "rewards/reward_len/mean": 0.8326823115348816, "rewards/reward_len/std": 0.3326784670352936, "step": 27 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.00896, "frac_reward_zero_std": 0.375, "grad_norm": 0.8247683048248291, "kl": 0.010293405503034592, "learning_rate": 4.999897672639636e-06, "loss": 0.0, "num_tokens": 5499968.0, "reward": 0.7662760019302368, "reward_std": 0.13208766281604767, "rewards/reward_len/mean": 0.7662760615348816, "rewards/reward_len/std": 0.3626876771450043, "step": 28 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.00928, "frac_reward_zero_std": 0.1875, "grad_norm": 0.9354327917098999, "kl": 0.010291656479239464, "learning_rate": 4.9998899525250556e-06, "loss": 0.0, "num_tokens": 5696496.0, "reward": 0.6142578125, "reward_std": 0.2659006714820862, "rewards/reward_len/mean": 0.6142578125, "rewards/reward_len/std": 0.5169708728790283, "step": 29 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0096, "frac_reward_zero_std": 0.3125, "grad_norm": 0.7617059946060181, "kl": 0.011677569709718227, "learning_rate": 4.99988195168742e-06, "loss": 0.0, "num_tokens": 5892672.0, "reward": 0.818359375, "reward_std": 0.16987799108028412, "rewards/reward_len/mean": 0.818359375, "rewards/reward_len/std": 0.2849595248699188, "step": 30 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.00992, "frac_reward_zero_std": 0.4375, "grad_norm": 0.6369988322257996, "kl": 0.011780554428696632, "learning_rate": 4.99987367012763e-06, "loss": 0.0, "num_tokens": 6089264.0, "reward": 0.8307291865348816, "reward_std": 0.15030531585216522, "rewards/reward_len/mean": 0.8307291865348816, "rewards/reward_len/std": 0.3026638329029083, "step": 31 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.01024, "frac_reward_zero_std": 0.625, "grad_norm": 0.5698184370994568, "kl": 0.011647619307041168, "learning_rate": 4.9998651078466144e-06, "loss": 0.0, "num_tokens": 6285792.0, "reward": 0.8603515625, "reward_std": 0.06890285015106201, "rewards/reward_len/mean": 0.8603515625, "rewards/reward_len/std": 0.3024090826511383, "step": 32 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.01056, "frac_reward_zero_std": 0.625, "grad_norm": 0.7035416960716248, "kl": 0.01283220387995243, "learning_rate": 4.999856264845334e-06, "loss": 0.0, "num_tokens": 6482272.0, "reward": 0.8284505605697632, "reward_std": 0.0647423267364502, "rewards/reward_len/mean": 0.8284505605697632, "rewards/reward_len/std": 0.2645607888698578, "step": 33 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.01088, "frac_reward_zero_std": 0.3125, "grad_norm": 0.7955151200294495, "kl": 0.013212130405008793, "learning_rate": 4.999847141124784e-06, "loss": 0.0, "num_tokens": 6678928.0, "reward": 0.9065755605697632, "reward_std": 0.15104928612709045, "rewards/reward_len/mean": 0.9065755605697632, "rewards/reward_len/std": 0.2394418716430664, "step": 34 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0112, "frac_reward_zero_std": 0.3125, "grad_norm": 0.7888926267623901, "kl": 0.01455118041485548, "learning_rate": 4.999837736685987e-06, "loss": 0.0, "num_tokens": 6875680.0, "reward": 0.8001302480697632, "reward_std": 0.18624618649482727, "rewards/reward_len/mean": 0.8001302480697632, "rewards/reward_len/std": 0.3228481709957123, "step": 35 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.01152, "frac_reward_zero_std": 0.375, "grad_norm": 0.7999125719070435, "kl": 0.016429290175437927, "learning_rate": 4.9998280515300006e-06, "loss": 0.0, "num_tokens": 7071792.0, "reward": 0.7805989980697632, "reward_std": 0.10644528269767761, "rewards/reward_len/mean": 0.7805989980697632, "rewards/reward_len/std": 0.3641582727432251, "step": 36 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.01184, "frac_reward_zero_std": 0.1875, "grad_norm": 0.8360075950622559, "kl": 0.01572563126683235, "learning_rate": 4.999818085657911e-06, "loss": 0.0, "num_tokens": 7268400.0, "reward": 0.8115234375, "reward_std": 0.19910216331481934, "rewards/reward_len/mean": 0.8115234375, "rewards/reward_len/std": 0.3251221179962158, "step": 37 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.01216, "frac_reward_zero_std": 0.25, "grad_norm": 0.8303148746490479, "kl": 0.015369746834039688, "learning_rate": 4.9998078390708375e-06, "loss": 0.0, "num_tokens": 7464832.0, "reward": 0.7236328125, "reward_std": 0.20254287123680115, "rewards/reward_len/mean": 0.7236328125, "rewards/reward_len/std": 0.49883776903152466, "step": 38 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.01248, "frac_reward_zero_std": 0.25, "grad_norm": 0.8379618525505066, "kl": 0.015162697061896324, "learning_rate": 4.999797311769932e-06, "loss": 0.0, "num_tokens": 7661264.0, "reward": 0.8512369990348816, "reward_std": 0.1815636157989502, "rewards/reward_len/mean": 0.8512369990348816, "rewards/reward_len/std": 0.30639442801475525, "step": 39 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0128, "frac_reward_zero_std": 0.25, "grad_norm": 0.9366811513900757, "kl": 0.01498054526746273, "learning_rate": 4.999786503756376e-06, "loss": 0.0, "num_tokens": 7857696.0, "reward": 0.8359375, "reward_std": 0.14262336492538452, "rewards/reward_len/mean": 0.8359375, "rewards/reward_len/std": 0.29101234674453735, "step": 40 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.01312, "frac_reward_zero_std": 0.125, "grad_norm": 0.8755704164505005, "kl": 0.017075425013899803, "learning_rate": 4.999775415031381e-06, "loss": 0.0, "num_tokens": 8054128.0, "reward": 0.8531901240348816, "reward_std": 0.19629715383052826, "rewards/reward_len/mean": 0.8531901240348816, "rewards/reward_len/std": 0.29085859656333923, "step": 41 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.01344, "frac_reward_zero_std": 0.4375, "grad_norm": 0.7478809356689453, "kl": 0.016752440482378006, "learning_rate": 4.999764045596195e-06, "loss": 0.0, "num_tokens": 8250448.0, "reward": 0.8297526240348816, "reward_std": 0.10785592347383499, "rewards/reward_len/mean": 0.8297526240348816, "rewards/reward_len/std": 0.3374955654144287, "step": 42 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.01376, "frac_reward_zero_std": 0.375, "grad_norm": 0.79532790184021, "kl": 0.01613219454884529, "learning_rate": 4.999752395452095e-06, "loss": 0.0, "num_tokens": 8446864.0, "reward": 0.798828125, "reward_std": 0.15989980101585388, "rewards/reward_len/mean": 0.798828125, "rewards/reward_len/std": 0.32186606526374817, "step": 43 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.01408, "frac_reward_zero_std": 0.4375, "grad_norm": 0.757718563079834, "kl": 0.016810301691293716, "learning_rate": 4.999740464600386e-06, "loss": 0.0, "num_tokens": 8643536.0, "reward": 0.8186849355697632, "reward_std": 0.14407595992088318, "rewards/reward_len/mean": 0.8186849355697632, "rewards/reward_len/std": 0.3191888630390167, "step": 44 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0144, "frac_reward_zero_std": 0.4375, "grad_norm": 0.6717040538787842, "kl": 0.019070003181695938, "learning_rate": 4.9997282530424114e-06, "loss": 0.0, "num_tokens": 8839776.0, "reward": 0.8984375, "reward_std": 0.15830302238464355, "rewards/reward_len/mean": 0.8984375, "rewards/reward_len/std": 0.24285823106765747, "step": 45 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.01472, "frac_reward_zero_std": 0.3125, "grad_norm": 0.8255589008331299, "kl": 0.01899189129471779, "learning_rate": 4.999715760779541e-06, "loss": 0.0, "num_tokens": 9036368.0, "reward": 0.7731119990348816, "reward_std": 0.15324297547340393, "rewards/reward_len/mean": 0.7731119394302368, "rewards/reward_len/std": 0.3341211676597595, "step": 46 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.01504, "frac_reward_zero_std": 0.125, "grad_norm": 0.9899515509605408, "kl": 0.018817655742168427, "learning_rate": 4.9997029878131776e-06, "loss": 0.0, "num_tokens": 9232672.0, "reward": 0.7311198115348816, "reward_std": 0.198727086186409, "rewards/reward_len/mean": 0.7311197519302368, "rewards/reward_len/std": 0.39068886637687683, "step": 47 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.01536, "frac_reward_zero_std": 0.3125, "grad_norm": 0.7901699542999268, "kl": 0.017967861145734787, "learning_rate": 4.999689934144754e-06, "loss": 0.0, "num_tokens": 9429104.0, "reward": 0.8727213740348816, "reward_std": 0.14133989810943604, "rewards/reward_len/mean": 0.8727213144302368, "rewards/reward_len/std": 0.27691739797592163, "step": 48 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.01568, "frac_reward_zero_std": 0.25, "grad_norm": 0.8071075677871704, "kl": 0.01948385313153267, "learning_rate": 4.99967659977574e-06, "loss": 0.0, "num_tokens": 9625744.0, "reward": 0.8323568105697632, "reward_std": 0.1779971718788147, "rewards/reward_len/mean": 0.8323567509651184, "rewards/reward_len/std": 0.301727294921875, "step": 49 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.016, "frac_reward_zero_std": 0.25, "grad_norm": 0.7960187792778015, "kl": 0.019299190491437912, "learning_rate": 4.999662984707629e-06, "loss": 0.0, "num_tokens": 9822208.0, "reward": 0.7916666865348816, "reward_std": 0.17278119921684265, "rewards/reward_len/mean": 0.7916666865348816, "rewards/reward_len/std": 0.3409265875816345, "step": 50 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.01632, "frac_reward_zero_std": 0.3125, "grad_norm": 0.815869152545929, "kl": 0.01802164316177368, "learning_rate": 4.999649088941951e-06, "loss": 0.0, "num_tokens": 10018832.0, "reward": 0.7574869990348816, "reward_std": 0.16599968075752258, "rewards/reward_len/mean": 0.7574869990348816, "rewards/reward_len/std": 0.3090384006500244, "step": 51 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.01664, "frac_reward_zero_std": 0.25, "grad_norm": 0.824347734451294, "kl": 0.020238451659679413, "learning_rate": 4.999634912480268e-06, "loss": 0.0, "num_tokens": 10215008.0, "reward": 0.8421224355697632, "reward_std": 0.14578115940093994, "rewards/reward_len/mean": 0.8421224355697632, "rewards/reward_len/std": 0.27820974588394165, "step": 52 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.01696, "frac_reward_zero_std": 0.4375, "grad_norm": 0.7971190810203552, "kl": 0.021430665627121925, "learning_rate": 4.99962045532417e-06, "loss": 0.0, "num_tokens": 10411328.0, "reward": 0.90625, "reward_std": 0.1278911530971527, "rewards/reward_len/mean": 0.90625, "rewards/reward_len/std": 0.22140371799468994, "step": 53 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.01728, "frac_reward_zero_std": 0.4375, "grad_norm": 0.7196866273880005, "kl": 0.02385888248682022, "learning_rate": 4.999605717475281e-06, "loss": 0.0, "num_tokens": 10607744.0, "reward": 0.7164713144302368, "reward_std": 0.1332741230726242, "rewards/reward_len/mean": 0.7164713740348816, "rewards/reward_len/std": 0.4236203730106354, "step": 54 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0176, "frac_reward_zero_std": 0.5, "grad_norm": 0.694108247756958, "kl": 0.019158754497766495, "learning_rate": 4.999590698935257e-06, "loss": 0.0, "num_tokens": 10804432.0, "reward": 0.83984375, "reward_std": 0.07713833451271057, "rewards/reward_len/mean": 0.83984375, "rewards/reward_len/std": 0.3475888669490814, "step": 55 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.01792, "frac_reward_zero_std": 0.375, "grad_norm": 0.7138301730155945, "kl": 0.021222909912467003, "learning_rate": 4.999575399705782e-06, "loss": 0.0, "num_tokens": 11000960.0, "reward": 0.8362630605697632, "reward_std": 0.13972680270671844, "rewards/reward_len/mean": 0.8362630605697632, "rewards/reward_len/std": 0.3137500584125519, "step": 56 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.01824, "frac_reward_zero_std": 0.5, "grad_norm": 0.5983142852783203, "kl": 0.023208746686577797, "learning_rate": 4.999559819788578e-06, "loss": 0.0, "num_tokens": 11197456.0, "reward": 0.8756510615348816, "reward_std": 0.09809666872024536, "rewards/reward_len/mean": 0.8756510615348816, "rewards/reward_len/std": 0.27435725927352905, "step": 57 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.01856, "frac_reward_zero_std": 0.4375, "grad_norm": 0.7307888865470886, "kl": 0.021232981234788895, "learning_rate": 4.999543959185391e-06, "loss": 0.0, "num_tokens": 11394064.0, "reward": 0.8444010615348816, "reward_std": 0.11457288265228271, "rewards/reward_len/mean": 0.8444010019302368, "rewards/reward_len/std": 0.28051677346229553, "step": 58 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.01888, "frac_reward_zero_std": 0.375, "grad_norm": 0.813261866569519, "kl": 0.02231256291270256, "learning_rate": 4.999527817898004e-06, "loss": 0.0, "num_tokens": 11590608.0, "reward": 0.853515625, "reward_std": 0.17535921931266785, "rewards/reward_len/mean": 0.853515625, "rewards/reward_len/std": 0.4461461901664734, "step": 59 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0192, "frac_reward_zero_std": 0.4375, "grad_norm": 0.7521728873252869, "kl": 0.022967170923948288, "learning_rate": 4.999511395928228e-06, "loss": 0.0, "num_tokens": 11787168.0, "reward": 0.701171875, "reward_std": 0.15171362459659576, "rewards/reward_len/mean": 0.701171875, "rewards/reward_len/std": 0.4368668794631958, "step": 60 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.01952, "frac_reward_zero_std": 0.375, "grad_norm": 0.8079439997673035, "kl": 0.023756055161356926, "learning_rate": 4.9994946932779076e-06, "loss": 0.0, "num_tokens": 11983168.0, "reward": 0.9052734375, "reward_std": 0.11953707039356232, "rewards/reward_len/mean": 0.9052734375, "rewards/reward_len/std": 0.23069322109222412, "step": 61 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.01984, "frac_reward_zero_std": 0.5, "grad_norm": 0.6609861850738525, "kl": 0.02364611253142357, "learning_rate": 4.99947770994892e-06, "loss": 0.0, "num_tokens": 12179648.0, "reward": 0.912109375, "reward_std": 0.124283567070961, "rewards/reward_len/mean": 0.912109375, "rewards/reward_len/std": 0.2015216052532196, "step": 62 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.02016, "frac_reward_zero_std": 0.4375, "grad_norm": 0.6409368515014648, "kl": 0.02544046938419342, "learning_rate": 4.999460445943169e-06, "loss": 0.0001, "num_tokens": 12375952.0, "reward": 0.7766927480697632, "reward_std": 0.15531742572784424, "rewards/reward_len/mean": 0.7766927480697632, "rewards/reward_len/std": 0.3684055209159851, "step": 63 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.02048, "frac_reward_zero_std": 0.4375, "grad_norm": 0.6801385283470154, "kl": 0.02245226688683033, "learning_rate": 4.999442901262598e-06, "loss": 0.0, "num_tokens": 12572432.0, "reward": 0.9143880605697632, "reward_std": 0.09244164079427719, "rewards/reward_len/mean": 0.9143880605697632, "rewards/reward_len/std": 0.18908163905143738, "step": 64 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0208, "frac_reward_zero_std": 0.5, "grad_norm": 0.688310980796814, "kl": 0.025234345346689224, "learning_rate": 4.9994250759091725e-06, "loss": 0.0001, "num_tokens": 12768864.0, "reward": 0.8714193105697632, "reward_std": 0.11069254577159882, "rewards/reward_len/mean": 0.8714193105697632, "rewards/reward_len/std": 0.2919362485408783, "step": 65 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.02112, "frac_reward_zero_std": 0.5625, "grad_norm": 0.6891856789588928, "kl": 0.023964237421751022, "learning_rate": 4.999406969884897e-06, "loss": 0.0, "num_tokens": 12965328.0, "reward": 0.8665364980697632, "reward_std": 0.1188942939043045, "rewards/reward_len/mean": 0.8665364980697632, "rewards/reward_len/std": 0.26203906536102295, "step": 66 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.02144, "frac_reward_zero_std": 0.6875, "grad_norm": 0.5637570023536682, "kl": 0.026914432644844055, "learning_rate": 4.9993885831918035e-06, "loss": 0.0001, "num_tokens": 13161712.0, "reward": 0.8720703125, "reward_std": 0.09208361804485321, "rewards/reward_len/mean": 0.8720703125, "rewards/reward_len/std": 0.2989489436149597, "step": 67 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.02176, "frac_reward_zero_std": 0.5625, "grad_norm": 0.5956972241401672, "kl": 0.026400156319141388, "learning_rate": 4.999369915831958e-06, "loss": 0.0001, "num_tokens": 13358048.0, "reward": 0.7919921875, "reward_std": 0.08180129528045654, "rewards/reward_len/mean": 0.7919921875, "rewards/reward_len/std": 0.3576885163784027, "step": 68 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.02208, "frac_reward_zero_std": 0.625, "grad_norm": 0.5297041535377502, "kl": 0.02976173348724842, "learning_rate": 4.999350967807455e-06, "loss": 0.0001, "num_tokens": 13554560.0, "reward": 0.9059244990348816, "reward_std": 0.08187607675790787, "rewards/reward_len/mean": 0.9059244990348816, "rewards/reward_len/std": 0.20148861408233643, "step": 69 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0224, "frac_reward_zero_std": 0.5625, "grad_norm": 0.5297337174415588, "kl": 0.029380029067397118, "learning_rate": 4.999331739120423e-06, "loss": 0.0001, "num_tokens": 13750992.0, "reward": 0.8766276240348816, "reward_std": 0.08794420212507248, "rewards/reward_len/mean": 0.8766276240348816, "rewards/reward_len/std": 0.24631834030151367, "step": 70 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.02272, "frac_reward_zero_std": 0.6875, "grad_norm": 0.5374413728713989, "kl": 0.03178559988737106, "learning_rate": 4.999312229773022e-06, "loss": 0.0001, "num_tokens": 13947232.0, "reward": 0.96484375, "reward_std": 0.07383135706186295, "rewards/reward_len/mean": 0.96484375, "rewards/reward_len/std": 0.17232529819011688, "step": 71 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.02304, "frac_reward_zero_std": 0.5625, "grad_norm": 0.6321916580200195, "kl": 0.02617986872792244, "learning_rate": 4.9992924397674414e-06, "loss": 0.0001, "num_tokens": 14143712.0, "reward": 0.9111328125, "reward_std": 0.08529947698116302, "rewards/reward_len/mean": 0.9111328125, "rewards/reward_len/std": 0.2390221208333969, "step": 72 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.02336, "frac_reward_zero_std": 0.3125, "grad_norm": 0.8528509140014648, "kl": 0.029244577512145042, "learning_rate": 4.999272369105904e-06, "loss": 0.0001, "num_tokens": 14340208.0, "reward": 0.8619791865348816, "reward_std": 0.1358361542224884, "rewards/reward_len/mean": 0.8619791269302368, "rewards/reward_len/std": 0.3006777763366699, "step": 73 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.02368, "frac_reward_zero_std": 0.4375, "grad_norm": 0.7068591117858887, "kl": 0.038017794489860535, "learning_rate": 4.999252017790665e-06, "loss": 0.0001, "num_tokens": 14536384.0, "reward": 0.8388671875, "reward_std": 0.126865953207016, "rewards/reward_len/mean": 0.8388671875, "rewards/reward_len/std": 0.2714577317237854, "step": 74 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.024, "frac_reward_zero_std": 0.3125, "grad_norm": 0.7906740307807922, "kl": 0.035286713391542435, "learning_rate": 4.999231385824008e-06, "loss": 0.0001, "num_tokens": 14732848.0, "reward": 0.8375651240348816, "reward_std": 0.20768919587135315, "rewards/reward_len/mean": 0.8375651240348816, "rewards/reward_len/std": 0.327578604221344, "step": 75 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.02432, "frac_reward_zero_std": 0.4375, "grad_norm": 0.7323654890060425, "kl": 0.029418617486953735, "learning_rate": 4.99921047320825e-06, "loss": 0.0001, "num_tokens": 14929392.0, "reward": 0.8792318105697632, "reward_std": 0.109856978058815, "rewards/reward_len/mean": 0.8792318105697632, "rewards/reward_len/std": 0.28465378284454346, "step": 76 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.02464, "frac_reward_zero_std": 0.5, "grad_norm": 0.7487607598304749, "kl": 0.02889881655573845, "learning_rate": 4.999189279945741e-06, "loss": 0.0001, "num_tokens": 15125728.0, "reward": 0.8938802480697632, "reward_std": 0.10382238030433655, "rewards/reward_len/mean": 0.8938801884651184, "rewards/reward_len/std": 0.26159700751304626, "step": 77 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.02496, "frac_reward_zero_std": 0.5, "grad_norm": 0.7281383275985718, "kl": 0.03049377351999283, "learning_rate": 4.999167806038858e-06, "loss": 0.0001, "num_tokens": 15322032.0, "reward": 0.8343099355697632, "reward_std": 0.14393801987171173, "rewards/reward_len/mean": 0.8343099355697632, "rewards/reward_len/std": 0.3518146574497223, "step": 78 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.02528, "frac_reward_zero_std": 0.5625, "grad_norm": 0.6423137784004211, "kl": 0.03136272728443146, "learning_rate": 4.999146051490016e-06, "loss": 0.0001, "num_tokens": 15518320.0, "reward": 0.9407552480697632, "reward_std": 0.0995391458272934, "rewards/reward_len/mean": 0.9407551884651184, "rewards/reward_len/std": 0.24368993937969208, "step": 79 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0256, "frac_reward_zero_std": 0.375, "grad_norm": 0.8017213940620422, "kl": 0.03368952497839928, "learning_rate": 4.999124016301654e-06, "loss": 0.0001, "num_tokens": 15714848.0, "reward": 0.857421875, "reward_std": 0.14494842290878296, "rewards/reward_len/mean": 0.857421875, "rewards/reward_len/std": 0.286716103553772, "step": 80 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.02592, "frac_reward_zero_std": 0.25, "grad_norm": 0.8540568947792053, "kl": 0.03586439788341522, "learning_rate": 4.99910170047625e-06, "loss": 0.0001, "num_tokens": 15911296.0, "reward": 0.8414713740348816, "reward_std": 0.18263337016105652, "rewards/reward_len/mean": 0.8414713740348816, "rewards/reward_len/std": 0.3021599352359772, "step": 81 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.02624, "frac_reward_zero_std": 0.25, "grad_norm": 0.8202486038208008, "kl": 0.04007921367883682, "learning_rate": 4.999079104016308e-06, "loss": 0.0001, "num_tokens": 16107648.0, "reward": 0.8688151240348816, "reward_std": 0.1747693419456482, "rewards/reward_len/mean": 0.8688150644302368, "rewards/reward_len/std": 0.2946781814098358, "step": 82 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.02656, "frac_reward_zero_std": 0.625, "grad_norm": 0.697175920009613, "kl": 0.03438268229365349, "learning_rate": 4.999056226924366e-06, "loss": 0.0001, "num_tokens": 16304064.0, "reward": 0.9052734375, "reward_std": 0.10430242866277695, "rewards/reward_len/mean": 0.9052734375, "rewards/reward_len/std": 0.24809814989566803, "step": 83 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.02688, "frac_reward_zero_std": 0.625, "grad_norm": 0.6200621724128723, "kl": 0.030285635963082314, "learning_rate": 4.999033069202992e-06, "loss": 0.0001, "num_tokens": 16500336.0, "reward": 0.9410807490348816, "reward_std": 0.08145523071289062, "rewards/reward_len/mean": 0.9410807490348816, "rewards/reward_len/std": 0.16886034607887268, "step": 84 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0272, "frac_reward_zero_std": 0.4375, "grad_norm": 0.6632947325706482, "kl": 0.03737508878111839, "learning_rate": 4.999009630854787e-06, "loss": 0.0001, "num_tokens": 16696880.0, "reward": 0.8681640625, "reward_std": 0.09984727203845978, "rewards/reward_len/mean": 0.8681640625, "rewards/reward_len/std": 0.3139059841632843, "step": 85 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.02752, "frac_reward_zero_std": 0.6875, "grad_norm": 0.5139328837394714, "kl": 0.033338725566864014, "learning_rate": 4.998985911882383e-06, "loss": 0.0001, "num_tokens": 16893424.0, "reward": 0.9231771230697632, "reward_std": 0.064088836312294, "rewards/reward_len/mean": 0.9231771230697632, "rewards/reward_len/std": 0.20548295974731445, "step": 86 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.02784, "frac_reward_zero_std": 0.4375, "grad_norm": 0.7163829803466797, "kl": 0.04579077661037445, "learning_rate": 4.998961912288445e-06, "loss": 0.0001, "num_tokens": 17089792.0, "reward": 0.9013671875, "reward_std": 0.14256593585014343, "rewards/reward_len/mean": 0.9013671875, "rewards/reward_len/std": 0.2612590491771698, "step": 87 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.02816, "frac_reward_zero_std": 0.625, "grad_norm": 0.6266374588012695, "kl": 0.03986838087439537, "learning_rate": 4.998937632075667e-06, "loss": 0.0001, "num_tokens": 17285984.0, "reward": 0.9016927480697632, "reward_std": 0.08280540257692337, "rewards/reward_len/mean": 0.9016926884651184, "rewards/reward_len/std": 0.24128234386444092, "step": 88 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.02848, "frac_reward_zero_std": 0.5, "grad_norm": 0.5966895222663879, "kl": 0.033690571784973145, "learning_rate": 4.998913071246774e-06, "loss": 0.0001, "num_tokens": 17482432.0, "reward": 0.9534505605697632, "reward_std": 0.08552976697683334, "rewards/reward_len/mean": 0.9534505009651184, "rewards/reward_len/std": 0.1531372368335724, "step": 89 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0288, "frac_reward_zero_std": 0.5, "grad_norm": 0.6456487774848938, "kl": 0.06617207825183868, "learning_rate": 4.998888229804526e-06, "loss": 0.0001, "num_tokens": 17678768.0, "reward": 0.8577474355697632, "reward_std": 0.10766549408435822, "rewards/reward_len/mean": 0.8577474355697632, "rewards/reward_len/std": 0.3121115267276764, "step": 90 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.02912, "frac_reward_zero_std": 0.5625, "grad_norm": 0.6451205015182495, "kl": 0.03554663062095642, "learning_rate": 4.998863107751711e-06, "loss": 0.0001, "num_tokens": 17875056.0, "reward": 0.8850911855697632, "reward_std": 0.09045256674289703, "rewards/reward_len/mean": 0.8850911855697632, "rewards/reward_len/std": 0.2813292443752289, "step": 91 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.02944, "frac_reward_zero_std": 0.5, "grad_norm": 0.6420301795005798, "kl": 0.03609941899776459, "learning_rate": 4.998837705091152e-06, "loss": 0.0001, "num_tokens": 18071504.0, "reward": 0.9020182490348816, "reward_std": 0.12863239645957947, "rewards/reward_len/mean": 0.9020181894302368, "rewards/reward_len/std": 0.2481461763381958, "step": 92 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.02976, "frac_reward_zero_std": 0.4375, "grad_norm": 0.744374692440033, "kl": 0.03676271066069603, "learning_rate": 4.9988120218257e-06, "loss": 0.0001, "num_tokens": 18267712.0, "reward": 0.8391927480697632, "reward_std": 0.13910341262817383, "rewards/reward_len/mean": 0.8391926884651184, "rewards/reward_len/std": 0.33368995785713196, "step": 93 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.03008, "frac_reward_zero_std": 0.4375, "grad_norm": 0.6814514398574829, "kl": 0.04908795654773712, "learning_rate": 4.99878605795824e-06, "loss": 0.0001, "num_tokens": 18464272.0, "reward": 0.8059896230697632, "reward_std": 0.10761559009552002, "rewards/reward_len/mean": 0.8059896230697632, "rewards/reward_len/std": 0.3403044641017914, "step": 94 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0304, "frac_reward_zero_std": 0.5, "grad_norm": 0.6609547138214111, "kl": 0.04120853543281555, "learning_rate": 4.998759813491687e-06, "loss": 0.0001, "num_tokens": 18660592.0, "reward": 0.8746744990348816, "reward_std": 0.12070801854133606, "rewards/reward_len/mean": 0.8746744990348816, "rewards/reward_len/std": 0.3313665986061096, "step": 95 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.03072, "frac_reward_zero_std": 0.4375, "grad_norm": 0.6958045959472656, "kl": 0.051205702126026154, "learning_rate": 4.998733288428987e-06, "loss": 0.0001, "num_tokens": 18857088.0, "reward": 0.8590494990348816, "reward_std": 0.12743408977985382, "rewards/reward_len/mean": 0.8590494990348816, "rewards/reward_len/std": 0.2551549971103668, "step": 96 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.03104, "frac_reward_zero_std": 0.5625, "grad_norm": 0.5829638838768005, "kl": 0.03792741894721985, "learning_rate": 4.998706482773121e-06, "loss": 0.0001, "num_tokens": 19053536.0, "reward": 0.8645833730697632, "reward_std": 0.08903007954359055, "rewards/reward_len/mean": 0.8645833134651184, "rewards/reward_len/std": 0.27937448024749756, "step": 97 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.03136, "frac_reward_zero_std": 0.4375, "grad_norm": 0.6745487451553345, "kl": 0.04386503994464874, "learning_rate": 4.998679396527099e-06, "loss": 0.0001, "num_tokens": 19250192.0, "reward": 0.8658854365348816, "reward_std": 0.11358411610126495, "rewards/reward_len/mean": 0.8658853769302368, "rewards/reward_len/std": 0.21724016964435577, "step": 98 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.03168, "frac_reward_zero_std": 0.375, "grad_norm": 0.8316110968589783, "kl": 0.04867399483919144, "learning_rate": 4.99865202969396e-06, "loss": 0.0001, "num_tokens": 19446624.0, "reward": 0.8932291865348816, "reward_std": 0.11117450892925262, "rewards/reward_len/mean": 0.8932291865348816, "rewards/reward_len/std": 0.20754754543304443, "step": 99 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.032, "frac_reward_zero_std": 0.125, "grad_norm": 0.9266502857208252, "kl": 0.04411531612277031, "learning_rate": 4.9986243822767795e-06, "loss": 0.0001, "num_tokens": 19643296.0, "reward": 0.8235677480697632, "reward_std": 0.17513221502304077, "rewards/reward_len/mean": 0.8235677480697632, "rewards/reward_len/std": 0.28480416536331177, "step": 100 } ], "logging_steps": 1, "max_steps": 9375, "num_input_tokens_seen": 19643296, "num_train_epochs": 3, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }