| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.1568627450980392, | |
| "eval_steps": 500, | |
| "global_step": 30, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "accuracy_delta": -0.03125, | |
| "baseline_accuracy": 0.5625, | |
| "completion_length": 1660.1953125, | |
| "degradation_rate": 0.1875, | |
| "epoch": 0.00522875816993464, | |
| "grad_norm": 1.8285036167620026, | |
| "improvement_rate": 0.15625, | |
| "kl": 0.0, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0, | |
| "reward": 0.24004681408405304, | |
| "reward_std": 0.25635848194360733, | |
| "rewards/AdaptiveTeachingReward": 0.24004681408405304, | |
| "step": 1, | |
| "student_accuracy": 0.53125, | |
| "student_approach_length": 500.0, | |
| "teaching_length_mean": 1349.5, | |
| "teaching_length_std": 1533.2601122433693, | |
| "token_efficiency": 0.01778511088825826 | |
| }, | |
| { | |
| "accuracy_delta": -0.21875, | |
| "baseline_accuracy": 1.0, | |
| "completion_length": 2174.171875, | |
| "degradation_rate": 0.21875, | |
| "epoch": 0.01045751633986928, | |
| "grad_norm": 1.2225533588422857, | |
| "improvement_rate": 0.0, | |
| "kl": 0.002572178840637207, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0001, | |
| "reward": 0.2656950503587723, | |
| "reward_std": 0.17467603832483292, | |
| "rewards/AdaptiveTeachingReward": 0.2656950503587723, | |
| "step": 2, | |
| "student_accuracy": 0.78125, | |
| "student_approach_length": 489.875, | |
| "teaching_length_mean": 1645.90625, | |
| "teaching_length_std": 1777.2254866782437, | |
| "token_efficiency": 0.016061369340544678 | |
| }, | |
| { | |
| "accuracy_delta": -0.0625, | |
| "baseline_accuracy": 0.0625, | |
| "completion_length": 2889.65625, | |
| "degradation_rate": 0.0625, | |
| "epoch": 0.01568627450980392, | |
| "grad_norm": 0.4912196165940296, | |
| "improvement_rate": 0.0, | |
| "kl": 0.0021309852600097656, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0001, | |
| "reward": 0.1566198617219925, | |
| "reward_std": 0.17452973127365112, | |
| "rewards/AdaptiveTeachingReward": 0.1566198617219925, | |
| "step": 3, | |
| "student_accuracy": 0.0, | |
| "student_approach_length": 500.0, | |
| "teaching_length_mean": 3435.71875, | |
| "teaching_length_std": 874.6175996443687, | |
| "token_efficiency": 0.005029539554335019 | |
| }, | |
| { | |
| "accuracy_delta": 0.0, | |
| "baseline_accuracy": 0.0, | |
| "completion_length": 1765.4765625, | |
| "degradation_rate": 0.0, | |
| "epoch": 0.02091503267973856, | |
| "grad_norm": 0.014919439029800962, | |
| "improvement_rate": 0.0, | |
| "kl": 0.0024797916412353516, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0001, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/AdaptiveTeachingReward": 0.0, | |
| "step": 4, | |
| "student_accuracy": 0.0, | |
| "student_approach_length": 500.0, | |
| "teaching_length_mean": 1644.53125, | |
| "teaching_length_std": 1544.9233390329066, | |
| "token_efficiency": 0.0 | |
| }, | |
| { | |
| "accuracy_delta": 0.1875, | |
| "baseline_accuracy": 0.375, | |
| "completion_length": 2330.296875, | |
| "degradation_rate": 0.09375, | |
| "epoch": 0.026143790849673203, | |
| "grad_norm": 1.6480657403271404, | |
| "improvement_rate": 0.28125, | |
| "kl": 0.002542257308959961, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0001, | |
| "reward": 0.293629452586174, | |
| "reward_std": 0.31043318659067154, | |
| "rewards/AdaptiveTeachingReward": 0.293629452586174, | |
| "step": 5, | |
| "student_accuracy": 0.5625, | |
| "student_approach_length": 500.0, | |
| "teaching_length_mean": 2543.6875, | |
| "teaching_length_std": 1157.76463240177, | |
| "token_efficiency": 0.015775285003662067 | |
| }, | |
| { | |
| "accuracy_delta": -0.125, | |
| "baseline_accuracy": 0.375, | |
| "completion_length": 2799.2265625, | |
| "degradation_rate": 0.21875, | |
| "epoch": 0.03137254901960784, | |
| "grad_norm": 1.3626772466174568, | |
| "improvement_rate": 0.09375, | |
| "kl": 0.0024237632751464844, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0001, | |
| "reward": 0.297846183180809, | |
| "reward_std": 0.1792445182800293, | |
| "rewards/AdaptiveTeachingReward": 0.297846183180809, | |
| "step": 6, | |
| "student_accuracy": 0.25, | |
| "student_approach_length": 500.0, | |
| "teaching_length_mean": 3300.46875, | |
| "teaching_length_std": 1322.463354762313, | |
| "token_efficiency": 0.008984860526379333 | |
| }, | |
| { | |
| "accuracy_delta": 0.15625, | |
| "baseline_accuracy": 0.09375, | |
| "completion_length": 2839.59375, | |
| "degradation_rate": 0.0, | |
| "epoch": 0.036601307189542485, | |
| "grad_norm": 1.0148015671601693, | |
| "improvement_rate": 0.15625, | |
| "kl": 0.0022563934326171875, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0001, | |
| "reward": 0.22244123369455338, | |
| "reward_std": 0.32708095014095306, | |
| "rewards/AdaptiveTeachingReward": 0.22244123369455338, | |
| "step": 7, | |
| "student_accuracy": 0.25, | |
| "student_approach_length": 500.0, | |
| "teaching_length_mean": 2663.28125, | |
| "teaching_length_std": 1263.471554191286, | |
| "token_efficiency": 0.009322577760442718 | |
| }, | |
| { | |
| "accuracy_delta": 0.03125, | |
| "baseline_accuracy": 0.34375, | |
| "completion_length": 2997.0234375, | |
| "degradation_rate": 0.0625, | |
| "epoch": 0.04183006535947712, | |
| "grad_norm": 1.0933058629769363, | |
| "improvement_rate": 0.09375, | |
| "kl": 0.00222015380859375, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0001, | |
| "reward": 0.4145798534154892, | |
| "reward_std": 0.348370686173439, | |
| "rewards/AdaptiveTeachingReward": 0.4145798534154892, | |
| "step": 8, | |
| "student_accuracy": 0.375, | |
| "student_approach_length": 500.0, | |
| "teaching_length_mean": 2859.9375, | |
| "teaching_length_std": 1548.3925018635746, | |
| "token_efficiency": 0.014895284304358753 | |
| }, | |
| { | |
| "accuracy_delta": 0.0, | |
| "baseline_accuracy": 0.0, | |
| "completion_length": 2069.3515625, | |
| "degradation_rate": 0.0, | |
| "epoch": 0.047058823529411764, | |
| "grad_norm": 0.6907373861546955, | |
| "improvement_rate": 0.0, | |
| "kl": 0.0022356510162353516, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0001, | |
| "reward": 0.13787749409675598, | |
| "reward_std": 0.08163860440254211, | |
| "rewards/AdaptiveTeachingReward": 0.13787749409675598, | |
| "step": 9, | |
| "student_accuracy": 0.0, | |
| "student_approach_length": 500.0, | |
| "teaching_length_mean": 1586.78125, | |
| "teaching_length_std": 1393.5538468081056, | |
| "token_efficiency": 0.009196431155361413 | |
| }, | |
| { | |
| "accuracy_delta": 0.0, | |
| "baseline_accuracy": 0.0, | |
| "completion_length": 2533.3203125, | |
| "degradation_rate": 0.0, | |
| "epoch": 0.05228758169934641, | |
| "grad_norm": 0.9569337200656036, | |
| "improvement_rate": 0.0, | |
| "kl": 0.002455472946166992, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0001, | |
| "reward": 0.2650887817144394, | |
| "reward_std": 0.1834174394607544, | |
| "rewards/AdaptiveTeachingReward": 0.2650887817144394, | |
| "step": 10, | |
| "student_accuracy": 0.0, | |
| "student_approach_length": 500.0, | |
| "teaching_length_mean": 2718.53125, | |
| "teaching_length_std": 1545.6357044636015, | |
| "token_efficiency": 0.00983657689669804 | |
| }, | |
| { | |
| "accuracy_delta": 0.0, | |
| "baseline_accuracy": 0.0, | |
| "completion_length": 2366.4609375, | |
| "degradation_rate": 0.0, | |
| "epoch": 0.05751633986928104, | |
| "grad_norm": 0.1992844149700769, | |
| "improvement_rate": 0.0, | |
| "kl": 0.0023772716522216797, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0001, | |
| "reward": 0.00799931213259697, | |
| "reward_std": 0.045250944793224335, | |
| "rewards/AdaptiveTeachingReward": 0.00799931213259697, | |
| "step": 11, | |
| "student_accuracy": 0.0, | |
| "student_approach_length": 500.0, | |
| "teaching_length_mean": 2560.28125, | |
| "teaching_length_std": 1264.0513369895032, | |
| "token_efficiency": 0.000244351732796639 | |
| }, | |
| { | |
| "accuracy_delta": 0.0, | |
| "baseline_accuracy": 0.0, | |
| "completion_length": 2429.25, | |
| "degradation_rate": 0.0, | |
| "epoch": 0.06274509803921569, | |
| "grad_norm": 0.02438642631727396, | |
| "improvement_rate": 0.0, | |
| "kl": 0.0028073787689208984, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0001, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/AdaptiveTeachingReward": 0.0, | |
| "step": 12, | |
| "student_accuracy": 0.0, | |
| "student_approach_length": 500.0, | |
| "teaching_length_mean": 2684.0, | |
| "teaching_length_std": 1537.9383137400418, | |
| "token_efficiency": 0.0 | |
| }, | |
| { | |
| "accuracy_delta": 0.03125, | |
| "baseline_accuracy": 0.03125, | |
| "completion_length": 2985.0234375, | |
| "degradation_rate": 0.03125, | |
| "epoch": 0.06797385620915032, | |
| "grad_norm": 1.9377263264415499, | |
| "improvement_rate": 0.0625, | |
| "kl": 0.002295255661010742, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0001, | |
| "reward": 0.3229658156633377, | |
| "reward_std": 0.2836003005504608, | |
| "rewards/AdaptiveTeachingReward": 0.3229658156633377, | |
| "step": 13, | |
| "student_accuracy": 0.0625, | |
| "student_approach_length": 500.0, | |
| "teaching_length_mean": 2764.78125, | |
| "teaching_length_std": 1359.6938149421062, | |
| "token_efficiency": 0.011864912871618307 | |
| }, | |
| { | |
| "accuracy_delta": 0.0, | |
| "baseline_accuracy": 0.40625, | |
| "completion_length": 2860.7890625, | |
| "degradation_rate": 0.03125, | |
| "epoch": 0.07320261437908497, | |
| "grad_norm": 0.7397348398480618, | |
| "improvement_rate": 0.03125, | |
| "kl": 0.0022339820861816406, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0001, | |
| "reward": 0.3183840811252594, | |
| "reward_std": 0.2730633243918419, | |
| "rewards/AdaptiveTeachingReward": 0.3183840811252594, | |
| "step": 14, | |
| "student_accuracy": 0.40625, | |
| "student_approach_length": 500.0, | |
| "teaching_length_mean": 3300.75, | |
| "teaching_length_std": 1258.5522551939885, | |
| "token_efficiency": 0.009677591351820222 | |
| }, | |
| { | |
| "accuracy_delta": -0.25, | |
| "baseline_accuracy": 0.59375, | |
| "completion_length": 2667.375, | |
| "degradation_rate": 0.375, | |
| "epoch": 0.0784313725490196, | |
| "grad_norm": 0.591091766464445, | |
| "improvement_rate": 0.125, | |
| "kl": 0.0020873546600341797, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0001, | |
| "reward": 0.14508739858865738, | |
| "reward_std": 0.1915995106101036, | |
| "rewards/AdaptiveTeachingReward": 0.14508739858865738, | |
| "step": 15, | |
| "student_accuracy": 0.34375, | |
| "student_approach_length": 500.0, | |
| "teaching_length_mean": 2580.1875, | |
| "teaching_length_std": 1411.9177183341415, | |
| "token_efficiency": 0.006345218750950054 | |
| }, | |
| { | |
| "accuracy_delta": -0.125, | |
| "baseline_accuracy": 0.78125, | |
| "completion_length": 2397.8046875, | |
| "degradation_rate": 0.28125, | |
| "epoch": 0.08366013071895424, | |
| "grad_norm": 1.0467841728187868, | |
| "improvement_rate": 0.15625, | |
| "kl": 0.0020101070404052734, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0001, | |
| "reward": 0.29722827672958374, | |
| "reward_std": 0.29581306129693985, | |
| "rewards/AdaptiveTeachingReward": 0.29722827672958374, | |
| "step": 16, | |
| "student_accuracy": 0.65625, | |
| "student_approach_length": 500.0, | |
| "teaching_length_mean": 1867.65625, | |
| "teaching_length_std": 1645.2406135166107, | |
| "token_efficiency": 0.016315762169946447 | |
| }, | |
| { | |
| "accuracy_delta": 0.0, | |
| "baseline_accuracy": 0.4375, | |
| "completion_length": 2681.4296875, | |
| "degradation_rate": 0.0625, | |
| "epoch": 0.08888888888888889, | |
| "grad_norm": 1.0178368263353954, | |
| "improvement_rate": 0.0625, | |
| "kl": 0.0022170543670654297, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0001, | |
| "reward": 0.27442795038223267, | |
| "reward_std": 0.24267160892486572, | |
| "rewards/AdaptiveTeachingReward": 0.27442795038223267, | |
| "step": 17, | |
| "student_accuracy": 0.4375, | |
| "student_approach_length": 500.0, | |
| "teaching_length_mean": 1946.4375, | |
| "teaching_length_std": 1595.4173133278073, | |
| "token_efficiency": 0.01725690617086827 | |
| }, | |
| { | |
| "accuracy_delta": 0.0, | |
| "baseline_accuracy": 0.0, | |
| "completion_length": 2260.1484375, | |
| "degradation_rate": 0.0, | |
| "epoch": 0.09411764705882353, | |
| "grad_norm": 0.39094525161531923, | |
| "improvement_rate": 0.0, | |
| "kl": 0.0026335716247558594, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0001, | |
| "reward": 0.15987491607666016, | |
| "reward_std": 0.12795361876487732, | |
| "rewards/AdaptiveTeachingReward": 0.15987491607666016, | |
| "step": 18, | |
| "student_accuracy": 0.0, | |
| "student_approach_length": 500.0, | |
| "teaching_length_mean": 2452.8125, | |
| "teaching_length_std": 1687.6677585883976, | |
| "token_efficiency": 0.00733118954839666 | |
| }, | |
| { | |
| "accuracy_delta": 0.09375, | |
| "baseline_accuracy": 0.34375, | |
| "completion_length": 2509.359375, | |
| "degradation_rate": 0.0, | |
| "epoch": 0.09934640522875816, | |
| "grad_norm": 1.112027848483532, | |
| "improvement_rate": 0.09375, | |
| "kl": 0.0021845102310180664, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0001, | |
| "reward": 0.32069824635982513, | |
| "reward_std": 0.1779022440314293, | |
| "rewards/AdaptiveTeachingReward": 0.32069824635982513, | |
| "step": 19, | |
| "student_accuracy": 0.4375, | |
| "student_approach_length": 500.0, | |
| "teaching_length_mean": 1688.25, | |
| "teaching_length_std": 1591.1127810270257, | |
| "token_efficiency": 0.02000900419695598 | |
| }, | |
| { | |
| "accuracy_delta": 0.15625, | |
| "baseline_accuracy": 0.25, | |
| "completion_length": 2566.109375, | |
| "degradation_rate": 0.125, | |
| "epoch": 0.10457516339869281, | |
| "grad_norm": 0.635668741879516, | |
| "improvement_rate": 0.28125, | |
| "kl": 0.0020592212677001953, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0001, | |
| "reward": 0.3237999305129051, | |
| "reward_std": 0.29250405728816986, | |
| "rewards/AdaptiveTeachingReward": 0.3237999305129051, | |
| "step": 20, | |
| "student_accuracy": 0.40625, | |
| "student_approach_length": 500.0, | |
| "teaching_length_mean": 3000.8125, | |
| "teaching_length_std": 1146.9664495361749, | |
| "token_efficiency": 0.009248973352977438 | |
| }, | |
| { | |
| "accuracy_delta": -0.15625, | |
| "baseline_accuracy": 0.5, | |
| "completion_length": 2821.34375, | |
| "degradation_rate": 0.15625, | |
| "epoch": 0.10980392156862745, | |
| "grad_norm": 1.038632983891241, | |
| "improvement_rate": 0.0, | |
| "kl": 0.0022677183151245117, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0001, | |
| "reward": 0.26167380064725876, | |
| "reward_std": 0.20054005086421967, | |
| "rewards/AdaptiveTeachingReward": 0.26167380064725876, | |
| "step": 21, | |
| "student_accuracy": 0.34375, | |
| "student_approach_length": 500.0, | |
| "teaching_length_mean": 2324.625, | |
| "teaching_length_std": 1783.1465396131703, | |
| "token_efficiency": 0.011391752427342916 | |
| }, | |
| { | |
| "accuracy_delta": -0.0625, | |
| "baseline_accuracy": 0.6875, | |
| "completion_length": 1981.4453125, | |
| "degradation_rate": 0.21875, | |
| "epoch": 0.11503267973856209, | |
| "grad_norm": 1.0591629807323937, | |
| "improvement_rate": 0.15625, | |
| "kl": 0.0025501251220703125, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0001, | |
| "reward": 0.3476671576499939, | |
| "reward_std": 0.32001765072345734, | |
| "rewards/AdaptiveTeachingReward": 0.3476671576499939, | |
| "step": 22, | |
| "student_accuracy": 0.625, | |
| "student_approach_length": 500.0, | |
| "teaching_length_mean": 1747.6875, | |
| "teaching_length_std": 1544.1982674617702, | |
| "token_efficiency": 0.019930595836054183 | |
| }, | |
| { | |
| "accuracy_delta": 0.0, | |
| "baseline_accuracy": 0.6875, | |
| "completion_length": 2491.2109375, | |
| "degradation_rate": 0.15625, | |
| "epoch": 0.12026143790849673, | |
| "grad_norm": 0.8736384977021919, | |
| "improvement_rate": 0.15625, | |
| "kl": 0.002077817916870117, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0001, | |
| "reward": 0.3966591954231262, | |
| "reward_std": 0.35394637286663055, | |
| "rewards/AdaptiveTeachingReward": 0.3966591954231262, | |
| "step": 23, | |
| "student_accuracy": 0.6875, | |
| "student_approach_length": 500.0, | |
| "teaching_length_mean": 2244.40625, | |
| "teaching_length_std": 1464.1512714006012, | |
| "token_efficiency": 0.020170202633726 | |
| }, | |
| { | |
| "accuracy_delta": -0.0625, | |
| "baseline_accuracy": 0.40625, | |
| "completion_length": 3183.6796875, | |
| "degradation_rate": 0.09375, | |
| "epoch": 0.12549019607843137, | |
| "grad_norm": 0.731937384244321, | |
| "improvement_rate": 0.03125, | |
| "kl": 0.002083301544189453, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0001, | |
| "reward": 0.24638524651527405, | |
| "reward_std": 0.25337880849838257, | |
| "rewards/AdaptiveTeachingReward": 0.24638524651527405, | |
| "step": 24, | |
| "student_accuracy": 0.34375, | |
| "student_approach_length": 500.0, | |
| "teaching_length_mean": 2833.75, | |
| "teaching_length_std": 883.8529811162587, | |
| "token_efficiency": 0.0154971457828618 | |
| }, | |
| { | |
| "accuracy_delta": 0.09375, | |
| "baseline_accuracy": 0.09375, | |
| "completion_length": 2996.8203125, | |
| "degradation_rate": 0.0625, | |
| "epoch": 0.13071895424836602, | |
| "grad_norm": 1.0366727211421645, | |
| "improvement_rate": 0.15625, | |
| "kl": 0.002071857452392578, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0001, | |
| "reward": 0.10002126544713974, | |
| "reward_std": 0.12362907081842422, | |
| "rewards/AdaptiveTeachingReward": 0.10002126544713974, | |
| "step": 25, | |
| "student_accuracy": 0.1875, | |
| "student_approach_length": 500.0, | |
| "teaching_length_mean": 2241.40625, | |
| "teaching_length_std": 1680.2583046873353, | |
| "token_efficiency": 0.005314979233325259 | |
| }, | |
| { | |
| "accuracy_delta": 0.03125, | |
| "baseline_accuracy": 0.125, | |
| "completion_length": 2699.5078125, | |
| "degradation_rate": 0.125, | |
| "epoch": 0.13594771241830064, | |
| "grad_norm": 0.8849958564728577, | |
| "improvement_rate": 0.15625, | |
| "kl": 0.00244140625, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0001, | |
| "reward": 0.21003766357898712, | |
| "reward_std": 0.27024491131305695, | |
| "rewards/AdaptiveTeachingReward": 0.21003766357898712, | |
| "step": 26, | |
| "student_accuracy": 0.15625, | |
| "student_approach_length": 500.0, | |
| "teaching_length_mean": 2865.75, | |
| "teaching_length_std": 1575.6950824020187, | |
| "token_efficiency": 0.00747702067329278 | |
| }, | |
| { | |
| "accuracy_delta": 0.09375, | |
| "baseline_accuracy": 0.0, | |
| "completion_length": 2216.78125, | |
| "degradation_rate": 0.0, | |
| "epoch": 0.1411764705882353, | |
| "grad_norm": 0.6451950468247998, | |
| "improvement_rate": 0.09375, | |
| "kl": 0.0021767616271972656, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0001, | |
| "reward": 0.2089657336473465, | |
| "reward_std": 0.21514078974723816, | |
| "rewards/AdaptiveTeachingReward": 0.2089657336473465, | |
| "step": 27, | |
| "student_accuracy": 0.09375, | |
| "student_approach_length": 499.96875, | |
| "teaching_length_mean": 2619.40625, | |
| "teaching_length_std": 1265.4369118665845, | |
| "token_efficiency": 0.0060904088353781515 | |
| }, | |
| { | |
| "accuracy_delta": 0.0, | |
| "baseline_accuracy": 0.0, | |
| "completion_length": 2668.4140625, | |
| "degradation_rate": 0.0, | |
| "epoch": 0.14640522875816994, | |
| "grad_norm": 0.7896767066978999, | |
| "improvement_rate": 0.0, | |
| "kl": 0.002083301544189453, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0001, | |
| "reward": 0.14528799057006836, | |
| "reward_std": 0.09566954523324966, | |
| "rewards/AdaptiveTeachingReward": 0.14528799057006836, | |
| "step": 28, | |
| "student_accuracy": 0.0, | |
| "student_approach_length": 500.0, | |
| "teaching_length_mean": 3113.90625, | |
| "teaching_length_std": 884.9974614855895, | |
| "token_efficiency": 0.0035470700822770596 | |
| }, | |
| { | |
| "accuracy_delta": 0.03125, | |
| "baseline_accuracy": 0.0, | |
| "completion_length": 2467.4140625, | |
| "degradation_rate": 0.0, | |
| "epoch": 0.15163398692810456, | |
| "grad_norm": 0.22872066233966626, | |
| "improvement_rate": 0.03125, | |
| "kl": 0.0025146007537841797, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0001, | |
| "reward": 0.007998689077794552, | |
| "reward_std": 0.04524742066860199, | |
| "rewards/AdaptiveTeachingReward": 0.007998689077794552, | |
| "step": 29, | |
| "student_accuracy": 0.03125, | |
| "student_approach_length": 500.0, | |
| "teaching_length_mean": 2764.9375, | |
| "teaching_length_std": 1560.6896673437209, | |
| "token_efficiency": 0.0002591402932910396 | |
| }, | |
| { | |
| "accuracy_delta": 0.0, | |
| "baseline_accuracy": 0.0, | |
| "completion_length": 2854.59375, | |
| "degradation_rate": 0.0, | |
| "epoch": 0.1568627450980392, | |
| "grad_norm": 0.5744778046050318, | |
| "improvement_rate": 0.0, | |
| "kl": 0.0022513866424560547, | |
| "learning_rate": 2e-06, | |
| "loss": 0.0001, | |
| "reward": 0.16005077958106995, | |
| "reward_std": 0.10021104663610458, | |
| "rewards/AdaptiveTeachingReward": 0.16005077958106995, | |
| "step": 30, | |
| "student_accuracy": 0.0, | |
| "student_approach_length": 500.0, | |
| "teaching_length_mean": 2139.71875, | |
| "teaching_length_std": 1711.9645016733543, | |
| "token_efficiency": 0.007949871083127776 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 250, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 10, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |