{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.11968880909634949, "eval_steps": 500, "global_step": 125, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "answer_log_prob_mean": -0.13323402404785156, "answer_log_prob_min": -9.0472412109375, "completion_length": 5898.6240234375, "epoch": 0.0009575104727707959, "grad_norm": 0.3625124822064271, "kl": 0.0, "kl_reward": -0.9189935177564621, "kl_reward_no_entropy": -1.1447369274683297, "kl_scores_no_entropy": 6.0738525390625, "learning_rate": 1e-06, "loss": -0.0, "match_reward": -0.1796875, "no_entropy_reasoning_kl_max": 11.8837890625, "no_entropy_reasoning_kl_mean": 0.2627410888671875, "no_entropy_unprocessed_thought_kl/_first_quartile": 2.2595282644033432e-05, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.1497793197631836, "no_entropy_unprocessed_thought_kl/_max": 11.8837890625, "no_entropy_unprocessed_thought_kl/_mean": 0.2627410888671875, "no_entropy_unprocessed_thought_kl/_median": 0.00572239700704813, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 2385.328125, "processed_kl_no_entropy": 0.2626495361328125, "reasoning_kl_max": 11.197998046875, "reasoning_kl_mean": 0.1943511962890625, "reward": -1.8610057830810547, "reward_std": 0.25214052200317383, "rewards/TeacherKLBasedReward": -1.8610057830810547, "solution_log_prob_reward": -0.22370643375325017, "step": 1, "thought_kl_scores": 5.6986083984375, "thought_processed_kl": 0.19428634643554688, "total_teacher_likelihood_reward": -1.3223874527029693, "total_tl_reward_no_entropy": -1.5481308568269014, "unprocessed_answer_log_prob/_first_quartile": -0.0033911094069480896, "unprocessed_answer_log_prob/_last_quartile": 0.0, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.13323402404785156, "unprocessed_answer_log_prob/_median": -6.970949470996857e-06, "unprocessed_answer_log_prob/_min": -9.0472412109375, "unprocessed_answer_log_prob/_sum": -82.123291015625, "unprocessed_thought_kl/_first_quartile": 3.4284312278032303e-06, "unprocessed_thought_kl/_last_quartile": 0.0922614336013794, "unprocessed_thought_kl/_max": 11.197998046875, "unprocessed_thought_kl/_mean": 0.1943511962890625, "unprocessed_thought_kl/_median": 0.002026565372943878, "unprocessed_thought_kl/_min": -4.5313720703125, "unprocessed_thought_kl/_sum": 1614.546875 }, { "answer_log_prob_mean": -0.09152984619140625, "answer_log_prob_min": -7.069091796875, "completion_length": 3600.64453125, "epoch": 0.0019150209455415918, "grad_norm": 1.825712629893242, "kl": 0.004577010869979858, "kl_reward": -1.2828991496935487, "kl_reward_no_entropy": -1.3174108685925603, "kl_scores_no_entropy": 5.42529296875, "learning_rate": 1e-06, "loss": 0.0002, "match_reward": 0.0, "no_entropy_reasoning_kl_max": 10.518310546875, "no_entropy_reasoning_kl_mean": 0.333953857421875, "no_entropy_unprocessed_thought_kl/_first_quartile": 0.0, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.09464216232299805, "no_entropy_unprocessed_thought_kl/_max": 10.518310546875, "no_entropy_unprocessed_thought_kl/_mean": 0.333953857421875, "no_entropy_unprocessed_thought_kl/_median": 0.00017149094492197037, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 456.4375, "processed_kl_no_entropy": 0.333984375, "reasoning_kl_max": 10.466552734375, "reasoning_kl_mean": 0.322967529296875, "reward": -1.7575204372406006, "reward_std": 0.1848515421152115, "rewards/TeacherKLBasedReward": -1.7575204372406006, "solution_log_prob_reward": -0.1622207605978474, "step": 2, "thought_kl_scores": 5.3963623046875, "thought_processed_kl": 0.3229522705078125, "total_teacher_likelihood_reward": -1.4451198922470212, "total_tl_reward_no_entropy": -1.4796316111460328, "unprocessed_answer_log_prob/_first_quartile": -8.97119753062725e-06, "unprocessed_answer_log_prob/_last_quartile": 0.0, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.09152984619140625, "unprocessed_answer_log_prob/_median": 0.0, "unprocessed_answer_log_prob/_min": -7.069091796875, "unprocessed_answer_log_prob/_sum": -44.41796875, "unprocessed_thought_kl/_first_quartile": 0.0, "unprocessed_thought_kl/_last_quartile": 0.0897374153137207, "unprocessed_thought_kl/_max": 10.466552734375, "unprocessed_thought_kl/_mean": 0.322967529296875, "unprocessed_thought_kl/_median": 0.0001523839309811592, "unprocessed_thought_kl/_min": -1.289729722775519, "unprocessed_thought_kl/_sum": 437.7109375 }, { "answer_log_prob_mean": -0.09611892700195312, "answer_log_prob_min": -5.025146484375, "completion_length": 6044.8134765625, "epoch": 0.002872531418312388, "grad_norm": 70.50210328915206, "kl": 0.04650622606277466, "kl_reward": -1.4351806631311774, "kl_reward_no_entropy": -1.5199090577661991, "kl_scores_no_entropy": 6.0830078125, "learning_rate": 1e-06, "loss": 0.0019, "match_reward": 0.0, "no_entropy_reasoning_kl_max": 11.7705078125, "no_entropy_reasoning_kl_mean": 0.3889312744140625, "no_entropy_unprocessed_thought_kl/_first_quartile": 3.869645297527313e-07, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.18077754974365234, "no_entropy_unprocessed_thought_kl/_max": 11.7705078125, "no_entropy_unprocessed_thought_kl/_mean": 0.3889312744140625, "no_entropy_unprocessed_thought_kl/_median": 0.0021131178364157677, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 634.75, "processed_kl_no_entropy": 0.3890228271484375, "reasoning_kl_max": 11.41357421875, "reasoning_kl_mean": 0.3642578125, "reward": -1.9357988834381104, "reward_std": 0.25202324986457825, "rewards/TeacherKLBasedReward": -1.9357988834381104, "solution_log_prob_reward": -0.14637039101216942, "step": 3, "thought_kl_scores": 5.8917236328125, "thought_processed_kl": 0.36438751220703125, "total_teacher_likelihood_reward": -1.5815510656684637, "total_tl_reward_no_entropy": -1.6662794621661305, "unprocessed_answer_log_prob/_first_quartile": -0.00015422911383211613, "unprocessed_answer_log_prob/_last_quartile": 0.0, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.09611892700195312, "unprocessed_answer_log_prob/_median": 0.0, "unprocessed_answer_log_prob/_min": -5.025146484375, "unprocessed_answer_log_prob/_sum": -33.6337890625, "unprocessed_thought_kl/_first_quartile": 1.5133991837501526e-07, "unprocessed_thought_kl/_last_quartile": 0.1580348014831543, "unprocessed_thought_kl/_max": 11.41357421875, "unprocessed_thought_kl/_mean": 0.3642578125, "unprocessed_thought_kl/_median": 0.0013374527916312218, "unprocessed_thought_kl/_min": -2.0825806008651853, "unprocessed_thought_kl/_sum": 589.25 }, { "answer_log_prob_mean": -0.31824493408203125, "answer_log_prob_min": -14.246826171875, "completion_length": 8281.794921875, "epoch": 0.0038300418910831835, "grad_norm": 2.448095061716343, "kl": 0.008541226387023926, "kl_reward": -1.321987595409155, "kl_reward_no_entropy": -1.6329766735434532, "kl_scores_no_entropy": 8.635986328125, "learning_rate": 1e-06, "loss": 0.0003, "match_reward": -0.265625, "no_entropy_reasoning_kl_max": 16.8974609375, "no_entropy_reasoning_kl_mean": 0.3753509521484375, "no_entropy_unprocessed_thought_kl/_first_quartile": 1.0682269930839539e-06, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.19900989532470703, "no_entropy_unprocessed_thought_kl/_max": 16.8974609375, "no_entropy_unprocessed_thought_kl/_mean": 0.3753509521484375, "no_entropy_unprocessed_thought_kl/_median": 0.00232035294175148, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 4047.0625, "processed_kl_no_entropy": 0.3753662109375, "reasoning_kl_max": 15.15771484375, "reasoning_kl_mean": 0.28908538818359375, "reward": -1.9182785749435425, "reward_std": 0.29039067029953003, "rewards/TeacherKLBasedReward": -1.9182785749435425, "solution_log_prob_reward": -0.4607131944503635, "step": 4, "thought_kl_scores": 7.72021484375, "thought_processed_kl": 0.28916168212890625, "total_teacher_likelihood_reward": -2.0483257863670588, "total_tl_reward_no_entropy": -2.35931486915797, "unprocessed_answer_log_prob/_first_quartile": -0.01492345705628395, "unprocessed_answer_log_prob/_last_quartile": -1.8649734556674957e-07, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.31824493408203125, "unprocessed_answer_log_prob/_median": -4.415679723024368e-05, "unprocessed_answer_log_prob/_min": -14.246826171875, "unprocessed_answer_log_prob/_sum": -168.85546875, "unprocessed_thought_kl/_first_quartile": 2.4028122425079346e-07, "unprocessed_thought_kl/_last_quartile": 0.12807273864746094, "unprocessed_thought_kl/_max": 15.15771484375, "unprocessed_thought_kl/_mean": 0.28908538818359375, "unprocessed_thought_kl/_median": 0.0007351292297244072, "unprocessed_thought_kl/_min": -6.631591796875, "unprocessed_thought_kl/_sum": 2983.0625 }, { "answer_log_prob_mean": -0.26096153259277344, "answer_log_prob_min": -12.0439453125, "completion_length": 3251.951171875, "epoch": 0.0047875523638539795, "grad_norm": 0.4774380153509563, "kl": 0.0027310550212860107, "kl_reward": -1.686850898899138, "kl_reward_no_entropy": -1.771528935059905, "kl_scores_no_entropy": 6.1517333984375, "learning_rate": 1e-06, "loss": 0.0001, "match_reward": 0.0, "no_entropy_reasoning_kl_max": 11.835693359375, "no_entropy_reasoning_kl_mean": 0.4721527099609375, "no_entropy_unprocessed_thought_kl/_first_quartile": 0.00021180231124162674, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.28435373306274414, "no_entropy_unprocessed_thought_kl/_max": 11.835693359375, "no_entropy_unprocessed_thought_kl/_mean": 0.4721527099609375, "no_entropy_unprocessed_thought_kl/_median": 0.017834552563726902, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 418.3359375, "processed_kl_no_entropy": 0.4715118408203125, "reasoning_kl_max": 11.615478515625, "reasoning_kl_mean": 0.44612884521484375, "reward": -1.9870548248291016, "reward_std": 0.23138150572776794, "rewards/TeacherKLBasedReward": -1.9870548248291016, "solution_log_prob_reward": -0.38140098471194506, "step": 5, "thought_kl_scores": 6.0299072265625, "thought_processed_kl": 0.44589996337890625, "total_teacher_likelihood_reward": -2.0682518975809216, "total_tl_reward_no_entropy": -2.1529299281537533, "unprocessed_answer_log_prob/_first_quartile": -0.07230664975941181, "unprocessed_answer_log_prob/_last_quartile": -3.152526915073395e-07, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.26096153259277344, "unprocessed_answer_log_prob/_median": -0.0010148733854293823, "unprocessed_answer_log_prob/_min": -12.0439453125, "unprocessed_answer_log_prob/_sum": -95.8603515625, "unprocessed_thought_kl/_first_quartile": 0.00010441080667078495, "unprocessed_thought_kl/_last_quartile": 0.2566502094268799, "unprocessed_thought_kl/_max": 11.615478515625, "unprocessed_thought_kl/_mean": 0.44612884521484375, "unprocessed_thought_kl/_median": 0.013556391932070255, "unprocessed_thought_kl/_min": -1.5325891645625234, "unprocessed_thought_kl/_sum": 394.703125 }, { "answer_log_prob_mean": -0.09277153015136719, "answer_log_prob_min": -7.357666015625, "completion_length": 6157.0107421875, "epoch": 0.005745062836624776, "grad_norm": 0.5244994913116167, "kl": 0.004882097244262695, "kl_reward": -1.6301532089710236, "kl_reward_no_entropy": -1.6699109002947807, "kl_scores_no_entropy": 6.17578125, "learning_rate": 1e-06, "loss": 0.0002, "match_reward": 0.0, "no_entropy_reasoning_kl_max": 11.93505859375, "no_entropy_reasoning_kl_mean": 0.437286376953125, "no_entropy_unprocessed_thought_kl/_first_quartile": 2.5890767574310303e-07, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.18334102630615234, "no_entropy_unprocessed_thought_kl/_max": 11.93505859375, "no_entropy_unprocessed_thought_kl/_mean": 0.437286376953125, "no_entropy_unprocessed_thought_kl/_median": 0.0010866150259971619, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 654.796875, "processed_kl_no_entropy": 0.4373626708984375, "reasoning_kl_max": 11.911376953125, "reasoning_kl_mean": 0.4242706298828125, "reward": -2.0006051063537598, "reward_std": 0.2536393702030182, "rewards/TeacherKLBasedReward": -2.0006051063537598, "solution_log_prob_reward": -0.16634819097816944, "step": 6, "thought_kl_scores": 6.158447265625, "thought_processed_kl": 0.4235076904296875, "total_teacher_likelihood_reward": -1.7965014325454831, "total_tl_reward_no_entropy": -1.8362591210752726, "unprocessed_answer_log_prob/_first_quartile": -0.0010709823109209538, "unprocessed_answer_log_prob/_last_quartile": 0.0, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.09277153015136719, "unprocessed_answer_log_prob/_median": -1.4062970876693726e-07, "unprocessed_answer_log_prob/_min": -7.357666015625, "unprocessed_answer_log_prob/_sum": -33.4794921875, "unprocessed_thought_kl/_first_quartile": 1.648440957069397e-07, "unprocessed_thought_kl/_last_quartile": 0.17230224609375, "unprocessed_thought_kl/_max": 11.911376953125, "unprocessed_thought_kl/_mean": 0.4242706298828125, "unprocessed_thought_kl/_median": 0.0008143484592437744, "unprocessed_thought_kl/_min": -1.1654206197708845, "unprocessed_thought_kl/_sum": 607.84375 }, { "answer_log_prob_mean": -0.16521453857421875, "answer_log_prob_min": -9.48858642578125, "completion_length": 6025.6123046875, "epoch": 0.0067025733093955715, "grad_norm": 0.42208221471259977, "kl": 0.0035770758986473083, "kl_reward": -1.5983633329160511, "kl_reward_no_entropy": -1.9597009206190705, "kl_scores_no_entropy": 8.6220703125, "learning_rate": 1e-06, "loss": 0.0001, "match_reward": -0.3984375, "no_entropy_reasoning_kl_max": 16.7607421875, "no_entropy_reasoning_kl_mean": 0.485626220703125, "no_entropy_unprocessed_thought_kl/_first_quartile": 0.00010567600838840008, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.34444713592529297, "no_entropy_unprocessed_thought_kl/_max": 16.7607421875, "no_entropy_unprocessed_thought_kl/_mean": 0.485626220703125, "no_entropy_unprocessed_thought_kl/_median": 0.01706552691757679, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 4759.25, "processed_kl_no_entropy": 0.48565673828125, "reasoning_kl_max": 15.1875, "reasoning_kl_mean": 0.38091278076171875, "reward": -2.0146963596343994, "reward_std": 0.2640538811683655, "rewards/TeacherKLBasedReward": -2.0146963596343994, "solution_log_prob_reward": -0.260100401181262, "step": 7, "thought_kl_scores": 7.7855224609375, "thought_processed_kl": 0.38097381591796875, "total_teacher_likelihood_reward": -2.2569012455642223, "total_tl_reward_no_entropy": -2.6182388197630644, "unprocessed_answer_log_prob/_first_quartile": -0.0025912949349731207, "unprocessed_answer_log_prob/_last_quartile": -9.080395102500916e-09, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.16521453857421875, "unprocessed_answer_log_prob/_median": -5.684792995452881e-06, "unprocessed_answer_log_prob/_min": -9.48858642578125, "unprocessed_answer_log_prob/_sum": -168.5849609375, "unprocessed_thought_kl/_first_quartile": 3.7187710404396057e-06, "unprocessed_thought_kl/_last_quartile": 0.23334050178527832, "unprocessed_thought_kl/_max": 15.1875, "unprocessed_thought_kl/_mean": 0.38091278076171875, "unprocessed_thought_kl/_median": 0.0050048260018229485, "unprocessed_thought_kl/_min": -6.457489013671875, "unprocessed_thought_kl/_sum": 3207.78125 }, { "answer_log_prob_mean": -0.4315948486328125, "answer_log_prob_min": -16.97314453125, "completion_length": 6141.6669921875, "epoch": 0.007660083782166367, "grad_norm": 12.867072708581977, "kl": 0.010605990886688232, "kl_reward": -1.3891973793506622, "kl_reward_no_entropy": -1.7548535093665123, "kl_scores_no_entropy": 8.1630859375, "learning_rate": 1e-06, "loss": 0.0004, "match_reward": -0.0703125, "no_entropy_reasoning_kl_max": 15.898681640625, "no_entropy_reasoning_kl_mean": 0.42596435546875, "no_entropy_unprocessed_thought_kl/_first_quartile": 3.242981620132923e-05, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.30156898498535156, "no_entropy_unprocessed_thought_kl/_max": 15.898681640625, "no_entropy_unprocessed_thought_kl/_mean": 0.42596435546875, "no_entropy_unprocessed_thought_kl/_median": 0.012257397174835205, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 3241.921875, "processed_kl_no_entropy": 0.42596435546875, "reasoning_kl_max": 13.83740234375, "reasoning_kl_mean": 0.3246917724609375, "reward": -1.9992852210998535, "reward_std": 0.2350505292415619, "rewards/TeacherKLBasedReward": -1.9992852210998535, "solution_log_prob_reward": -0.6013262888882309, "step": 8, "thought_kl_scores": 7.078857421875, "thought_processed_kl": 0.3245391845703125, "total_teacher_likelihood_reward": -2.0608361708000302, "total_tl_reward_no_entropy": -2.426492290571332, "unprocessed_answer_log_prob/_first_quartile": -0.0407942533493042, "unprocessed_answer_log_prob/_last_quartile": -9.173527359962463e-08, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.4315948486328125, "unprocessed_answer_log_prob/_median": -7.983855903148651e-05, "unprocessed_answer_log_prob/_min": -16.97314453125, "unprocessed_answer_log_prob/_sum": -183.6015625, "unprocessed_thought_kl/_first_quartile": 6.504356861114502e-06, "unprocessed_thought_kl/_last_quartile": 0.20567703247070312, "unprocessed_thought_kl/_max": 13.83740234375, "unprocessed_thought_kl/_mean": 0.3246917724609375, "unprocessed_thought_kl/_median": 0.005722761154174805, "unprocessed_thought_kl/_min": -5.7874321937561035, "unprocessed_thought_kl/_sum": 2332.140625 }, { "answer_log_prob_mean": -0.34268951416015625, "answer_log_prob_min": -15.44921875, "completion_length": 5702.8154296875, "epoch": 0.008617594254937163, "grad_norm": 1.08297615134172, "kl": 0.00522458553314209, "kl_reward": -1.4341094847768545, "kl_reward_no_entropy": -1.9114654501900077, "kl_scores_no_entropy": 8.275390625, "learning_rate": 1e-06, "loss": 0.0002, "match_reward": -0.375, "no_entropy_reasoning_kl_max": 16.0791015625, "no_entropy_reasoning_kl_mean": 0.4763641357421875, "no_entropy_unprocessed_thought_kl/_first_quartile": 0.00021449732594192028, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.4163398742675781, "no_entropy_unprocessed_thought_kl/_max": 16.0791015625, "no_entropy_unprocessed_thought_kl/_mean": 0.4763641357421875, "no_entropy_unprocessed_thought_kl/_median": 0.029049158096313477, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 4758.28125, "processed_kl_no_entropy": 0.4763641357421875, "reasoning_kl_max": 14.8095703125, "reasoning_kl_mean": 0.3299407958984375, "reward": -2.014585018157959, "reward_std": 0.2206641286611557, "rewards/TeacherKLBasedReward": -2.014585018157959, "solution_log_prob_reward": -0.4971816958859563, "step": 9, "thought_kl_scores": 7.570068359375, "thought_processed_kl": 0.329986572265625, "total_teacher_likelihood_reward": -2.306291177868843, "total_tl_reward_no_entropy": -2.7836471451446414, "unprocessed_answer_log_prob/_first_quartile": -0.04085034132003784, "unprocessed_answer_log_prob/_last_quartile": -1.6693957149982452e-07, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.34268951416015625, "unprocessed_answer_log_prob/_median": -0.0001163305714726448, "unprocessed_answer_log_prob/_min": -15.44921875, "unprocessed_answer_log_prob/_sum": -254.94921875, "unprocessed_thought_kl/_first_quartile": 7.7798031270504e-06, "unprocessed_thought_kl/_last_quartile": 0.25427818298339844, "unprocessed_thought_kl/_max": 14.8095703125, "unprocessed_thought_kl/_mean": 0.3299407958984375, "unprocessed_thought_kl/_median": 0.008814811706542969, "unprocessed_thought_kl/_min": -6.12347412109375, "unprocessed_thought_kl/_sum": 2989.21875 }, { "answer_log_prob_mean": -0.28643798828125, "answer_log_prob_min": -15.88720703125, "completion_length": 5843.2451171875, "epoch": 0.009575104727707959, "grad_norm": 0.42439434929793096, "kl": 0.0049599409103393555, "kl_reward": -1.2793679791502655, "kl_reward_no_entropy": -1.560272821225226, "kl_scores_no_entropy": 7.661865234375, "learning_rate": 1e-06, "loss": 0.0002, "match_reward": -0.0859375, "no_entropy_reasoning_kl_max": 14.953125, "no_entropy_reasoning_kl_mean": 0.3705596923828125, "no_entropy_unprocessed_thought_kl/_first_quartile": 2.1870480850338936e-05, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.19979572296142578, "no_entropy_unprocessed_thought_kl/_max": 14.953125, "no_entropy_unprocessed_thought_kl/_mean": 0.3705596923828125, "no_entropy_unprocessed_thought_kl/_median": 0.005754400976002216, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 2533.625, "processed_kl_no_entropy": 0.3704681396484375, "reasoning_kl_max": 13.92626953125, "reasoning_kl_mean": 0.28719329833984375, "reward": -1.9180649518966675, "reward_std": 0.1902327537536621, "rewards/TeacherKLBasedReward": -1.9180649518966675, "solution_log_prob_reward": -0.4453100565588102, "step": 10, "thought_kl_scores": 7.106201171875, "thought_processed_kl": 0.287078857421875, "total_teacher_likelihood_reward": -1.8106155386194587, "total_tl_reward_no_entropy": -2.0915203876793385, "unprocessed_answer_log_prob/_first_quartile": -0.011770589277148247, "unprocessed_answer_log_prob/_last_quartile": 0.0, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.28643798828125, "unprocessed_answer_log_prob/_median": -8.062459528446198e-06, "unprocessed_answer_log_prob/_min": -15.88720703125, "unprocessed_answer_log_prob/_sum": -173.234375, "unprocessed_thought_kl/_first_quartile": 1.7320271581411362e-06, "unprocessed_thought_kl/_last_quartile": 0.12961626052856445, "unprocessed_thought_kl/_max": 13.92626953125, "unprocessed_thought_kl/_mean": 0.28719329833984375, "unprocessed_thought_kl/_median": 0.0019216211512684822, "unprocessed_thought_kl/_min": -5.5621795654296875, "unprocessed_thought_kl/_sum": 1796.84375 }, { "answer_log_prob_mean": -0.29549407958984375, "answer_log_prob_min": -11.053466796875, "completion_length": 4243.08984375, "epoch": 0.010532615200478756, "grad_norm": 0.4171901979649654, "kl": 0.0024015307426452637, "kl_reward": -1.5211880570277572, "kl_reward_no_entropy": -1.8712069736793637, "kl_scores_no_entropy": 7.1923828125, "learning_rate": 1e-06, "loss": 0.0001, "match_reward": -0.3203125, "no_entropy_reasoning_kl_max": 13.910888671875, "no_entropy_reasoning_kl_mean": 0.48462677001953125, "no_entropy_unprocessed_thought_kl/_first_quartile": 0.0004536018241196871, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.40421558171510696, "no_entropy_unprocessed_thought_kl/_max": 13.910888671875, "no_entropy_unprocessed_thought_kl/_mean": 0.48462677001953125, "no_entropy_unprocessed_thought_kl/_median": 0.03569460287690163, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 3667.09375, "processed_kl_no_entropy": 0.484954833984375, "reasoning_kl_max": 12.785888671875, "reasoning_kl_mean": 0.37920379638671875, "reward": -2.020846366882324, "reward_std": 0.2275008261203766, "rewards/TeacherKLBasedReward": -2.020846366882324, "solution_log_prob_reward": -0.40602874639444053, "step": 11, "thought_kl_scores": 6.5787353515625, "thought_processed_kl": 0.3796844482421875, "total_teacher_likelihood_reward": -2.2475293008610606, "total_tl_reward_no_entropy": -2.5975482231006026, "unprocessed_answer_log_prob/_first_quartile": -0.031216979026794434, "unprocessed_answer_log_prob/_last_quartile": -1.2712553143501282e-07, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.29549407958984375, "unprocessed_answer_log_prob/_median": -0.00019325688481330872, "unprocessed_answer_log_prob/_min": -11.053466796875, "unprocessed_answer_log_prob/_sum": -158.80078125, "unprocessed_thought_kl/_first_quartile": 8.913874626159668e-05, "unprocessed_thought_kl/_last_quartile": 0.28575200331397355, "unprocessed_thought_kl/_max": 12.785888671875, "unprocessed_thought_kl/_mean": 0.37920379638671875, "unprocessed_thought_kl/_median": 0.017132284119725227, "unprocessed_thought_kl/_min": -5.247555077075958, "unprocessed_thought_kl/_sum": 2364.78125 }, { "answer_log_prob_mean": -0.27300262451171875, "answer_log_prob_min": -11.240234375, "completion_length": 5592.818359375, "epoch": 0.011490125673249552, "grad_norm": 0.5228941346396662, "kl": 0.004792213439941406, "kl_reward": -1.2675018389709294, "kl_reward_no_entropy": -1.5875317426398396, "kl_scores_no_entropy": 6.264404296875, "learning_rate": 1e-06, "loss": 0.0002, "match_reward": -0.2109375, "no_entropy_reasoning_kl_max": 12.1279296875, "no_entropy_reasoning_kl_mean": 0.40789794921875, "no_entropy_unprocessed_thought_kl/_first_quartile": 0.0002993866801261902, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.2911710739135742, "no_entropy_unprocessed_thought_kl/_max": 12.1279296875, "no_entropy_unprocessed_thought_kl/_mean": 0.40789794921875, "no_entropy_unprocessed_thought_kl/_median": 0.026454031467437744, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 3557.59375, "processed_kl_no_entropy": 0.4078216552734375, "reasoning_kl_max": 11.3525390625, "reasoning_kl_mean": 0.3089752197265625, "reward": -1.9440778493881226, "reward_std": 0.26418378949165344, "rewards/TeacherKLBasedReward": -1.9440778493881226, "solution_log_prob_reward": -0.38540496036875993, "step": 12, "thought_kl_scores": 5.831298828125, "thought_processed_kl": 0.308929443359375, "total_teacher_likelihood_reward": -1.863844320178032, "total_tl_reward_no_entropy": -2.1838742149993777, "unprocessed_answer_log_prob/_first_quartile": -0.03647028421983123, "unprocessed_answer_log_prob/_last_quartile": -5.8673322200775146e-08, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.27300262451171875, "unprocessed_answer_log_prob/_median": -0.0001435745507478714, "unprocessed_answer_log_prob/_min": -11.240234375, "unprocessed_answer_log_prob/_sum": -151.365234375, "unprocessed_thought_kl/_first_quartile": 1.0839663445949554e-05, "unprocessed_thought_kl/_last_quartile": 0.1751551628112793, "unprocessed_thought_kl/_max": 11.3525390625, "unprocessed_thought_kl/_mean": 0.3089752197265625, "unprocessed_thought_kl/_median": 0.007678680121898651, "unprocessed_thought_kl/_min": -4.1248779296875, "unprocessed_thought_kl/_sum": 2250.4453125 }, { "answer_log_prob_mean": -0.4579620361328125, "answer_log_prob_min": -18.0771484375, "completion_length": 6216.0517578125, "epoch": 0.012447636146020347, "grad_norm": 2.0875285935847576, "kl": 0.006413817405700684, "kl_reward": -1.3939837580546737, "kl_reward_no_entropy": -1.9854180729016662, "kl_scores_no_entropy": 9.125244140625, "learning_rate": 1e-06, "loss": 0.0003, "match_reward": -0.640625, "no_entropy_reasoning_kl_max": 17.7568359375, "no_entropy_reasoning_kl_mean": 0.4842376708984375, "no_entropy_unprocessed_thought_kl/_first_quartile": 0.00020413286983966827, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.42238616943359375, "no_entropy_unprocessed_thought_kl/_max": 17.7568359375, "no_entropy_unprocessed_thought_kl/_mean": 0.4842376708984375, "no_entropy_unprocessed_thought_kl/_median": 0.028687000274658203, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 6815.75, "processed_kl_no_entropy": 0.4842529296875, "reasoning_kl_max": 16.294921875, "reasoning_kl_mean": 0.3017120361328125, "reward": -2.019016981124878, "reward_std": 0.23926706612110138, "rewards/TeacherKLBasedReward": -2.019016981124878, "solution_log_prob_reward": -0.6387335169129074, "step": 13, "thought_kl_scores": 8.296875, "thought_processed_kl": 0.30173492431640625, "total_teacher_likelihood_reward": -2.6733422726392746, "total_tl_reward_no_entropy": -3.2647765893489122, "unprocessed_answer_log_prob/_first_quartile": -0.1476963758468628, "unprocessed_answer_log_prob/_last_quartile": -5.450332537293434e-06, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.4579620361328125, "unprocessed_answer_log_prob/_median": -0.0023719854652881622, "unprocessed_answer_log_prob/_min": -18.0771484375, "unprocessed_answer_log_prob/_sum": -332.984375, "unprocessed_thought_kl/_first_quartile": 9.409384801983833e-06, "unprocessed_thought_kl/_last_quartile": 0.2228717803955078, "unprocessed_thought_kl/_max": 16.294921875, "unprocessed_thought_kl/_mean": 0.3017120361328125, "unprocessed_thought_kl/_median": 0.007037997245788574, "unprocessed_thought_kl/_min": -7.56298828125, "unprocessed_thought_kl/_sum": 4142.625 }, { "answer_log_prob_mean": -0.3228302001953125, "answer_log_prob_min": -13.8759765625, "completion_length": 4029.490234375, "epoch": 0.013405146618791143, "grad_norm": 0.4947697466793597, "kl": 0.0037463903427124023, "kl_reward": -1.7880413690581918, "kl_reward_no_entropy": -2.0693902475759387, "kl_scores_no_entropy": 7.5377197265625, "learning_rate": 1e-06, "loss": 0.0001, "match_reward": -0.03125, "no_entropy_reasoning_kl_max": 14.525634765625, "no_entropy_reasoning_kl_mean": 0.5445404052734375, "no_entropy_unprocessed_thought_kl/_first_quartile": 0.00017479760572314262, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.4893913269042969, "no_entropy_unprocessed_thought_kl/_max": 14.525634765625, "no_entropy_unprocessed_thought_kl/_mean": 0.5445404052734375, "no_entropy_unprocessed_thought_kl/_median": 0.03736543655395508, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 2166.0625, "processed_kl_no_entropy": 0.544189453125, "reasoning_kl_max": 12.8896484375, "reasoning_kl_mean": 0.4671173095703125, "reward": -1.8813114166259766, "reward_std": 0.20121660828590393, "rewards/TeacherKLBasedReward": -1.8813114166259766, "solution_log_prob_reward": -0.46158996294252574, "step": 14, "thought_kl_scores": 6.679443359375, "thought_processed_kl": 0.466949462890625, "total_teacher_likelihood_reward": -2.2808813378214836, "total_tl_reward_no_entropy": -2.56223020888865, "unprocessed_answer_log_prob/_first_quartile": -0.04482126235961914, "unprocessed_answer_log_prob/_last_quartile": -4.7963112592697144e-08, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.3228302001953125, "unprocessed_answer_log_prob/_median": -0.00012182258069515228, "unprocessed_answer_log_prob/_min": -13.8759765625, "unprocessed_answer_log_prob/_sum": -176.4609375, "unprocessed_thought_kl/_first_quartile": 7.401499897241592e-05, "unprocessed_thought_kl/_last_quartile": 0.40399169921875, "unprocessed_thought_kl/_max": 12.8896484375, "unprocessed_thought_kl/_mean": 0.4671173095703125, "unprocessed_thought_kl/_median": 0.02642536163330078, "unprocessed_thought_kl/_min": -4.784263610839844, "unprocessed_thought_kl/_sum": 1735.375 }, { "answer_log_prob_mean": -0.3506927490234375, "answer_log_prob_min": -13.74658203125, "completion_length": 7723.8369140625, "epoch": 0.01436265709156194, "grad_norm": 1.2429999476199531, "kl": 0.006841421127319336, "kl_reward": -1.4535982990637422, "kl_reward_no_entropy": -1.8151849089190364, "kl_scores_no_entropy": 7.43701171875, "learning_rate": 1e-06, "loss": 0.0003, "match_reward": -0.4765625, "no_entropy_reasoning_kl_max": 14.40478515625, "no_entropy_reasoning_kl_mean": 0.4610137939453125, "no_entropy_unprocessed_thought_kl/_first_quartile": 0.00026752427220344543, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.3070793151855469, "no_entropy_unprocessed_thought_kl/_max": 14.40478515625, "no_entropy_unprocessed_thought_kl/_mean": 0.4610137939453125, "no_entropy_unprocessed_thought_kl/_median": 0.022836029529571533, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 4505.65625, "processed_kl_no_entropy": 0.460968017578125, "reasoning_kl_max": 13.8486328125, "reasoning_kl_mean": 0.34604644775390625, "reward": -2.1683297157287598, "reward_std": 0.27772092819213867, "rewards/TeacherKLBasedReward": -2.1683297157287598, "solution_log_prob_reward": -0.4881585657130927, "step": 15, "thought_kl_scores": 7.099853515625, "thought_processed_kl": 0.34595489501953125, "total_teacher_likelihood_reward": -2.418319339863956, "total_tl_reward_no_entropy": -2.779905959032476, "unprocessed_answer_log_prob/_first_quartile": -0.05823177099227905, "unprocessed_answer_log_prob/_last_quartile": -3.203749656677246e-07, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.3506927490234375, "unprocessed_answer_log_prob/_median": -0.00032545533031225204, "unprocessed_answer_log_prob/_min": -13.74658203125, "unprocessed_answer_log_prob/_sum": -251.1640625, "unprocessed_thought_kl/_first_quartile": 3.3748801797628403e-06, "unprocessed_thought_kl/_last_quartile": 0.1771240234375, "unprocessed_thought_kl/_max": 13.8486328125, "unprocessed_thought_kl/_mean": 0.34604644775390625, "unprocessed_thought_kl/_median": 0.004260599613189697, "unprocessed_thought_kl/_min": -3.626769721508026, "unprocessed_thought_kl/_sum": 2633.1875 }, { "answer_log_prob_mean": -0.11906242370605469, "answer_log_prob_min": -6.3876953125, "completion_length": 4247.1669921875, "epoch": 0.015320167564332734, "grad_norm": 148.39839420451506, "kl": 0.2813307046890259, "kl_reward": -1.4238143782131374, "kl_reward_no_entropy": -1.527155444957316, "kl_scores_no_entropy": 7.32763671875, "learning_rate": 1e-06, "loss": 0.0113, "match_reward": 0.0, "no_entropy_reasoning_kl_max": 14.29248046875, "no_entropy_reasoning_kl_mean": 0.36612701416015625, "no_entropy_unprocessed_thought_kl/_first_quartile": 2.193264663219452e-07, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.13061046600341797, "no_entropy_unprocessed_thought_kl/_max": 14.29248046875, "no_entropy_unprocessed_thought_kl/_mean": 0.36612701416015625, "no_entropy_unprocessed_thought_kl/_median": 0.0011534560471773148, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 734.6484375, "processed_kl_no_entropy": 0.3660736083984375, "reasoning_kl_max": 12.93212890625, "reasoning_kl_mean": 0.34528350830078125, "reward": -1.989772081375122, "reward_std": 0.2041010558605194, "rewards/TeacherKLBasedReward": -1.989772081375122, "solution_log_prob_reward": -0.1829393751686439, "step": 16, "thought_kl_scores": 6.63671875, "thought_processed_kl": 0.34534454345703125, "total_teacher_likelihood_reward": -1.60675376560539, "total_tl_reward_no_entropy": -1.7100948290899396, "unprocessed_answer_log_prob/_first_quartile": -0.0010185868013650179, "unprocessed_answer_log_prob/_last_quartile": 0.0, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.11906242370605469, "unprocessed_answer_log_prob/_median": -1.3969838619232178e-08, "unprocessed_answer_log_prob/_min": -6.3876953125, "unprocessed_answer_log_prob/_sum": -50.01611328125, "unprocessed_thought_kl/_first_quartile": 1.441221684217453e-07, "unprocessed_thought_kl/_last_quartile": 0.12040829658508301, "unprocessed_thought_kl/_max": 12.93212890625, "unprocessed_thought_kl/_mean": 0.34528350830078125, "unprocessed_thought_kl/_median": 0.0009243860840797424, "unprocessed_thought_kl/_min": -2.3444442749023438, "unprocessed_thought_kl/_sum": 695.953125 }, { "answer_log_prob_mean": -0.24367523193359375, "answer_log_prob_min": -8.1015625, "completion_length": 5488.5869140625, "epoch": 0.016277678037103532, "grad_norm": 13.429506415548953, "kl": 0.019090473651885986, "kl_reward": -1.5284454505890608, "kl_reward_no_entropy": -1.8621753081679344, "kl_scores_no_entropy": 7.89306640625, "learning_rate": 1e-06, "loss": 0.0008, "match_reward": -0.1796875, "no_entropy_reasoning_kl_max": 15.3134765625, "no_entropy_reasoning_kl_mean": 0.46759033203125, "no_entropy_unprocessed_thought_kl/_first_quartile": 0.00030121393501758575, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.34670162200927734, "no_entropy_unprocessed_thought_kl/_max": 15.3134765625, "no_entropy_unprocessed_thought_kl/_mean": 0.46759033203125, "no_entropy_unprocessed_thought_kl/_median": 0.027063626796007156, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 3533.90625, "processed_kl_no_entropy": 0.4676971435546875, "reasoning_kl_max": 14.57275390625, "reasoning_kl_mean": 0.3637542724609375, "reward": -2.060959815979004, "reward_std": 0.23366352915763855, "rewards/TeacherKLBasedReward": -2.060959815979004, "solution_log_prob_reward": -0.3246908556902781, "step": 17, "thought_kl_scores": 7.47265625, "thought_processed_kl": 0.36383056640625, "total_teacher_likelihood_reward": -2.032823826186359, "total_tl_reward_no_entropy": -2.3665536819025874, "unprocessed_answer_log_prob/_first_quartile": -0.025334147736430168, "unprocessed_answer_log_prob/_last_quartile": -1.6530975699424744e-08, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.24367523193359375, "unprocessed_answer_log_prob/_median": -7.30929896235466e-05, "unprocessed_answer_log_prob/_min": -8.1015625, "unprocessed_answer_log_prob/_sum": -128.4765625, "unprocessed_thought_kl/_first_quartile": 1.1405907571315765e-05, "unprocessed_thought_kl/_last_quartile": 0.22740459442138672, "unprocessed_thought_kl/_max": 14.57275390625, "unprocessed_thought_kl/_mean": 0.3637542724609375, "unprocessed_thought_kl/_median": 0.00824187695980072, "unprocessed_thought_kl/_min": -3.991635501384735, "unprocessed_thought_kl/_sum": 2327.84375 }, { "answer_log_prob_mean": -0.17073822021484375, "answer_log_prob_min": -5.10009765625, "completion_length": 5616.2021484375, "epoch": 0.017235188509874325, "grad_norm": 0.9686466063469117, "kl": 0.004296600818634033, "kl_reward": -2.113500344567001, "kl_reward_no_entropy": -2.166639386676252, "kl_scores_no_entropy": 7.1910400390625, "learning_rate": 1e-06, "loss": 0.0002, "match_reward": 0.0, "no_entropy_reasoning_kl_max": 13.81982421875, "no_entropy_reasoning_kl_mean": 0.584014892578125, "no_entropy_unprocessed_thought_kl/_first_quartile": 9.017763659358025e-06, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.44476318359375, "no_entropy_unprocessed_thought_kl/_max": 13.81982421875, "no_entropy_unprocessed_thought_kl/_mean": 0.584014892578125, "no_entropy_unprocessed_thought_kl/_median": 0.011654853820800781, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 1151.21875, "processed_kl_no_entropy": 0.585418701171875, "reasoning_kl_max": 13.636962890625, "reasoning_kl_mean": 0.5681304931640625, "reward": -1.9615497589111328, "reward_std": 0.19835737347602844, "rewards/TeacherKLBasedReward": -1.9615497589111328, "solution_log_prob_reward": -0.22173919761553407, "step": 18, "thought_kl_scores": 7.093017578125, "thought_processed_kl": 0.5684051513671875, "total_teacher_likelihood_reward": -2.3352394783869386, "total_tl_reward_no_entropy": -2.388378519564867, "unprocessed_answer_log_prob/_first_quartile": -0.013521671295166016, "unprocessed_answer_log_prob/_last_quartile": 0.0, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.17073822021484375, "unprocessed_answer_log_prob/_median": -1.2969598174095154e-05, "unprocessed_answer_log_prob/_min": -5.10009765625, "unprocessed_answer_log_prob/_sum": -43.83203125, "unprocessed_thought_kl/_first_quartile": 8.213566616177559e-06, "unprocessed_thought_kl/_last_quartile": 0.4307842254638672, "unprocessed_thought_kl/_max": 13.636962890625, "unprocessed_thought_kl/_mean": 0.5681304931640625, "unprocessed_thought_kl/_median": 0.010908067226409912, "unprocessed_thought_kl/_min": -1.4371651411056519, "unprocessed_thought_kl/_sum": 1118.953125 }, { "answer_log_prob_mean": -0.14633750915527344, "answer_log_prob_min": -10.156005859375, "completion_length": 8511.8369140625, "epoch": 0.01819269898264512, "grad_norm": 1247.3636041424622, "kl": 0.818266749382019, "kl_reward": -1.2140359343029559, "kl_reward_no_entropy": -1.522982470691204, "kl_scores_no_entropy": 7.2470703125, "learning_rate": 1e-06, "loss": 0.0328, "match_reward": -0.1484375, "no_entropy_reasoning_kl_max": 14.109130859375, "no_entropy_reasoning_kl_mean": 0.36656951904296875, "no_entropy_unprocessed_thought_kl/_first_quartile": 8.113915100693703e-06, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.18887710571289062, "no_entropy_unprocessed_thought_kl/_max": 14.109130859375, "no_entropy_unprocessed_thought_kl/_mean": 0.36656951904296875, "no_entropy_unprocessed_thought_kl/_median": 0.005923178978264332, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 3218.140625, "processed_kl_no_entropy": 0.3665008544921875, "reasoning_kl_max": 12.5908203125, "reasoning_kl_mean": 0.27877044677734375, "reward": -1.9576257467269897, "reward_std": 0.2793550491333008, "rewards/TeacherKLBasedReward": -1.9576257467269897, "solution_log_prob_reward": -0.24789756548125297, "step": 19, "thought_kl_scores": 6.4373779296875, "thought_processed_kl": 0.278656005859375, "total_teacher_likelihood_reward": -1.6103710141032934, "total_tl_reward_no_entropy": -1.919317552819848, "unprocessed_answer_log_prob/_first_quartile": -0.0011936761438846588, "unprocessed_answer_log_prob/_last_quartile": 0.0, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.14633750915527344, "unprocessed_answer_log_prob/_median": -9.406358003616333e-08, "unprocessed_answer_log_prob/_min": -10.156005859375, "unprocessed_answer_log_prob/_sum": -78.603515625, "unprocessed_thought_kl/_first_quartile": 3.7229619920253754e-07, "unprocessed_thought_kl/_last_quartile": 0.10306715965270996, "unprocessed_thought_kl/_max": 12.5908203125, "unprocessed_thought_kl/_mean": 0.27877044677734375, "unprocessed_thought_kl/_median": 0.0009911563247442245, "unprocessed_thought_kl/_min": -4.360961948521435, "unprocessed_thought_kl/_sum": 2094.328125 }, { "answer_log_prob_mean": -0.21501922607421875, "answer_log_prob_min": -9.920166015625, "completion_length": 6702.2109375, "epoch": 0.019150209455415918, "grad_norm": 2388.592050906485, "kl": 3.919822096824646, "kl_reward": -1.4240249460563064, "kl_reward_no_entropy": -1.6249108770862222, "kl_scores_no_entropy": 7.476806640625, "learning_rate": 1e-06, "loss": 0.1568, "match_reward": -0.078125, "no_entropy_reasoning_kl_max": 14.548828125, "no_entropy_reasoning_kl_mean": 0.396148681640625, "no_entropy_unprocessed_thought_kl/_first_quartile": 2.644956111907959e-07, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.17458081245422363, "no_entropy_unprocessed_thought_kl/_max": 14.548828125, "no_entropy_unprocessed_thought_kl/_mean": 0.396148681640625, "no_entropy_unprocessed_thought_kl/_median": 0.0013595661148428917, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 2106.375, "processed_kl_no_entropy": 0.39599609375, "reasoning_kl_max": 13.55712890625, "reasoning_kl_mean": 0.33910369873046875, "reward": -2.04057240486145, "reward_std": 0.23849233984947205, "rewards/TeacherKLBasedReward": -2.04057240486145, "solution_log_prob_reward": -0.31422088376712054, "step": 20, "thought_kl_scores": 6.95068359375, "thought_processed_kl": 0.33880615234375, "total_teacher_likelihood_reward": -1.8163708318024874, "total_tl_reward_no_entropy": -2.0172567637637258, "unprocessed_answer_log_prob/_first_quartile": -0.007244325242936611, "unprocessed_answer_log_prob/_last_quartile": 0.0, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.21501922607421875, "unprocessed_answer_log_prob/_median": -9.853392839431763e-07, "unprocessed_answer_log_prob/_min": -9.920166015625, "unprocessed_answer_log_prob/_sum": -122.04296875, "unprocessed_thought_kl/_first_quartile": 1.0896474123001099e-07, "unprocessed_thought_kl/_last_quartile": 0.12926393747329712, "unprocessed_thought_kl/_max": 13.55712890625, "unprocessed_thought_kl/_mean": 0.33910369873046875, "unprocessed_thought_kl/_median": 0.000605105422437191, "unprocessed_thought_kl/_min": -4.138885498046875, "unprocessed_thought_kl/_sum": 1591.28125 }, { "answer_log_prob_mean": -0.374298095703125, "answer_log_prob_min": -14.32568359375, "completion_length": 4703.1376953125, "epoch": 0.020107719928186715, "grad_norm": 2.0454181475363353, "kl": 0.007532268762588501, "kl_reward": -1.7092684945091605, "kl_reward_no_entropy": -2.2083288598805666, "kl_scores_no_entropy": 9.456787109375, "learning_rate": 1e-06, "loss": 0.0003, "match_reward": -0.28125, "no_entropy_reasoning_kl_max": 18.35888671875, "no_entropy_reasoning_kl_mean": 0.552520751953125, "no_entropy_unprocessed_thought_kl/_first_quartile": 0.0004247426986694336, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.5294532775878906, "no_entropy_unprocessed_thought_kl/_max": 18.35888671875, "no_entropy_unprocessed_thought_kl/_mean": 0.552520751953125, "no_entropy_unprocessed_thought_kl/_median": 0.045750620774924755, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 5443.875, "processed_kl_no_entropy": 0.5523681640625, "reasoning_kl_max": 17.57666015625, "reasoning_kl_mean": 0.39398956298828125, "reward": -1.9534462690353394, "reward_std": 0.2375689148902893, "rewards/TeacherKLBasedReward": -1.9534462690353394, "solution_log_prob_reward": -0.5175549278501421, "step": 21, "thought_kl_scores": 8.984619140625, "thought_processed_kl": 0.39411163330078125, "total_teacher_likelihood_reward": -2.5080734295770526, "total_tl_reward_no_entropy": -3.0071337893605232, "unprocessed_answer_log_prob/_first_quartile": -0.048117876052856445, "unprocessed_answer_log_prob/_last_quartile": -2.635642886161804e-07, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.374298095703125, "unprocessed_answer_log_prob/_median": -0.00022349506616592407, "unprocessed_answer_log_prob/_min": -14.32568359375, "unprocessed_answer_log_prob/_sum": -214.09375, "unprocessed_thought_kl/_first_quartile": 6.750458851456642e-05, "unprocessed_thought_kl/_last_quartile": 0.34681153297424316, "unprocessed_thought_kl/_max": 17.57666015625, "unprocessed_thought_kl/_mean": 0.39398956298828125, "unprocessed_thought_kl/_median": 0.018993856385350227, "unprocessed_thought_kl/_min": -6.343725204467773, "unprocessed_thought_kl/_sum": 3468.125 }, { "answer_log_prob_mean": -0.20286941528320312, "answer_log_prob_min": -5.9503173828125, "completion_length": 3941.0947265625, "epoch": 0.02106523040095751, "grad_norm": 0.46078064661862567, "kl": 0.0030318796634674072, "kl_reward": -1.7982732523232698, "kl_reward_no_entropy": -1.8195775737985969, "kl_scores_no_entropy": 6.1917724609375, "learning_rate": 1e-06, "loss": 0.0001, "match_reward": 0.0, "no_entropy_reasoning_kl_max": 11.8916015625, "no_entropy_reasoning_kl_mean": 0.48760986328125, "no_entropy_unprocessed_thought_kl/_first_quartile": 2.0769890397787094e-05, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.23627972602844238, "no_entropy_unprocessed_thought_kl/_max": 11.8916015625, "no_entropy_unprocessed_thought_kl/_mean": 0.48760986328125, "no_entropy_unprocessed_thought_kl/_median": 0.010665877722203732, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 567.765625, "processed_kl_no_entropy": 0.4878692626953125, "reasoning_kl_max": 11.7841796875, "reasoning_kl_mean": 0.4815826416015625, "reward": -1.8680000305175781, "reward_std": 0.1958393007516861, "rewards/TeacherKLBasedReward": -1.8680000305175781, "solution_log_prob_reward": -0.2623725866433233, "step": 22, "thought_kl_scores": 6.135986328125, "thought_processed_kl": 0.481597900390625, "total_teacher_likelihood_reward": -2.060645886696875, "total_tl_reward_no_entropy": -2.0819502091035247, "unprocessed_answer_log_prob/_first_quartile": -0.03213741770014167, "unprocessed_answer_log_prob/_last_quartile": 0.0, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.20286941528320312, "unprocessed_answer_log_prob/_median": -1.0778196156024933e-05, "unprocessed_answer_log_prob/_min": -5.9503173828125, "unprocessed_answer_log_prob/_sum": -53.751953125, "unprocessed_thought_kl/_first_quartile": 2.0625535398721695e-05, "unprocessed_thought_kl/_last_quartile": 0.23336410522460938, "unprocessed_thought_kl/_max": 11.7841796875, "unprocessed_thought_kl/_mean": 0.4815826416015625, "unprocessed_thought_kl/_median": 0.010526307858526707, "unprocessed_thought_kl/_min": -0.7774462252855301, "unprocessed_thought_kl/_sum": 554.52734375 }, { "answer_log_prob_mean": -0.08954238891601562, "answer_log_prob_min": -6.91162109375, "completion_length": 6865.12109375, "epoch": 0.022022740873728307, "grad_norm": 0.44806766437187434, "kl": 0.004395961761474609, "kl_reward": -1.4020522912032902, "kl_reward_no_entropy": -1.5013513015583158, "kl_scores_no_entropy": 6.37890625, "learning_rate": 1e-06, "loss": 0.0002, "match_reward": 0.0, "no_entropy_reasoning_kl_max": 12.38330078125, "no_entropy_reasoning_kl_mean": 0.376617431640625, "no_entropy_unprocessed_thought_kl/_first_quartile": 3.073364496231079e-08, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.14332962036132812, "no_entropy_unprocessed_thought_kl/_max": 12.38330078125, "no_entropy_unprocessed_thought_kl/_mean": 0.376617431640625, "no_entropy_unprocessed_thought_kl/_median": 0.0006490945816040039, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 941.21875, "processed_kl_no_entropy": 0.37567901611328125, "reasoning_kl_max": 11.964111328125, "reasoning_kl_mean": 0.34770965576171875, "reward": -1.8904714584350586, "reward_std": 0.22687342762947083, "rewards/TeacherKLBasedReward": -1.8904714584350586, "solution_log_prob_reward": -0.15865859808400273, "step": 23, "thought_kl_scores": 6.15478515625, "thought_processed_kl": 0.3467864990234375, "total_teacher_likelihood_reward": -1.5607108678668737, "total_tl_reward_no_entropy": -1.6600098796188831, "unprocessed_answer_log_prob/_first_quartile": -2.639833837747574e-06, "unprocessed_answer_log_prob/_last_quartile": 0.0, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.08954238891601562, "unprocessed_answer_log_prob/_median": 0.0, "unprocessed_answer_log_prob/_min": -6.91162109375, "unprocessed_answer_log_prob/_sum": -54.642578125, "unprocessed_thought_kl/_first_quartile": 4.6566128730773926e-09, "unprocessed_thought_kl/_last_quartile": 0.12801259756088257, "unprocessed_thought_kl/_max": 11.964111328125, "unprocessed_thought_kl/_mean": 0.34770965576171875, "unprocessed_thought_kl/_median": 0.0005800928920507431, "unprocessed_thought_kl/_min": -2.5282602505758405, "unprocessed_thought_kl/_sum": 816.09375 }, { "answer_log_prob_mean": -0.3697967529296875, "answer_log_prob_min": -11.93896484375, "completion_length": 5782.232421875, "epoch": 0.022980251346499104, "grad_norm": 0.3438790636197873, "kl": 0.005478262901306152, "kl_reward": -1.4897625474259257, "kl_reward_no_entropy": -1.8700213404372334, "kl_scores_no_entropy": 7.4130859375, "learning_rate": 1e-06, "loss": 0.0002, "match_reward": -0.2890625, "no_entropy_reasoning_kl_max": 14.3466796875, "no_entropy_reasoning_kl_mean": 0.4798736572265625, "no_entropy_unprocessed_thought_kl/_first_quartile": 0.00022043706849217415, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.4119300842285156, "no_entropy_unprocessed_thought_kl/_max": 14.3466796875, "no_entropy_unprocessed_thought_kl/_mean": 0.4798736572265625, "no_entropy_unprocessed_thought_kl/_median": 0.02840423583984375, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 4301.078125, "processed_kl_no_entropy": 0.4798583984375, "reasoning_kl_max": 13.493896484375, "reasoning_kl_mean": 0.3616485595703125, "reward": -1.8042333126068115, "reward_std": 0.21678432822227478, "rewards/TeacherKLBasedReward": -1.8042333126068115, "solution_log_prob_reward": -0.4891863917000592, "step": 24, "thought_kl_scores": 6.9334716796875, "thought_processed_kl": 0.3609771728515625, "total_teacher_likelihood_reward": -2.268011483363807, "total_tl_reward_no_entropy": -2.6482702791690826, "unprocessed_answer_log_prob/_first_quartile": -0.08701872825622559, "unprocessed_answer_log_prob/_last_quartile": -3.003515303134918e-08, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.3697967529296875, "unprocessed_answer_log_prob/_median": -0.00032783858478069305, "unprocessed_answer_log_prob/_min": -11.93896484375, "unprocessed_answer_log_prob/_sum": -198.078125, "unprocessed_thought_kl/_first_quartile": 1.200544647872448e-05, "unprocessed_thought_kl/_last_quartile": 0.2749767303466797, "unprocessed_thought_kl/_max": 13.493896484375, "unprocessed_thought_kl/_mean": 0.3616485595703125, "unprocessed_thought_kl/_median": 0.009480953216552734, "unprocessed_thought_kl/_min": -3.8502273559570312, "unprocessed_thought_kl/_sum": 2592.6875 }, { "answer_log_prob_mean": -0.23607635498046875, "answer_log_prob_min": -7.951171875, "completion_length": 5439.7587890625, "epoch": 0.023937761819269897, "grad_norm": 24.481123571866128, "kl": 0.012595534324645996, "kl_reward": -1.5988540388643742, "kl_reward_no_entropy": -1.9292138535529375, "kl_scores_no_entropy": 7.0950927734375, "learning_rate": 1e-06, "loss": 0.0005, "match_reward": -0.0390625, "no_entropy_reasoning_kl_max": 13.6845703125, "no_entropy_reasoning_kl_mean": 0.5062255859375, "no_entropy_unprocessed_thought_kl/_first_quartile": 0.00013215257786214352, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.43082427978515625, "no_entropy_unprocessed_thought_kl/_max": 13.6845703125, "no_entropy_unprocessed_thought_kl/_mean": 0.5062255859375, "no_entropy_unprocessed_thought_kl/_median": 0.027274370193481445, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 2122.125, "processed_kl_no_entropy": 0.506072998046875, "reasoning_kl_max": 12.725830078125, "reasoning_kl_mean": 0.40569305419921875, "reward": -1.847424030303955, "reward_std": 0.24988999962806702, "rewards/TeacherKLBasedReward": -1.847424030303955, "solution_log_prob_reward": -0.3155880703125149, "step": 25, "thought_kl_scores": 6.566650390625, "thought_processed_kl": 0.40564727783203125, "total_teacher_likelihood_reward": -1.9535046108067036, "total_tl_reward_no_entropy": -2.2838644245639443, "unprocessed_answer_log_prob/_first_quartile": -0.019740819931030273, "unprocessed_answer_log_prob/_last_quartile": -9.313225746154785e-10, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.23607635498046875, "unprocessed_answer_log_prob/_median": -9.301118552684784e-06, "unprocessed_answer_log_prob/_min": -7.951171875, "unprocessed_answer_log_prob/_sum": -79.013671875, "unprocessed_thought_kl/_first_quartile": 7.88690522313118e-06, "unprocessed_thought_kl/_last_quartile": 0.3190174102783203, "unprocessed_thought_kl/_max": 12.725830078125, "unprocessed_thought_kl/_mean": 0.40569305419921875, "unprocessed_thought_kl/_median": 0.012482106685638428, "unprocessed_thought_kl/_min": -4.45330810546875, "unprocessed_thought_kl/_sum": 1487.96875 }, { "answer_log_prob_mean": -0.10131072998046875, "answer_log_prob_min": -7.9111328125, "completion_length": 5899.0849609375, "epoch": 0.024895272292040693, "grad_norm": 0.3170153655378751, "kl": 0.005529880523681641, "kl_reward": -1.014562671072781, "kl_reward_no_entropy": -1.0781002715229988, "kl_scores_no_entropy": 6.3841552734375, "learning_rate": 1e-06, "loss": 0.0002, "match_reward": -0.0234375, "no_entropy_reasoning_kl_max": 12.544189453125, "no_entropy_reasoning_kl_mean": 0.23392486572265625, "no_entropy_unprocessed_thought_kl/_first_quartile": 2.7939677238464355e-08, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.026324868202209473, "no_entropy_unprocessed_thought_kl/_max": 12.544189453125, "no_entropy_unprocessed_thought_kl/_mean": 0.23392486572265625, "no_entropy_unprocessed_thought_kl/_median": 5.479808896780014e-05, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 940.796875, "processed_kl_no_entropy": 0.2337646484375, "reasoning_kl_max": 12.324462890625, "reasoning_kl_mean": 0.21494293212890625, "reward": -1.873171091079712, "reward_std": 0.2559967637062073, "rewards/TeacherKLBasedReward": -1.873171091079712, "solution_log_prob_reward": -0.18042205751407892, "step": 26, "thought_kl_scores": 6.263916015625, "thought_processed_kl": 0.21466064453125, "total_teacher_likelihood_reward": -1.2184222405776381, "total_tl_reward_no_entropy": -1.2819598414935172, "unprocessed_answer_log_prob/_first_quartile": -4.7465553507208824e-05, "unprocessed_answer_log_prob/_last_quartile": 0.0, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.10131072998046875, "unprocessed_answer_log_prob/_median": -5.8673322200775146e-08, "unprocessed_answer_log_prob/_min": -7.9111328125, "unprocessed_answer_log_prob/_sum": -82.44140625, "unprocessed_thought_kl/_first_quartile": 4.6566128730773926e-09, "unprocessed_thought_kl/_last_quartile": 0.01809293031692505, "unprocessed_thought_kl/_max": 12.324462890625, "unprocessed_thought_kl/_mean": 0.21494293212890625, "unprocessed_thought_kl/_median": 2.188887447118759e-05, "unprocessed_thought_kl/_min": -2.04786716401577, "unprocessed_thought_kl/_sum": 825.984375 }, { "answer_log_prob_mean": -0.271240234375, "answer_log_prob_min": -8.076904296875, "completion_length": 5801.625, "epoch": 0.02585278276481149, "grad_norm": 1.5401017420586431, "kl": 0.0057413578033447266, "kl_reward": -1.6620428329333663, "kl_reward_no_entropy": -1.7990551628172398, "kl_scores_no_entropy": 7.39599609375, "learning_rate": 1e-06, "loss": 0.0002, "match_reward": 0.0, "no_entropy_reasoning_kl_max": 14.33251953125, "no_entropy_reasoning_kl_mean": 0.45635986328125, "no_entropy_unprocessed_thought_kl/_first_quartile": 9.597744792699814e-06, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.2769203186035156, "no_entropy_unprocessed_thought_kl/_max": 14.33251953125, "no_entropy_unprocessed_thought_kl/_mean": 0.45635986328125, "no_entropy_unprocessed_thought_kl/_median": 0.006593614816665649, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 1656.53125, "processed_kl_no_entropy": 0.45623779296875, "reasoning_kl_max": 13.9814453125, "reasoning_kl_mean": 0.4141998291015625, "reward": -1.9862810373306274, "reward_std": 0.21808595955371857, "rewards/TeacherKLBasedReward": -1.9862810373306274, "solution_log_prob_reward": -0.352009276393801, "step": 27, "thought_kl_scores": 7.197021484375, "thought_processed_kl": 0.4141845703125, "total_teacher_likelihood_reward": -2.0140521125867963, "total_tl_reward_no_entropy": -2.151064437814057, "unprocessed_answer_log_prob/_first_quartile": -0.01954793930053711, "unprocessed_answer_log_prob/_last_quartile": 0.0, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.271240234375, "unprocessed_answer_log_prob/_median": -3.2028183341026306e-06, "unprocessed_answer_log_prob/_min": -8.076904296875, "unprocessed_answer_log_prob/_sum": -114.85546875, "unprocessed_thought_kl/_first_quartile": 4.72203828394413e-06, "unprocessed_thought_kl/_last_quartile": 0.2361278533935547, "unprocessed_thought_kl/_max": 13.9814453125, "unprocessed_thought_kl/_mean": 0.4141998291015625, "unprocessed_thought_kl/_median": 0.004428401589393616, "unprocessed_thought_kl/_min": -4.129180908203125, "unprocessed_thought_kl/_sum": 1498.28125 }, { "answer_log_prob_mean": -0.15908050537109375, "answer_log_prob_min": -9.59619140625, "completion_length": 6227.755859375, "epoch": 0.026810293237582286, "grad_norm": 0.7338933933106465, "kl": 0.006051421165466309, "kl_reward": -1.2284783907234669, "kl_reward_no_entropy": -1.3433386133983731, "kl_scores_no_entropy": 5.8070068359375, "learning_rate": 1e-06, "loss": 0.0002, "match_reward": -0.0078125, "no_entropy_reasoning_kl_max": 11.27880859375, "no_entropy_reasoning_kl_mean": 0.334991455078125, "no_entropy_unprocessed_thought_kl/_first_quartile": 1.2619420886039734e-07, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.11242556571960449, "no_entropy_unprocessed_thought_kl/_max": 11.27880859375, "no_entropy_unprocessed_thought_kl/_mean": 0.334991455078125, "no_entropy_unprocessed_thought_kl/_median": 0.0008085714653134346, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 921.4453125, "processed_kl_no_entropy": 0.334197998046875, "reasoning_kl_max": 10.90625, "reasoning_kl_mean": 0.3004302978515625, "reward": -1.8966951370239258, "reward_std": 0.24390605092048645, "rewards/TeacherKLBasedReward": -1.8966951370239258, "solution_log_prob_reward": -0.2550424182554707, "step": 28, "thought_kl_scores": 5.60400390625, "thought_processed_kl": 0.2996063232421875, "total_teacher_likelihood_reward": -1.4913333235308528, "total_tl_reward_no_entropy": -1.6061935489997268, "unprocessed_answer_log_prob/_first_quartile": -0.0018934458494186401, "unprocessed_answer_log_prob/_last_quartile": 0.0, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.15908050537109375, "unprocessed_answer_log_prob/_median": -6.416812539100647e-07, "unprocessed_answer_log_prob/_min": -9.59619140625, "unprocessed_answer_log_prob/_sum": -65.791015625, "unprocessed_thought_kl/_first_quartile": 6.798654794692993e-08, "unprocessed_thought_kl/_last_quartile": 0.0935283899307251, "unprocessed_thought_kl/_max": 10.90625, "unprocessed_thought_kl/_mean": 0.3004302978515625, "unprocessed_thought_kl/_median": 0.0006477478891611099, "unprocessed_thought_kl/_min": -3.73345947265625, "unprocessed_thought_kl/_sum": 735.7421875 }, { "answer_log_prob_mean": -0.6255645751953125, "answer_log_prob_min": -18.04150390625, "completion_length": 5369.94921875, "epoch": 0.027767803710353083, "grad_norm": 0.680201542886803, "kl": 0.006578683853149414, "kl_reward": -1.5393548547290266, "kl_reward_no_entropy": -1.9912042059004307, "kl_scores_no_entropy": 7.7449951171875, "learning_rate": 1e-06, "loss": 0.0003, "match_reward": -0.3828125, "no_entropy_reasoning_kl_max": 14.97119140625, "no_entropy_reasoning_kl_mean": 0.5140228271484375, "no_entropy_unprocessed_thought_kl/_first_quartile": 6.369221955537796e-05, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.38315582275390625, "no_entropy_unprocessed_thought_kl/_max": 14.97119140625, "no_entropy_unprocessed_thought_kl/_mean": 0.5140228271484375, "no_entropy_unprocessed_thought_kl/_median": 0.016377925872802734, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 3692.53125, "processed_kl_no_entropy": 0.513641357421875, "reasoning_kl_max": 13.590576171875, "reasoning_kl_mean": 0.3772125244140625, "reward": -1.9747979640960693, "reward_std": 0.2248312532901764, "rewards/TeacherKLBasedReward": -1.9747979640960693, "solution_log_prob_reward": -0.805979608790949, "step": 29, "thought_kl_scores": 6.9862060546875, "thought_processed_kl": 0.37712860107421875, "total_teacher_likelihood_reward": -2.7281469656154513, "total_tl_reward_no_entropy": -3.1799963135272264, "unprocessed_answer_log_prob/_first_quartile": -0.11033916473388672, "unprocessed_answer_log_prob/_last_quartile": -1.1739321053028107e-06, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.6255645751953125, "unprocessed_answer_log_prob/_median": -0.0008077435195446014, "unprocessed_answer_log_prob/_min": -18.04150390625, "unprocessed_answer_log_prob/_sum": -276.3046875, "unprocessed_thought_kl/_first_quartile": 9.9909957498312e-06, "unprocessed_thought_kl/_last_quartile": 0.2391185760498047, "unprocessed_thought_kl/_max": 13.590576171875, "unprocessed_thought_kl/_mean": 0.3772125244140625, "unprocessed_thought_kl/_median": 0.00611644983291626, "unprocessed_thought_kl/_min": -4.84356689453125, "unprocessed_thought_kl/_sum": 2038.09375 }, { "answer_log_prob_mean": -0.24297332763671875, "answer_log_prob_min": -11.62646484375, "completion_length": 6516.8935546875, "epoch": 0.02872531418312388, "grad_norm": 1.071480251624754, "kl": 0.006301552057266235, "kl_reward": -1.243399647064507, "kl_reward_no_entropy": -1.615172409452498, "kl_scores_no_entropy": 8.106201171875, "learning_rate": 1e-06, "loss": 0.0003, "match_reward": -0.2421875, "no_entropy_reasoning_kl_max": 15.8271484375, "no_entropy_reasoning_kl_mean": 0.38011932373046875, "no_entropy_unprocessed_thought_kl/_first_quartile": 6.00963830947876e-05, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.22579288482666016, "no_entropy_unprocessed_thought_kl/_max": 15.8271484375, "no_entropy_unprocessed_thought_kl/_mean": 0.38011932373046875, "no_entropy_unprocessed_thought_kl/_median": 0.010392609983682632, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 4149.09375, "processed_kl_no_entropy": 0.380096435546875, "reasoning_kl_max": 14.78955078125, "reasoning_kl_mean": 0.266571044921875, "reward": -2.107651710510254, "reward_std": 0.24437236785888672, "rewards/TeacherKLBasedReward": -2.107651710510254, "solution_log_prob_reward": -0.35923797474242747, "step": 30, "thought_kl_scores": 7.527099609375, "thought_processed_kl": 0.2664642333984375, "total_teacher_likelihood_reward": -1.8448251336812973, "total_tl_reward_no_entropy": -2.2165979016572237, "unprocessed_answer_log_prob/_first_quartile": -0.00509204831905663, "unprocessed_answer_log_prob/_last_quartile": -9.313225746154785e-10, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.24297332763671875, "unprocessed_answer_log_prob/_median": -8.137896656990051e-06, "unprocessed_answer_log_prob/_min": -11.62646484375, "unprocessed_answer_log_prob/_sum": -175.734375, "unprocessed_thought_kl/_first_quartile": 4.020286723971367e-06, "unprocessed_thought_kl/_last_quartile": 0.12206530570983887, "unprocessed_thought_kl/_max": 14.78955078125, "unprocessed_thought_kl/_mean": 0.266571044921875, "unprocessed_thought_kl/_median": 0.0026796311140060425, "unprocessed_thought_kl/_min": -6.3035888671875, "unprocessed_thought_kl/_sum": 2742.4375 }, { "answer_log_prob_mean": -0.336669921875, "answer_log_prob_min": -12.052734375, "completion_length": 6151.5927734375, "epoch": 0.029682824655894675, "grad_norm": 0.8245125836838982, "kl": 0.007184267044067383, "kl_reward": -1.5548446774482727, "kl_reward_no_entropy": -1.910414439626038, "kl_scores_no_entropy": 7.519287109375, "learning_rate": 1e-06, "loss": 0.0003, "match_reward": -0.1015625, "no_entropy_reasoning_kl_max": 14.545654296875, "no_entropy_reasoning_kl_mean": 0.4913482666015625, "no_entropy_unprocessed_thought_kl/_first_quartile": 6.148149259388447e-05, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.3906288146972656, "no_entropy_unprocessed_thought_kl/_max": 14.545654296875, "no_entropy_unprocessed_thought_kl/_mean": 0.4913482666015625, "no_entropy_unprocessed_thought_kl/_median": 0.019704103469848633, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 3251.40625, "processed_kl_no_entropy": 0.4913177490234375, "reasoning_kl_max": 13.857421875, "reasoning_kl_mean": 0.37970733642578125, "reward": -2.0403811931610107, "reward_std": 0.26437994837760925, "rewards/TeacherKLBasedReward": -2.0403811931610107, "solution_log_prob_reward": -0.45719726430252194, "step": 31, "thought_kl_scores": 7.1180419921875, "thought_processed_kl": 0.37970733642578125, "total_teacher_likelihood_reward": -2.113604448735714, "total_tl_reward_no_entropy": -2.4691742081195116, "unprocessed_answer_log_prob/_first_quartile": -0.05645036697387695, "unprocessed_answer_log_prob/_last_quartile": -2.3748725652694702e-08, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.336669921875, "unprocessed_answer_log_prob/_median": -0.00015547871589660645, "unprocessed_answer_log_prob/_min": -12.052734375, "unprocessed_answer_log_prob/_sum": -208.1953125, "unprocessed_thought_kl/_first_quartile": 1.113908365368843e-05, "unprocessed_thought_kl/_last_quartile": 0.2710437774658203, "unprocessed_thought_kl/_max": 13.857421875, "unprocessed_thought_kl/_mean": 0.37970733642578125, "unprocessed_thought_kl/_median": 0.008646130561828613, "unprocessed_thought_kl/_min": -5.1181488037109375, "unprocessed_thought_kl/_sum": 2269.0625 }, { "answer_log_prob_mean": -0.312896728515625, "answer_log_prob_min": -11.201416015625, "completion_length": 4966.4140625, "epoch": 0.03064033512866547, "grad_norm": 1.1610283340289145, "kl": 0.008937358856201172, "kl_reward": -1.3656518468633294, "kl_reward_no_entropy": -1.8303735191002488, "kl_scores_no_entropy": 8.694580078125, "learning_rate": 1e-06, "loss": 0.0004, "match_reward": -0.4140625, "no_entropy_reasoning_kl_max": 16.951171875, "no_entropy_reasoning_kl_mean": 0.44061279296875, "no_entropy_unprocessed_thought_kl/_first_quartile": 5.4077012464404106e-05, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.31076717376708984, "no_entropy_unprocessed_thought_kl/_max": 16.951171875, "no_entropy_unprocessed_thought_kl/_mean": 0.44061279296875, "no_entropy_unprocessed_thought_kl/_median": 0.014629512093961239, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 4408.5, "processed_kl_no_entropy": 0.4405517578125, "reasoning_kl_max": 15.4130859375, "reasoning_kl_mean": 0.30108642578125, "reward": -1.9251971244812012, "reward_std": 0.26349419355392456, "rewards/TeacherKLBasedReward": -1.9251971244812012, "solution_log_prob_reward": -0.42491088761016726, "step": 32, "thought_kl_scores": 7.8603515625, "thought_processed_kl": 0.30103302001953125, "total_teacher_likelihood_reward": -2.2046252312138677, "total_tl_reward_no_entropy": -2.669346898794174, "unprocessed_answer_log_prob/_first_quartile": -0.013798093423247337, "unprocessed_answer_log_prob/_last_quartile": -1.862645149230957e-09, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.312896728515625, "unprocessed_answer_log_prob/_median": -2.3824162781238556e-05, "unprocessed_answer_log_prob/_min": -11.201416015625, "unprocessed_answer_log_prob/_sum": -177.98828125, "unprocessed_thought_kl/_first_quartile": 2.689659595489502e-06, "unprocessed_thought_kl/_last_quartile": 0.17926430702209473, "unprocessed_thought_kl/_max": 15.4130859375, "unprocessed_thought_kl/_mean": 0.30108642578125, "unprocessed_thought_kl/_median": 0.004705534316599369, "unprocessed_thought_kl/_min": -6.5765380859375, "unprocessed_thought_kl/_sum": 2845.875 }, { "answer_log_prob_mean": -0.20421600341796875, "answer_log_prob_min": -7.875244140625, "completion_length": 3481.3994140625, "epoch": 0.03159784560143627, "grad_norm": 0.3687183701963747, "kl": 0.0023608505725860596, "kl_reward": -1.335899951402098, "kl_reward_no_entropy": -1.4355834820307791, "kl_scores_no_entropy": 6.1512451171875, "learning_rate": 1e-06, "loss": 0.0001, "match_reward": 0.0, "no_entropy_reasoning_kl_max": 11.939697265625, "no_entropy_reasoning_kl_mean": 0.359130859375, "no_entropy_unprocessed_thought_kl/_first_quartile": 3.4226104617118835e-08, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.1448063850402832, "no_entropy_unprocessed_thought_kl/_max": 11.939697265625, "no_entropy_unprocessed_thought_kl/_mean": 0.359130859375, "no_entropy_unprocessed_thought_kl/_median": 0.0008380189538002014, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 381.5234375, "processed_kl_no_entropy": 0.3590545654296875, "reasoning_kl_max": 11.73046875, "reasoning_kl_mean": 0.32799530029296875, "reward": -1.7358273267745972, "reward_std": 0.19265222549438477, "rewards/TeacherKLBasedReward": -1.7358273267745972, "solution_log_prob_reward": -0.2829684428870678, "step": 33, "thought_kl_scores": 6.0308837890625, "thought_processed_kl": 0.3279876708984375, "total_teacher_likelihood_reward": -1.6188684022054076, "total_tl_reward_no_entropy": -1.7185519337654114, "unprocessed_answer_log_prob/_first_quartile": -0.0030419938266277313, "unprocessed_answer_log_prob/_last_quartile": 0.0, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.20421600341796875, "unprocessed_answer_log_prob/_median": -7.040798664093018e-07, "unprocessed_answer_log_prob/_min": -7.875244140625, "unprocessed_answer_log_prob/_sum": -49.15625, "unprocessed_thought_kl/_first_quartile": 6.51925802230835e-09, "unprocessed_thought_kl/_last_quartile": 0.1216273307800293, "unprocessed_thought_kl/_max": 11.73046875, "unprocessed_thought_kl/_mean": 0.32799530029296875, "unprocessed_thought_kl/_median": 0.0004982417449355125, "unprocessed_thought_kl/_min": -2.0801219940185547, "unprocessed_thought_kl/_sum": 346.328125 }, { "answer_log_prob_mean": -0.127105712890625, "answer_log_prob_min": -9.8623046875, "completion_length": 5443.244140625, "epoch": 0.032555356074207065, "grad_norm": 0.7855045344455627, "kl": 0.0034145712852478027, "kl_reward": -1.0812945463694632, "kl_reward_no_entropy": -1.2671081447042525, "kl_scores_no_entropy": 6.723388671875, "learning_rate": 1e-06, "loss": 0.0001, "match_reward": -0.09375, "no_entropy_reasoning_kl_max": 13.150634765625, "no_entropy_reasoning_kl_mean": 0.290863037109375, "no_entropy_unprocessed_thought_kl/_first_quartile": 1.4808028936386108e-07, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.0812678337097168, "no_entropy_unprocessed_thought_kl/_max": 13.150634765625, "no_entropy_unprocessed_thought_kl/_mean": 0.290863037109375, "no_entropy_unprocessed_thought_kl/_median": 0.0002776114270091057, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 1806.15625, "processed_kl_no_entropy": 0.290863037109375, "reasoning_kl_max": 12.6865234375, "reasoning_kl_mean": 0.2335662841796875, "reward": -1.727834939956665, "reward_std": 0.2525476813316345, "rewards/TeacherKLBasedReward": -1.727834939956665, "solution_log_prob_reward": -0.22572875762125477, "step": 34, "thought_kl_scores": 6.4619140625, "thought_processed_kl": 0.23355865478515625, "total_teacher_likelihood_reward": -1.4007733021862805, "total_tl_reward_no_entropy": -1.586586905643344, "unprocessed_answer_log_prob/_first_quartile": -0.00010834797285497189, "unprocessed_answer_log_prob/_last_quartile": 0.0, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.127105712890625, "unprocessed_answer_log_prob/_median": -1.3783574104309082e-07, "unprocessed_answer_log_prob/_min": -9.8623046875, "unprocessed_answer_log_prob/_sum": -79.5048828125, "unprocessed_thought_kl/_first_quartile": 2.1653249859809875e-08, "unprocessed_thought_kl/_last_quartile": 0.050031304359436035, "unprocessed_thought_kl/_max": 12.6865234375, "unprocessed_thought_kl/_mean": 0.2335662841796875, "unprocessed_thought_kl/_median": 8.228607475757599e-05, "unprocessed_thought_kl/_min": -4.246218681335449, "unprocessed_thought_kl/_sum": 1297.3984375 }, { "answer_log_prob_mean": -0.2646141052246094, "answer_log_prob_min": -8.427490234375, "completion_length": 6669.291015625, "epoch": 0.033512866546977854, "grad_norm": 0.4473331660026175, "kl": 0.003653883934020996, "kl_reward": -1.383345308713615, "kl_reward_no_entropy": -1.5894341776147485, "kl_scores_no_entropy": 7.374267578125, "learning_rate": 1e-06, "loss": 0.0001, "match_reward": -0.046875, "no_entropy_reasoning_kl_max": 14.36572265625, "no_entropy_reasoning_kl_mean": 0.3861541748046875, "no_entropy_unprocessed_thought_kl/_first_quartile": 1.3940734788775444e-05, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.17424726486206055, "no_entropy_unprocessed_thought_kl/_max": 14.36572265625, "no_entropy_unprocessed_thought_kl/_mean": 0.3861541748046875, "no_entropy_unprocessed_thought_kl/_median": 0.004666251130402088, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 2115.3125, "processed_kl_no_entropy": 0.3861236572265625, "reasoning_kl_max": 13.501953125, "reasoning_kl_mean": 0.3260955810546875, "reward": -1.9000990390777588, "reward_std": 0.262474000453949, "rewards/TeacherKLBasedReward": -1.9000990390777588, "solution_log_prob_reward": -0.3488890044391155, "step": 35, "thought_kl_scores": 6.9111328125, "thought_processed_kl": 0.32619476318359375, "total_teacher_likelihood_reward": -1.7791092991828918, "total_tl_reward_no_entropy": -1.985198175534606, "unprocessed_answer_log_prob/_first_quartile": -0.029193921014666557, "unprocessed_answer_log_prob/_last_quartile": -4.0046870708465576e-08, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.2646141052246094, "unprocessed_answer_log_prob/_median": -0.00011116079986095428, "unprocessed_answer_log_prob/_min": -8.427490234375, "unprocessed_answer_log_prob/_sum": -168.8359375, "unprocessed_thought_kl/_first_quartile": 2.5224871933460236e-06, "unprocessed_thought_kl/_last_quartile": 0.11638736724853516, "unprocessed_thought_kl/_max": 13.501953125, "unprocessed_thought_kl/_mean": 0.3260955810546875, "unprocessed_thought_kl/_median": 0.0016458192840218544, "unprocessed_thought_kl/_min": -3.727365016937256, "unprocessed_thought_kl/_sum": 1617.84375 }, { "answer_log_prob_mean": -0.08678436279296875, "answer_log_prob_min": -7.114501953125, "completion_length": 4121.203125, "epoch": 0.03447037701974865, "grad_norm": 1.3715383404882309, "kl": 0.003298342227935791, "kl_reward": -1.376758108381182, "kl_reward_no_entropy": -1.4198153605684638, "kl_scores_no_entropy": 7.077880859375, "learning_rate": 1e-06, "loss": 0.0001, "match_reward": 0.0, "no_entropy_reasoning_kl_max": 13.82421875, "no_entropy_reasoning_kl_mean": 0.33502960205078125, "no_entropy_unprocessed_thought_kl/_first_quartile": 1.7159618437290192e-07, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.10207927227020264, "no_entropy_unprocessed_thought_kl/_max": 13.82421875, "no_entropy_unprocessed_thought_kl/_mean": 0.33502960205078125, "no_entropy_unprocessed_thought_kl/_median": 0.0009801508858799934, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 438.3203125, "processed_kl_no_entropy": 0.3350830078125, "reasoning_kl_max": 13.803466796875, "reasoning_kl_mean": 0.32088470458984375, "reward": -1.7795765399932861, "reward_std": 0.20026755332946777, "rewards/TeacherKLBasedReward": -1.7795765399932861, "solution_log_prob_reward": -0.15792938123922795, "step": 36, "thought_kl_scores": 7.0589599609375, "thought_processed_kl": 0.3214759826660156, "total_teacher_likelihood_reward": -1.5346874790266156, "total_tl_reward_no_entropy": -1.5777447307482362, "unprocessed_answer_log_prob/_first_quartile": -2.2926833480596542e-06, "unprocessed_answer_log_prob/_last_quartile": 0.0, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.08678436279296875, "unprocessed_answer_log_prob/_median": -1.862645149230957e-09, "unprocessed_answer_log_prob/_min": -7.114501953125, "unprocessed_answer_log_prob/_sum": -47.654296875, "unprocessed_thought_kl/_first_quartile": 1.210719347000122e-07, "unprocessed_thought_kl/_last_quartile": 0.09506036341190338, "unprocessed_thought_kl/_max": 13.803466796875, "unprocessed_thought_kl/_mean": 0.32088470458984375, "unprocessed_thought_kl/_median": 0.0009177634492516518, "unprocessed_thought_kl/_min": -1.7002490535378456, "unprocessed_thought_kl/_sum": 399.4140625 }, { "answer_log_prob_mean": -0.21288681030273438, "answer_log_prob_min": -8.5572509765625, "completion_length": 5328.6826171875, "epoch": 0.03542788749251945, "grad_norm": 0.3962855651189272, "kl": 0.0034644007682800293, "kl_reward": -1.3930371049791574, "kl_reward_no_entropy": -1.6535888640210032, "kl_scores_no_entropy": 7.1197509765625, "learning_rate": 1e-06, "loss": 0.0001, "match_reward": 0.0, "no_entropy_reasoning_kl_max": 13.8232421875, "no_entropy_reasoning_kl_mean": 0.4129638671875, "no_entropy_unprocessed_thought_kl/_first_quartile": 2.5552231818437576e-05, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.25836944580078125, "no_entropy_unprocessed_thought_kl/_max": 13.8232421875, "no_entropy_unprocessed_thought_kl/_mean": 0.4129638671875, "no_entropy_unprocessed_thought_kl/_median": 0.009146876633167267, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 1274.5, "processed_kl_no_entropy": 0.4128570556640625, "reasoning_kl_max": 13.054443359375, "reasoning_kl_mean": 0.33380126953125, "reward": -1.7345645427703857, "reward_std": 0.22955447435379028, "rewards/TeacherKLBasedReward": -1.7345645427703857, "solution_log_prob_reward": -0.2984593167784624, "step": 37, "thought_kl_scores": 6.6934814453125, "thought_processed_kl": 0.33380126953125, "total_teacher_likelihood_reward": -1.6914964206516743, "total_tl_reward_no_entropy": -1.9520481815561652, "unprocessed_answer_log_prob/_first_quartile": -0.009911462664604187, "unprocessed_answer_log_prob/_last_quartile": 0.0, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.21288681030273438, "unprocessed_answer_log_prob/_median": -4.98257577419281e-06, "unprocessed_answer_log_prob/_min": -8.5572509765625, "unprocessed_answer_log_prob/_sum": -103.1474609375, "unprocessed_thought_kl/_first_quartile": 9.947223588824272e-06, "unprocessed_thought_kl/_last_quartile": 0.1871051788330078, "unprocessed_thought_kl/_max": 13.054443359375, "unprocessed_thought_kl/_mean": 0.33380126953125, "unprocessed_thought_kl/_median": 0.005231976509094238, "unprocessed_thought_kl/_min": -4.148231506347656, "unprocessed_thought_kl/_sum": 960.9296875 }, { "answer_log_prob_mean": -0.08989810943603516, "answer_log_prob_min": -4.6712646484375, "completion_length": 4364.6884765625, "epoch": 0.03638539796529024, "grad_norm": 43.090458218617755, "kl": 0.03820580244064331, "kl_reward": -1.3131655752658844, "kl_reward_no_entropy": -1.4417962478473783, "kl_scores_no_entropy": 6.28173828125, "learning_rate": 1e-06, "loss": 0.0015, "match_reward": -0.015625, "no_entropy_reasoning_kl_max": 12.21240234375, "no_entropy_reasoning_kl_mean": 0.3584747314453125, "no_entropy_unprocessed_thought_kl/_first_quartile": 1.8137507140636444e-07, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.11544251441955566, "no_entropy_unprocessed_thought_kl/_max": 12.21240234375, "no_entropy_unprocessed_thought_kl/_mean": 0.3584747314453125, "no_entropy_unprocessed_thought_kl/_median": 0.0009328983724117279, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 893.640625, "processed_kl_no_entropy": 0.3577423095703125, "reasoning_kl_max": 11.30224609375, "reasoning_kl_mean": 0.32469940185546875, "reward": -1.6468842029571533, "reward_std": 0.2006833702325821, "rewards/TeacherKLBasedReward": -1.6468842029571533, "solution_log_prob_reward": -0.13661075467825867, "step": 38, "thought_kl_scores": 5.8106689453125, "thought_processed_kl": 0.32464599609375, "total_teacher_likelihood_reward": -1.4654013235121965, "total_tl_reward_no_entropy": -1.594031997025013, "unprocessed_answer_log_prob/_first_quartile": -1.790374517440796e-05, "unprocessed_answer_log_prob/_last_quartile": 0.0, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.08989810943603516, "unprocessed_answer_log_prob/_median": 0.0, "unprocessed_answer_log_prob/_min": -4.6712646484375, "unprocessed_answer_log_prob/_sum": -34.5068359375, "unprocessed_thought_kl/_first_quartile": 6.51925802230835e-08, "unprocessed_thought_kl/_last_quartile": 0.09389686584472656, "unprocessed_thought_kl/_max": 11.30224609375, "unprocessed_thought_kl/_mean": 0.32469940185546875, "unprocessed_thought_kl/_median": 0.000593181699514389, "unprocessed_thought_kl/_min": -2.4994990825653076, "unprocessed_thought_kl/_sum": 710.53125 }, { "answer_log_prob_mean": -0.4458160400390625, "answer_log_prob_min": -15.914306640625, "completion_length": 7742.08984375, "epoch": 0.03734290843806104, "grad_norm": 0.30597219223303557, "kl": 0.00324249267578125, "kl_reward": -1.2250524796545506, "kl_reward_no_entropy": -1.9386346321552992, "kl_scores_no_entropy": 8.67529296875, "learning_rate": 1e-06, "loss": 0.0001, "match_reward": -0.65625, "no_entropy_reasoning_kl_max": 16.8779296875, "no_entropy_reasoning_kl_mean": 0.4774322509765625, "no_entropy_unprocessed_thought_kl/_first_quartile": 0.0004329504445195198, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.39646434783935547, "no_entropy_unprocessed_thought_kl/_max": 16.8779296875, "no_entropy_unprocessed_thought_kl/_mean": 0.4774322509765625, "no_entropy_unprocessed_thought_kl/_median": 0.031424134969711304, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 7231.0625, "processed_kl_no_entropy": 0.477447509765625, "reasoning_kl_max": 14.4892578125, "reasoning_kl_mean": 0.263458251953125, "reward": -2.0216965675354004, "reward_std": 0.315352201461792, "rewards/TeacherKLBasedReward": -2.0216965675354004, "solution_log_prob_reward": -0.6049591033952311, "step": 39, "thought_kl_scores": 7.373779296875, "thought_processed_kl": 0.2634429931640625, "total_teacher_likelihood_reward": -2.486261587589979, "total_tl_reward_no_entropy": -3.199843741953373, "unprocessed_answer_log_prob/_first_quartile": -0.0708247721195221, "unprocessed_answer_log_prob/_last_quartile": -4.5634806156158447e-07, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.4458160400390625, "unprocessed_answer_log_prob/_median": -0.00047805625945329666, "unprocessed_answer_log_prob/_min": -15.914306640625, "unprocessed_answer_log_prob/_sum": -359.21875, "unprocessed_thought_kl/_first_quartile": 5.958136171102524e-07, "unprocessed_thought_kl/_last_quartile": 0.16521358489990234, "unprocessed_thought_kl/_max": 14.4892578125, "unprocessed_thought_kl/_mean": 0.263458251953125, "unprocessed_thought_kl/_median": 0.0031337812542915344, "unprocessed_thought_kl/_min": -6.6990966796875, "unprocessed_thought_kl/_sum": 3890.125 }, { "answer_log_prob_mean": -0.24734878540039062, "answer_log_prob_min": -9.557861328125, "completion_length": 5497.0302734375, "epoch": 0.038300418910831836, "grad_norm": 0.32799540149482914, "kl": 0.0036551356315612793, "kl_reward": -1.515451346989721, "kl_reward_no_entropy": -1.7724206484854221, "kl_scores_no_entropy": 7.251708984375, "learning_rate": 1e-06, "loss": 0.0001, "match_reward": -0.03125, "no_entropy_reasoning_kl_max": 14.0458984375, "no_entropy_reasoning_kl_mean": 0.450347900390625, "no_entropy_unprocessed_thought_kl/_first_quartile": 0.00022165034897625446, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.30166006088256836, "no_entropy_unprocessed_thought_kl/_max": 14.0458984375, "no_entropy_unprocessed_thought_kl/_mean": 0.450347900390625, "no_entropy_unprocessed_thought_kl/_median": 0.017883948981761932, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 1977.5, "processed_kl_no_entropy": 0.4506683349609375, "reasoning_kl_max": 13.3408203125, "reasoning_kl_mean": 0.37174224853515625, "reward": -1.9314584732055664, "reward_std": 0.2311333417892456, "rewards/TeacherKLBasedReward": -1.9314584732055664, "solution_log_prob_reward": -0.34292739629745483, "step": 40, "thought_kl_scores": 6.8603515625, "thought_processed_kl": 0.37203216552734375, "total_teacher_likelihood_reward": -1.8896287390962243, "total_tl_reward_no_entropy": -2.1465980345383286, "unprocessed_answer_log_prob/_first_quartile": -0.02500108629465103, "unprocessed_answer_log_prob/_last_quartile": -8.847564458847046e-09, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.24734878540039062, "unprocessed_answer_log_prob/_median": -5.9145502746105194e-05, "unprocessed_answer_log_prob/_min": -9.557861328125, "unprocessed_answer_log_prob/_sum": -129.73046875, "unprocessed_thought_kl/_first_quartile": 4.557706415653229e-05, "unprocessed_thought_kl/_last_quartile": 0.22100520133972168, "unprocessed_thought_kl/_max": 13.3408203125, "unprocessed_thought_kl/_mean": 0.37174224853515625, "unprocessed_thought_kl/_median": 0.00830429419875145, "unprocessed_thought_kl/_min": -4.01781439781189, "unprocessed_thought_kl/_sum": 1520.90625 }, { "answer_log_prob_mean": -0.38683319091796875, "answer_log_prob_min": -16.069091796875, "completion_length": 5902.9482421875, "epoch": 0.03925792938360263, "grad_norm": 1.2922041931903694, "kl": 0.004740595817565918, "kl_reward": -1.3125732280313969, "kl_reward_no_entropy": -1.8289324836805463, "kl_scores_no_entropy": 7.9345703125, "learning_rate": 1e-06, "loss": 0.0002, "match_reward": -0.3046875, "no_entropy_reasoning_kl_max": 15.41845703125, "no_entropy_reasoning_kl_mean": 0.4554595947265625, "no_entropy_unprocessed_thought_kl/_first_quartile": 0.000244886614382267, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.3500051498413086, "no_entropy_unprocessed_thought_kl/_max": 15.41845703125, "no_entropy_unprocessed_thought_kl/_mean": 0.4554595947265625, "no_entropy_unprocessed_thought_kl/_median": 0.02295115776360035, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 4710.125, "processed_kl_no_entropy": 0.4553375244140625, "reasoning_kl_max": 14.156494140625, "reasoning_kl_mean": 0.29595947265625, "reward": -2.097391366958618, "reward_std": 0.2091302126646042, "rewards/TeacherKLBasedReward": -2.097391366958618, "solution_log_prob_reward": -0.5475241052918136, "step": 41, "thought_kl_scores": 7.22607421875, "thought_processed_kl": 0.29584503173828125, "total_teacher_likelihood_reward": -2.1647848272696137, "total_tl_reward_no_entropy": -2.6811440866440535, "unprocessed_answer_log_prob/_first_quartile": -0.06183910369873047, "unprocessed_answer_log_prob/_last_quartile": -8.312053978443146e-08, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.38683319091796875, "unprocessed_answer_log_prob/_median": -0.00010887067764997482, "unprocessed_answer_log_prob/_min": -16.069091796875, "unprocessed_answer_log_prob/_sum": -167.32421875, "unprocessed_thought_kl/_first_quartile": 1.7996178939938545e-05, "unprocessed_thought_kl/_last_quartile": 0.19469952583312988, "unprocessed_thought_kl/_max": 14.156494140625, "unprocessed_thought_kl/_mean": 0.29595947265625, "unprocessed_thought_kl/_median": 0.007742021232843399, "unprocessed_thought_kl/_min": -5.7047119140625, "unprocessed_thought_kl/_sum": 2878.0625 }, { "answer_log_prob_mean": -0.37237548828125, "answer_log_prob_min": -14.798583984375, "completion_length": 6041.2080078125, "epoch": 0.04021543985637343, "grad_norm": 39.879668779865334, "kl": 0.021791577339172363, "kl_reward": -1.4739944166503847, "kl_reward_no_entropy": -1.7936553712934256, "kl_scores_no_entropy": 7.570556640625, "learning_rate": 1e-06, "loss": 0.0009, "match_reward": -0.0234375, "no_entropy_reasoning_kl_max": 14.68505859375, "no_entropy_reasoning_kl_mean": 0.4510345458984375, "no_entropy_unprocessed_thought_kl/_first_quartile": 1.0380521416664124e-05, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.2787590026855469, "no_entropy_unprocessed_thought_kl/_max": 14.68505859375, "no_entropy_unprocessed_thought_kl/_mean": 0.4510345458984375, "no_entropy_unprocessed_thought_kl/_median": 0.00788375735282898, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 2304.5, "processed_kl_no_entropy": 0.4510345458984375, "reasoning_kl_max": 13.974609375, "reasoning_kl_mean": 0.35158538818359375, "reward": -1.9199696779251099, "reward_std": 0.23330266773700714, "rewards/TeacherKLBasedReward": -1.9199696779251099, "solution_log_prob_reward": -0.5203613252379, "step": 42, "thought_kl_scores": 7.165283203125, "thought_processed_kl": 0.3515167236328125, "total_teacher_likelihood_reward": -2.0177932493388653, "total_tl_reward_no_entropy": -2.3374541960656643, "unprocessed_answer_log_prob/_first_quartile": -0.017995357513427734, "unprocessed_answer_log_prob/_last_quartile": 0.0, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.37237548828125, "unprocessed_answer_log_prob/_median": -3.1739473342895508e-06, "unprocessed_answer_log_prob/_min": -14.798583984375, "unprocessed_answer_log_prob/_sum": -196.9921875, "unprocessed_thought_kl/_first_quartile": 5.268491804599762e-06, "unprocessed_thought_kl/_last_quartile": 0.18343067169189453, "unprocessed_thought_kl/_max": 13.974609375, "unprocessed_thought_kl/_mean": 0.35158538818359375, "unprocessed_thought_kl/_median": 0.003775753080844879, "unprocessed_thought_kl/_min": -4.0025177001953125, "unprocessed_thought_kl/_sum": 1584.1875 }, { "answer_log_prob_mean": -0.444793701171875, "answer_log_prob_min": -12.536865234375, "completion_length": 6973.1982421875, "epoch": 0.041172950329144226, "grad_norm": 1.9432888148439018, "kl": 0.004826903343200684, "kl_reward": -1.239494004752487, "kl_reward_no_entropy": -1.9071331657469273, "kl_scores_no_entropy": 8.26220703125, "learning_rate": 1e-06, "loss": 0.0002, "match_reward": -0.2734375, "no_entropy_reasoning_kl_max": 16.0537109375, "no_entropy_reasoning_kl_mean": 0.4751739501953125, "no_entropy_unprocessed_thought_kl/_first_quartile": 0.00013489136472344398, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.3968772888183594, "no_entropy_unprocessed_thought_kl/_max": 16.0537109375, "no_entropy_unprocessed_thought_kl/_mean": 0.4751739501953125, "no_entropy_unprocessed_thought_kl/_median": 0.024457931518554688, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 5041.875, "processed_kl_no_entropy": 0.4749298095703125, "reasoning_kl_max": 14.70361328125, "reasoning_kl_mean": 0.2661285400390625, "reward": -1.8218159675598145, "reward_std": 0.27830398082733154, "rewards/TeacherKLBasedReward": -1.8218159675598145, "solution_log_prob_reward": -0.5701623521745205, "step": 43, "thought_kl_scores": 7.4873046875, "thought_processed_kl": 0.2662353515625, "total_teacher_likelihood_reward": -2.083093848079443, "total_tl_reward_no_entropy": -2.7507330123335123, "unprocessed_answer_log_prob/_first_quartile": -0.13948345184326172, "unprocessed_answer_log_prob/_last_quartile": -6.782356649637222e-07, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.444793701171875, "unprocessed_answer_log_prob/_median": -0.0012442022562026978, "unprocessed_answer_log_prob/_min": -12.536865234375, "unprocessed_answer_log_prob/_sum": -253.1875, "unprocessed_thought_kl/_first_quartile": 9.469222277402878e-07, "unprocessed_thought_kl/_last_quartile": 0.17471694946289062, "unprocessed_thought_kl/_max": 14.70361328125, "unprocessed_thought_kl/_mean": 0.2661285400390625, "unprocessed_thought_kl/_median": 0.003325343132019043, "unprocessed_thought_kl/_min": -5.6971435546875, "unprocessed_thought_kl/_sum": 2726.1875 }, { "answer_log_prob_mean": -0.4260406494140625, "answer_log_prob_min": -14.2216796875, "completion_length": 4608.513671875, "epoch": 0.04213046080191502, "grad_norm": 8.906458150175885, "kl": 0.012569189071655273, "kl_reward": -1.7565792759414762, "kl_reward_no_entropy": -2.2135368273593485, "kl_scores_no_entropy": 8.80029296875, "learning_rate": 1e-06, "loss": 0.0005, "match_reward": -0.375, "no_entropy_reasoning_kl_max": 17.0458984375, "no_entropy_reasoning_kl_mean": 0.5673866271972656, "no_entropy_unprocessed_thought_kl/_first_quartile": 0.00017193169333040714, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.46305465791374445, "no_entropy_unprocessed_thought_kl/_max": 17.0458984375, "no_entropy_unprocessed_thought_kl/_mean": 0.5673866271972656, "no_entropy_unprocessed_thought_kl/_median": 0.02946114633232355, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 4569.71875, "processed_kl_no_entropy": 0.5676002502441406, "reasoning_kl_max": 15.580078125, "reasoning_kl_mean": 0.42972564697265625, "reward": -1.941602349281311, "reward_std": 0.21179929375648499, "rewards/TeacherKLBasedReward": -1.941602349281311, "solution_log_prob_reward": -0.5682574443053454, "step": 44, "thought_kl_scores": 8.000732421875, "thought_processed_kl": 0.42971038818359375, "total_teacher_likelihood_reward": -2.6998367169871926, "total_tl_reward_no_entropy": -3.156794272363186, "unprocessed_answer_log_prob/_first_quartile": -0.04910683631896973, "unprocessed_answer_log_prob/_last_quartile": -1.1548399925231934e-07, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.4260406494140625, "unprocessed_answer_log_prob/_median": -0.00016983691602945328, "unprocessed_answer_log_prob/_min": -14.2216796875, "unprocessed_answer_log_prob/_sum": -270.65234375, "unprocessed_thought_kl/_first_quartile": 2.196943387389183e-05, "unprocessed_thought_kl/_last_quartile": 0.3125991765409708, "unprocessed_thought_kl/_max": 15.580078125, "unprocessed_thought_kl/_mean": 0.42972564697265625, "unprocessed_thought_kl/_median": 0.01281573437154293, "unprocessed_thought_kl/_min": -4.882791042327881, "unprocessed_thought_kl/_sum": 2590.71875 }, { "answer_log_prob_mean": -0.326324462890625, "answer_log_prob_min": -9.5772705078125, "completion_length": 5591.828125, "epoch": 0.04308797127468582, "grad_norm": 1.5120410909748079, "kl": 0.0034488439559936523, "kl_reward": -1.7703552371822298, "kl_reward_no_entropy": -1.9608526742085814, "kl_scores_no_entropy": 7.72216796875, "learning_rate": 1e-06, "loss": 0.0001, "match_reward": -0.0234375, "no_entropy_reasoning_kl_max": 14.93603515625, "no_entropy_reasoning_kl_mean": 0.5042572021484375, "no_entropy_unprocessed_thought_kl/_first_quartile": 9.29064117372036e-06, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.3045644760131836, "no_entropy_unprocessed_thought_kl/_max": 14.93603515625, "no_entropy_unprocessed_thought_kl/_mean": 0.5042572021484375, "no_entropy_unprocessed_thought_kl/_median": 0.009938426315784454, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 1722.296875, "processed_kl_no_entropy": 0.5054779052734375, "reasoning_kl_max": 14.1357421875, "reasoning_kl_mean": 0.448760986328125, "reward": -1.956151008605957, "reward_std": 0.2553611397743225, "rewards/TeacherKLBasedReward": -1.956151008605957, "solution_log_prob_reward": -0.42209716816432774, "step": 45, "thought_kl_scores": 7.29248046875, "thought_processed_kl": 0.44890594482421875, "total_teacher_likelihood_reward": -2.2158899353817105, "total_tl_reward_no_entropy": -2.406387365423143, "unprocessed_answer_log_prob/_first_quartile": -0.05310964584350586, "unprocessed_answer_log_prob/_last_quartile": -4.0978193283081055e-08, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.326324462890625, "unprocessed_answer_log_prob/_median": -3.176834434270859e-05, "unprocessed_answer_log_prob/_min": -9.5772705078125, "unprocessed_answer_log_prob/_sum": -96.9375, "unprocessed_thought_kl/_first_quartile": 4.579778760671616e-06, "unprocessed_thought_kl/_last_quartile": 0.2619309425354004, "unprocessed_thought_kl/_max": 14.1357421875, "unprocessed_thought_kl/_mean": 0.448760986328125, "unprocessed_thought_kl/_median": 0.008336457423865795, "unprocessed_thought_kl/_min": -3.6654052734375, "unprocessed_thought_kl/_sum": 1304.125 }, { "answer_log_prob_mean": -0.26973724365234375, "answer_log_prob_min": -9.9404296875, "completion_length": 5318.50390625, "epoch": 0.044045481747456615, "grad_norm": 0.6897069137880592, "kl": 0.004994750022888184, "kl_reward": -1.4103863527998328, "kl_reward_no_entropy": -1.948952621780336, "kl_scores_no_entropy": 7.447998046875, "learning_rate": 1e-06, "loss": 0.0002, "match_reward": 0.0, "no_entropy_reasoning_kl_max": 14.3974609375, "no_entropy_reasoning_kl_mean": 0.50567626953125, "no_entropy_unprocessed_thought_kl/_first_quartile": 9.522680193185806e-05, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.4316139221191406, "no_entropy_unprocessed_thought_kl/_max": 14.3974609375, "no_entropy_unprocessed_thought_kl/_mean": 0.50567626953125, "no_entropy_unprocessed_thought_kl/_median": 0.025480031967163086, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 2696.5625, "processed_kl_no_entropy": 0.5055084228515625, "reasoning_kl_max": 13.46337890625, "reasoning_kl_mean": 0.3354949951171875, "reward": -1.767851710319519, "reward_std": 0.17136019468307495, "rewards/TeacherKLBasedReward": -1.767851710319519, "solution_log_prob_reward": -0.369141539093107, "step": 46, "thought_kl_scores": 6.900146484375, "thought_processed_kl": 0.3353118896484375, "total_teacher_likelihood_reward": -1.7795278951525688, "total_tl_reward_no_entropy": -2.3180941715836525, "unprocessed_answer_log_prob/_first_quartile": -0.023833036422729492, "unprocessed_answer_log_prob/_last_quartile": -1.3969838619232178e-09, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.26973724365234375, "unprocessed_answer_log_prob/_median": -2.049468457698822e-05, "unprocessed_answer_log_prob/_min": -9.9404296875, "unprocessed_answer_log_prob/_sum": -177.45703125, "unprocessed_thought_kl/_first_quartile": 3.600027412176132e-06, "unprocessed_thought_kl/_last_quartile": 0.24074363708496094, "unprocessed_thought_kl/_max": 13.46337890625, "unprocessed_thought_kl/_mean": 0.3354949951171875, "unprocessed_thought_kl/_median": 0.005900293588638306, "unprocessed_thought_kl/_min": -5.367431640625, "unprocessed_thought_kl/_sum": 1728.25 }, { "answer_log_prob_mean": -0.16436386108398438, "answer_log_prob_min": -9.42822265625, "completion_length": 7116.349609375, "epoch": 0.04500299222022741, "grad_norm": 0.8010051706682997, "kl": 0.0054912567138671875, "kl_reward": -1.3126986473798752, "kl_reward_no_entropy": -1.6569250235334039, "kl_scores_no_entropy": 7.718994140625, "learning_rate": 1e-06, "loss": 0.0002, "match_reward": -0.2109375, "no_entropy_reasoning_kl_max": 15.02392578125, "no_entropy_reasoning_kl_mean": 0.402069091796875, "no_entropy_unprocessed_thought_kl/_first_quartile": 3.399117849767208e-05, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.22354793548583984, "no_entropy_unprocessed_thought_kl/_max": 15.02392578125, "no_entropy_unprocessed_thought_kl/_mean": 0.402069091796875, "no_entropy_unprocessed_thought_kl/_median": 0.009190816432237625, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 3503.4375, "processed_kl_no_entropy": 0.401947021484375, "reasoning_kl_max": 13.921875, "reasoning_kl_mean": 0.29834747314453125, "reward": -1.8180336952209473, "reward_std": 0.2656768560409546, "rewards/TeacherKLBasedReward": -1.8180336952209473, "solution_log_prob_reward": -0.25864608702249825, "step": 47, "thought_kl_scores": 7.107177734375, "thought_processed_kl": 0.2983856201171875, "total_teacher_likelihood_reward": -1.782282237894833, "total_tl_reward_no_entropy": -2.126508615911007, "unprocessed_answer_log_prob/_first_quartile": -0.008475000970065594, "unprocessed_answer_log_prob/_last_quartile": 0.0, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.16436386108398438, "unprocessed_answer_log_prob/_median": -4.24310564994812e-06, "unprocessed_answer_log_prob/_min": -9.42822265625, "unprocessed_answer_log_prob/_sum": -114.388671875, "unprocessed_thought_kl/_first_quartile": 1.6086269170045853e-06, "unprocessed_thought_kl/_last_quartile": 0.12265205383300781, "unprocessed_thought_kl/_max": 13.921875, "unprocessed_thought_kl/_mean": 0.29834747314453125, "unprocessed_thought_kl/_median": 0.001890174113214016, "unprocessed_thought_kl/_min": -4.248046875, "unprocessed_thought_kl/_sum": 2252.03125 }, { "answer_log_prob_mean": -0.343292236328125, "answer_log_prob_min": -12.148681640625, "completion_length": 9240.0205078125, "epoch": 0.04596050269299821, "grad_norm": 2.871190502551693, "kl": 0.010389089584350586, "kl_reward": -1.2165380790829659, "kl_reward_no_entropy": -1.5559606878086925, "kl_scores_no_entropy": 6.962646484375, "learning_rate": 1e-06, "loss": 0.0004, "match_reward": 0.0, "no_entropy_reasoning_kl_max": 13.538330078125, "no_entropy_reasoning_kl_mean": 0.383270263671875, "no_entropy_unprocessed_thought_kl/_first_quartile": 9.937211871147156e-07, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.20276260375976562, "no_entropy_unprocessed_thought_kl/_max": 13.538330078125, "no_entropy_unprocessed_thought_kl/_mean": 0.383270263671875, "no_entropy_unprocessed_thought_kl/_median": 0.002657979726791382, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 1871.59375, "processed_kl_no_entropy": 0.3833160400390625, "reasoning_kl_max": 12.591064453125, "reasoning_kl_mean": 0.27960205078125, "reward": -1.8826024532318115, "reward_std": 0.32833802700042725, "rewards/TeacherKLBasedReward": -1.8826024532318115, "solution_log_prob_reward": -0.4647790470626205, "step": 48, "thought_kl_scores": 6.43505859375, "thought_processed_kl": 0.279632568359375, "total_teacher_likelihood_reward": -1.6813171217218041, "total_tl_reward_no_entropy": -2.0207397332414985, "unprocessed_answer_log_prob/_first_quartile": -0.021289825439453125, "unprocessed_answer_log_prob/_last_quartile": 0.0, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.343292236328125, "unprocessed_answer_log_prob/_median": -3.3350661396980286e-06, "unprocessed_answer_log_prob/_min": -12.148681640625, "unprocessed_answer_log_prob/_sum": -159.7421875, "unprocessed_thought_kl/_first_quartile": 6.426125764846802e-08, "unprocessed_thought_kl/_last_quartile": 0.1186361312866211, "unprocessed_thought_kl/_max": 12.591064453125, "unprocessed_thought_kl/_mean": 0.27960205078125, "unprocessed_thought_kl/_median": 0.0006442703306674957, "unprocessed_thought_kl/_min": -4.6224212646484375, "unprocessed_thought_kl/_sum": 1292.828125 }, { "answer_log_prob_mean": -0.10714340209960938, "answer_log_prob_min": -6.88330078125, "completion_length": 4438.0576171875, "epoch": 0.046918013165769, "grad_norm": 0.40114333373778904, "kl": 0.003406703472137451, "kl_reward": -1.9060941082425416, "kl_reward_no_entropy": -1.9899060586467385, "kl_scores_no_entropy": 9.5411376953125, "learning_rate": 1e-06, "loss": 0.0001, "match_reward": 0.0, "no_entropy_reasoning_kl_max": 18.6220703125, "no_entropy_reasoning_kl_mean": 0.477081298828125, "no_entropy_unprocessed_thought_kl/_first_quartile": 1.4469726011157036e-05, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.27919435501098633, "no_entropy_unprocessed_thought_kl/_max": 18.6220703125, "no_entropy_unprocessed_thought_kl/_mean": 0.477081298828125, "no_entropy_unprocessed_thought_kl/_median": 0.009118372574448586, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 737.1875, "processed_kl_no_entropy": 0.47719573974609375, "reasoning_kl_max": 18.313232421875, "reasoning_kl_mean": 0.45223236083984375, "reward": -1.8642020225524902, "reward_std": 0.20896290242671967, "rewards/TeacherKLBasedReward": -1.8642020225524902, "solution_log_prob_reward": -0.1759764093440026, "step": 49, "thought_kl_scores": 9.375244140625, "thought_processed_kl": 0.45218658447265625, "total_teacher_likelihood_reward": -2.0820705350488424, "total_tl_reward_no_entropy": -2.165882483124733, "unprocessed_answer_log_prob/_first_quartile": -0.0005495704244822264, "unprocessed_answer_log_prob/_last_quartile": 0.0, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.10714340209960938, "unprocessed_answer_log_prob/_median": -1.9827857613563538e-06, "unprocessed_answer_log_prob/_min": -6.88330078125, "unprocessed_answer_log_prob/_sum": -54.931640625, "unprocessed_thought_kl/_first_quartile": 1.368415541946888e-05, "unprocessed_thought_kl/_last_quartile": 0.26599979400634766, "unprocessed_thought_kl/_max": 18.313232421875, "unprocessed_thought_kl/_mean": 0.45223236083984375, "unprocessed_thought_kl/_median": 0.008997421711683273, "unprocessed_thought_kl/_min": -1.6595714092254639, "unprocessed_thought_kl/_sum": 685.1796875 }, { "answer_log_prob_mean": -0.3388519287109375, "answer_log_prob_min": -11.5205078125, "completion_length": 4838.1279296875, "epoch": 0.047875523638539794, "grad_norm": 0.5032308374273473, "kl": 0.005939960479736328, "kl_reward": -1.134564506355673, "kl_reward_no_entropy": -1.5872799586504698, "kl_scores_no_entropy": 8.306640625, "learning_rate": 1e-06, "loss": 0.0002, "match_reward": -0.359375, "no_entropy_reasoning_kl_max": 16.25390625, "no_entropy_reasoning_kl_mean": 0.36655426025390625, "no_entropy_unprocessed_thought_kl/_first_quartile": 2.1890969946980476e-05, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.19569528102874756, "no_entropy_unprocessed_thought_kl/_max": 16.25390625, "no_entropy_unprocessed_thought_kl/_mean": 0.36655426025390625, "no_entropy_unprocessed_thought_kl/_median": 0.006942844949662685, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 3541.625, "processed_kl_no_entropy": 0.36653900146484375, "reasoning_kl_max": 14.82763671875, "reasoning_kl_mean": 0.22991180419921875, "reward": -1.830962896347046, "reward_std": 0.21839211881160736, "rewards/TeacherKLBasedReward": -1.830962896347046, "solution_log_prob_reward": -0.4540570038370788, "step": 50, "thought_kl_scores": 7.5302734375, "thought_processed_kl": 0.229949951171875, "total_teacher_likelihood_reward": -1.9479965036734939, "total_tl_reward_no_entropy": -2.400711958296597, "unprocessed_answer_log_prob/_first_quartile": -0.02890777587890625, "unprocessed_answer_log_prob/_last_quartile": 0.0, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.3388519287109375, "unprocessed_answer_log_prob/_median": -1.0338611900806427e-05, "unprocessed_answer_log_prob/_min": -11.5205078125, "unprocessed_answer_log_prob/_sum": -165.2109375, "unprocessed_thought_kl/_first_quartile": 8.114147931337357e-07, "unprocessed_thought_kl/_last_quartile": 0.09155833721160889, "unprocessed_thought_kl/_max": 14.82763671875, "unprocessed_thought_kl/_mean": 0.22991180419921875, "unprocessed_thought_kl/_median": 0.0012770844623446465, "unprocessed_thought_kl/_min": -5.5738525390625, "unprocessed_thought_kl/_sum": 2110.6875 }, { "answer_log_prob_mean": -0.33034515380859375, "answer_log_prob_min": -15.24462890625, "completion_length": 5936.0419921875, "epoch": 0.04883303411131059, "grad_norm": 0.8817287643998369, "kl": 0.006775379180908203, "kl_reward": -1.582055062521249, "kl_reward_no_entropy": -1.974157110787928, "kl_scores_no_entropy": 7.500732421875, "learning_rate": 1e-06, "loss": 0.0003, "match_reward": -0.125, "no_entropy_reasoning_kl_max": 14.49755859375, "no_entropy_reasoning_kl_mean": 0.5130767822265625, "no_entropy_unprocessed_thought_kl/_first_quartile": 2.641952596604824e-05, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.3725719451904297, "no_entropy_unprocessed_thought_kl/_max": 14.49755859375, "no_entropy_unprocessed_thought_kl/_mean": 0.5130767822265625, "no_entropy_unprocessed_thought_kl/_median": 0.014168024063110352, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 3096.1875, "processed_kl_no_entropy": 0.5129852294921875, "reasoning_kl_max": 13.6162109375, "reasoning_kl_mean": 0.3911895751953125, "reward": -1.8044838905334473, "reward_std": 0.2402721494436264, "rewards/TeacherKLBasedReward": -1.8044838905334473, "solution_log_prob_reward": -0.48279144102707505, "step": 51, "thought_kl_scores": 7.00341796875, "thought_processed_kl": 0.392425537109375, "total_teacher_likelihood_reward": -2.189846492372453, "total_tl_reward_no_entropy": -2.581948542036116, "unprocessed_answer_log_prob/_first_quartile": -0.029352664947509766, "unprocessed_answer_log_prob/_last_quartile": -1.862645149230957e-09, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.33034515380859375, "unprocessed_answer_log_prob/_median": -7.290393114089966e-05, "unprocessed_answer_log_prob/_min": -15.24462890625, "unprocessed_answer_log_prob/_sum": -233.2890625, "unprocessed_thought_kl/_first_quartile": 1.1035706847906113e-05, "unprocessed_thought_kl/_last_quartile": 0.25664615631103516, "unprocessed_thought_kl/_max": 13.6162109375, "unprocessed_thought_kl/_mean": 0.3911895751953125, "unprocessed_thought_kl/_median": 0.007967024110257626, "unprocessed_thought_kl/_min": -3.3543328791856766, "unprocessed_thought_kl/_sum": 1755.375 }, { "answer_log_prob_mean": -0.10418033599853516, "answer_log_prob_min": -8.86328125, "completion_length": 4011.2099609375, "epoch": 0.049790544584081387, "grad_norm": 0.36063002183697407, "kl": 0.0027614235877990723, "kl_reward": -1.3617380103096366, "kl_reward_no_entropy": -1.403182100970298, "kl_scores_no_entropy": 6.2525634765625, "learning_rate": 1e-06, "loss": 0.0001, "match_reward": -0.0078125, "no_entropy_reasoning_kl_max": 12.15283203125, "no_entropy_reasoning_kl_mean": 0.34619903564453125, "no_entropy_unprocessed_thought_kl/_first_quartile": 2.9802322387695312e-08, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.07033562660217285, "no_entropy_unprocessed_thought_kl/_max": 12.15283203125, "no_entropy_unprocessed_thought_kl/_mean": 0.34619903564453125, "no_entropy_unprocessed_thought_kl/_median": 0.0001365160569548607, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 671.3203125, "processed_kl_no_entropy": 0.3470458984375, "reasoning_kl_max": 12.068359375, "reasoning_kl_mean": 0.33322906494140625, "reward": -1.8238111734390259, "reward_std": 0.19831283390522003, "rewards/TeacherKLBasedReward": -1.8238111734390259, "solution_log_prob_reward": -0.1928131408058107, "step": 52, "thought_kl_scores": 6.2030029296875, "thought_processed_kl": 0.3340301513671875, "total_teacher_likelihood_reward": -1.5623636385425925, "total_tl_reward_no_entropy": -1.6038077306002378, "unprocessed_answer_log_prob/_first_quartile": -2.3510539904236794e-05, "unprocessed_answer_log_prob/_last_quartile": 0.0, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.10418033599853516, "unprocessed_answer_log_prob/_median": 0.0, "unprocessed_answer_log_prob/_min": -8.86328125, "unprocessed_answer_log_prob/_sum": -51.08203125, "unprocessed_thought_kl/_first_quartile": 2.7939677238464355e-09, "unprocessed_thought_kl/_last_quartile": 0.06292283535003662, "unprocessed_thought_kl/_max": 12.068359375, "unprocessed_thought_kl/_mean": 0.33322906494140625, "unprocessed_thought_kl/_median": 9.106285870075226e-05, "unprocessed_thought_kl/_min": -0.9671789184212685, "unprocessed_thought_kl/_sum": 570.4609375 }, { "answer_log_prob_mean": -0.23358917236328125, "answer_log_prob_min": -10.552001953125, "completion_length": 5533.4609375, "epoch": 0.05074805505685218, "grad_norm": 0.32354645439626906, "kl": 0.0068531036376953125, "kl_reward": -1.2315792720764875, "kl_reward_no_entropy": -1.6933447169139981, "kl_scores_no_entropy": 6.9007568359375, "learning_rate": 1e-06, "loss": 0.0003, "match_reward": -0.1484375, "no_entropy_reasoning_kl_max": 13.3662109375, "no_entropy_reasoning_kl_mean": 0.4307861328125, "no_entropy_unprocessed_thought_kl/_first_quartile": 3.4204451367259026e-05, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.30019187927246094, "no_entropy_unprocessed_thought_kl/_max": 13.3662109375, "no_entropy_unprocessed_thought_kl/_mean": 0.4307861328125, "no_entropy_unprocessed_thought_kl/_median": 0.010560780763626099, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 3026.875, "processed_kl_no_entropy": 0.4307708740234375, "reasoning_kl_max": 12.441650390625, "reasoning_kl_mean": 0.28610992431640625, "reward": -1.895449161529541, "reward_std": 0.27086466550827026, "rewards/TeacherKLBasedReward": -1.895449161529541, "solution_log_prob_reward": -0.3391091898083687, "step": 53, "thought_kl_scores": 6.364990234375, "thought_processed_kl": 0.28607940673828125, "total_teacher_likelihood_reward": -1.7191259562969208, "total_tl_reward_no_entropy": -2.180891408585012, "unprocessed_answer_log_prob/_first_quartile": -0.01691159512847662, "unprocessed_answer_log_prob/_last_quartile": -9.778887033462524e-09, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.23358917236328125, "unprocessed_answer_log_prob/_median": -1.3012439012527466e-05, "unprocessed_answer_log_prob/_min": -10.552001953125, "unprocessed_answer_log_prob/_sum": -123.81640625, "unprocessed_thought_kl/_first_quartile": 4.954636096954346e-07, "unprocessed_thought_kl/_last_quartile": 0.15347957611083984, "unprocessed_thought_kl/_max": 12.441650390625, "unprocessed_thought_kl/_mean": 0.28610992431640625, "unprocessed_thought_kl/_median": 0.0016584359109401703, "unprocessed_thought_kl/_min": -4.6768951416015625, "unprocessed_thought_kl/_sum": 1668.9296875 }, { "answer_log_prob_mean": -0.11983108520507812, "answer_log_prob_min": -5.54931640625, "completion_length": 5490.0908203125, "epoch": 0.05170556552962298, "grad_norm": 17.636296465555493, "kl": 0.025933265686035156, "kl_reward": -1.4329303307458758, "kl_reward_no_entropy": -1.5039093345403671, "kl_scores_no_entropy": 5.47705078125, "learning_rate": 1e-06, "loss": 0.001, "match_reward": 0.0, "no_entropy_reasoning_kl_max": 10.562744140625, "no_entropy_reasoning_kl_mean": 0.3956756591796875, "no_entropy_unprocessed_thought_kl/_first_quartile": 2.9029324650764465e-06, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.19800376892089844, "no_entropy_unprocessed_thought_kl/_max": 10.562744140625, "no_entropy_unprocessed_thought_kl/_mean": 0.3956756591796875, "no_entropy_unprocessed_thought_kl/_median": 0.004504382610321045, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 435.9296875, "processed_kl_no_entropy": 0.3965911865234375, "reasoning_kl_max": 10.107177734375, "reasoning_kl_mean": 0.3765716552734375, "reward": -1.6963753700256348, "reward_std": 0.23049044609069824, "rewards/TeacherKLBasedReward": -1.6963753700256348, "solution_log_prob_reward": -0.1753242480335757, "step": 54, "thought_kl_scores": 5.2391357421875, "thought_processed_kl": 0.376861572265625, "total_teacher_likelihood_reward": -1.608254567719996, "total_tl_reward_no_entropy": -1.6792335724458098, "unprocessed_answer_log_prob/_first_quartile": -0.0047151315957307816, "unprocessed_answer_log_prob/_last_quartile": 0.0, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.11983108520507812, "unprocessed_answer_log_prob/_median": -1.909211277961731e-07, "unprocessed_answer_log_prob/_min": -5.54931640625, "unprocessed_answer_log_prob/_sum": -31.9326171875, "unprocessed_thought_kl/_first_quartile": 8.784700185060501e-07, "unprocessed_thought_kl/_last_quartile": 0.17815399169921875, "unprocessed_thought_kl/_max": 10.107177734375, "unprocessed_thought_kl/_mean": 0.3765716552734375, "unprocessed_thought_kl/_median": 0.0030138641595840454, "unprocessed_thought_kl/_min": -1.0801731813699007, "unprocessed_thought_kl/_sum": 418.7265625 }, { "answer_log_prob_mean": -0.22735595703125, "answer_log_prob_min": -9.79296875, "completion_length": 6062.5751953125, "epoch": 0.052663076002393776, "grad_norm": 0.34225309033607126, "kl": 0.006734013557434082, "kl_reward": -1.5932171600870788, "kl_reward_no_entropy": -1.7107690423727036, "kl_scores_no_entropy": 6.5390625, "learning_rate": 1e-06, "loss": 0.0003, "match_reward": 0.0, "no_entropy_reasoning_kl_max": 12.640869140625, "no_entropy_reasoning_kl_mean": 0.44384765625, "no_entropy_unprocessed_thought_kl/_first_quartile": 1.1820346117019653e-05, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.2915194034576416, "no_entropy_unprocessed_thought_kl/_max": 12.640869140625, "no_entropy_unprocessed_thought_kl/_mean": 0.44384765625, "no_entropy_unprocessed_thought_kl/_median": 0.011685210280120373, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 1069.03125, "processed_kl_no_entropy": 0.443115234375, "reasoning_kl_max": 11.7177734375, "reasoning_kl_mean": 0.4138946533203125, "reward": -1.959690809249878, "reward_std": 0.28602170944213867, "rewards/TeacherKLBasedReward": -1.959690809249878, "solution_log_prob_reward": -0.32528564473614097, "step": 55, "thought_kl_scores": 6.0621337890625, "thought_processed_kl": 0.413330078125, "total_teacher_likelihood_reward": -1.9185027964413166, "total_tl_reward_no_entropy": -2.0360546745359898, "unprocessed_answer_log_prob/_first_quartile": -0.03030577814206481, "unprocessed_answer_log_prob/_last_quartile": 0.0, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.22735595703125, "unprocessed_answer_log_prob/_median": -5.161762237548828e-05, "unprocessed_answer_log_prob/_min": -9.79296875, "unprocessed_answer_log_prob/_sum": -113.650390625, "unprocessed_thought_kl/_first_quartile": 5.800742655992508e-06, "unprocessed_thought_kl/_last_quartile": 0.2683829069137573, "unprocessed_thought_kl/_max": 11.7177734375, "unprocessed_thought_kl/_mean": 0.4138946533203125, "unprocessed_thought_kl/_median": 0.010021468624472618, "unprocessed_thought_kl/_min": -2.863410954363644, "unprocessed_thought_kl/_sum": 975.59375 }, { "answer_log_prob_mean": -0.07788848876953125, "answer_log_prob_min": -5.3052978515625, "completion_length": 4959.2783203125, "epoch": 0.05362058647516457, "grad_norm": 0.30348591426839483, "kl": 0.005731344223022461, "kl_reward": -1.2556585762649775, "kl_reward_no_entropy": -1.3876684638671577, "kl_scores_no_entropy": 5.7672119140625, "learning_rate": 1e-06, "loss": 0.0002, "match_reward": 0.0, "no_entropy_reasoning_kl_max": 11.184814453125, "no_entropy_reasoning_kl_mean": 0.3507080078125, "no_entropy_unprocessed_thought_kl/_first_quartile": 1.25030055642128e-07, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.15720224380493164, "no_entropy_unprocessed_thought_kl/_max": 11.184814453125, "no_entropy_unprocessed_thought_kl/_mean": 0.3507080078125, "no_entropy_unprocessed_thought_kl/_median": 0.0014498848468065262, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 672.984375, "processed_kl_no_entropy": 0.3505401611328125, "reasoning_kl_max": 10.70751953125, "reasoning_kl_mean": 0.3114776611328125, "reward": -1.869783878326416, "reward_std": 0.222633495926857, "rewards/TeacherKLBasedReward": -1.869783878326416, "solution_log_prob_reward": -0.1309414657880552, "step": 56, "thought_kl_scores": 5.506591796875, "thought_processed_kl": 0.31168365478515625, "total_teacher_likelihood_reward": -1.3866000343114138, "total_tl_reward_no_entropy": -1.5186099214479327, "unprocessed_answer_log_prob/_first_quartile": -0.0001947060227394104, "unprocessed_answer_log_prob/_last_quartile": 0.0, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.07788848876953125, "unprocessed_answer_log_prob/_median": -9.313225746154785e-10, "unprocessed_answer_log_prob/_min": -5.3052978515625, "unprocessed_answer_log_prob/_sum": -26.3310546875, "unprocessed_thought_kl/_first_quartile": 2.9802322387695312e-08, "unprocessed_thought_kl/_last_quartile": 0.12647294998168945, "unprocessed_thought_kl/_max": 10.70751953125, "unprocessed_thought_kl/_mean": 0.3114776611328125, "unprocessed_thought_kl/_median": 0.0007433071732521057, "unprocessed_thought_kl/_min": -2.492523193359375, "unprocessed_thought_kl/_sum": 574.296875 }, { "answer_log_prob_mean": -0.123504638671875, "answer_log_prob_min": -6.4150390625, "completion_length": 6807.8603515625, "epoch": 0.05457809694793537, "grad_norm": 0.40008738613398726, "kl": 0.006499528884887695, "kl_reward": -1.3690566942095757, "kl_reward_no_entropy": -1.4730889797210693, "kl_scores_no_entropy": 5.75048828125, "learning_rate": 1e-06, "loss": 0.0003, "match_reward": 0.0, "no_entropy_reasoning_kl_max": 11.1162109375, "no_entropy_reasoning_kl_mean": 0.3798675537109375, "no_entropy_unprocessed_thought_kl/_first_quartile": 6.353948265314102e-07, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.16289806365966797, "no_entropy_unprocessed_thought_kl/_max": 11.1162109375, "no_entropy_unprocessed_thought_kl/_mean": 0.3798675537109375, "no_entropy_unprocessed_thought_kl/_median": 0.001433052122592926, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 798.484375, "processed_kl_no_entropy": 0.37982177734375, "reasoning_kl_max": 10.797119140625, "reasoning_kl_mean": 0.34838104248046875, "reward": -1.849998116493225, "reward_std": 0.2500193119049072, "rewards/TeacherKLBasedReward": -1.849998116493225, "solution_log_prob_reward": -0.1876550290035084, "step": 57, "thought_kl_scores": 5.5765380859375, "thought_processed_kl": 0.34848785400390625, "total_teacher_likelihood_reward": -1.5567117258906364, "total_tl_reward_no_entropy": -1.6607440104708076, "unprocessed_answer_log_prob/_first_quartile": -0.0008315723389387131, "unprocessed_answer_log_prob/_last_quartile": 0.0, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.123504638671875, "unprocessed_answer_log_prob/_median": -1.2014061212539673e-07, "unprocessed_answer_log_prob/_min": -6.4150390625, "unprocessed_answer_log_prob/_sum": -46.66796875, "unprocessed_thought_kl/_first_quartile": 3.152526915073395e-07, "unprocessed_thought_kl/_last_quartile": 0.13893795013427734, "unprocessed_thought_kl/_max": 10.797119140625, "unprocessed_thought_kl/_mean": 0.34838104248046875, "unprocessed_thought_kl/_median": 0.000894591212272644, "unprocessed_thought_kl/_min": -2.41589548997581, "unprocessed_thought_kl/_sum": 720.6953125 }, { "answer_log_prob_mean": -0.3458518981933594, "answer_log_prob_min": -12.7197265625, "completion_length": 7313.5546875, "epoch": 0.055535607420706165, "grad_norm": 0.3674733043972024, "kl": 0.00832056999206543, "kl_reward": -1.27445799857378, "kl_reward_no_entropy": -1.7226013066247106, "kl_scores_no_entropy": 6.931640625, "learning_rate": 1e-06, "loss": 0.0003, "match_reward": -0.4921875, "no_entropy_reasoning_kl_max": 13.4228515625, "no_entropy_reasoning_kl_mean": 0.439971923828125, "no_entropy_unprocessed_thought_kl/_first_quartile": 0.00033875182271003723, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.3027362823486328, "no_entropy_unprocessed_thought_kl/_max": 13.4228515625, "no_entropy_unprocessed_thought_kl/_mean": 0.439971923828125, "no_entropy_unprocessed_thought_kl/_median": 0.022662702947854996, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 4475.390625, "processed_kl_no_entropy": 0.4397735595703125, "reasoning_kl_max": 12.6357421875, "reasoning_kl_mean": 0.2984619140625, "reward": -1.9876900911331177, "reward_std": 0.24720607697963715, "rewards/TeacherKLBasedReward": -1.9876900911331177, "solution_log_prob_reward": -0.4730491630034521, "step": 58, "thought_kl_scores": 6.470458984375, "thought_processed_kl": 0.29845428466796875, "total_teacher_likelihood_reward": -2.239694674499333, "total_tl_reward_no_entropy": -2.6878380002453923, "unprocessed_answer_log_prob/_first_quartile": -0.06604685168713331, "unprocessed_answer_log_prob/_last_quartile": -6.658956408500671e-07, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.3458518981933594, "unprocessed_answer_log_prob/_median": -0.0003652647137641907, "unprocessed_answer_log_prob/_min": -12.7197265625, "unprocessed_answer_log_prob/_sum": -286.732421875, "unprocessed_thought_kl/_first_quartile": 5.452893674373627e-07, "unprocessed_thought_kl/_last_quartile": 0.14434480667114258, "unprocessed_thought_kl/_max": 12.6357421875, "unprocessed_thought_kl/_mean": 0.2984619140625, "unprocessed_thought_kl/_median": 0.0021210089325904846, "unprocessed_thought_kl/_min": -3.228027385659516, "unprocessed_thought_kl/_sum": 2197.921875 }, { "answer_log_prob_mean": -0.26912689208984375, "answer_log_prob_min": -13.691650390625, "completion_length": 6652.833984375, "epoch": 0.05649311789347696, "grad_norm": 0.32054140375079626, "kl": 0.007784128189086914, "kl_reward": -1.2091140551492572, "kl_reward_no_entropy": -1.686381214298308, "kl_scores_no_entropy": 7.521728515625, "learning_rate": 1e-06, "loss": 0.0003, "match_reward": -0.125, "no_entropy_reasoning_kl_max": 14.6279296875, "no_entropy_reasoning_kl_mean": 0.4158477783203125, "no_entropy_unprocessed_thought_kl/_first_quartile": 1.905602402985096e-05, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.2606973648071289, "no_entropy_unprocessed_thought_kl/_max": 14.6279296875, "no_entropy_unprocessed_thought_kl/_mean": 0.4158477783203125, "no_entropy_unprocessed_thought_kl/_median": 0.007005274295806885, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 3336.75, "processed_kl_no_entropy": 0.4158477783203125, "reasoning_kl_max": 13.6474609375, "reasoning_kl_mean": 0.26656341552734375, "reward": -1.876607894897461, "reward_std": 0.27012139558792114, "rewards/TeacherKLBasedReward": -1.876607894897461, "solution_log_prob_reward": -0.40604339237324893, "step": 59, "thought_kl_scores": 6.958984375, "thought_processed_kl": 0.26647186279296875, "total_teacher_likelihood_reward": -1.7401574458926916, "total_tl_reward_no_entropy": -2.2174246050417423, "unprocessed_answer_log_prob/_first_quartile": -0.01878916658461094, "unprocessed_answer_log_prob/_last_quartile": -1.4901161193847656e-08, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.26912689208984375, "unprocessed_answer_log_prob/_median": -4.354119300842285e-05, "unprocessed_answer_log_prob/_min": -13.691650390625, "unprocessed_answer_log_prob/_sum": -196.98046875, "unprocessed_thought_kl/_first_quartile": 1.6111880540847778e-07, "unprocessed_thought_kl/_last_quartile": 0.12236785888671875, "unprocessed_thought_kl/_max": 13.6474609375, "unprocessed_thought_kl/_mean": 0.26656341552734375, "unprocessed_thought_kl/_median": 0.0007909797132015228, "unprocessed_thought_kl/_min": -5.049072265625, "unprocessed_thought_kl/_sum": 1852.375 }, { "answer_log_prob_mean": -0.2582855224609375, "answer_log_prob_min": -9.1591796875, "completion_length": 6517.3486328125, "epoch": 0.05745062836624776, "grad_norm": 5383106132335.241, "kl": 14025752576.006418, "kl_reward": -1.6309350654482841, "kl_reward_no_entropy": -2.0200433460995555, "kl_scores_no_entropy": 7.67041015625, "learning_rate": 1e-06, "loss": 562036736.0, "match_reward": 0.0, "no_entropy_reasoning_kl_max": 14.8125, "no_entropy_reasoning_kl_mean": 0.5252227783203125, "no_entropy_unprocessed_thought_kl/_first_quartile": 0.0001732800155878067, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.44769287109375, "no_entropy_unprocessed_thought_kl/_max": 14.8125, "no_entropy_unprocessed_thought_kl/_mean": 0.5252227783203125, "no_entropy_unprocessed_thought_kl/_median": 0.02756643295288086, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 1525.78125, "processed_kl_no_entropy": 0.5251617431640625, "reasoning_kl_max": 14.1728515625, "reasoning_kl_mean": 0.40191650390625, "reward": -2.0513033866882324, "reward_std": 0.25622084736824036, "rewards/TeacherKLBasedReward": -2.0513033866882324, "solution_log_prob_reward": -0.34987731580622494, "step": 60, "thought_kl_scores": 7.2860107421875, "thought_processed_kl": 0.4018707275390625, "total_teacher_likelihood_reward": -1.9808123828843236, "total_tl_reward_no_entropy": -2.369920660741627, "unprocessed_answer_log_prob/_first_quartile": -0.01777428388595581, "unprocessed_answer_log_prob/_last_quartile": 0.0, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.2582855224609375, "unprocessed_answer_log_prob/_median": -9.213574230670929e-06, "unprocessed_answer_log_prob/_min": -9.1591796875, "unprocessed_answer_log_prob/_sum": -136.69140625, "unprocessed_thought_kl/_first_quartile": 2.3283297196030617e-05, "unprocessed_thought_kl/_last_quartile": 0.3078174591064453, "unprocessed_thought_kl/_max": 14.1728515625, "unprocessed_thought_kl/_mean": 0.40191650390625, "unprocessed_thought_kl/_median": 0.011466026306152344, "unprocessed_thought_kl/_min": -4.47900390625, "unprocessed_thought_kl/_sum": 1146.90625 }, { "answer_log_prob_mean": -0.466522216796875, "answer_log_prob_min": -11.4970703125, "completion_length": 5092.099609375, "epoch": 0.058408138839018554, "grad_norm": 0.3132043419227832, "kl": 0.007602691650390625, "kl_reward": -1.5723110809922218, "kl_reward_no_entropy": -1.9808038175106049, "kl_scores_no_entropy": 7.174560546875, "learning_rate": 1e-06, "loss": 0.0003, "match_reward": -0.109375, "no_entropy_reasoning_kl_max": 13.837158203125, "no_entropy_reasoning_kl_mean": 0.5218963623046875, "no_entropy_unprocessed_thought_kl/_first_quartile": 0.0002637808211147785, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.4482917785644531, "no_entropy_unprocessed_thought_kl/_max": 13.837158203125, "no_entropy_unprocessed_thought_kl/_mean": 0.5218963623046875, "no_entropy_unprocessed_thought_kl/_median": 0.03166675567626953, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 2813.765625, "processed_kl_no_entropy": 0.5218658447265625, "reasoning_kl_max": 12.591796875, "reasoning_kl_mean": 0.39818572998046875, "reward": -1.8844857215881348, "reward_std": 0.22472116351127625, "rewards/TeacherKLBasedReward": -1.8844857215881348, "solution_log_prob_reward": -0.5814929213374853, "step": 61, "thought_kl_scores": 6.4898681640625, "thought_processed_kl": 0.39813232421875, "total_teacher_likelihood_reward": -2.263178987428546, "total_tl_reward_no_entropy": -2.671671723946929, "unprocessed_answer_log_prob/_first_quartile": -0.12910175323486328, "unprocessed_answer_log_prob/_last_quartile": -2.0149163901805878e-06, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.466522216796875, "unprocessed_answer_log_prob/_median": -0.001496179960668087, "unprocessed_answer_log_prob/_min": -11.4970703125, "unprocessed_answer_log_prob/_sum": -151.96875, "unprocessed_thought_kl/_first_quartile": 2.072635106742382e-05, "unprocessed_thought_kl/_last_quartile": 0.30461883544921875, "unprocessed_thought_kl/_max": 12.591796875, "unprocessed_thought_kl/_mean": 0.39818572998046875, "unprocessed_thought_kl/_median": 0.01229807734489441, "unprocessed_thought_kl/_min": -3.92767333984375, "unprocessed_thought_kl/_sum": 1771.3125 }, { "answer_log_prob_mean": -0.10787010192871094, "answer_log_prob_min": -5.24884033203125, "completion_length": 7056.0126953125, "epoch": 0.05936564931178935, "grad_norm": 0.25969148639931133, "kl": 0.006255388259887695, "kl_reward": -1.2810937436297536, "kl_reward_no_entropy": -1.3762014685198665, "kl_scores_no_entropy": 5.9847412109375, "learning_rate": 1e-06, "loss": 0.0003, "match_reward": 0.0, "no_entropy_reasoning_kl_max": 11.63037109375, "no_entropy_reasoning_kl_mean": 0.34243011474609375, "no_entropy_unprocessed_thought_kl/_first_quartile": 8.109724149107933e-06, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.20155203342437744, "no_entropy_unprocessed_thought_kl/_max": 11.63037109375, "no_entropy_unprocessed_thought_kl/_mean": 0.34243011474609375, "no_entropy_unprocessed_thought_kl/_median": 0.007661169394850731, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 721.125, "processed_kl_no_entropy": 0.34243011474609375, "reasoning_kl_max": 11.34326171875, "reasoning_kl_mean": 0.3135986328125, "reward": -1.8546392917633057, "reward_std": 0.2872159481048584, "rewards/TeacherKLBasedReward": -1.8546392917633057, "solution_log_prob_reward": -0.16035850410116836, "step": 62, "thought_kl_scores": 5.824951171875, "thought_processed_kl": 0.3137969970703125, "total_teacher_likelihood_reward": -1.4414522415027022, "total_tl_reward_no_entropy": -1.5365599696524441, "unprocessed_answer_log_prob/_first_quartile": -6.456580013036728e-05, "unprocessed_answer_log_prob/_last_quartile": 0.0, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.10787010192871094, "unprocessed_answer_log_prob/_median": -1.862645149230957e-09, "unprocessed_answer_log_prob/_min": -5.24884033203125, "unprocessed_answer_log_prob/_sum": -28.89892578125, "unprocessed_thought_kl/_first_quartile": 3.386056050658226e-06, "unprocessed_thought_kl/_last_quartile": 0.18251540511846542, "unprocessed_thought_kl/_max": 11.34326171875, "unprocessed_thought_kl/_mean": 0.3135986328125, "unprocessed_thought_kl/_median": 0.006005355156958103, "unprocessed_thought_kl/_min": -2.799743101000786, "unprocessed_thought_kl/_sum": 633.59375 }, { "answer_log_prob_mean": -0.2474212646484375, "answer_log_prob_min": -7.9866943359375, "completion_length": 5883.8759765625, "epoch": 0.06032315978456014, "grad_norm": 12.460718193055673, "kl": 0.032608866691589355, "kl_reward": -1.491140441969037, "kl_reward_no_entropy": -1.8516119346022606, "kl_scores_no_entropy": 6.9544677734375, "learning_rate": 1e-06, "loss": 0.0013, "match_reward": -0.046875, "no_entropy_reasoning_kl_max": 13.430908203125, "no_entropy_reasoning_kl_mean": 0.4828948974609375, "no_entropy_unprocessed_thought_kl/_first_quartile": 0.00012231804430484772, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.3708038330078125, "no_entropy_unprocessed_thought_kl/_max": 13.430908203125, "no_entropy_unprocessed_thought_kl/_mean": 0.4828948974609375, "no_entropy_unprocessed_thought_kl/_median": 0.02114027738571167, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 2305.03125, "processed_kl_no_entropy": 0.4826812744140625, "reasoning_kl_max": 12.446533203125, "reasoning_kl_mean": 0.37258148193359375, "reward": -1.9250439405441284, "reward_std": 0.2341998666524887, "rewards/TeacherKLBasedReward": -1.9250439405441284, "solution_log_prob_reward": -0.32728820538613945, "step": 63, "thought_kl_scores": 6.40771484375, "thought_processed_kl": 0.37287139892578125, "total_teacher_likelihood_reward": -1.8653036477044225, "total_tl_reward_no_entropy": -2.2257751375436783, "unprocessed_answer_log_prob/_first_quartile": -0.03185295965522528, "unprocessed_answer_log_prob/_last_quartile": -1.862645149230957e-09, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.2474212646484375, "unprocessed_answer_log_prob/_median": -0.00011618062853813171, "unprocessed_answer_log_prob/_min": -7.9866943359375, "unprocessed_answer_log_prob/_sum": -115.607421875, "unprocessed_thought_kl/_first_quartile": 1.880573108792305e-06, "unprocessed_thought_kl/_last_quartile": 0.24380874633789062, "unprocessed_thought_kl/_max": 12.446533203125, "unprocessed_thought_kl/_mean": 0.37258148193359375, "unprocessed_thought_kl/_median": 0.00599902868270874, "unprocessed_thought_kl/_min": -3.539093017578125, "unprocessed_thought_kl/_sum": 1483.9140625 }, { "answer_log_prob_mean": -0.28692626953125, "answer_log_prob_min": -11.203125, "completion_length": 8569.3837890625, "epoch": 0.06128067025733094, "grad_norm": 0.6746619604012825, "kl": 0.009011983871459961, "kl_reward": -1.12101469328627, "kl_reward_no_entropy": -1.4353344570845366, "kl_scores_no_entropy": 6.5941162109375, "learning_rate": 1e-06, "loss": 0.0004, "match_reward": -0.2109375, "no_entropy_reasoning_kl_max": 12.834716796875, "no_entropy_reasoning_kl_mean": 0.35009765625, "no_entropy_unprocessed_thought_kl/_first_quartile": 6.432412192225456e-06, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.14519357681274414, "no_entropy_unprocessed_thought_kl/_max": 12.834716796875, "no_entropy_unprocessed_thought_kl/_mean": 0.35009765625, "no_entropy_unprocessed_thought_kl/_median": 0.002880527637898922, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 2435.54296875, "processed_kl_no_entropy": 0.350067138671875, "reasoning_kl_max": 12.10546875, "reasoning_kl_mean": 0.25261688232421875, "reward": -2.0308949947357178, "reward_std": 0.3050272464752197, "rewards/TeacherKLBasedReward": -2.0308949947357178, "solution_log_prob_reward": -0.39895752049051225, "step": 64, "thought_kl_scores": 6.1826171875, "thought_processed_kl": 0.25264739990234375, "total_teacher_likelihood_reward": -1.7309097135439515, "total_tl_reward_no_entropy": -2.045229472219944, "unprocessed_answer_log_prob/_first_quartile": -0.032661616802215576, "unprocessed_answer_log_prob/_last_quartile": -2.062879502773285e-07, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.28692626953125, "unprocessed_answer_log_prob/_median": -0.00025630369782447815, "unprocessed_answer_log_prob/_min": -11.203125, "unprocessed_answer_log_prob/_sum": -131.787109375, "unprocessed_thought_kl/_first_quartile": 1.043081283569336e-07, "unprocessed_thought_kl/_last_quartile": 0.06447672843933105, "unprocessed_thought_kl/_max": 12.10546875, "unprocessed_thought_kl/_mean": 0.25261688232421875, "unprocessed_thought_kl/_median": 0.0002777073532342911, "unprocessed_thought_kl/_min": -3.16087943688035, "unprocessed_thought_kl/_sum": 1318.76953125 }, { "answer_log_prob_mean": -0.21307373046875, "answer_log_prob_min": -13.4716796875, "completion_length": 6304.6357421875, "epoch": 0.06223818073010173, "grad_norm": 0.29552307916866394, "kl": 0.0021495819091796875, "kl_reward": -1.1347659155726433, "kl_reward_no_entropy": -1.4465872077271342, "kl_scores_no_entropy": 7.1070556640625, "learning_rate": 1e-06, "loss": 0.0001, "match_reward": -0.078125, "no_entropy_reasoning_kl_max": 13.872802734375, "no_entropy_reasoning_kl_mean": 0.34346771240234375, "no_entropy_unprocessed_thought_kl/_first_quartile": 3.848690539598465e-07, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.13466548919677734, "no_entropy_unprocessed_thought_kl/_max": 13.872802734375, "no_entropy_unprocessed_thought_kl/_mean": 0.34346771240234375, "no_entropy_unprocessed_thought_kl/_median": 0.0008579380810260773, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 2212.625, "processed_kl_no_entropy": 0.3435211181640625, "reasoning_kl_max": 12.8720703125, "reasoning_kl_mean": 0.24953460693359375, "reward": -1.8242573738098145, "reward_std": 0.2653343081474304, "rewards/TeacherKLBasedReward": -1.8242573738098145, "solution_log_prob_reward": -0.3477905245963484, "step": 65, "thought_kl_scores": 6.56103515625, "thought_processed_kl": 0.24958038330078125, "total_teacher_likelihood_reward": -1.5606814390048385, "total_tl_reward_no_entropy": -1.8725027311593294, "unprocessed_answer_log_prob/_first_quartile": -0.00018470804207026958, "unprocessed_answer_log_prob/_last_quartile": 0.0, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.21307373046875, "unprocessed_answer_log_prob/_median": -1.4528632164001465e-07, "unprocessed_answer_log_prob/_min": -13.4716796875, "unprocessed_answer_log_prob/_sum": -140.9453125, "unprocessed_thought_kl/_first_quartile": 4.6566128730773926e-08, "unprocessed_thought_kl/_last_quartile": 0.0734940767288208, "unprocessed_thought_kl/_max": 12.8720703125, "unprocessed_thought_kl/_mean": 0.24953460693359375, "unprocessed_thought_kl/_median": 0.00022165384143590927, "unprocessed_thought_kl/_min": -4.7498779296875, "unprocessed_thought_kl/_sum": 1535.09375 }, { "answer_log_prob_mean": -0.37050533294677734, "answer_log_prob_min": -8.89764404296875, "completion_length": 6354.720703125, "epoch": 0.06319569120287254, "grad_norm": 2512.940195406842, "kl": 1.5774415731430054, "kl_reward": -1.2665322865359485, "kl_reward_no_entropy": -1.8325460748746991, "kl_scores_no_entropy": 7.96044921875, "learning_rate": 1e-06, "loss": 0.0631, "match_reward": -0.4765625, "no_entropy_reasoning_kl_max": 15.46337890625, "no_entropy_reasoning_kl_mean": 0.45621490478515625, "no_entropy_unprocessed_thought_kl/_first_quartile": 0.000615609809756279, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.3432600498199463, "no_entropy_unprocessed_thought_kl/_max": 15.46337890625, "no_entropy_unprocessed_thought_kl/_mean": 0.45621490478515625, "no_entropy_unprocessed_thought_kl/_median": 0.033322921954095364, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 4959.578125, "processed_kl_no_entropy": 0.456298828125, "reasoning_kl_max": 13.593017578125, "reasoning_kl_mean": 0.28624725341796875, "reward": -1.867666244506836, "reward_std": 0.22969815135002136, "rewards/TeacherKLBasedReward": -1.867666244506836, "solution_log_prob_reward": -0.4594817706674803, "step": 66, "thought_kl_scores": 6.935546875, "thought_processed_kl": 0.28607177734375, "total_teacher_likelihood_reward": -2.2025765581056476, "total_tl_reward_no_entropy": -2.768590346444398, "unprocessed_answer_log_prob/_first_quartile": -0.14104468189179897, "unprocessed_answer_log_prob/_last_quartile": -2.3264437913894653e-06, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.37050533294677734, "unprocessed_answer_log_prob/_median": -0.0020038485527038574, "unprocessed_answer_log_prob/_min": -8.89764404296875, "unprocessed_answer_log_prob/_sum": -191.321044921875, "unprocessed_thought_kl/_first_quartile": 3.587920218706131e-07, "unprocessed_thought_kl/_last_quartile": 0.14734411239624023, "unprocessed_thought_kl/_max": 13.593017578125, "unprocessed_thought_kl/_mean": 0.28624725341796875, "unprocessed_thought_kl/_median": 0.002885008230805397, "unprocessed_thought_kl/_min": -5.06622314453125, "unprocessed_thought_kl/_sum": 2375.4453125 }, { "answer_log_prob_mean": -0.26125335693359375, "answer_log_prob_min": -10.663330078125, "completion_length": 6057.466796875, "epoch": 0.06415320167564333, "grad_norm": 0.2572508771952572, "kl": 0.0025144219398498535, "kl_reward": -1.2906619110144675, "kl_reward_no_entropy": -1.7917822180315852, "kl_scores_no_entropy": 7.3037109375, "learning_rate": 1e-06, "loss": 0.0001, "match_reward": -0.0703125, "no_entropy_reasoning_kl_max": 14.1572265625, "no_entropy_reasoning_kl_mean": 0.4556884765625, "no_entropy_unprocessed_thought_kl/_first_quartile": 0.00016932259313762188, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.332763671875, "no_entropy_unprocessed_thought_kl/_max": 14.1572265625, "no_entropy_unprocessed_thought_kl/_mean": 0.4556884765625, "no_entropy_unprocessed_thought_kl/_median": 0.019749224185943604, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 3007.59375, "processed_kl_no_entropy": 0.4553985595703125, "reasoning_kl_max": 12.905029296875, "reasoning_kl_mean": 0.30117034912109375, "reward": -1.6952446699142456, "reward_std": 0.25144603848457336, "rewards/TeacherKLBasedReward": -1.6952446699142456, "solution_log_prob_reward": -0.3678866550908424, "step": 67, "thought_kl_scores": 6.60205078125, "thought_processed_kl": 0.30123138427734375, "total_teacher_likelihood_reward": -1.7288610599935055, "total_tl_reward_no_entropy": -2.2299813767895103, "unprocessed_answer_log_prob/_first_quartile": -0.03989343252032995, "unprocessed_answer_log_prob/_last_quartile": -9.266659617424011e-08, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.26125335693359375, "unprocessed_answer_log_prob/_median": -0.00022920500487089157, "unprocessed_answer_log_prob/_min": -10.663330078125, "unprocessed_answer_log_prob/_sum": -173.552734375, "unprocessed_thought_kl/_first_quartile": 6.649643182754517e-07, "unprocessed_thought_kl/_last_quartile": 0.16721534729003906, "unprocessed_thought_kl/_max": 12.905029296875, "unprocessed_thought_kl/_mean": 0.30117034912109375, "unprocessed_thought_kl/_median": 0.002602584660053253, "unprocessed_thought_kl/_min": -4.399847030639648, "unprocessed_thought_kl/_sum": 1708.5625 }, { "answer_log_prob_mean": -0.21295928955078125, "answer_log_prob_min": -7.529296875, "completion_length": 4179.4892578125, "epoch": 0.06511071214841413, "grad_norm": 4.443750612699715, "kl": 0.005201160907745361, "kl_reward": -1.520296290051192, "kl_reward_no_entropy": -1.5974834859371185, "kl_scores_no_entropy": 6.1702880859375, "learning_rate": 1e-06, "loss": 0.0002, "match_reward": 0.0, "no_entropy_reasoning_kl_max": 11.91796875, "no_entropy_reasoning_kl_mean": 0.4133148193359375, "no_entropy_unprocessed_thought_kl/_first_quartile": 4.961621016263962e-07, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.1856517791748047, "no_entropy_unprocessed_thought_kl/_max": 11.91796875, "no_entropy_unprocessed_thought_kl/_mean": 0.4133148193359375, "no_entropy_unprocessed_thought_kl/_median": 0.001484692096710205, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 858.8125, "processed_kl_no_entropy": 0.4139251708984375, "reasoning_kl_max": 11.680419921875, "reasoning_kl_mean": 0.38996124267578125, "reward": -1.7392557859420776, "reward_std": 0.18361955881118774, "rewards/TeacherKLBasedReward": -1.7392557859420776, "solution_log_prob_reward": -0.2882522555300966, "step": 68, "thought_kl_scores": 6.0408935546875, "thought_processed_kl": 0.39117431640625, "total_teacher_likelihood_reward": -1.8085485324263573, "total_tl_reward_no_entropy": -1.885735728777945, "unprocessed_answer_log_prob/_first_quartile": -0.0019721202552318573, "unprocessed_answer_log_prob/_last_quartile": 0.0, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.21295928955078125, "unprocessed_answer_log_prob/_median": -2.3283064365386963e-07, "unprocessed_answer_log_prob/_min": -7.529296875, "unprocessed_answer_log_prob/_sum": -67.54296875, "unprocessed_thought_kl/_first_quartile": 3.264285624027252e-07, "unprocessed_thought_kl/_last_quartile": 0.1661520004272461, "unprocessed_thought_kl/_max": 11.680419921875, "unprocessed_thought_kl/_mean": 0.38996124267578125, "unprocessed_thought_kl/_median": 0.0010055601596832275, "unprocessed_thought_kl/_min": -1.4849557876586914, "unprocessed_thought_kl/_sum": 787.515625 }, { "answer_log_prob_mean": -0.34741973876953125, "answer_log_prob_min": -12.337890625, "completion_length": 7181.88671875, "epoch": 0.06606822262118492, "grad_norm": 0.3934776654975871, "kl": 0.0026544928550720215, "kl_reward": -1.248578782659024, "kl_reward_no_entropy": -1.794375915080309, "kl_scores_no_entropy": 7.951904296875, "learning_rate": 1e-06, "loss": 0.0001, "match_reward": -0.234375, "no_entropy_reasoning_kl_max": 15.45751953125, "no_entropy_reasoning_kl_mean": 0.44355010986328125, "no_entropy_unprocessed_thought_kl/_first_quartile": 0.0001122539397329092, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.32822561264038086, "no_entropy_unprocessed_thought_kl/_max": 15.45751953125, "no_entropy_unprocessed_thought_kl/_mean": 0.44355010986328125, "no_entropy_unprocessed_thought_kl/_median": 0.022017566487193108, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 4292.25, "processed_kl_no_entropy": 0.44345855712890625, "reasoning_kl_max": 14.119140625, "reasoning_kl_mean": 0.27500152587890625, "reward": -1.8296278715133667, "reward_std": 0.2627447247505188, "rewards/TeacherKLBasedReward": -1.8296278715133667, "solution_log_prob_reward": -0.4707986426074058, "step": 69, "thought_kl_scores": 7.19873046875, "thought_processed_kl": 0.27516937255859375, "total_teacher_likelihood_reward": -1.9537524180486798, "total_tl_reward_no_entropy": -2.4995495453476906, "unprocessed_answer_log_prob/_first_quartile": -0.04275782685726881, "unprocessed_answer_log_prob/_last_quartile": -9.313225746154785e-10, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.34741973876953125, "unprocessed_answer_log_prob/_median": -3.190338611602783e-05, "unprocessed_answer_log_prob/_min": -12.337890625, "unprocessed_answer_log_prob/_sum": -140.5546875, "unprocessed_thought_kl/_first_quartile": 6.288755685091019e-07, "unprocessed_thought_kl/_last_quartile": 0.1646801233291626, "unprocessed_thought_kl/_max": 14.119140625, "unprocessed_thought_kl/_mean": 0.27500152587890625, "unprocessed_thought_kl/_median": 0.00421930942684412, "unprocessed_thought_kl/_min": -5.875, "unprocessed_thought_kl/_sum": 2636.5 }, { "answer_log_prob_mean": -0.28264617919921875, "answer_log_prob_min": -10.21533203125, "completion_length": 6788.587890625, "epoch": 0.06702573309395571, "grad_norm": 0.3655904426918564, "kl": 0.002532660961151123, "kl_reward": -1.5170040801167488, "kl_reward_no_entropy": -2.0026336647570133, "kl_scores_no_entropy": 8.150390625, "learning_rate": 1e-06, "loss": 0.0001, "match_reward": -0.140625, "no_entropy_reasoning_kl_max": 15.80078125, "no_entropy_reasoning_kl_mean": 0.5095367431640625, "no_entropy_unprocessed_thought_kl/_first_quartile": 0.00026230746880173683, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.4504814147949219, "no_entropy_unprocessed_thought_kl/_max": 15.80078125, "no_entropy_unprocessed_thought_kl/_mean": 0.5095367431640625, "no_entropy_unprocessed_thought_kl/_median": 0.030608654022216797, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 3636.9375, "processed_kl_no_entropy": 0.5095062255859375, "reasoning_kl_max": 15.1748046875, "reasoning_kl_mean": 0.35391998291015625, "reward": -1.9631524085998535, "reward_std": 0.25682127475738525, "rewards/TeacherKLBasedReward": -1.9631524085998535, "solution_log_prob_reward": -0.384799498366192, "step": 70, "thought_kl_scores": 7.761962890625, "thought_processed_kl": 0.35416412353515625, "total_teacher_likelihood_reward": -2.042428582906723, "total_tl_reward_no_entropy": -2.5280581638216972, "unprocessed_answer_log_prob/_first_quartile": -0.032655954360961914, "unprocessed_answer_log_prob/_last_quartile": -2.0954757928848267e-08, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.28264617919921875, "unprocessed_answer_log_prob/_median": -7.356517016887665e-05, "unprocessed_answer_log_prob/_min": -10.21533203125, "unprocessed_answer_log_prob/_sum": -157.27734375, "unprocessed_thought_kl/_first_quartile": 5.060015246272087e-05, "unprocessed_thought_kl/_last_quartile": 0.2719135284423828, "unprocessed_thought_kl/_max": 15.1748046875, "unprocessed_thought_kl/_mean": 0.35391998291015625, "unprocessed_thought_kl/_median": 0.009906888008117676, "unprocessed_thought_kl/_min": -4.345947265625, "unprocessed_thought_kl/_sum": 2088.125 }, { "answer_log_prob_mean": -0.3038482666015625, "answer_log_prob_min": -15.01123046875, "completion_length": 8034.87109375, "epoch": 0.06798324356672651, "grad_norm": 0.3807220372596107, "kl": 0.0026268362998962402, "kl_reward": -1.6050320407375693, "kl_reward_no_entropy": -1.9520727479830384, "kl_scores_no_entropy": 6.6668701171875, "learning_rate": 1e-06, "loss": 0.0001, "match_reward": 0.0, "no_entropy_reasoning_kl_max": 12.810791015625, "no_entropy_reasoning_kl_mean": 0.5225830078125, "no_entropy_unprocessed_thought_kl/_first_quartile": 0.00011896481737494469, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.45747947692871094, "no_entropy_unprocessed_thought_kl/_max": 12.810791015625, "no_entropy_unprocessed_thought_kl/_mean": 0.5225830078125, "no_entropy_unprocessed_thought_kl/_median": 0.028111398220062256, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 2383.171875, "processed_kl_no_entropy": 0.52264404296875, "reasoning_kl_max": 11.750732421875, "reasoning_kl_mean": 0.41750335693359375, "reward": -2.0256497859954834, "reward_std": 0.3024482727050781, "rewards/TeacherKLBasedReward": -2.0256497859954834, "solution_log_prob_reward": -0.4539605671307072, "step": 71, "thought_kl_scores": 6.083984375, "thought_processed_kl": 0.4172515869140625, "total_teacher_likelihood_reward": -2.0589925963431597, "total_tl_reward_no_entropy": -2.4060333045199513, "unprocessed_answer_log_prob/_first_quartile": -0.015707921236753464, "unprocessed_answer_log_prob/_last_quartile": -1.3969838619232178e-09, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.3038482666015625, "unprocessed_answer_log_prob/_median": -8.43685120344162e-06, "unprocessed_answer_log_prob/_min": -15.01123046875, "unprocessed_answer_log_prob/_sum": -112.814453125, "unprocessed_thought_kl/_first_quartile": 3.638723865151405e-05, "unprocessed_thought_kl/_last_quartile": 0.3366413116455078, "unprocessed_thought_kl/_max": 11.750732421875, "unprocessed_thought_kl/_mean": 0.41750335693359375, "unprocessed_thought_kl/_median": 0.014127731323242188, "unprocessed_thought_kl/_min": -4.347816467285156, "unprocessed_thought_kl/_sum": 1761.4375 }, { "answer_log_prob_mean": -0.10723114013671875, "answer_log_prob_min": -6.8343505859375, "completion_length": 5688.640625, "epoch": 0.0689407540394973, "grad_norm": 1.286134645798262, "kl": 0.0032358169555664062, "kl_reward": -1.0389651386067271, "kl_reward_no_entropy": -1.221608271356672, "kl_scores_no_entropy": 6.1190185546875, "learning_rate": 1e-06, "loss": 0.0001, "match_reward": -0.046875, "no_entropy_reasoning_kl_max": 11.940673828125, "no_entropy_reasoning_kl_mean": 0.2877960205078125, "no_entropy_unprocessed_thought_kl/_first_quartile": 6.51925802230835e-09, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.05274534225463867, "no_entropy_unprocessed_thought_kl/_max": 11.940673828125, "no_entropy_unprocessed_thought_kl/_mean": 0.2877960205078125, "no_entropy_unprocessed_thought_kl/_median": 3.5585835576057434e-05, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 1414.0234375, "processed_kl_no_entropy": 0.2870635986328125, "reasoning_kl_max": 11.49755859375, "reasoning_kl_mean": 0.23134613037109375, "reward": -1.6832743883132935, "reward_std": 0.2378900796175003, "rewards/TeacherKLBasedReward": -1.6832743883132935, "solution_log_prob_reward": -0.17557464231504127, "step": 72, "thought_kl_scores": 5.870849609375, "thought_processed_kl": 0.23146820068359375, "total_teacher_likelihood_reward": -1.261414765380323, "total_tl_reward_no_entropy": -1.444057896733284, "unprocessed_answer_log_prob/_first_quartile": -2.2551044821739197e-05, "unprocessed_answer_log_prob/_last_quartile": 0.0, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.10723114013671875, "unprocessed_answer_log_prob/_median": -7.450580596923828e-09, "unprocessed_answer_log_prob/_min": -6.8343505859375, "unprocessed_answer_log_prob/_sum": -77.13818359375, "unprocessed_thought_kl/_first_quartile": 0.0, "unprocessed_thought_kl/_last_quartile": 0.030379831790924072, "unprocessed_thought_kl/_max": 11.49755859375, "unprocessed_thought_kl/_mean": 0.23134613037109375, "unprocessed_thought_kl/_median": 8.393079042434692e-06, "unprocessed_thought_kl/_min": -2.837125778198242, "unprocessed_thought_kl/_sum": 895.0625 }, { "answer_log_prob_mean": -0.11773681640625, "answer_log_prob_min": -4.51318359375, "completion_length": 6083.8369140625, "epoch": 0.0698982645122681, "grad_norm": 0.30561457009970766, "kl": 0.002385556697845459, "kl_reward": -1.7825006302446127, "kl_reward_no_entropy": -1.8211047546938062, "kl_scores_no_entropy": 7.808837890625, "learning_rate": 1e-06, "loss": 0.0001, "match_reward": 0.0, "no_entropy_reasoning_kl_max": 15.143798828125, "no_entropy_reasoning_kl_mean": 0.455596923828125, "no_entropy_unprocessed_thought_kl/_first_quartile": 3.2032839953899384e-06, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.20584487915039062, "no_entropy_unprocessed_thought_kl/_max": 15.143798828125, "no_entropy_unprocessed_thought_kl/_mean": 0.455596923828125, "no_entropy_unprocessed_thought_kl/_median": 0.0032502934336662292, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 622.234375, "processed_kl_no_entropy": 0.45513916015625, "reasoning_kl_max": 15.01513671875, "reasoning_kl_mean": 0.4440155029296875, "reward": -1.932433843612671, "reward_std": 0.2811543941497803, "rewards/TeacherKLBasedReward": -1.932433843612671, "solution_log_prob_reward": -0.16286865167785436, "step": 73, "thought_kl_scores": 7.7388916015625, "thought_processed_kl": 0.44493865966796875, "total_teacher_likelihood_reward": -1.9453692641109228, "total_tl_reward_no_entropy": -1.9839733866974711, "unprocessed_answer_log_prob/_first_quartile": -0.00016749277710914612, "unprocessed_answer_log_prob/_last_quartile": 0.0, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.11773681640625, "unprocessed_answer_log_prob/_median": -2.7939677238464355e-09, "unprocessed_answer_log_prob/_min": -4.51318359375, "unprocessed_answer_log_prob/_sum": -36.3271484375, "unprocessed_thought_kl/_first_quartile": 2.821441739797592e-06, "unprocessed_thought_kl/_last_quartile": 0.19593191146850586, "unprocessed_thought_kl/_max": 15.01513671875, "unprocessed_thought_kl/_mean": 0.4440155029296875, "unprocessed_thought_kl/_median": 0.003011384978890419, "unprocessed_thought_kl/_min": -1.4219552464783192, "unprocessed_thought_kl/_sum": 605.484375 }, { "answer_log_prob_mean": -0.3094139099121094, "answer_log_prob_min": -14.09130859375, "completion_length": 7098.3125, "epoch": 0.0708557749850389, "grad_norm": 178.50793672497542, "kl": 0.12848186492919922, "kl_reward": -1.1762709189206362, "kl_reward_no_entropy": -1.6185905607417226, "kl_scores_no_entropy": 6.74365234375, "learning_rate": 1e-06, "loss": 0.0051, "match_reward": -0.46875, "no_entropy_reasoning_kl_max": 13.08349609375, "no_entropy_reasoning_kl_mean": 0.4086952209472656, "no_entropy_unprocessed_thought_kl/_first_quartile": 0.00014720717445015907, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.27188205905258656, "no_entropy_unprocessed_thought_kl/_max": 13.08349609375, "no_entropy_unprocessed_thought_kl/_mean": 0.4086952209472656, "no_entropy_unprocessed_thought_kl/_median": 0.017789803445339203, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 4278.328125, "processed_kl_no_entropy": 0.4079780578613281, "reasoning_kl_max": 12.19677734375, "reasoning_kl_mean": 0.2701225280761719, "reward": -2.328500986099243, "reward_std": 0.22643806040287018, "rewards/TeacherKLBasedReward": -2.328500986099243, "solution_log_prob_reward": -0.45032699359580874, "step": 74, "thought_kl_scores": 6.2333984375, "thought_processed_kl": 0.2693977355957031, "total_teacher_likelihood_reward": -2.0953479316085577, "total_tl_reward_no_entropy": -2.5376675743609667, "unprocessed_answer_log_prob/_first_quartile": -0.03475697711110115, "unprocessed_answer_log_prob/_last_quartile": -7.450580596923828e-07, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.3094139099121094, "unprocessed_answer_log_prob/_median": -0.00034987926483154297, "unprocessed_answer_log_prob/_min": -14.09130859375, "unprocessed_answer_log_prob/_sum": -419.087890625, "unprocessed_thought_kl/_first_quartile": 2.766028046607971e-07, "unprocessed_thought_kl/_last_quartile": 0.12137746904045343, "unprocessed_thought_kl/_max": 12.19677734375, "unprocessed_thought_kl/_mean": 0.2701225280761719, "unprocessed_thought_kl/_median": 0.0014148382470011711, "unprocessed_thought_kl/_min": -3.2954559326171875, "unprocessed_thought_kl/_sum": 2120.296875 }, { "answer_log_prob_mean": -0.39764404296875, "answer_log_prob_min": -14.72412109375, "completion_length": 5659.5400390625, "epoch": 0.0718132854578097, "grad_norm": 5.304536708213684, "kl": 0.005004763603210449, "kl_reward": -1.2737237429246306, "kl_reward_no_entropy": -1.9098165770992637, "kl_scores_no_entropy": 8.1253662109375, "learning_rate": 1e-06, "loss": 0.0002, "match_reward": -0.0546875, "no_entropy_reasoning_kl_max": 15.774658203125, "no_entropy_reasoning_kl_mean": 0.47885894775390625, "no_entropy_unprocessed_thought_kl/_first_quartile": 0.00015693739987909794, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.3904848098754883, "no_entropy_unprocessed_thought_kl/_max": 15.774658203125, "no_entropy_unprocessed_thought_kl/_mean": 0.47885894775390625, "no_entropy_unprocessed_thought_kl/_median": 0.023367879912257195, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 4031.234375, "processed_kl_no_entropy": 0.47864532470703125, "reasoning_kl_max": 14.425537109375, "reasoning_kl_mean": 0.2803192138671875, "reward": -1.8041210174560547, "reward_std": 0.23888634145259857, "rewards/TeacherKLBasedReward": -1.8041210174560547, "solution_log_prob_reward": -0.5448852509725839, "step": 75, "thought_kl_scores": 7.3531494140625, "thought_processed_kl": 0.28020477294921875, "total_teacher_likelihood_reward": -1.8732964918017387, "total_tl_reward_no_entropy": -2.509389329701662, "unprocessed_answer_log_prob/_first_quartile": -0.07367623597383499, "unprocessed_answer_log_prob/_last_quartile": -2.1187588572502136e-08, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.39764404296875, "unprocessed_answer_log_prob/_median": -0.00011727958917617798, "unprocessed_answer_log_prob/_min": -14.72412109375, "unprocessed_answer_log_prob/_sum": -199.84765625, "unprocessed_thought_kl/_first_quartile": 1.3676472008228302e-06, "unprocessed_thought_kl/_last_quartile": 0.1810319423675537, "unprocessed_thought_kl/_max": 14.425537109375, "unprocessed_thought_kl/_mean": 0.2803192138671875, "unprocessed_thought_kl/_median": 0.003540473058819771, "unprocessed_thought_kl/_min": -5.56103515625, "unprocessed_thought_kl/_sum": 2299.734375 }, { "answer_log_prob_mean": -0.3465423583984375, "answer_log_prob_min": -12.6474609375, "completion_length": 5828.6982421875, "epoch": 0.07277079593058049, "grad_norm": 0.2957763851409548, "kl": 0.0026491284370422363, "kl_reward": -1.4012237461283803, "kl_reward_no_entropy": -2.1221813894808292, "kl_scores_no_entropy": 7.751953125, "learning_rate": 1e-06, "loss": 0.0001, "match_reward": -0.03125, "no_entropy_reasoning_kl_max": 14.9501953125, "no_entropy_reasoning_kl_mean": 0.557891845703125, "no_entropy_unprocessed_thought_kl/_first_quartile": 0.0003382486756891012, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.5347137451171875, "no_entropy_unprocessed_thought_kl/_max": 14.9501953125, "no_entropy_unprocessed_thought_kl/_mean": 0.557891845703125, "no_entropy_unprocessed_thought_kl/_median": 0.0445156991481781, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 4092.1875, "processed_kl_no_entropy": 0.5580596923828125, "reasoning_kl_max": 13.10302734375, "reasoning_kl_mean": 0.3360443115234375, "reward": -1.8341878652572632, "reward_std": 0.2117013931274414, "rewards/TeacherKLBasedReward": -1.8341878652572632, "solution_log_prob_reward": -0.4730169659014791, "step": 76, "thought_kl_scores": 6.719970703125, "thought_processed_kl": 0.33611297607421875, "total_teacher_likelihood_reward": -1.9054907085373998, "total_tl_reward_no_entropy": -2.6264483649283648, "unprocessed_answer_log_prob/_first_quartile": -0.030602693557739258, "unprocessed_answer_log_prob/_last_quartile": -7.450580596923828e-09, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.3465423583984375, "unprocessed_answer_log_prob/_median": -4.738662391901016e-05, "unprocessed_answer_log_prob/_min": -12.6474609375, "unprocessed_answer_log_prob/_sum": -202.8515625, "unprocessed_thought_kl/_first_quartile": 1.5615951269865036e-06, "unprocessed_thought_kl/_last_quartile": 0.27353858947753906, "unprocessed_thought_kl/_max": 13.10302734375, "unprocessed_thought_kl/_mean": 0.3360443115234375, "unprocessed_thought_kl/_median": 0.0085370484739542, "unprocessed_thought_kl/_min": -5.133544921875, "unprocessed_thought_kl/_sum": 2330.40625 }, { "answer_log_prob_mean": -0.08110427856445312, "answer_log_prob_min": -4.16693115234375, "completion_length": 3928.8671875, "epoch": 0.07372830640335129, "grad_norm": 0.3018569324378454, "kl": 0.0020518898963928223, "kl_reward": -1.4782498008571565, "kl_reward_no_entropy": -1.5142437578178942, "kl_scores_no_entropy": 5.013671875, "learning_rate": 1e-06, "loss": 0.0001, "match_reward": 0.0, "no_entropy_reasoning_kl_max": 9.607177734375, "no_entropy_reasoning_kl_mean": 0.4086761474609375, "no_entropy_unprocessed_thought_kl/_first_quartile": 1.4808028936386108e-07, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.15525054931640625, "no_entropy_unprocessed_thought_kl/_max": 9.607177734375, "no_entropy_unprocessed_thought_kl/_mean": 0.4086761474609375, "no_entropy_unprocessed_thought_kl/_median": 0.001150183379650116, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 547.375, "processed_kl_no_entropy": 0.4080352783203125, "reasoning_kl_max": 9.551025390625, "reasoning_kl_mean": 0.39723968505859375, "reward": -1.9067004919052124, "reward_std": 0.20047932863235474, "rewards/TeacherKLBasedReward": -1.9067004919052124, "solution_log_prob_reward": -0.1227735917782411, "step": 77, "thought_kl_scores": 4.9793701171875, "thought_processed_kl": 0.39702606201171875, "total_teacher_likelihood_reward": -1.6010234020650387, "total_tl_reward_no_entropy": -1.6370173571631312, "unprocessed_answer_log_prob/_first_quartile": -9.399536065757275e-05, "unprocessed_answer_log_prob/_last_quartile": 0.0, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.08110427856445312, "unprocessed_answer_log_prob/_median": 0.0, "unprocessed_answer_log_prob/_min": -4.16693115234375, "unprocessed_answer_log_prob/_sum": -22.6279296875, "unprocessed_thought_kl/_first_quartile": 6.82193785905838e-08, "unprocessed_thought_kl/_last_quartile": 0.14721012115478516, "unprocessed_thought_kl/_max": 9.551025390625, "unprocessed_thought_kl/_mean": 0.39723968505859375, "unprocessed_thought_kl/_median": 0.0010227393358945847, "unprocessed_thought_kl/_min": -1.416168600320816, "unprocessed_thought_kl/_sum": 530.0625 }, { "answer_log_prob_mean": -0.14611053466796875, "answer_log_prob_min": -6.1292724609375, "completion_length": 4046.3349609375, "epoch": 0.07468581687612208, "grad_norm": 0.3237210933425242, "kl": 0.002109527587890625, "kl_reward": -1.2169198305346072, "kl_reward_no_entropy": -1.4069164684042335, "kl_scores_no_entropy": 6.547119140625, "learning_rate": 1e-06, "loss": 0.0001, "match_reward": 0.0, "no_entropy_reasoning_kl_max": 12.748046875, "no_entropy_reasoning_kl_mean": 0.34149169921875, "no_entropy_unprocessed_thought_kl/_first_quartile": 2.919696271419525e-07, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.13012123107910156, "no_entropy_unprocessed_thought_kl/_max": 12.748046875, "no_entropy_unprocessed_thought_kl/_mean": 0.34149169921875, "no_entropy_unprocessed_thought_kl/_median": 0.0005972208455204964, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 1049.375, "processed_kl_no_entropy": 0.340911865234375, "reasoning_kl_max": 12.28564453125, "reasoning_kl_mean": 0.28278350830078125, "reward": -1.7005836963653564, "reward_std": 0.19062848389148712, "rewards/TeacherKLBasedReward": -1.7005836963653564, "solution_log_prob_reward": -0.20740325655788183, "step": 78, "thought_kl_scores": 6.28662109375, "thought_processed_kl": 0.2826690673828125, "total_teacher_likelihood_reward": -1.4243230763822794, "total_tl_reward_no_entropy": -1.6143197175115347, "unprocessed_answer_log_prob/_first_quartile": -0.010965828318148851, "unprocessed_answer_log_prob/_last_quartile": 0.0, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.14611053466796875, "unprocessed_answer_log_prob/_median": -4.515983164310455e-06, "unprocessed_answer_log_prob/_min": -6.1292724609375, "unprocessed_answer_log_prob/_sum": -48.4189453125, "unprocessed_thought_kl/_first_quartile": 2.60770320892334e-08, "unprocessed_thought_kl/_last_quartile": 0.09534883499145508, "unprocessed_thought_kl/_max": 12.28564453125, "unprocessed_thought_kl/_mean": 0.28278350830078125, "unprocessed_thought_kl/_median": 0.0002824198454618454, "unprocessed_thought_kl/_min": -3.0170211791992188, "unprocessed_thought_kl/_sum": 733.640625 }, { "answer_log_prob_mean": -0.2882881164550781, "answer_log_prob_min": -9.620361328125, "completion_length": 6381.4140625, "epoch": 0.07564332734889288, "grad_norm": 0.2735509313965725, "kl": 0.0023647546768188477, "kl_reward": -1.548229071777314, "kl_reward_no_entropy": -1.770465093664825, "kl_scores_no_entropy": 6.057861328125, "learning_rate": 1e-06, "loss": 0.0001, "match_reward": -0.0078125, "no_entropy_reasoning_kl_max": 11.64306640625, "no_entropy_reasoning_kl_mean": 0.473724365234375, "no_entropy_unprocessed_thought_kl/_first_quartile": 5.561113357543945e-05, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.28088974952697754, "no_entropy_unprocessed_thought_kl/_max": 11.64306640625, "no_entropy_unprocessed_thought_kl/_mean": 0.473724365234375, "no_entropy_unprocessed_thought_kl/_median": 0.011099250987172127, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 1522.703125, "processed_kl_no_entropy": 0.473785400390625, "reasoning_kl_max": 10.80029296875, "reasoning_kl_mean": 0.40807342529296875, "reward": -1.8543866872787476, "reward_std": 0.2087193876504898, "rewards/TeacherKLBasedReward": -1.8543866872787476, "solution_log_prob_reward": -0.38449172652326524, "step": 79, "thought_kl_scores": 5.603759765625, "thought_processed_kl": 0.4097747802734375, "total_teacher_likelihood_reward": -1.940533253364265, "total_tl_reward_no_entropy": -2.16276928037405, "unprocessed_answer_log_prob/_first_quartile": -0.031277868431061506, "unprocessed_answer_log_prob/_last_quartile": -3.052409738302231e-07, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.2882881164550781, "unprocessed_answer_log_prob/_median": -0.00021146982908248901, "unprocessed_answer_log_prob/_min": -9.620361328125, "unprocessed_answer_log_prob/_sum": -92.4375, "unprocessed_thought_kl/_first_quartile": 4.8902351409196854e-05, "unprocessed_thought_kl/_last_quartile": 0.24635940790176392, "unprocessed_thought_kl/_max": 10.80029296875, "unprocessed_thought_kl/_mean": 0.40807342529296875, "unprocessed_thought_kl/_median": 0.010743970982730389, "unprocessed_thought_kl/_min": -2.7431775853037834, "unprocessed_thought_kl/_sum": 997.265625 }, { "answer_log_prob_mean": -0.19268417358398438, "answer_log_prob_min": -8.65625, "completion_length": 5965.259765625, "epoch": 0.07660083782166367, "grad_norm": 0.253485960158329, "kl": 0.0032181143760681152, "kl_reward": -1.1525024310685694, "kl_reward_no_entropy": -1.4088564980775118, "kl_scores_no_entropy": 5.8468017578125, "learning_rate": 1e-06, "loss": 0.0001, "match_reward": 0.0, "no_entropy_reasoning_kl_max": 11.33642578125, "no_entropy_reasoning_kl_mean": 0.35625457763671875, "no_entropy_unprocessed_thought_kl/_first_quartile": 2.6402994990348816e-07, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.14527273178100586, "no_entropy_unprocessed_thought_kl/_max": 11.33642578125, "no_entropy_unprocessed_thought_kl/_mean": 0.35625457763671875, "no_entropy_unprocessed_thought_kl/_median": 0.0011567845940589905, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 711.1328125, "processed_kl_no_entropy": 0.3562469482421875, "reasoning_kl_max": 10.4443359375, "reasoning_kl_mean": 0.27972412109375, "reward": -1.7004259824752808, "reward_std": 0.2676074206829071, "rewards/TeacherKLBasedReward": -1.7004259824752808, "solution_log_prob_reward": -0.2792466724058613, "step": 80, "thought_kl_scores": 5.3616943359375, "thought_processed_kl": 0.27970123291015625, "total_teacher_likelihood_reward": -1.4317491063848138, "total_tl_reward_no_entropy": -1.6881031692028046, "unprocessed_answer_log_prob/_first_quartile": -0.0009224507957696915, "unprocessed_answer_log_prob/_last_quartile": 0.0, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.19268417358398438, "unprocessed_answer_log_prob/_median": -9.592622518539429e-08, "unprocessed_answer_log_prob/_min": -8.65625, "unprocessed_answer_log_prob/_sum": -78.04296875, "unprocessed_thought_kl/_first_quartile": 4.563480615615845e-08, "unprocessed_thought_kl/_last_quartile": 0.09166336059570312, "unprocessed_thought_kl/_max": 10.4443359375, "unprocessed_thought_kl/_mean": 0.27972412109375, "unprocessed_thought_kl/_median": 0.00044204387813806534, "unprocessed_thought_kl/_min": -3.614044189453125, "unprocessed_thought_kl/_sum": 539.6015625 }, { "answer_log_prob_mean": -0.3775177001953125, "answer_log_prob_min": -11.416015625, "completion_length": 4996.2890625, "epoch": 0.07755834829443448, "grad_norm": 0.34606856999308255, "kl": 0.002635866403579712, "kl_reward": -1.2752004880458117, "kl_reward_no_entropy": -1.6844531074166298, "kl_scores_no_entropy": 6.5919189453125, "learning_rate": 1e-06, "loss": 0.0001, "match_reward": -0.046875, "no_entropy_reasoning_kl_max": 12.75244140625, "no_entropy_reasoning_kl_mean": 0.4339599609375, "no_entropy_unprocessed_thought_kl/_first_quartile": 3.550853580236435e-05, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.2951784133911133, "no_entropy_unprocessed_thought_kl/_max": 12.75244140625, "no_entropy_unprocessed_thought_kl/_mean": 0.4339599609375, "no_entropy_unprocessed_thought_kl/_median": 0.01170414686203003, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 3153.78125, "processed_kl_no_entropy": 0.4341278076171875, "reasoning_kl_max": 11.004150390625, "reasoning_kl_mean": 0.31502532958984375, "reward": -1.6824357509613037, "reward_std": 0.1957823634147644, "rewards/TeacherKLBasedReward": -1.6824357509613037, "solution_log_prob_reward": -0.4916778551414609, "step": 81, "thought_kl_scores": 5.657958984375, "thought_processed_kl": 0.31465911865234375, "total_teacher_likelihood_reward": -1.8137533506378531, "total_tl_reward_no_entropy": -2.2230059755966067, "unprocessed_answer_log_prob/_first_quartile": -0.08282792195677757, "unprocessed_answer_log_prob/_last_quartile": -1.0989606380462646e-07, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.3775177001953125, "unprocessed_answer_log_prob/_median": -0.0005417615175247192, "unprocessed_answer_log_prob/_min": -11.416015625, "unprocessed_answer_log_prob/_sum": -121.23828125, "unprocessed_thought_kl/_first_quartile": 2.952292561531067e-07, "unprocessed_thought_kl/_last_quartile": 0.16666555404663086, "unprocessed_thought_kl/_max": 11.004150390625, "unprocessed_thought_kl/_mean": 0.31502532958984375, "unprocessed_thought_kl/_median": 0.0019467398524284363, "unprocessed_thought_kl/_min": -3.3169748038053513, "unprocessed_thought_kl/_sum": 1809.0390625 }, { "answer_log_prob_mean": -0.24287033081054688, "answer_log_prob_min": -10.5263671875, "completion_length": 4885.3994140625, "epoch": 0.07851585876720527, "grad_norm": 0.29488423087459953, "kl": 0.0028171539306640625, "kl_reward": -1.183600148651749, "kl_reward_no_entropy": -1.6202032482251525, "kl_scores_no_entropy": 6.9798583984375, "learning_rate": 1e-06, "loss": 0.0001, "match_reward": -0.328125, "no_entropy_reasoning_kl_max": 13.55419921875, "no_entropy_reasoning_kl_mean": 0.4045257568359375, "no_entropy_unprocessed_thought_kl/_first_quartile": 4.223780706524849e-06, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.225341796875, "no_entropy_unprocessed_thought_kl/_max": 13.55419921875, "no_entropy_unprocessed_thought_kl/_mean": 0.4045257568359375, "no_entropy_unprocessed_thought_kl/_median": 0.0036453083157539368, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 3378.453125, "processed_kl_no_entropy": 0.4045867919921875, "reasoning_kl_max": 12.250732421875, "reasoning_kl_mean": 0.27202606201171875, "reward": -1.80086350440979, "reward_std": 0.22165364027023315, "rewards/TeacherKLBasedReward": -1.80086350440979, "solution_log_prob_reward": -0.3481340006692335, "step": 82, "thought_kl_scores": 6.26318359375, "thought_processed_kl": 0.2720489501953125, "total_teacher_likelihood_reward": -1.8598591471090913, "total_tl_reward_no_entropy": -2.296462249942124, "unprocessed_answer_log_prob/_first_quartile": -0.00819836650043726, "unprocessed_answer_log_prob/_last_quartile": -8.614733815193176e-09, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.24287033081054688, "unprocessed_answer_log_prob/_median": -7.188878953456879e-06, "unprocessed_answer_log_prob/_min": -10.5263671875, "unprocessed_answer_log_prob/_sum": -113.087890625, "unprocessed_thought_kl/_first_quartile": 2.3283064365386963e-08, "unprocessed_thought_kl/_last_quartile": 0.11253118515014648, "unprocessed_thought_kl/_max": 12.250732421875, "unprocessed_thought_kl/_mean": 0.27202606201171875, "unprocessed_thought_kl/_median": 0.0005807047709822655, "unprocessed_thought_kl/_min": -4.3671875, "unprocessed_thought_kl/_sum": 1784.859375 }, { "answer_log_prob_mean": -0.28736114501953125, "answer_log_prob_min": -7.9169921875, "completion_length": 5791.5576171875, "epoch": 0.07947336923997607, "grad_norm": 3.4047215627944807, "kl": 0.01745140552520752, "kl_reward": -1.336912504862994, "kl_reward_no_entropy": -1.715866668149829, "kl_scores_no_entropy": 7.217529296875, "learning_rate": 1e-06, "loss": 0.0007, "match_reward": -0.3125, "no_entropy_reasoning_kl_max": 13.98876953125, "no_entropy_reasoning_kl_mean": 0.43206787109375, "no_entropy_unprocessed_thought_kl/_first_quartile": 5.774572491645813e-05, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.28925514221191406, "no_entropy_unprocessed_thought_kl/_max": 13.98876953125, "no_entropy_unprocessed_thought_kl/_mean": 0.43206787109375, "no_entropy_unprocessed_thought_kl/_median": 0.01148109883069992, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 3544.28125, "processed_kl_no_entropy": 0.43133544921875, "reasoning_kl_max": 13.16650390625, "reasoning_kl_mean": 0.31397247314453125, "reward": -1.9730294942855835, "reward_std": 0.25891056656837463, "rewards/TeacherKLBasedReward": -1.9730294942855835, "solution_log_prob_reward": -0.3665310712531209, "step": 83, "thought_kl_scores": 6.747802734375, "thought_processed_kl": 0.31394195556640625, "total_teacher_likelihood_reward": -2.015943550504744, "total_tl_reward_no_entropy": -2.3948977096006274, "unprocessed_answer_log_prob/_first_quartile": -0.03590047359466553, "unprocessed_answer_log_prob/_last_quartile": -1.1129304766654968e-07, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.28736114501953125, "unprocessed_answer_log_prob/_median": -0.00014277920126914978, "unprocessed_answer_log_prob/_min": -7.9169921875, "unprocessed_answer_log_prob/_sum": -110.619140625, "unprocessed_thought_kl/_first_quartile": 5.399342626333237e-07, "unprocessed_thought_kl/_last_quartile": 0.16654396057128906, "unprocessed_thought_kl/_max": 13.16650390625, "unprocessed_thought_kl/_mean": 0.31397247314453125, "unprocessed_thought_kl/_median": 0.0016708187758922577, "unprocessed_thought_kl/_min": -3.034928321838379, "unprocessed_thought_kl/_sum": 1801.453125 }, { "answer_log_prob_mean": -0.25531005859375, "answer_log_prob_min": -13.376220703125, "completion_length": 6721.5654296875, "epoch": 0.08043087971274686, "grad_norm": 0.2879676114580942, "kl": 0.0035854578018188477, "kl_reward": -1.4215658609755337, "kl_reward_no_entropy": -1.7120489459484816, "kl_scores_no_entropy": 6.842529296875, "learning_rate": 1e-06, "loss": 0.0001, "match_reward": -0.0625, "no_entropy_reasoning_kl_max": 13.240478515625, "no_entropy_reasoning_kl_mean": 0.4382781982421875, "no_entropy_unprocessed_thought_kl/_first_quartile": 9.163515642285347e-06, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.21844053268432617, "no_entropy_unprocessed_thought_kl/_max": 13.240478515625, "no_entropy_unprocessed_thought_kl/_mean": 0.4382781982421875, "no_entropy_unprocessed_thought_kl/_median": 0.006232475861907005, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 1796.09375, "processed_kl_no_entropy": 0.4383087158203125, "reasoning_kl_max": 12.088134765625, "reasoning_kl_mean": 0.35297393798828125, "reward": -1.8602304458618164, "reward_std": 0.2674080729484558, "rewards/TeacherKLBasedReward": -1.8602304458618164, "solution_log_prob_reward": -0.38907226163428277, "step": 84, "thought_kl_scores": 6.2236328125, "thought_processed_kl": 0.35357666015625, "total_teacher_likelihood_reward": -1.8731381352990866, "total_tl_reward_no_entropy": -2.1636212151497602, "unprocessed_answer_log_prob/_first_quartile": -0.012360347900539637, "unprocessed_answer_log_prob/_last_quartile": 0.0, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.25531005859375, "unprocessed_answer_log_prob/_median": -1.4817342162132263e-06, "unprocessed_answer_log_prob/_min": -13.376220703125, "unprocessed_answer_log_prob/_sum": -103.548828125, "unprocessed_thought_kl/_first_quartile": 4.897592589259148e-06, "unprocessed_thought_kl/_last_quartile": 0.16042232513427734, "unprocessed_thought_kl/_max": 12.088134765625, "unprocessed_thought_kl/_mean": 0.35297393798828125, "unprocessed_thought_kl/_median": 0.004798226989805698, "unprocessed_thought_kl/_min": -3.4828948974609375, "unprocessed_thought_kl/_sum": 1084.7265625 }, { "answer_log_prob_mean": -0.36254119873046875, "answer_log_prob_min": -13.56396484375, "completion_length": 6849.8232421875, "epoch": 0.08138839018551765, "grad_norm": 0.22304247261750126, "kl": 0.003109574317932129, "kl_reward": -1.1536990371532738, "kl_reward_no_entropy": -1.678874815814197, "kl_scores_no_entropy": 7.66943359375, "learning_rate": 1e-06, "loss": 0.0001, "match_reward": -0.1875, "no_entropy_reasoning_kl_max": 14.92626953125, "no_entropy_reasoning_kl_mean": 0.41036224365234375, "no_entropy_unprocessed_thought_kl/_first_quartile": 1.8088845536112785e-05, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.2649729251861572, "no_entropy_unprocessed_thought_kl/_max": 14.92626953125, "no_entropy_unprocessed_thought_kl/_mean": 0.41036224365234375, "no_entropy_unprocessed_thought_kl/_median": 0.00893627293407917, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 3685.34375, "processed_kl_no_entropy": 0.4103240966796875, "reasoning_kl_max": 13.765625, "reasoning_kl_mean": 0.24691009521484375, "reward": -1.8498295545578003, "reward_std": 0.2882581353187561, "rewards/TeacherKLBasedReward": -1.8498295545578003, "solution_log_prob_reward": -0.4981808424927294, "step": 85, "thought_kl_scores": 7.0093994140625, "thought_processed_kl": 0.24683380126953125, "total_teacher_likelihood_reward": -1.8393798768520355, "total_tl_reward_no_entropy": -2.3645556615665555, "unprocessed_answer_log_prob/_first_quartile": -0.04217783804051578, "unprocessed_answer_log_prob/_last_quartile": -2.337619662284851e-07, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.36254119873046875, "unprocessed_answer_log_prob/_median": -0.00019700080156326294, "unprocessed_answer_log_prob/_min": -13.56396484375, "unprocessed_answer_log_prob/_sum": -238.01171875, "unprocessed_thought_kl/_first_quartile": 8.66129994392395e-08, "unprocessed_thought_kl/_last_quartile": 0.11148953437805176, "unprocessed_thought_kl/_max": 13.765625, "unprocessed_thought_kl/_mean": 0.24691009521484375, "unprocessed_thought_kl/_median": 0.0007159281522035599, "unprocessed_thought_kl/_min": -4.8096923828125, "unprocessed_thought_kl/_sum": 1934.828125 }, { "answer_log_prob_mean": -0.5243377685546875, "answer_log_prob_min": -17.25341796875, "completion_length": 7843.9619140625, "epoch": 0.08234590065828845, "grad_norm": 0.24778739569254224, "kl": 0.003593742847442627, "kl_reward": -1.1794555597007275, "kl_reward_no_entropy": -1.9963586330413818, "kl_scores_no_entropy": 8.137939453125, "learning_rate": 1e-06, "loss": 0.0001, "match_reward": -0.4921875, "no_entropy_reasoning_kl_max": 15.76708984375, "no_entropy_reasoning_kl_mean": 0.507781982421875, "no_entropy_unprocessed_thought_kl/_first_quartile": 0.0002780044451355934, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.4610404968261719, "no_entropy_unprocessed_thought_kl/_max": 15.76708984375, "no_entropy_unprocessed_thought_kl/_mean": 0.507781982421875, "no_entropy_unprocessed_thought_kl/_median": 0.03421831130981445, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 6612.0625, "processed_kl_no_entropy": 0.5076751708984375, "reasoning_kl_max": 14.06494140625, "reasoning_kl_mean": 0.25250244140625, "reward": -1.852046251296997, "reward_std": 0.2674240171909332, "rewards/TeacherKLBasedReward": -1.852046251296997, "solution_log_prob_reward": -0.6968719458673149, "step": 86, "thought_kl_scores": 7.15966796875, "thought_processed_kl": 0.25243377685546875, "total_teacher_likelihood_reward": -2.368515002541244, "total_tl_reward_no_entropy": -3.1854180824011564, "unprocessed_answer_log_prob/_first_quartile": -0.17858004570007324, "unprocessed_answer_log_prob/_last_quartile": -4.068948328495026e-06, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.5243377685546875, "unprocessed_answer_log_prob/_median": -0.002795932814478874, "unprocessed_answer_log_prob/_min": -17.25341796875, "unprocessed_answer_log_prob/_sum": -267.75, "unprocessed_thought_kl/_first_quartile": 3.164168447256088e-07, "unprocessed_thought_kl/_last_quartile": 0.17686939239501953, "unprocessed_thought_kl/_max": 14.06494140625, "unprocessed_thought_kl/_mean": 0.25250244140625, "unprocessed_thought_kl/_median": 0.002841651439666748, "unprocessed_thought_kl/_min": -5.6129150390625, "unprocessed_thought_kl/_sum": 3184.3125 }, { "answer_log_prob_mean": -0.23639678955078125, "answer_log_prob_min": -11.139404296875, "completion_length": 4990.1103515625, "epoch": 0.08330341113105924, "grad_norm": 0.27300780581286466, "kl": 0.003909111022949219, "kl_reward": -1.1793264648877084, "kl_reward_no_entropy": -1.8856100398115814, "kl_scores_no_entropy": 8.3740234375, "learning_rate": 1e-06, "loss": 0.0002, "match_reward": -0.15625, "no_entropy_reasoning_kl_max": 16.2861328125, "no_entropy_reasoning_kl_mean": 0.46567535400390625, "no_entropy_unprocessed_thought_kl/_first_quartile": 8.462858386337757e-05, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.3680345034226775, "no_entropy_unprocessed_thought_kl/_max": 16.2861328125, "no_entropy_unprocessed_thought_kl/_mean": 0.46567535400390625, "no_entropy_unprocessed_thought_kl/_median": 0.018449366092681885, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 4700.6875, "processed_kl_no_entropy": 0.46552276611328125, "reasoning_kl_max": 14.9189453125, "reasoning_kl_mean": 0.24391937255859375, "reward": -1.895204782485962, "reward_std": 0.274730920791626, "rewards/TeacherKLBasedReward": -1.895204782485962, "solution_log_prob_reward": -0.3477908297209069, "step": 87, "thought_kl_scores": 7.58203125, "thought_processed_kl": 0.243804931640625, "total_teacher_likelihood_reward": -1.6833672923967242, "total_tl_reward_no_entropy": -2.389650870114565, "unprocessed_answer_log_prob/_first_quartile": -0.009359300136566162, "unprocessed_answer_log_prob/_last_quartile": -8.381903171539307e-09, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.23639678955078125, "unprocessed_answer_log_prob/_median": -1.139659434556961e-05, "unprocessed_answer_log_prob/_min": -11.139404296875, "unprocessed_answer_log_prob/_sum": -159.9453125, "unprocessed_thought_kl/_first_quartile": 2.4493783712387085e-07, "unprocessed_thought_kl/_last_quartile": 0.13800478260964155, "unprocessed_thought_kl/_max": 14.9189453125, "unprocessed_thought_kl/_mean": 0.24391937255859375, "unprocessed_thought_kl/_median": 0.0013163462281227112, "unprocessed_thought_kl/_min": -5.3431396484375, "unprocessed_thought_kl/_sum": 2406.75 }, { "answer_log_prob_mean": -0.40886688232421875, "answer_log_prob_min": -11.6688232421875, "completion_length": 7541.826171875, "epoch": 0.08426092160383004, "grad_norm": 0.2471765728226046, "kl": 0.004180431365966797, "kl_reward": -1.0891891359351575, "kl_reward_no_entropy": -1.6098989816382527, "kl_scores_no_entropy": 7.569091796875, "learning_rate": 1e-06, "loss": 0.0002, "match_reward": -0.5, "no_entropy_reasoning_kl_max": 14.748046875, "no_entropy_reasoning_kl_mean": 0.38915252685546875, "no_entropy_unprocessed_thought_kl/_first_quartile": 0.00010650837793946266, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.2479569911956787, "no_entropy_unprocessed_thought_kl/_max": 14.748046875, "no_entropy_unprocessed_thought_kl/_mean": 0.38915252685546875, "no_entropy_unprocessed_thought_kl/_median": 0.01642279140651226, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 4571.15625, "processed_kl_no_entropy": 0.3890380859375, "reasoning_kl_max": 13.438720703125, "reasoning_kl_mean": 0.22867584228515625, "reward": -1.871209740638733, "reward_std": 0.25296729803085327, "rewards/TeacherKLBasedReward": -1.871209740638733, "solution_log_prob_reward": -0.5255551129812375, "step": 88, "thought_kl_scores": 6.835205078125, "thought_processed_kl": 0.22864532470703125, "total_teacher_likelihood_reward": -2.11474425252527, "total_tl_reward_no_entropy": -2.6354540968313813, "unprocessed_answer_log_prob/_first_quartile": -0.14676999673247337, "unprocessed_answer_log_prob/_last_quartile": -5.404697731137276e-06, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.40886688232421875, "unprocessed_answer_log_prob/_median": -0.002447132021188736, "unprocessed_answer_log_prob/_min": -11.6688232421875, "unprocessed_answer_log_prob/_sum": -249.7822265625, "unprocessed_thought_kl/_first_quartile": 4.307366907596588e-08, "unprocessed_thought_kl/_last_quartile": 0.08729821443557739, "unprocessed_thought_kl/_max": 13.438720703125, "unprocessed_thought_kl/_mean": 0.22867584228515625, "unprocessed_thought_kl/_median": 0.0007869889959692955, "unprocessed_thought_kl/_min": -4.86187744140625, "unprocessed_thought_kl/_sum": 2211.40625 }, { "answer_log_prob_mean": -0.3504180908203125, "answer_log_prob_min": -13.22412109375, "completion_length": 7705.392578125, "epoch": 0.08521843207660083, "grad_norm": 0.2226748082825164, "kl": 0.0040045976638793945, "kl_reward": -1.136942128650844, "kl_reward_no_entropy": -1.6710974061861634, "kl_scores_no_entropy": 7.565673828125, "learning_rate": 1e-06, "loss": 0.0002, "match_reward": -0.0390625, "no_entropy_reasoning_kl_max": 14.72119140625, "no_entropy_reasoning_kl_mean": 0.409820556640625, "no_entropy_unprocessed_thought_kl/_first_quartile": 5.4694246500730515e-06, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.22734832763671875, "no_entropy_unprocessed_thought_kl/_max": 14.72119140625, "no_entropy_unprocessed_thought_kl/_mean": 0.409820556640625, "no_entropy_unprocessed_thought_kl/_median": 0.0038117729127407074, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 3142.3125, "processed_kl_no_entropy": 0.4095916748046875, "reasoning_kl_max": 13.2490234375, "reasoning_kl_mean": 0.246490478515625, "reward": -1.6878085136413574, "reward_std": 0.2577773332595825, "rewards/TeacherKLBasedReward": -1.6878085136413574, "solution_log_prob_reward": -0.48265930043999106, "step": 89, "thought_kl_scores": 6.74951171875, "thought_processed_kl": 0.246368408203125, "total_teacher_likelihood_reward": -1.6586639201268554, "total_tl_reward_no_entropy": -2.192819212563336, "unprocessed_answer_log_prob/_first_quartile": -0.061917065642774105, "unprocessed_answer_log_prob/_last_quartile": -2.3283064365386963e-09, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.3504180908203125, "unprocessed_answer_log_prob/_median": -6.815232336521149e-05, "unprocessed_answer_log_prob/_min": -13.22412109375, "unprocessed_answer_log_prob/_sum": -143.2265625, "unprocessed_thought_kl/_first_quartile": 8.381903171539307e-08, "unprocessed_thought_kl/_last_quartile": 0.08827757835388184, "unprocessed_thought_kl/_max": 13.2490234375, "unprocessed_thought_kl/_mean": 0.246490478515625, "unprocessed_thought_kl/_median": 0.0002745548263192177, "unprocessed_thought_kl/_min": -5.2039794921875, "unprocessed_thought_kl/_sum": 1851.15625 }, { "answer_log_prob_mean": -0.20010757446289062, "answer_log_prob_min": -10.6015625, "completion_length": 7502.5615234375, "epoch": 0.08617594254937164, "grad_norm": 87.94153489047, "kl": 0.050749897956848145, "kl_reward": -1.2975292694754899, "kl_reward_no_entropy": -1.8725042482838035, "kl_scores_no_entropy": 8.273681640625, "learning_rate": 1e-06, "loss": 0.002, "match_reward": -0.40625, "no_entropy_reasoning_kl_max": 16.095703125, "no_entropy_reasoning_kl_mean": 0.4632110595703125, "no_entropy_unprocessed_thought_kl/_first_quartile": 0.0001857355237007141, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.3388633728027344, "no_entropy_unprocessed_thought_kl/_max": 16.095703125, "no_entropy_unprocessed_thought_kl/_mean": 0.4632110595703125, "no_entropy_unprocessed_thought_kl/_median": 0.020913437008857727, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 4765.375, "processed_kl_no_entropy": 0.463134765625, "reasoning_kl_max": 15.0771484375, "reasoning_kl_mean": 0.28173828125, "reward": -1.8382683992385864, "reward_std": 0.25067082047462463, "rewards/TeacherKLBasedReward": -1.8382683992385864, "solution_log_prob_reward": -0.30612319766078144, "step": 90, "thought_kl_scores": 7.680908203125, "thought_processed_kl": 0.28173065185546875, "total_teacher_likelihood_reward": -2.0099024679511786, "total_tl_reward_no_entropy": -2.5848774537444115, "unprocessed_answer_log_prob/_first_quartile": -0.0007590828463435173, "unprocessed_answer_log_prob/_last_quartile": 0.0, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.20010757446289062, "unprocessed_answer_log_prob/_median": -1.9185245037078857e-06, "unprocessed_answer_log_prob/_min": -10.6015625, "unprocessed_answer_log_prob/_sum": -249.775390625, "unprocessed_thought_kl/_first_quartile": 4.470348358154297e-08, "unprocessed_thought_kl/_last_quartile": 0.14407730102539062, "unprocessed_thought_kl/_max": 15.0771484375, "unprocessed_thought_kl/_mean": 0.28173828125, "unprocessed_thought_kl/_median": 0.001225341111421585, "unprocessed_thought_kl/_min": -4.495122909545898, "unprocessed_thought_kl/_sum": 2235.71875 }, { "answer_log_prob_mean": -0.21294784545898438, "answer_log_prob_min": -7.9019775390625, "completion_length": 6073.244140625, "epoch": 0.08713345302214243, "grad_norm": 0.511155081311373, "kl": 0.004072368144989014, "kl_reward": -1.2279771333560348, "kl_reward_no_entropy": -1.6270628217607737, "kl_scores_no_entropy": 7.679443359375, "learning_rate": 1e-06, "loss": 0.0002, "match_reward": -0.0625, "no_entropy_reasoning_kl_max": 14.96044921875, "no_entropy_reasoning_kl_mean": 0.3927497863769531, "no_entropy_unprocessed_thought_kl/_first_quartile": 2.3869797587394714e-06, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.19328244449570775, "no_entropy_unprocessed_thought_kl/_max": 14.96044921875, "no_entropy_unprocessed_thought_kl/_mean": 0.3927497863769531, "no_entropy_unprocessed_thought_kl/_median": 0.0037011094391345978, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 2108.65625, "processed_kl_no_entropy": 0.3925514221191406, "reasoning_kl_max": 14.032470703125, "reasoning_kl_mean": 0.2690010070800781, "reward": -1.9360074996948242, "reward_std": 0.23080134391784668, "rewards/TeacherKLBasedReward": -1.9360074996948242, "solution_log_prob_reward": -0.2919676183955744, "step": 91, "thought_kl_scores": 7.1495361328125, "thought_processed_kl": 0.2688407897949219, "total_teacher_likelihood_reward": -1.5824447581544518, "total_tl_reward_no_entropy": -1.9815304400399327, "unprocessed_answer_log_prob/_first_quartile": -0.008148878812789917, "unprocessed_answer_log_prob/_last_quartile": -1.6298145055770874e-09, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.21294784545898438, "unprocessed_answer_log_prob/_median": -1.298263669013977e-06, "unprocessed_answer_log_prob/_min": -7.9019775390625, "unprocessed_answer_log_prob/_sum": -89.7607421875, "unprocessed_thought_kl/_first_quartile": 3.534369170665741e-07, "unprocessed_thought_kl/_last_quartile": 0.09964234568178654, "unprocessed_thought_kl/_max": 14.032470703125, "unprocessed_thought_kl/_mean": 0.2690010070800781, "unprocessed_thought_kl/_median": 0.0009682383388280869, "unprocessed_thought_kl/_min": -4.522918701171875, "unprocessed_thought_kl/_sum": 1356.6875 }, { "answer_log_prob_mean": -0.3280982971191406, "answer_log_prob_min": -8.7392578125, "completion_length": 7748.9453125, "epoch": 0.08809096349491323, "grad_norm": 0.7169966295928093, "kl": 0.0054454803466796875, "kl_reward": -1.298070352524519, "kl_reward_no_entropy": -1.8032015864737332, "kl_scores_no_entropy": 7.1322021484375, "learning_rate": 1e-06, "loss": 0.0002, "match_reward": -0.171875, "no_entropy_reasoning_kl_max": 13.8046875, "no_entropy_reasoning_kl_mean": 0.46302032470703125, "no_entropy_unprocessed_thought_kl/_first_quartile": 0.00022331904619932175, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.3323221206665039, "no_entropy_unprocessed_thought_kl/_max": 13.8046875, "no_entropy_unprocessed_thought_kl/_mean": 0.46302032470703125, "no_entropy_unprocessed_thought_kl/_median": 0.026368528604507446, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 3590.7890625, "processed_kl_no_entropy": 0.46292877197265625, "reasoning_kl_max": 12.7216796875, "reasoning_kl_mean": 0.30547332763671875, "reward": -1.8369691371917725, "reward_std": 0.31943684816360474, "rewards/TeacherKLBasedReward": -1.8369691371917725, "solution_log_prob_reward": -0.4154908722266555, "step": 92, "thought_kl_scores": 6.5123291015625, "thought_processed_kl": 0.3050537109375, "total_teacher_likelihood_reward": -1.8854362377896905, "total_tl_reward_no_entropy": -2.3905674712732434, "unprocessed_answer_log_prob/_first_quartile": -0.08928312361240387, "unprocessed_answer_log_prob/_last_quartile": -2.9569491744041443e-07, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.3280982971191406, "unprocessed_answer_log_prob/_median": -0.000467127189040184, "unprocessed_answer_log_prob/_min": -8.7392578125, "unprocessed_answer_log_prob/_sum": -150.6328125, "unprocessed_thought_kl/_first_quartile": 2.8870999813079834e-08, "unprocessed_thought_kl/_last_quartile": 0.14889240264892578, "unprocessed_thought_kl/_max": 12.7216796875, "unprocessed_thought_kl/_mean": 0.30547332763671875, "unprocessed_thought_kl/_median": 0.001787399873137474, "unprocessed_thought_kl/_min": -3.6865081787109375, "unprocessed_thought_kl/_sum": 1716.67578125 }, { "answer_log_prob_mean": -0.137908935546875, "answer_log_prob_min": -9.8048095703125, "completion_length": 6976.2880859375, "epoch": 0.08904847396768402, "grad_norm": 0.6664292657763075, "kl": 0.006201386451721191, "kl_reward": -1.0020528994500637, "kl_reward_no_entropy": -1.2343762093223631, "kl_scores_no_entropy": 5.65185546875, "learning_rate": 1e-06, "loss": 0.0002, "match_reward": 0.0, "no_entropy_reasoning_kl_max": 11.003662109375, "no_entropy_reasoning_kl_mean": 0.301422119140625, "no_entropy_unprocessed_thought_kl/_first_quartile": 3.5599805414676666e-07, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.10168051719665527, "no_entropy_unprocessed_thought_kl/_max": 11.003662109375, "no_entropy_unprocessed_thought_kl/_mean": 0.301422119140625, "no_entropy_unprocessed_thought_kl/_median": 0.0007828855887055397, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 535.015625, "processed_kl_no_entropy": 0.30145263671875, "reasoning_kl_max": 10.44873046875, "reasoning_kl_mean": 0.22953033447265625, "reward": -1.7157937288284302, "reward_std": 0.25127851963043213, "rewards/TeacherKLBasedReward": -1.7157937288284302, "solution_log_prob_reward": -0.23595702811144292, "step": 93, "thought_kl_scores": 5.3394775390625, "thought_processed_kl": 0.2294769287109375, "total_teacher_likelihood_reward": -1.238009927328676, "total_tl_reward_no_entropy": -1.4703332297503948, "unprocessed_answer_log_prob/_first_quartile": -5.469750612974167e-05, "unprocessed_answer_log_prob/_last_quartile": 0.0, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.137908935546875, "unprocessed_answer_log_prob/_median": -9.313225746154785e-10, "unprocessed_answer_log_prob/_min": -9.8048095703125, "unprocessed_answer_log_prob/_sum": -74.869140625, "unprocessed_thought_kl/_first_quartile": 6.193295121192932e-08, "unprocessed_thought_kl/_last_quartile": 0.05778396129608154, "unprocessed_thought_kl/_max": 10.44873046875, "unprocessed_thought_kl/_mean": 0.22953033447265625, "unprocessed_thought_kl/_median": 0.00024539418518543243, "unprocessed_thought_kl/_min": -3.129180908203125, "unprocessed_thought_kl/_sum": 397.4140625 }, { "answer_log_prob_mean": -0.5115203857421875, "answer_log_prob_min": -15.67236328125, "completion_length": 6559.359375, "epoch": 0.09000598444045482, "grad_norm": 0.2639574622741154, "kl": 0.0040940046310424805, "kl_reward": -1.2325973403640091, "kl_reward_no_entropy": -2.1506927320733666, "kl_scores_no_entropy": 9.010009765625, "learning_rate": 1e-06, "loss": 0.0002, "match_reward": -0.3046875, "no_entropy_reasoning_kl_max": 17.470703125, "no_entropy_reasoning_kl_mean": 0.5421905517578125, "no_entropy_unprocessed_thought_kl/_first_quartile": 0.0007861359044909477, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.4831218719482422, "no_entropy_unprocessed_thought_kl/_max": 17.470703125, "no_entropy_unprocessed_thought_kl/_mean": 0.5421905517578125, "no_entropy_unprocessed_thought_kl/_median": 0.0421622134745121, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 7155.3125, "processed_kl_no_entropy": 0.5425262451171875, "reasoning_kl_max": 14.4287109375, "reasoning_kl_mean": 0.26657867431640625, "reward": -1.7899671792984009, "reward_std": 0.24448524415493011, "rewards/TeacherKLBasedReward": -1.7899671792984009, "solution_log_prob_reward": -0.668244015192613, "step": 94, "thought_kl_scores": 7.34619140625, "thought_processed_kl": 0.266571044921875, "total_teacher_likelihood_reward": -2.205528852529824, "total_tl_reward_no_entropy": -3.1236242465674877, "unprocessed_answer_log_prob/_first_quartile": -0.04597488045692444, "unprocessed_answer_log_prob/_last_quartile": -3.1851232051849365e-07, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.5115203857421875, "unprocessed_answer_log_prob/_median": -0.00031276140362024307, "unprocessed_answer_log_prob/_min": -15.67236328125, "unprocessed_answer_log_prob/_sum": -257.5234375, "unprocessed_thought_kl/_first_quartile": -1.2386590242385864e-07, "unprocessed_thought_kl/_last_quartile": 0.17635869979858398, "unprocessed_thought_kl/_max": 14.4287109375, "unprocessed_thought_kl/_mean": 0.26657867431640625, "unprocessed_thought_kl/_median": 0.0028645629063248634, "unprocessed_thought_kl/_min": -5.5723876953125, "unprocessed_thought_kl/_sum": 3434.5625 }, { "answer_log_prob_mean": -0.20281600952148438, "answer_log_prob_min": -9.1171875, "completion_length": 8930.3203125, "epoch": 0.09096349491322561, "grad_norm": 0.2216656900097957, "kl": 0.005129098892211914, "kl_reward": -1.3280044491402805, "kl_reward_no_entropy": -1.650657344609499, "kl_scores_no_entropy": 6.5557861328125, "learning_rate": 1e-06, "loss": 0.0002, "match_reward": 0.0, "no_entropy_reasoning_kl_max": 12.695556640625, "no_entropy_reasoning_kl_mean": 0.4232635498046875, "no_entropy_unprocessed_thought_kl/_first_quartile": 0.00011092657223343849, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.2626070976257324, "no_entropy_unprocessed_thought_kl/_max": 12.695556640625, "no_entropy_unprocessed_thought_kl/_mean": 0.4232635498046875, "no_entropy_unprocessed_thought_kl/_median": 0.016583358868956566, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 1170.375, "processed_kl_no_entropy": 0.4234619140625, "reasoning_kl_max": 12.05322265625, "reasoning_kl_mean": 0.32213592529296875, "reward": -1.9518935680389404, "reward_std": 0.31954649090766907, "rewards/TeacherKLBasedReward": -1.9518935680389404, "solution_log_prob_reward": -0.29398788284743205, "step": 95, "thought_kl_scores": 6.1846923828125, "thought_processed_kl": 0.322723388671875, "total_teacher_likelihood_reward": -1.6219923188909888, "total_tl_reward_no_entropy": -1.9446452176198363, "unprocessed_answer_log_prob/_first_quartile": -0.02391798747703433, "unprocessed_answer_log_prob/_last_quartile": -5.587935447692871e-09, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.20281600952148438, "unprocessed_answer_log_prob/_median": -7.530674338340759e-05, "unprocessed_answer_log_prob/_min": -9.1171875, "unprocessed_answer_log_prob/_sum": -93.4453125, "unprocessed_thought_kl/_first_quartile": 3.914814442396164e-06, "unprocessed_thought_kl/_last_quartile": 0.15549898147583008, "unprocessed_thought_kl/_max": 12.05322265625, "unprocessed_thought_kl/_mean": 0.32213592529296875, "unprocessed_thought_kl/_median": 0.004055105149745941, "unprocessed_thought_kl/_min": -3.0344772338867188, "unprocessed_thought_kl/_sum": 813.359375 }, { "answer_log_prob_mean": -0.20633316040039062, "answer_log_prob_min": -8.7158203125, "completion_length": 7762.4453125, "epoch": 0.09192100538599642, "grad_norm": 1.0602501621939449, "kl": 0.0062721967697143555, "kl_reward": -1.1763821397908032, "kl_reward_no_entropy": -1.4339089882560074, "kl_scores_no_entropy": 5.3104248046875, "learning_rate": 1e-06, "loss": 0.0003, "match_reward": 0.0, "no_entropy_reasoning_kl_max": 10.245849609375, "no_entropy_reasoning_kl_mean": 0.37551116943359375, "no_entropy_unprocessed_thought_kl/_first_quartile": 3.4225406125187874e-05, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.2293863296508789, "no_entropy_unprocessed_thought_kl/_max": 10.245849609375, "no_entropy_unprocessed_thought_kl/_mean": 0.37551116943359375, "no_entropy_unprocessed_thought_kl/_median": 0.011277128010988235, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 329.703125, "processed_kl_no_entropy": 0.37546539306640625, "reasoning_kl_max": 9.309326171875, "reasoning_kl_mean": 0.29903411865234375, "reward": -1.8174201250076294, "reward_std": 0.28991371393203735, "rewards/TeacherKLBasedReward": -1.8174201250076294, "solution_log_prob_reward": -0.2934913592762314, "step": 96, "thought_kl_scores": 4.804931640625, "thought_processed_kl": 0.29912567138671875, "total_teacher_likelihood_reward": -1.469873498659581, "total_tl_reward_no_entropy": -1.7274003475904465, "unprocessed_answer_log_prob/_first_quartile": -0.009472152451053262, "unprocessed_answer_log_prob/_last_quartile": 0.0, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.20633316040039062, "unprocessed_answer_log_prob/_median": -1.2576580047607422e-05, "unprocessed_answer_log_prob/_min": -8.7158203125, "unprocessed_answer_log_prob/_sum": -67.50341796875, "unprocessed_thought_kl/_first_quartile": 5.105976015329361e-06, "unprocessed_thought_kl/_last_quartile": 0.16108942031860352, "unprocessed_thought_kl/_max": 9.309326171875, "unprocessed_thought_kl/_mean": 0.29903411865234375, "unprocessed_thought_kl/_median": 0.00505401473492384, "unprocessed_thought_kl/_min": -2.9017181396484375, "unprocessed_thought_kl/_sum": 260.796875 }, { "answer_log_prob_mean": -0.23789215087890625, "answer_log_prob_min": -11.887451171875, "completion_length": 7468.3955078125, "epoch": 0.0928785158587672, "grad_norm": 0.24364712304849398, "kl": 0.0019916892051696777, "kl_reward": -1.0717190455179662, "kl_reward_no_entropy": -1.4934846335090697, "kl_scores_no_entropy": 7.12841796875, "learning_rate": 1e-06, "loss": 0.0001, "match_reward": -0.0078125, "no_entropy_reasoning_kl_max": 13.89453125, "no_entropy_reasoning_kl_mean": 0.3588829040527344, "no_entropy_unprocessed_thought_kl/_first_quartile": 6.266403943300247e-06, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.20410623867064714, "no_entropy_unprocessed_thought_kl/_max": 13.89453125, "no_entropy_unprocessed_thought_kl/_mean": 0.3588829040527344, "no_entropy_unprocessed_thought_kl/_median": 0.007529047317802906, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 1612.8125, "processed_kl_no_entropy": 0.3590049743652344, "reasoning_kl_max": 12.253662109375, "reasoning_kl_mean": 0.23470306396484375, "reward": -1.8210334777832031, "reward_std": 0.24422413110733032, "rewards/TeacherKLBasedReward": -1.8210334777832031, "solution_log_prob_reward": -0.3567666590679437, "step": 97, "thought_kl_scores": 6.242919921875, "thought_processed_kl": 0.23462677001953125, "total_teacher_likelihood_reward": -1.4362982003949583, "total_tl_reward_no_entropy": -1.8580637956038117, "unprocessed_answer_log_prob/_first_quartile": -0.0022887028753757477, "unprocessed_answer_log_prob/_last_quartile": 0.0, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.23789215087890625, "unprocessed_answer_log_prob/_median": -2.016313374042511e-06, "unprocessed_answer_log_prob/_min": -11.887451171875, "unprocessed_answer_log_prob/_sum": -71.2421875, "unprocessed_thought_kl/_first_quartile": 4.1443854570388794e-08, "unprocessed_thought_kl/_last_quartile": 0.09256799705326557, "unprocessed_thought_kl/_max": 12.253662109375, "unprocessed_thought_kl/_mean": 0.23470306396484375, "unprocessed_thought_kl/_median": 0.0008019208908081055, "unprocessed_thought_kl/_min": -3.95025634765625, "unprocessed_thought_kl/_sum": 1021.515625 }, { "answer_log_prob_mean": -0.4741973876953125, "answer_log_prob_min": -13.055419921875, "completion_length": 7145.287109375, "epoch": 0.093836026331538, "grad_norm": 0.8350530765164518, "kl": 0.003049790859222412, "kl_reward": -1.116918169427663, "kl_reward_no_entropy": -1.947355025447905, "kl_scores_no_entropy": 8.3232421875, "learning_rate": 1e-06, "loss": 0.0001, "match_reward": -0.515625, "no_entropy_reasoning_kl_max": 16.15771484375, "no_entropy_reasoning_kl_mean": 0.48754119873046875, "no_entropy_unprocessed_thought_kl/_first_quartile": 0.0003238564822822809, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.42248964309692383, "no_entropy_unprocessed_thought_kl/_max": 16.15771484375, "no_entropy_unprocessed_thought_kl/_mean": 0.48754119873046875, "no_entropy_unprocessed_thought_kl/_median": 0.030604492872953415, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 6143.21875, "processed_kl_no_entropy": 0.48748016357421875, "reasoning_kl_max": 14.012451171875, "reasoning_kl_mean": 0.23218154907226562, "reward": -1.8484134674072266, "reward_std": 0.28681135177612305, "rewards/TeacherKLBasedReward": -1.8484134674072266, "solution_log_prob_reward": -0.6047515838872641, "step": 98, "thought_kl_scores": 7.1221923828125, "thought_processed_kl": 0.23211288452148438, "total_teacher_likelihood_reward": -2.237294743768871, "total_tl_reward_no_entropy": -3.0677316039800644, "unprocessed_answer_log_prob/_first_quartile": -0.14719271659851074, "unprocessed_answer_log_prob/_last_quartile": -9.988434612751007e-07, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.4741973876953125, "unprocessed_answer_log_prob/_median": -0.0007010810077190399, "unprocessed_answer_log_prob/_min": -13.055419921875, "unprocessed_answer_log_prob/_sum": -251.4140625, "unprocessed_thought_kl/_first_quartile": 2.1862797439098358e-07, "unprocessed_thought_kl/_last_quartile": 0.14885377883911133, "unprocessed_thought_kl/_max": 14.012451171875, "unprocessed_thought_kl/_mean": 0.23218154907226562, "unprocessed_thought_kl/_median": 0.002172885462641716, "unprocessed_thought_kl/_min": -5.27392578125, "unprocessed_thought_kl/_sum": 2843.5 }, { "answer_log_prob_mean": -0.2362518310546875, "answer_log_prob_min": -11.62841796875, "completion_length": 5091.1826171875, "epoch": 0.0947935368043088, "grad_norm": 0.2830582219996727, "kl": 0.0021591782569885254, "kl_reward": -1.5791509482078254, "kl_reward_no_entropy": -1.6537380432710052, "kl_scores_no_entropy": 6.216796875, "learning_rate": 1e-06, "loss": 0.0001, "match_reward": 0.0, "no_entropy_reasoning_kl_max": 11.994873046875, "no_entropy_reasoning_kl_mean": 0.43129730224609375, "no_entropy_unprocessed_thought_kl/_first_quartile": 1.0021263733506203e-05, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.24173355102539062, "no_entropy_unprocessed_thought_kl/_max": 11.994873046875, "no_entropy_unprocessed_thought_kl/_mean": 0.43129730224609375, "no_entropy_unprocessed_thought_kl/_median": 0.003956664353609085, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 350.296875, "processed_kl_no_entropy": 0.43331146240234375, "reasoning_kl_max": 11.87451171875, "reasoning_kl_mean": 0.4076385498046875, "reward": -1.730265736579895, "reward_std": 0.23911260068416595, "rewards/TeacherKLBasedReward": -1.730265736579895, "solution_log_prob_reward": -0.35253601567819715, "step": 99, "thought_kl_scores": 6.145751953125, "thought_processed_kl": 0.409423828125, "total_teacher_likelihood_reward": -1.9316869392059743, "total_tl_reward_no_entropy": -2.006274034269154, "unprocessed_answer_log_prob/_first_quartile": -0.019685920560732484, "unprocessed_answer_log_prob/_last_quartile": -9.313225746154785e-10, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.2362518310546875, "unprocessed_answer_log_prob/_median": -2.275407314300537e-05, "unprocessed_answer_log_prob/_min": -11.62841796875, "unprocessed_answer_log_prob/_sum": -79.1962890625, "unprocessed_thought_kl/_first_quartile": 9.863870218396187e-06, "unprocessed_thought_kl/_last_quartile": 0.22743749618530273, "unprocessed_thought_kl/_max": 11.87451171875, "unprocessed_thought_kl/_mean": 0.4076385498046875, "unprocessed_thought_kl/_median": 0.0037841256707906723, "unprocessed_thought_kl/_min": -1.43273401632905, "unprocessed_thought_kl/_sum": 327.90625 }, { "answer_log_prob_mean": -0.16897964477539062, "answer_log_prob_min": -11.089111328125, "completion_length": 4800.17578125, "epoch": 0.09575104727707959, "grad_norm": 0.26975816015445125, "kl": 0.0014168322086334229, "kl_reward": -1.0023115426301956, "kl_reward_no_entropy": -1.4268997055478394, "kl_scores_no_entropy": 7.893798828125, "learning_rate": 1e-06, "loss": 0.0001, "match_reward": -0.1171875, "no_entropy_reasoning_kl_max": 15.46875, "no_entropy_reasoning_kl_mean": 0.32094573974609375, "no_entropy_unprocessed_thought_kl/_first_quartile": 3.029126673936844e-07, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.10783594846725464, "no_entropy_unprocessed_thought_kl/_max": 15.46875, "no_entropy_unprocessed_thought_kl/_mean": 0.32094573974609375, "no_entropy_unprocessed_thought_kl/_median": 0.0005873963236808777, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 2231.78125, "processed_kl_no_entropy": 0.321075439453125, "reasoning_kl_max": 13.6689453125, "reasoning_kl_mean": 0.19741439819335938, "reward": -1.7339184284210205, "reward_std": 0.23996591567993164, "rewards/TeacherKLBasedReward": -1.7339184284210205, "solution_log_prob_reward": -0.2798707551555708, "step": 100, "thought_kl_scores": 6.9329833984375, "thought_processed_kl": 0.19734573364257812, "total_teacher_likelihood_reward": -1.3993697948753834, "total_tl_reward_no_entropy": -1.8239579480141401, "unprocessed_answer_log_prob/_first_quartile": -7.665110751986504e-05, "unprocessed_answer_log_prob/_last_quartile": 0.0, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.16897964477539062, "unprocessed_answer_log_prob/_median": -2.3283064365386963e-08, "unprocessed_answer_log_prob/_min": -11.089111328125, "unprocessed_answer_log_prob/_sum": -101.236328125, "unprocessed_thought_kl/_first_quartile": 6.51925802230835e-09, "unprocessed_thought_kl/_last_quartile": 0.03966079652309418, "unprocessed_thought_kl/_max": 13.6689453125, "unprocessed_thought_kl/_mean": 0.19741439819335938, "unprocessed_thought_kl/_median": 7.401872426271439e-05, "unprocessed_thought_kl/_min": -4.732177734375, "unprocessed_thought_kl/_sum": 1280.921875 }, { "answer_log_prob_mean": -0.1330718994140625, "answer_log_prob_min": -6.851806640625, "completion_length": 3963.861328125, "epoch": 0.09670855774985039, "grad_norm": 0.3027723112914477, "kl": 0.0016226768493652344, "kl_reward": -1.183957205619663, "kl_reward_no_entropy": -1.4567907666787505, "kl_scores_no_entropy": 6.0391845703125, "learning_rate": 1e-06, "loss": 0.0001, "match_reward": 0.0, "no_entropy_reasoning_kl_max": 11.709716796875, "no_entropy_reasoning_kl_mean": 0.368499755859375, "no_entropy_unprocessed_thought_kl/_first_quartile": 6.132759153842926e-07, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.17977619171142578, "no_entropy_unprocessed_thought_kl/_max": 11.709716796875, "no_entropy_unprocessed_thought_kl/_mean": 0.368499755859375, "no_entropy_unprocessed_thought_kl/_median": 0.002645060420036316, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 688.36328125, "processed_kl_no_entropy": 0.3684844970703125, "reasoning_kl_max": 11.0556640625, "reasoning_kl_mean": 0.28409576416015625, "reward": -1.7828785181045532, "reward_std": 0.18598753213882446, "rewards/TeacherKLBasedReward": -1.7828785181045532, "solution_log_prob_reward": -0.20158996479585767, "step": 101, "thought_kl_scores": 5.66943359375, "thought_processed_kl": 0.284088134765625, "total_teacher_likelihood_reward": -1.3855471704155207, "total_tl_reward_no_entropy": -1.6583807319402695, "unprocessed_answer_log_prob/_first_quartile": -0.001757921651005745, "unprocessed_answer_log_prob/_last_quartile": 0.0, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.1330718994140625, "unprocessed_answer_log_prob/_median": -2.2640451788902283e-06, "unprocessed_answer_log_prob/_min": -6.851806640625, "unprocessed_answer_log_prob/_sum": -39.8173828125, "unprocessed_thought_kl/_first_quartile": 9.476207196712494e-08, "unprocessed_thought_kl/_last_quartile": 0.11354541778564453, "unprocessed_thought_kl/_max": 11.0556640625, "unprocessed_thought_kl/_mean": 0.28409576416015625, "unprocessed_thought_kl/_median": 0.0011468417942523956, "unprocessed_thought_kl/_min": -3.071624755859375, "unprocessed_thought_kl/_sum": 495.57421875 }, { "answer_log_prob_mean": -0.2682533264160156, "answer_log_prob_min": -17.66650390625, "completion_length": 5008.173828125, "epoch": 0.09766606822262118, "grad_norm": 0.24624426472356675, "kl": 0.0017092227935791016, "kl_reward": -1.2619280871003866, "kl_reward_no_entropy": -1.7134057553485036, "kl_scores_no_entropy": 9.68798828125, "learning_rate": 1e-06, "loss": 0.0001, "match_reward": -0.046875, "no_entropy_reasoning_kl_max": 18.9970703125, "no_entropy_reasoning_kl_mean": 0.38116455078125, "no_entropy_unprocessed_thought_kl/_first_quartile": 9.199511259794235e-05, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.22219718247652054, "no_entropy_unprocessed_thought_kl/_max": 18.9970703125, "no_entropy_unprocessed_thought_kl/_mean": 0.38116455078125, "no_entropy_unprocessed_thought_kl/_median": 0.014134888537228107, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 2962.3125, "processed_kl_no_entropy": 0.38120269775390625, "reasoning_kl_max": 18.04541015625, "reasoning_kl_mean": 0.2401885986328125, "reward": -1.629777431488037, "reward_std": 0.19675558805465698, "rewards/TeacherKLBasedReward": -1.629777431488037, "solution_log_prob_reward": -0.44491836009547114, "step": 102, "thought_kl_scores": 9.144775390625, "thought_processed_kl": 0.2401123046875, "total_teacher_likelihood_reward": -1.7537214453332126, "total_tl_reward_no_entropy": -2.2051991084590554, "unprocessed_answer_log_prob/_first_quartile": -0.02468837983906269, "unprocessed_answer_log_prob/_last_quartile": -4.1443854570388794e-08, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.2682533264160156, "unprocessed_answer_log_prob/_median": -4.3344683945178986e-05, "unprocessed_answer_log_prob/_min": -17.66650390625, "unprocessed_answer_log_prob/_sum": -151.6015625, "unprocessed_thought_kl/_first_quartile": 3.005843609571457e-07, "unprocessed_thought_kl/_last_quartile": 0.08935848623514175, "unprocessed_thought_kl/_max": 18.04541015625, "unprocessed_thought_kl/_mean": 0.2401885986328125, "unprocessed_thought_kl/_median": 0.001324295997619629, "unprocessed_thought_kl/_min": -4.62109375, "unprocessed_thought_kl/_sum": 1651.21875 }, { "answer_log_prob_mean": -0.10097122192382812, "answer_log_prob_min": -9.239013671875, "completion_length": 6223.0703125, "epoch": 0.09862357869539198, "grad_norm": 1.5765653928898677, "kl": 0.0035607218742370605, "kl_reward": -1.1822057920508087, "kl_reward_no_entropy": -1.2573293880559504, "kl_scores_no_entropy": 7.0111083984375, "learning_rate": 1e-06, "loss": 0.0001, "match_reward": 0.0, "no_entropy_reasoning_kl_max": 13.753173828125, "no_entropy_reasoning_kl_mean": 0.28157806396484375, "no_entropy_unprocessed_thought_kl/_first_quartile": 3.725290298461914e-09, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.03378117084503174, "no_entropy_unprocessed_thought_kl/_max": 13.753173828125, "no_entropy_unprocessed_thought_kl/_mean": 0.28157806396484375, "no_entropy_unprocessed_thought_kl/_median": 4.069507122039795e-05, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 389.03125, "processed_kl_no_entropy": 0.2814483642578125, "reasoning_kl_max": 13.579833984375, "reasoning_kl_mean": 0.258270263671875, "reward": -1.8427104949951172, "reward_std": 0.26327720284461975, "rewards/TeacherKLBasedReward": -1.8427104949951172, "solution_log_prob_reward": -0.1933613553410396, "step": 103, "thought_kl_scores": 6.913330078125, "thought_processed_kl": 0.25786590576171875, "total_teacher_likelihood_reward": -1.375567140057683, "total_tl_reward_no_entropy": -1.4506907351315022, "unprocessed_answer_log_prob/_first_quartile": -2.405373379588127e-06, "unprocessed_answer_log_prob/_last_quartile": 0.0, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.10097122192382812, "unprocessed_answer_log_prob/_median": 0.0, "unprocessed_answer_log_prob/_min": -9.239013671875, "unprocessed_answer_log_prob/_sum": -54.796875, "unprocessed_thought_kl/_first_quartile": 0.0, "unprocessed_thought_kl/_last_quartile": 0.02388930320739746, "unprocessed_thought_kl/_max": 13.579833984375, "unprocessed_thought_kl/_mean": 0.258270263671875, "unprocessed_thought_kl/_median": 1.2291595339775085e-05, "unprocessed_thought_kl/_min": -1.513146162033081, "unprocessed_thought_kl/_sum": 346.75 }, { "answer_log_prob_mean": -0.4114532470703125, "answer_log_prob_min": -12.5810546875, "completion_length": 8867.2861328125, "epoch": 0.09958108916816277, "grad_norm": 0.2036901844877722, "kl": 0.0021796226501464844, "kl_reward": -1.179736622609198, "kl_reward_no_entropy": -1.8913622964173555, "kl_scores_no_entropy": 7.072998046875, "learning_rate": 1e-06, "loss": 0.0001, "match_reward": -0.015625, "no_entropy_reasoning_kl_max": 13.65576171875, "no_entropy_reasoning_kl_mean": 0.493896484375, "no_entropy_unprocessed_thought_kl/_first_quartile": 0.00018128077499568462, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.4226837158203125, "no_entropy_unprocessed_thought_kl/_max": 13.65576171875, "no_entropy_unprocessed_thought_kl/_mean": 0.493896484375, "no_entropy_unprocessed_thought_kl/_median": 0.027715206146240234, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 3462.03125, "processed_kl_no_entropy": 0.4940185546875, "reasoning_kl_max": 12.375244140625, "reasoning_kl_mean": 0.26949310302734375, "reward": -1.8171875476837158, "reward_std": 0.2745671272277832, "rewards/TeacherKLBasedReward": -1.8171875476837158, "solution_log_prob_reward": -0.5372637913096696, "step": 104, "thought_kl_scores": 6.32177734375, "thought_processed_kl": 0.2694549560546875, "total_teacher_likelihood_reward": -1.7326254146173596, "total_tl_reward_no_entropy": -2.4442510791122913, "unprocessed_answer_log_prob/_first_quartile": -0.08148694038391113, "unprocessed_answer_log_prob/_last_quartile": -3.632158041000366e-08, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.4114532470703125, "unprocessed_answer_log_prob/_median": -0.00021228846162557602, "unprocessed_answer_log_prob/_min": -12.5810546875, "unprocessed_answer_log_prob/_sum": -197.4609375, "unprocessed_thought_kl/_first_quartile": 2.1813903003931046e-06, "unprocessed_thought_kl/_last_quartile": 0.1785430908203125, "unprocessed_thought_kl/_max": 12.375244140625, "unprocessed_thought_kl/_mean": 0.26949310302734375, "unprocessed_thought_kl/_median": 0.0034769102931022644, "unprocessed_thought_kl/_min": -4.8634033203125, "unprocessed_thought_kl/_sum": 1761.140625 }, { "answer_log_prob_mean": -0.2069091796875, "answer_log_prob_min": -11.08935546875, "completion_length": 5416.7890625, "epoch": 0.10053859964093358, "grad_norm": 0.8515045794631846, "kl": 0.0029563307762145996, "kl_reward": -1.0528243910521269, "kl_reward_no_entropy": -1.4548516757786274, "kl_scores_no_entropy": 7.217529296875, "learning_rate": 1e-06, "loss": 0.0001, "match_reward": -0.390625, "no_entropy_reasoning_kl_max": 14.091064453125, "no_entropy_reasoning_kl_mean": 0.3440399169921875, "no_entropy_unprocessed_thought_kl/_first_quartile": 1.9744038581848145e-07, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.14350648410618305, "no_entropy_unprocessed_thought_kl/_max": 14.091064453125, "no_entropy_unprocessed_thought_kl/_mean": 0.3440399169921875, "no_entropy_unprocessed_thought_kl/_median": 0.0011483384296298027, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 2791.890625, "processed_kl_no_entropy": 0.3441314697265625, "reasoning_kl_max": 12.891845703125, "reasoning_kl_mean": 0.22202301025390625, "reward": -1.6683796644210815, "reward_std": 0.23964843153953552, "rewards/TeacherKLBasedReward": -1.6683796644210815, "solution_log_prob_reward": -0.3178027317626402, "step": 105, "thought_kl_scores": 6.5555419921875, "thought_processed_kl": 0.2220458984375, "total_teacher_likelihood_reward": -1.7612521229311824, "total_tl_reward_no_entropy": -2.1632794057950377, "unprocessed_answer_log_prob/_first_quartile": -0.004653967916965485, "unprocessed_answer_log_prob/_last_quartile": 0.0, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.2069091796875, "unprocessed_answer_log_prob/_median": -8.298084139823914e-07, "unprocessed_answer_log_prob/_min": -11.08935546875, "unprocessed_answer_log_prob/_sum": -123.185546875, "unprocessed_thought_kl/_first_quartile": 4.889443516731262e-09, "unprocessed_thought_kl/_last_quartile": 0.06467620749026537, "unprocessed_thought_kl/_max": 12.891845703125, "unprocessed_thought_kl/_mean": 0.22202301025390625, "unprocessed_thought_kl/_median": 0.00018803030252456665, "unprocessed_thought_kl/_min": -4.29443359375, "unprocessed_thought_kl/_sum": 1558.96875 }, { "answer_log_prob_mean": -0.39111328125, "answer_log_prob_min": -11.368408203125, "completion_length": 8715.2333984375, "epoch": 0.10149611011370437, "grad_norm": 170.66538511366434, "kl": 0.10807919502258301, "kl_reward": -1.234948415774852, "kl_reward_no_entropy": -2.0446014404296875, "kl_scores_no_entropy": 7.8916015625, "learning_rate": 1e-06, "loss": 0.0043, "match_reward": -0.4140625, "no_entropy_reasoning_kl_max": 15.2587890625, "no_entropy_reasoning_kl_mean": 0.5289459228515625, "no_entropy_unprocessed_thought_kl/_first_quartile": 0.0003262599930167198, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.4952239990234375, "no_entropy_unprocessed_thought_kl/_max": 15.2587890625, "no_entropy_unprocessed_thought_kl/_mean": 0.5289459228515625, "no_entropy_unprocessed_thought_kl/_median": 0.03809070587158203, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 5608.78125, "processed_kl_no_entropy": 0.5289764404296875, "reasoning_kl_max": 13.59765625, "reasoning_kl_mean": 0.27567291259765625, "reward": -2.0517051219940186, "reward_std": 0.303366482257843, "rewards/TeacherKLBasedReward": -2.0517051219940186, "solution_log_prob_reward": -0.5047973596956581, "step": 106, "thought_kl_scores": 6.9384765625, "thought_processed_kl": 0.2757568359375, "total_teacher_likelihood_reward": -2.1538082705810666, "total_tl_reward_no_entropy": -2.9634612910449505, "unprocessed_answer_log_prob/_first_quartile": -0.0635218620300293, "unprocessed_answer_log_prob/_last_quartile": -3.1013041734695435e-07, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.39111328125, "unprocessed_answer_log_prob/_median": -0.00023283809423446655, "unprocessed_answer_log_prob/_min": -11.368408203125, "unprocessed_answer_log_prob/_sum": -209.44140625, "unprocessed_thought_kl/_first_quartile": 1.4759134501218796e-06, "unprocessed_thought_kl/_last_quartile": 0.20596694946289062, "unprocessed_thought_kl/_max": 13.59765625, "unprocessed_thought_kl/_mean": 0.27567291259765625, "unprocessed_thought_kl/_median": 0.004224210977554321, "unprocessed_thought_kl/_min": -5.8934326171875, "unprocessed_thought_kl/_sum": 2723.53125 }, { "answer_log_prob_mean": -0.18038177490234375, "answer_log_prob_min": -7.754638671875, "completion_length": 5455.4541015625, "epoch": 0.10245362058647517, "grad_norm": 0.7527214954222508, "kl": 0.0026355385780334473, "kl_reward": -1.2261446942575276, "kl_reward_no_entropy": -1.5643816916272044, "kl_scores_no_entropy": 6.581787109375, "learning_rate": 1e-06, "loss": 0.0001, "match_reward": -0.0390625, "no_entropy_reasoning_kl_max": 12.77685546875, "no_entropy_reasoning_kl_mean": 0.3936920166015625, "no_entropy_unprocessed_thought_kl/_first_quartile": 6.773811765015125e-05, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.24620985984802246, "no_entropy_unprocessed_thought_kl/_max": 12.77685546875, "no_entropy_unprocessed_thought_kl/_mean": 0.3936920166015625, "no_entropy_unprocessed_thought_kl/_median": 0.013580095022916794, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 1783.0546875, "processed_kl_no_entropy": 0.394500732421875, "reasoning_kl_max": 12.248291015625, "reasoning_kl_mean": 0.28623199462890625, "reward": -1.6808451414108276, "reward_std": 0.22713851928710938, "rewards/TeacherKLBasedReward": -1.6808451414108276, "solution_log_prob_reward": -0.257928159320727, "step": 107, "thought_kl_scores": 6.264404296875, "thought_processed_kl": 0.28614044189453125, "total_teacher_likelihood_reward": -1.5231353463605046, "total_tl_reward_no_entropy": -1.8613723423331976, "unprocessed_answer_log_prob/_first_quartile": -0.005495788063853979, "unprocessed_answer_log_prob/_last_quartile": -1.862645149230957e-09, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.18038177490234375, "unprocessed_answer_log_prob/_median": -3.4440308809280396e-06, "unprocessed_answer_log_prob/_min": -7.754638671875, "unprocessed_answer_log_prob/_sum": -87.61572265625, "unprocessed_thought_kl/_first_quartile": 3.4319236874580383e-07, "unprocessed_thought_kl/_last_quartile": 0.12211227416992188, "unprocessed_thought_kl/_max": 12.248291015625, "unprocessed_thought_kl/_mean": 0.28623199462890625, "unprocessed_thought_kl/_median": 0.0017518103122711182, "unprocessed_thought_kl/_min": -2.658069610595703, "unprocessed_thought_kl/_sum": 1032.765625 }, { "answer_log_prob_mean": -0.3048248291015625, "answer_log_prob_min": -12.54052734375, "completion_length": 5919.4033203125, "epoch": 0.10341113105924596, "grad_norm": 9.433892441528187, "kl": 0.007802128791809082, "kl_reward": -1.3046658332459629, "kl_reward_no_entropy": -1.7356658903881907, "kl_scores_no_entropy": 6.3857421875, "learning_rate": 1e-06, "loss": 0.0003, "match_reward": 0.0, "no_entropy_reasoning_kl_max": 12.315673828125, "no_entropy_reasoning_kl_mean": 0.4553985595703125, "no_entropy_unprocessed_thought_kl/_first_quartile": 4.6170549467206e-05, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.32098388671875, "no_entropy_unprocessed_thought_kl/_max": 12.315673828125, "no_entropy_unprocessed_thought_kl/_mean": 0.4553985595703125, "no_entropy_unprocessed_thought_kl/_median": 0.017241641879081726, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 1049.09375, "processed_kl_no_entropy": 0.45526885986328125, "reasoning_kl_max": 11.60791015625, "reasoning_kl_mean": 0.31880950927734375, "reward": -1.9162555932998657, "reward_std": 0.2361510992050171, "rewards/TeacherKLBasedReward": -1.9162555932998657, "solution_log_prob_reward": -0.4302301013376564, "step": 108, "thought_kl_scores": 5.9639892578125, "thought_processed_kl": 0.31877899169921875, "total_teacher_likelihood_reward": -1.7348959390074015, "total_tl_reward_no_entropy": -2.165895991027355, "unprocessed_answer_log_prob/_first_quartile": -0.018545506754890084, "unprocessed_answer_log_prob/_last_quartile": 0.0, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.3048248291015625, "unprocessed_answer_log_prob/_median": -1.196935772895813e-05, "unprocessed_answer_log_prob/_min": -12.54052734375, "unprocessed_answer_log_prob/_sum": -108.7265625, "unprocessed_thought_kl/_first_quartile": 3.019813448190689e-07, "unprocessed_thought_kl/_last_quartile": 0.17877769470214844, "unprocessed_thought_kl/_max": 11.60791015625, "unprocessed_thought_kl/_mean": 0.31880950927734375, "unprocessed_thought_kl/_median": 0.002879660576581955, "unprocessed_thought_kl/_min": -3.50396728515625, "unprocessed_thought_kl/_sum": 665.796875 }, { "answer_log_prob_mean": -0.30562591552734375, "answer_log_prob_min": -11.233642578125, "completion_length": 8475.3310546875, "epoch": 0.10436864153201676, "grad_norm": 0.5365464837364284, "kl": 0.0034295320510864258, "kl_reward": -1.2896777270361781, "kl_reward_no_entropy": -2.049497678875923, "kl_scores_no_entropy": 8.111328125, "learning_rate": 1e-06, "loss": 0.0001, "match_reward": -0.0625, "no_entropy_reasoning_kl_max": 15.69970703125, "no_entropy_reasoning_kl_mean": 0.5261688232421875, "no_entropy_unprocessed_thought_kl/_first_quartile": 0.00010278751142323017, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.46309661865234375, "no_entropy_unprocessed_thought_kl/_max": 15.69970703125, "no_entropy_unprocessed_thought_kl/_mean": 0.5261688232421875, "no_entropy_unprocessed_thought_kl/_median": 0.027769804000854492, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 4476.5625, "processed_kl_no_entropy": 0.52606201171875, "reasoning_kl_max": 14.076904296875, "reasoning_kl_mean": 0.28912353515625, "reward": -1.8069674968719482, "reward_std": 0.28566548228263855, "rewards/TeacherKLBasedReward": -1.8069674968719482, "solution_log_prob_reward": -0.4179623411037028, "step": 109, "thought_kl_scores": 7.1822509765625, "thought_processed_kl": 0.28912353515625, "total_teacher_likelihood_reward": -1.7701400639489293, "total_tl_reward_no_entropy": -2.5299600195139647, "unprocessed_answer_log_prob/_first_quartile": -0.02799248695373535, "unprocessed_answer_log_prob/_last_quartile": -4.0279701352119446e-08, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.30562591552734375, "unprocessed_answer_log_prob/_median": -5.917996168136597e-05, "unprocessed_answer_log_prob/_min": -11.233642578125, "unprocessed_answer_log_prob/_sum": -196.796875, "unprocessed_thought_kl/_first_quartile": 3.3853575587272644e-07, "unprocessed_thought_kl/_last_quartile": 0.19415760040283203, "unprocessed_thought_kl/_max": 14.076904296875, "unprocessed_thought_kl/_mean": 0.28912353515625, "unprocessed_thought_kl/_median": 0.0025459211319684982, "unprocessed_thought_kl/_min": -5.2147216796875, "unprocessed_thought_kl/_sum": 2373.875 }, { "answer_log_prob_mean": -0.13448715209960938, "answer_log_prob_min": -12.324951171875, "completion_length": 3966.029296875, "epoch": 0.10532615200478755, "grad_norm": 0.2632379353933544, "kl": 0.002051055431365967, "kl_reward": -1.0521578858606517, "kl_reward_no_entropy": -1.2652981425635517, "kl_scores_no_entropy": 5.5006103515625, "learning_rate": 1e-06, "loss": 0.0001, "match_reward": 0.0, "no_entropy_reasoning_kl_max": 10.686279296875, "no_entropy_reasoning_kl_mean": 0.31490325927734375, "no_entropy_unprocessed_thought_kl/_first_quartile": 3.096647560596466e-08, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.10088133811950684, "no_entropy_unprocessed_thought_kl/_max": 10.686279296875, "no_entropy_unprocessed_thought_kl/_mean": 0.31490325927734375, "no_entropy_unprocessed_thought_kl/_median": 0.0007537966594099998, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 516.6796875, "processed_kl_no_entropy": 0.31488037109375, "reasoning_kl_max": 10.031494140625, "reasoning_kl_mean": 0.25040435791015625, "reward": -1.6838016510009766, "reward_std": 0.19624723494052887, "rewards/TeacherKLBasedReward": -1.6838016510009766, "solution_log_prob_reward": -0.2577366599580273, "step": 110, "thought_kl_scores": 5.1417236328125, "thought_processed_kl": 0.250457763671875, "total_teacher_likelihood_reward": -1.3098945431411266, "total_tl_reward_no_entropy": -1.5230348035693169, "unprocessed_answer_log_prob/_first_quartile": -9.480351582169533e-05, "unprocessed_answer_log_prob/_last_quartile": 0.0, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.13448715209960938, "unprocessed_answer_log_prob/_median": -9.313225746154785e-10, "unprocessed_answer_log_prob/_min": -12.324951171875, "unprocessed_answer_log_prob/_sum": -62.693359375, "unprocessed_thought_kl/_first_quartile": 8.381903171539307e-09, "unprocessed_thought_kl/_last_quartile": 0.06644713878631592, "unprocessed_thought_kl/_max": 10.031494140625, "unprocessed_thought_kl/_mean": 0.25040435791015625, "unprocessed_thought_kl/_median": 0.0003691241145133972, "unprocessed_thought_kl/_min": -3.0007171630859375, "unprocessed_thought_kl/_sum": 400.0 }, { "answer_log_prob_mean": -0.3413238525390625, "answer_log_prob_min": -10.076904296875, "completion_length": 4092.3740234375, "epoch": 0.10628366247755835, "grad_norm": 25.512935400896787, "kl": 0.048678457736968994, "kl_reward": -1.2226016195490956, "kl_reward_no_entropy": -1.6467572003602982, "kl_scores_no_entropy": 5.8253173828125, "learning_rate": 1e-06, "loss": 0.0019, "match_reward": 0.0, "no_entropy_reasoning_kl_max": 11.213623046875, "no_entropy_reasoning_kl_mean": 0.4367828369140625, "no_entropy_unprocessed_thought_kl/_first_quartile": 4.2721396312117577e-05, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.28671836853027344, "no_entropy_unprocessed_thought_kl/_max": 11.213623046875, "no_entropy_unprocessed_thought_kl/_mean": 0.4367828369140625, "no_entropy_unprocessed_thought_kl/_median": 0.012645132839679718, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 856.2265625, "processed_kl_no_entropy": 0.436614990234375, "reasoning_kl_max": 10.3037109375, "reasoning_kl_mean": 0.30449676513671875, "reward": -1.628671407699585, "reward_std": 0.20720821619033813, "rewards/TeacherKLBasedReward": -1.628671407699585, "solution_log_prob_reward": -0.44209289248101413, "step": 111, "thought_kl_scores": 5.30322265625, "thought_processed_kl": 0.30462646484375, "total_teacher_likelihood_reward": -1.664694513194263, "total_tl_reward_no_entropy": -2.088850097730756, "unprocessed_answer_log_prob/_first_quartile": -0.06284034252166748, "unprocessed_answer_log_prob/_last_quartile": -6.277114152908325e-07, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.3413238525390625, "unprocessed_answer_log_prob/_median": -0.0008510518819093704, "unprocessed_answer_log_prob/_min": -10.076904296875, "unprocessed_answer_log_prob/_sum": -112.943359375, "unprocessed_thought_kl/_first_quartile": 2.547167241573334e-07, "unprocessed_thought_kl/_last_quartile": 0.14913272857666016, "unprocessed_thought_kl/_max": 10.3037109375, "unprocessed_thought_kl/_mean": 0.30449676513671875, "unprocessed_thought_kl/_median": 0.0016046371310949326, "unprocessed_thought_kl/_min": -3.49322509765625, "unprocessed_thought_kl/_sum": 524.78125 }, { "answer_log_prob_mean": -0.40228271484375, "answer_log_prob_min": -14.752197265625, "completion_length": 5770.3408203125, "epoch": 0.10724117295032914, "grad_norm": 0.2565441127729312, "kl": 0.0023849010467529297, "kl_reward": -1.2669094842858613, "kl_reward_no_entropy": -1.7867285059764981, "kl_scores_no_entropy": 8.19580078125, "learning_rate": 1e-06, "loss": 0.0001, "match_reward": -0.3125, "no_entropy_reasoning_kl_max": 15.9541015625, "no_entropy_reasoning_kl_mean": 0.43603515625, "no_entropy_unprocessed_thought_kl/_first_quartile": 1.546344719827175e-05, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.2732973098754883, "no_entropy_unprocessed_thought_kl/_max": 15.9541015625, "no_entropy_unprocessed_thought_kl/_mean": 0.43603515625, "no_entropy_unprocessed_thought_kl/_median": 0.00838516652584076, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 3689.46875, "processed_kl_no_entropy": 0.4361724853515625, "reasoning_kl_max": 13.852783203125, "reasoning_kl_mean": 0.28377532958984375, "reward": -1.5884602069854736, "reward_std": 0.2349155843257904, "rewards/TeacherKLBasedReward": -1.5884602069854736, "solution_log_prob_reward": -0.5498046847060323, "step": 112, "thought_kl_scores": 7.06884765625, "thought_processed_kl": 0.28375244140625, "total_teacher_likelihood_reward": -2.1292141675949097, "total_tl_reward_no_entropy": -2.649033189751208, "unprocessed_answer_log_prob/_first_quartile": -0.02102452516555786, "unprocessed_answer_log_prob/_last_quartile": -8.847564458847046e-09, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.40228271484375, "unprocessed_answer_log_prob/_median": -1.4240853488445282e-05, "unprocessed_answer_log_prob/_min": -14.752197265625, "unprocessed_answer_log_prob/_sum": -236.80078125, "unprocessed_thought_kl/_first_quartile": 4.470348358154297e-08, "unprocessed_thought_kl/_last_quartile": 0.12793350219726562, "unprocessed_thought_kl/_max": 13.852783203125, "unprocessed_thought_kl/_mean": 0.28377532958984375, "unprocessed_thought_kl/_median": 0.0010284557938575745, "unprocessed_thought_kl/_min": -4.3438720703125, "unprocessed_thought_kl/_sum": 1954.375 }, { "answer_log_prob_mean": -0.29091644287109375, "answer_log_prob_min": -13.38623046875, "completion_length": 6131.6044921875, "epoch": 0.10819868342309993, "grad_norm": 0.24866940037440668, "kl": 0.002917766571044922, "kl_reward": -1.2786785839125514, "kl_reward_no_entropy": -1.8282202100381255, "kl_scores_no_entropy": 6.7099609375, "learning_rate": 1e-06, "loss": 0.0001, "match_reward": -0.1328125, "no_entropy_reasoning_kl_max": 12.942626953125, "no_entropy_reasoning_kl_mean": 0.47998046875, "no_entropy_unprocessed_thought_kl/_first_quartile": 0.00014941254630684853, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.3922080993652344, "no_entropy_unprocessed_thought_kl/_max": 12.942626953125, "no_entropy_unprocessed_thought_kl/_mean": 0.47998046875, "no_entropy_unprocessed_thought_kl/_median": 0.02273237705230713, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 3435.796875, "processed_kl_no_entropy": 0.47998046875, "reasoning_kl_max": 12.21337890625, "reasoning_kl_mean": 0.3040924072265625, "reward": -1.703942060470581, "reward_std": 0.264788419008255, "rewards/TeacherKLBasedReward": -1.703942060470581, "solution_log_prob_reward": -0.42477874481119215, "step": 113, "thought_kl_scores": 6.25830078125, "thought_processed_kl": 0.3040008544921875, "total_teacher_likelihood_reward": -1.8362698247656226, "total_tl_reward_no_entropy": -2.3858114536851645, "unprocessed_answer_log_prob/_first_quartile": -0.031122207641601562, "unprocessed_answer_log_prob/_last_quartile": -9.313225746154785e-10, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.29091644287109375, "unprocessed_answer_log_prob/_median": -7.842946797609329e-05, "unprocessed_answer_log_prob/_min": -13.38623046875, "unprocessed_answer_log_prob/_sum": -189.375, "unprocessed_thought_kl/_first_quartile": 2.443697303533554e-05, "unprocessed_thought_kl/_last_quartile": 0.19826889038085938, "unprocessed_thought_kl/_max": 12.21337890625, "unprocessed_thought_kl/_mean": 0.3040924072265625, "unprocessed_thought_kl/_median": 0.004256129264831543, "unprocessed_thought_kl/_min": -3.966827392578125, "unprocessed_thought_kl/_sum": 1682.3984375 }, { "answer_log_prob_mean": -0.2568206787109375, "answer_log_prob_min": -7.69091796875, "completion_length": 4892.7490234375, "epoch": 0.10915619389587074, "grad_norm": 0.25663668372779425, "kl": 0.002834320068359375, "kl_reward": -1.2702767802402377, "kl_reward_no_entropy": -1.5982818445190787, "kl_scores_no_entropy": 6.2723388671875, "learning_rate": 1e-06, "loss": 0.0001, "match_reward": 0.0, "no_entropy_reasoning_kl_max": 12.127685546875, "no_entropy_reasoning_kl_mean": 0.4114837646484375, "no_entropy_unprocessed_thought_kl/_first_quartile": 2.6995548978447914e-05, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.2451801300048828, "no_entropy_unprocessed_thought_kl/_max": 12.127685546875, "no_entropy_unprocessed_thought_kl/_mean": 0.4114837646484375, "no_entropy_unprocessed_thought_kl/_median": 0.007391706109046936, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 1369.6328125, "processed_kl_no_entropy": 0.4114227294921875, "reasoning_kl_max": 11.549560546875, "reasoning_kl_mean": 0.30792999267578125, "reward": -1.6279833316802979, "reward_std": 0.1978306919336319, "rewards/TeacherKLBasedReward": -1.6279833316802979, "solution_log_prob_reward": -0.33372985711321235, "step": 114, "thought_kl_scores": 5.9300537109375, "thought_processed_kl": 0.30800628662109375, "total_teacher_likelihood_reward": -1.604006634093821, "total_tl_reward_no_entropy": -1.9320116993039846, "unprocessed_answer_log_prob/_first_quartile": -0.02447676658630371, "unprocessed_answer_log_prob/_last_quartile": -9.313225746154785e-10, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.2568206787109375, "unprocessed_answer_log_prob/_median": -1.3663433492183685e-05, "unprocessed_answer_log_prob/_min": -7.69091796875, "unprocessed_answer_log_prob/_sum": -106.5830078125, "unprocessed_thought_kl/_first_quartile": 5.066394805908203e-07, "unprocessed_thought_kl/_last_quartile": 0.14335298538208008, "unprocessed_thought_kl/_max": 11.549560546875, "unprocessed_thought_kl/_mean": 0.30792999267578125, "unprocessed_thought_kl/_median": 0.0012867338955402374, "unprocessed_thought_kl/_min": -3.5096435546875, "unprocessed_thought_kl/_sum": 859.453125 }, { "answer_log_prob_mean": -0.3574371337890625, "answer_log_prob_min": -16.572265625, "completion_length": 5415.216796875, "epoch": 0.11011370436864153, "grad_norm": 1.4462023425473713, "kl": 0.00542604923248291, "kl_reward": -1.1735307194758207, "kl_reward_no_entropy": -1.8537684557959437, "kl_scores_no_entropy": 8.18115234375, "learning_rate": 1e-06, "loss": 0.0002, "match_reward": -0.203125, "no_entropy_reasoning_kl_max": 15.90185546875, "no_entropy_reasoning_kl_mean": 0.4589042663574219, "no_entropy_unprocessed_thought_kl/_first_quartile": 5.86886890232563e-05, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.33169366512447596, "no_entropy_unprocessed_thought_kl/_max": 15.90185546875, "no_entropy_unprocessed_thought_kl/_mean": 0.4589042663574219, "no_entropy_unprocessed_thought_kl/_median": 0.014613212086260319, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 4678.0625, "processed_kl_no_entropy": 0.4590721130371094, "reasoning_kl_max": 14.51708984375, "reasoning_kl_mean": 0.24600601196289062, "reward": -1.7962305545806885, "reward_std": 0.2158288061618805, "rewards/TeacherKLBasedReward": -1.7962305545806885, "solution_log_prob_reward": -0.5231597854290158, "step": 115, "thought_kl_scores": 7.38232421875, "thought_processed_kl": 0.24589920043945312, "total_teacher_likelihood_reward": -1.8998155035078526, "total_tl_reward_no_entropy": -2.580053247511387, "unprocessed_answer_log_prob/_first_quartile": -0.018521606922149658, "unprocessed_answer_log_prob/_last_quartile": -5.075708031654358e-08, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.3574371337890625, "unprocessed_answer_log_prob/_median": -3.2804906368255615e-05, "unprocessed_answer_log_prob/_min": -16.572265625, "unprocessed_answer_log_prob/_sum": -275.0234375, "unprocessed_thought_kl/_first_quartile": 1.3317912817001343e-07, "unprocessed_thought_kl/_last_quartile": 0.12021065223962069, "unprocessed_thought_kl/_max": 14.51708984375, "unprocessed_thought_kl/_mean": 0.24600601196289062, "unprocessed_thought_kl/_median": 0.000931866466999054, "unprocessed_thought_kl/_min": -5.208984375, "unprocessed_thought_kl/_sum": 2407.5 }, { "answer_log_prob_mean": -0.2350311279296875, "answer_log_prob_min": -10.150146484375, "completion_length": 5522.17578125, "epoch": 0.11107121484141233, "grad_norm": 0.22600987572356854, "kl": 0.003482818603515625, "kl_reward": -1.1341616716235876, "kl_reward_no_entropy": -1.5047021340578794, "kl_scores_no_entropy": 6.82177734375, "learning_rate": 1e-06, "loss": 0.0001, "match_reward": -0.03125, "no_entropy_reasoning_kl_max": 13.273193359375, "no_entropy_reasoning_kl_mean": 0.36883544921875, "no_entropy_unprocessed_thought_kl/_first_quartile": 1.650070771574974e-06, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.15288424491882324, "no_entropy_unprocessed_thought_kl/_max": 13.273193359375, "no_entropy_unprocessed_thought_kl/_mean": 0.36883544921875, "no_entropy_unprocessed_thought_kl/_median": 0.001895260065793991, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 1530.921875, "processed_kl_no_entropy": 0.36858367919921875, "reasoning_kl_max": 11.799072265625, "reasoning_kl_mean": 0.26006317138671875, "reward": -1.5451099872589111, "reward_std": 0.23667991161346436, "rewards/TeacherKLBasedReward": -1.5451099872589111, "solution_log_prob_reward": -0.3365325884660706, "step": 116, "thought_kl_scores": 6.028564453125, "thought_processed_kl": 0.2599067687988281, "total_teacher_likelihood_reward": -1.5019442606717348, "total_tl_reward_no_entropy": -1.8724847193807364, "unprocessed_answer_log_prob/_first_quartile": -0.004795367363840342, "unprocessed_answer_log_prob/_last_quartile": 0.0, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.2350311279296875, "unprocessed_answer_log_prob/_median": -7.888302206993103e-07, "unprocessed_answer_log_prob/_min": -10.150146484375, "unprocessed_answer_log_prob/_sum": -129.0703125, "unprocessed_thought_kl/_first_quartile": 2.752058207988739e-07, "unprocessed_thought_kl/_last_quartile": 0.08423256874084473, "unprocessed_thought_kl/_max": 11.799072265625, "unprocessed_thought_kl/_mean": 0.26006317138671875, "unprocessed_thought_kl/_median": 0.0006666379049420357, "unprocessed_thought_kl/_min": -3.7545166015625, "unprocessed_thought_kl/_sum": 910.9609375 }, { "answer_log_prob_mean": -0.40478515625, "answer_log_prob_min": -14.90673828125, "completion_length": 5098.939453125, "epoch": 0.11202872531418312, "grad_norm": 0.26502405450703936, "kl": 0.0032253265380859375, "kl_reward": -1.1609628200531006, "kl_reward_no_entropy": -1.5723028499633074, "kl_scores_no_entropy": 7.01318359375, "learning_rate": 1e-06, "loss": 0.0001, "match_reward": -0.2421875, "no_entropy_reasoning_kl_max": 13.64208984375, "no_entropy_reasoning_kl_mean": 0.3876800537109375, "no_entropy_unprocessed_thought_kl/_first_quartile": 1.1078082025051117e-06, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.19204139709472656, "no_entropy_unprocessed_thought_kl/_max": 13.64208984375, "no_entropy_unprocessed_thought_kl/_mean": 0.3876800537109375, "no_entropy_unprocessed_thought_kl/_median": 0.00273798406124115, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 2972.734375, "processed_kl_no_entropy": 0.38726806640625, "reasoning_kl_max": 12.21484375, "reasoning_kl_mean": 0.26483917236328125, "reward": -1.750619649887085, "reward_std": 0.24088267982006073, "rewards/TeacherKLBasedReward": -1.750619649887085, "solution_log_prob_reward": -0.5538525364827365, "step": 117, "thought_kl_scores": 6.2423095703125, "thought_processed_kl": 0.2644805908203125, "total_teacher_likelihood_reward": -1.9570028595626354, "total_tl_reward_no_entropy": -2.368342890404165, "unprocessed_answer_log_prob/_first_quartile": -0.055110424757003784, "unprocessed_answer_log_prob/_last_quartile": -7.543712854385376e-08, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.40478515625, "unprocessed_answer_log_prob/_median": -9.69497486948967e-05, "unprocessed_answer_log_prob/_min": -14.90673828125, "unprocessed_answer_log_prob/_sum": -149.330078125, "unprocessed_thought_kl/_first_quartile": 1.457519829273224e-07, "unprocessed_thought_kl/_last_quartile": 0.0901479721069336, "unprocessed_thought_kl/_max": 12.21484375, "unprocessed_thought_kl/_mean": 0.26483917236328125, "unprocessed_thought_kl/_median": 0.0006389506161212921, "unprocessed_thought_kl/_min": -3.626953125, "unprocessed_thought_kl/_sum": 1502.234375 }, { "answer_log_prob_mean": -0.3309783935546875, "answer_log_prob_min": -10.238525390625, "completion_length": 8037.810546875, "epoch": 0.11298623578695392, "grad_norm": 0.2523949772370748, "kl": 0.003951549530029297, "kl_reward": -1.197713918518275, "kl_reward_no_entropy": -1.8289233325049281, "kl_scores_no_entropy": 7.064697265625, "learning_rate": 1e-06, "loss": 0.0002, "match_reward": -0.2109375, "no_entropy_reasoning_kl_max": 13.65576171875, "no_entropy_reasoning_kl_mean": 0.47308349609375, "no_entropy_unprocessed_thought_kl/_first_quartile": 6.626266986131668e-05, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.3669872283935547, "no_entropy_unprocessed_thought_kl/_max": 13.65576171875, "no_entropy_unprocessed_thought_kl/_mean": 0.47308349609375, "no_entropy_unprocessed_thought_kl/_median": 0.01846182346343994, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 4039.8125, "processed_kl_no_entropy": 0.4730072021484375, "reasoning_kl_max": 12.165771484375, "reasoning_kl_mean": 0.27758026123046875, "reward": -1.7759274244308472, "reward_std": 0.29780739545822144, "rewards/TeacherKLBasedReward": -1.7759274244308472, "solution_log_prob_reward": -0.43336364382412285, "step": 118, "thought_kl_scores": 6.22265625, "thought_processed_kl": 0.2774505615234375, "total_teacher_likelihood_reward": -1.842015067115426, "total_tl_reward_no_entropy": -2.473224484361708, "unprocessed_answer_log_prob/_first_quartile": -0.05856745271012187, "unprocessed_answer_log_prob/_last_quartile": -1.0174699127674103e-07, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.3309783935546875, "unprocessed_answer_log_prob/_median": -0.00023932009935379028, "unprocessed_answer_log_prob/_min": -10.238525390625, "unprocessed_answer_log_prob/_sum": -190.7861328125, "unprocessed_thought_kl/_first_quartile": 1.2456439435482025e-07, "unprocessed_thought_kl/_last_quartile": 0.1605520248413086, "unprocessed_thought_kl/_max": 12.165771484375, "unprocessed_thought_kl/_mean": 0.27758026123046875, "unprocessed_thought_kl/_median": 0.001967594027519226, "unprocessed_thought_kl/_min": -4.47491455078125, "unprocessed_thought_kl/_sum": 1908.546875 }, { "answer_log_prob_mean": -0.4562835693359375, "answer_log_prob_min": -14.5498046875, "completion_length": 6500.1953125, "epoch": 0.11394374625972471, "grad_norm": 0.25029410313652206, "kl": 0.004144906997680664, "kl_reward": -0.9849101155996323, "kl_reward_no_entropy": -1.697639456950128, "kl_scores_no_entropy": 8.380859375, "learning_rate": 1e-06, "loss": 0.0002, "match_reward": -0.53125, "no_entropy_reasoning_kl_max": 16.36962890625, "no_entropy_reasoning_kl_mean": 0.40218353271484375, "no_entropy_unprocessed_thought_kl/_first_quartile": 0.00039726775139570236, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.28433191776275635, "no_entropy_unprocessed_thought_kl/_max": 16.36962890625, "no_entropy_unprocessed_thought_kl/_mean": 0.40218353271484375, "no_entropy_unprocessed_thought_kl/_median": 0.025893118232488632, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 5708.3125, "processed_kl_no_entropy": 0.4021759033203125, "reasoning_kl_max": 14.05859375, "reasoning_kl_mean": 0.18771743774414062, "reward": -1.8597354888916016, "reward_std": 0.26064741611480713, "rewards/TeacherKLBasedReward": -1.8597354888916016, "solution_log_prob_reward": -0.6017816131934524, "step": 119, "thought_kl_scores": 7.1240234375, "thought_processed_kl": 0.18771743774414062, "total_teacher_likelihood_reward": -2.1179417381063104, "total_tl_reward_no_entropy": -2.83067106641829, "unprocessed_answer_log_prob/_first_quartile": -0.10451912879943848, "unprocessed_answer_log_prob/_last_quartile": -1.976964995265007e-06, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.4562835693359375, "unprocessed_answer_log_prob/_median": -0.0013771094381809235, "unprocessed_answer_log_prob/_min": -14.5498046875, "unprocessed_answer_log_prob/_sum": -313.46875, "unprocessed_thought_kl/_first_quartile": 1.3969838619232178e-08, "unprocessed_thought_kl/_last_quartile": 0.08860769867897034, "unprocessed_thought_kl/_max": 14.05859375, "unprocessed_thought_kl/_mean": 0.18771743774414062, "unprocessed_thought_kl/_median": 0.0014114994555711746, "unprocessed_thought_kl/_min": -5.196533203125, "unprocessed_thought_kl/_sum": 2578.75 }, { "answer_log_prob_mean": -0.17163467407226562, "answer_log_prob_min": -10.511962890625, "completion_length": 3975.783203125, "epoch": 0.11490125673249552, "grad_norm": 0.2510196657893443, "kl": 0.0040656328201293945, "kl_reward": -1.039367064833641, "kl_reward_no_entropy": -1.3576565496623516, "kl_scores_no_entropy": 5.7652587890625, "learning_rate": 1e-06, "loss": 0.0002, "match_reward": 0.0, "no_entropy_reasoning_kl_max": 11.189208984375, "no_entropy_reasoning_kl_mean": 0.34066009521484375, "no_entropy_unprocessed_thought_kl/_first_quartile": 1.150183379650116e-07, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.1363844871520996, "no_entropy_unprocessed_thought_kl/_max": 11.189208984375, "no_entropy_unprocessed_thought_kl/_mean": 0.34066009521484375, "no_entropy_unprocessed_thought_kl/_median": 0.0009362520650029182, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 743.359375, "processed_kl_no_entropy": 0.34061431884765625, "reasoning_kl_max": 10.260498046875, "reasoning_kl_mean": 0.2438507080078125, "reward": -1.7174623012542725, "reward_std": 0.19737733900547028, "rewards/TeacherKLBasedReward": -1.7174623012542725, "solution_log_prob_reward": -0.2767543017398566, "step": 120, "thought_kl_scores": 5.2515869140625, "thought_processed_kl": 0.2437591552734375, "total_teacher_likelihood_reward": -1.3161213672719896, "total_tl_reward_no_entropy": -1.6344108562916517, "unprocessed_answer_log_prob/_first_quartile": -0.00015680817887187004, "unprocessed_answer_log_prob/_last_quartile": 0.0, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.17163467407226562, "unprocessed_answer_log_prob/_median": -4.6566128730773926e-09, "unprocessed_answer_log_prob/_min": -10.511962890625, "unprocessed_answer_log_prob/_sum": -69.380859375, "unprocessed_thought_kl/_first_quartile": 1.3271346688270569e-08, "unprocessed_thought_kl/_last_quartile": 0.07243680953979492, "unprocessed_thought_kl/_max": 10.260498046875, "unprocessed_thought_kl/_mean": 0.2438507080078125, "unprocessed_thought_kl/_median": 0.0001954091712832451, "unprocessed_thought_kl/_min": -3.393798828125, "unprocessed_thought_kl/_sum": 521.75 }, { "answer_log_prob_mean": -0.1546173095703125, "answer_log_prob_min": -13.318115234375, "completion_length": 7381.73828125, "epoch": 0.1158587672052663, "grad_norm": 0.25076127407397514, "kl": 0.004145503044128418, "kl_reward": -0.9938580193556845, "kl_reward_no_entropy": -1.4199508521705866, "kl_scores_no_entropy": 7.318603515625, "learning_rate": 1e-06, "loss": 0.0002, "match_reward": 0.0, "no_entropy_reasoning_kl_max": 14.3017578125, "no_entropy_reasoning_kl_mean": 0.33029937744140625, "no_entropy_unprocessed_thought_kl/_first_quartile": 8.475035429000854e-08, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.10358333587646484, "no_entropy_unprocessed_thought_kl/_max": 14.3017578125, "no_entropy_unprocessed_thought_kl/_mean": 0.33029937744140625, "no_entropy_unprocessed_thought_kl/_median": 0.00035502854734659195, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 1259.5625, "processed_kl_no_entropy": 0.33032989501953125, "reasoning_kl_max": 12.646728515625, "reasoning_kl_mean": 0.2048187255859375, "reward": -1.6387622356414795, "reward_std": 0.2579277753829956, "rewards/TeacherKLBasedReward": -1.6387622356414795, "solution_log_prob_reward": -0.28779845824465156, "step": 121, "thought_kl_scores": 6.4249267578125, "thought_processed_kl": 0.204925537109375, "total_teacher_likelihood_reward": -1.2816564729437232, "total_tl_reward_no_entropy": -1.707749305292964, "unprocessed_answer_log_prob/_first_quartile": -1.445133239030838e-05, "unprocessed_answer_log_prob/_last_quartile": 0.0, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.1546173095703125, "unprocessed_answer_log_prob/_median": -9.313225746154785e-10, "unprocessed_answer_log_prob/_min": -13.318115234375, "unprocessed_answer_log_prob/_sum": -112.32421875, "unprocessed_thought_kl/_first_quartile": 0.0, "unprocessed_thought_kl/_last_quartile": 0.033359527587890625, "unprocessed_thought_kl/_max": 12.646728515625, "unprocessed_thought_kl/_mean": 0.2048187255859375, "unprocessed_thought_kl/_median": 1.8737278878688812e-05, "unprocessed_thought_kl/_min": -4.0411376953125, "unprocessed_thought_kl/_sum": 757.640625 }, { "answer_log_prob_mean": -0.54998779296875, "answer_log_prob_min": -15.4912109375, "completion_length": 7829.8427734375, "epoch": 0.11681627767803711, "grad_norm": 0.8469193016754256, "kl": 0.00536191463470459, "kl_reward": -1.1598440427333117, "kl_reward_no_entropy": -2.0659442096948624, "kl_scores_no_entropy": 9.0517578125, "learning_rate": 1e-06, "loss": 0.0002, "match_reward": -0.4921875, "no_entropy_reasoning_kl_max": 17.59375, "no_entropy_reasoning_kl_mean": 0.5127105712890625, "no_entropy_unprocessed_thought_kl/_first_quartile": 0.00032986653968691826, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.43169593811035156, "no_entropy_unprocessed_thought_kl/_max": 17.59375, "no_entropy_unprocessed_thought_kl/_mean": 0.5127105712890625, "no_entropy_unprocessed_thought_kl/_median": 0.027358612045645714, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 7219.0, "processed_kl_no_entropy": 0.512725830078125, "reasoning_kl_max": 15.11181640625, "reasoning_kl_mean": 0.23549652099609375, "reward": -1.8326237201690674, "reward_std": 0.2704693675041199, "rewards/TeacherKLBasedReward": -1.8326237201690674, "solution_log_prob_reward": -0.704899898963049, "step": 122, "thought_kl_scores": 7.67724609375, "thought_processed_kl": 0.2354278564453125, "total_teacher_likelihood_reward": -2.356931443326175, "total_tl_reward_no_entropy": -3.263031605631113, "unprocessed_answer_log_prob/_first_quartile": -0.09047198295593262, "unprocessed_answer_log_prob/_last_quartile": -3.7229619920253754e-07, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.54998779296875, "unprocessed_answer_log_prob/_median": -0.0003536846488714218, "unprocessed_answer_log_prob/_min": -15.4912109375, "unprocessed_answer_log_prob/_sum": -302.1640625, "unprocessed_thought_kl/_first_quartile": 6.984919309616089e-09, "unprocessed_thought_kl/_last_quartile": 0.13502979278564453, "unprocessed_thought_kl/_max": 15.11181640625, "unprocessed_thought_kl/_mean": 0.23549652099609375, "unprocessed_thought_kl/_median": 0.0013350965455174446, "unprocessed_thought_kl/_min": -6.512451171875, "unprocessed_thought_kl/_sum": 3273.9375 }, { "answer_log_prob_mean": -0.5678787231445312, "answer_log_prob_min": -9.692626953125, "completion_length": 6752.47265625, "epoch": 0.1177737881508079, "grad_norm": 0.35190325863868016, "kl": 0.00472259521484375, "kl_reward": -1.657565935049206, "kl_reward_no_entropy": -1.9870669734664261, "kl_scores_no_entropy": 6.830810546875, "learning_rate": 1e-06, "loss": 0.0002, "match_reward": -0.203125, "no_entropy_reasoning_kl_max": 13.1357421875, "no_entropy_reasoning_kl_mean": 0.5309982299804688, "no_entropy_unprocessed_thought_kl/_first_quartile": 9.284401312470436e-05, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.416770301759243, "no_entropy_unprocessed_thought_kl/_max": 13.1357421875, "no_entropy_unprocessed_thought_kl/_mean": 0.5309982299804688, "no_entropy_unprocessed_thought_kl/_median": 0.023594983853399754, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 2550.125, "processed_kl_no_entropy": 0.53106689453125, "reasoning_kl_max": 11.89892578125, "reasoning_kl_mean": 0.43353271484375, "reward": -1.6914355754852295, "reward_std": 0.2595997452735901, "rewards/TeacherKLBasedReward": -1.6914355754852295, "solution_log_prob_reward": -0.6648049938958138, "step": 123, "thought_kl_scores": 6.166015625, "thought_processed_kl": 0.4335975646972656, "total_teacher_likelihood_reward": -2.5254959538578987, "total_tl_reward_no_entropy": -2.8549969904124737, "unprocessed_answer_log_prob/_first_quartile": -0.2587471008300781, "unprocessed_answer_log_prob/_last_quartile": -1.5683472156524658e-06, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.5678787231445312, "unprocessed_answer_log_prob/_median": -0.0013647237792611122, "unprocessed_answer_log_prob/_min": -9.692626953125, "unprocessed_answer_log_prob/_sum": -97.818359375, "unprocessed_thought_kl/_first_quartile": 5.970289930701256e-05, "unprocessed_thought_kl/_last_quartile": 0.35413504112511873, "unprocessed_thought_kl/_max": 11.89892578125, "unprocessed_thought_kl/_mean": 0.43353271484375, "unprocessed_thought_kl/_median": 0.020850921981036663, "unprocessed_thought_kl/_min": -3.7400513254106045, "unprocessed_thought_kl/_sum": 1384.53125 }, { "answer_log_prob_mean": -0.07605743408203125, "answer_log_prob_min": -11.485595703125, "completion_length": 4368.41015625, "epoch": 0.1187312986235787, "grad_norm": 11457.169959069213, "kl": 12.067533135414124, "kl_reward": -1.1052403873763978, "kl_reward_no_entropy": -1.4317895350977778, "kl_scores_no_entropy": 7.751708984375, "learning_rate": 1e-06, "loss": 0.4826, "match_reward": -0.03125, "no_entropy_reasoning_kl_max": 15.17626953125, "no_entropy_reasoning_kl_mean": 0.32550048828125, "no_entropy_unprocessed_thought_kl/_first_quartile": 1.4039687812328339e-07, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.10554266069084406, "no_entropy_unprocessed_thought_kl/_max": 15.17626953125, "no_entropy_unprocessed_thought_kl/_mean": 0.32550048828125, "no_entropy_unprocessed_thought_kl/_median": 0.00037637725472450256, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 1337.890625, "processed_kl_no_entropy": 0.325592041015625, "reasoning_kl_max": 14.371826171875, "reasoning_kl_mean": 0.22469520568847656, "reward": -1.4985926151275635, "reward_std": 0.2090553343296051, "rewards/TeacherKLBasedReward": -1.4985926151275635, "solution_log_prob_reward": -0.19091338833095506, "step": 124, "thought_kl_scores": 7.298583984375, "thought_processed_kl": 0.2247333526611328, "total_teacher_likelihood_reward": -1.327403775881976, "total_tl_reward_no_entropy": -1.6539529217407107, "unprocessed_answer_log_prob/_first_quartile": -1.6151461750268936e-06, "unprocessed_answer_log_prob/_last_quartile": 0.0, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.07605743408203125, "unprocessed_answer_log_prob/_median": 0.0, "unprocessed_answer_log_prob/_min": -11.485595703125, "unprocessed_answer_log_prob/_sum": -65.296875, "unprocessed_thought_kl/_first_quartile": 0.0, "unprocessed_thought_kl/_last_quartile": 0.04699593875557184, "unprocessed_thought_kl/_max": 14.371826171875, "unprocessed_thought_kl/_mean": 0.22469520568847656, "unprocessed_thought_kl/_median": 5.5019743740558624e-05, "unprocessed_thought_kl/_min": -3.89569091796875, "unprocessed_thought_kl/_sum": 804.1875 }, { "answer_log_prob_mean": -0.17917633056640625, "answer_log_prob_min": -8.9091796875, "completion_length": 5009.69140625, "epoch": 0.11968880909634949, "grad_norm": 0.2597859378446038, "kl": 0.004796385765075684, "kl_reward": -1.185811153613031, "kl_reward_no_entropy": -1.5233111549168825, "kl_scores_no_entropy": 7.05126953125, "learning_rate": 1e-06, "loss": 0.0002, "match_reward": -0.015625, "no_entropy_reasoning_kl_max": 13.736328125, "no_entropy_reasoning_kl_mean": 0.3704071044921875, "no_entropy_unprocessed_thought_kl/_first_quartile": 1.7927959561347961e-07, "no_entropy_unprocessed_thought_kl/_last_quartile": 0.12949752807617188, "no_entropy_unprocessed_thought_kl/_max": 13.736328125, "no_entropy_unprocessed_thought_kl/_mean": 0.3704071044921875, "no_entropy_unprocessed_thought_kl/_median": 0.0005892887711524963, "no_entropy_unprocessed_thought_kl/_min": 0.0, "no_entropy_unprocessed_thought_kl/_sum": 1488.171875, "processed_kl_no_entropy": 0.3704071044921875, "reasoning_kl_max": 12.63037109375, "reasoning_kl_mean": 0.2689666748046875, "reward": -1.7310512065887451, "reward_std": 0.23177963495254517, "rewards/TeacherKLBasedReward": -1.7310512065887451, "solution_log_prob_reward": -0.2682681247824803, "step": 125, "thought_kl_scores": 6.44873046875, "thought_processed_kl": 0.2688751220703125, "total_teacher_likelihood_reward": -1.4697042764164507, "total_tl_reward_no_entropy": -1.8072042791172862, "unprocessed_answer_log_prob/_first_quartile": -0.0019635986536741257, "unprocessed_answer_log_prob/_last_quartile": 0.0, "unprocessed_answer_log_prob/_max": 0.0, "unprocessed_answer_log_prob/_mean": -0.17917633056640625, "unprocessed_answer_log_prob/_median": -4.852190613746643e-07, "unprocessed_answer_log_prob/_min": -8.9091796875, "unprocessed_answer_log_prob/_sum": -103.1435546875, "unprocessed_thought_kl/_first_quartile": 0.0, "unprocessed_thought_kl/_last_quartile": 0.06113862991333008, "unprocessed_thought_kl/_max": 12.63037109375, "unprocessed_thought_kl/_mean": 0.2689666748046875, "unprocessed_thought_kl/_median": 7.714331150054932e-05, "unprocessed_thought_kl/_min": -3.48443603515625, "unprocessed_thought_kl/_sum": 883.58203125 } ], "logging_steps": 1, "max_steps": 125, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 128, "trial_name": null, "trial_params": null }