{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.009681385599907059, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 161.9166717529297, "epoch": 9.681385599907059e-06, "grad_norm": 10.48554750863998, "kl": 0.0, "learning_rate": 1e-06, "loss": -0.0, "reward": 1.6471608877182007, "reward_std": 0.32962578535079956, "rewards/accuracy_reward": 0.5338388681411743, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.1549886167049408, "step": 1 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 251.25, "epoch": 1.9362771199814118e-05, "grad_norm": 3.3671917177951087, "kl": 0.000347137451171875, "learning_rate": 9.999999997687322e-07, "loss": 0.0, "reward": 1.4162977933883667, "reward_std": 0.07594326138496399, "rewards/accuracy_reward": 0.2638278007507324, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.13580322265625, "step": 2 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 230.70834350585938, "epoch": 2.9044156799721175e-05, "grad_norm": 8.863212114063046, "kl": 0.000499725341796875, "learning_rate": 9.999999990749294e-07, "loss": 0.0, "reward": 1.7035826444625854, "reward_std": 0.27621591091156006, "rewards/accuracy_reward": 0.4490740895271301, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2045084685087204, "step": 3 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 356.8333435058594, "epoch": 3.8725542399628236e-05, "grad_norm": 2.688920912009499, "kl": 0.00040435791015625, "learning_rate": 9.999999979185916e-07, "loss": 0.0, "reward": 1.7487304210662842, "reward_std": 0.5758256316184998, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.1153971403837204, "step": 4 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 236.7916717529297, "epoch": 4.840692799953529e-05, "grad_norm": 3.014936636267315, "kl": 0.0003795623779296875, "learning_rate": 9.999999962997183e-07, "loss": 0.0, "reward": 1.8440674543380737, "reward_std": 0.3521568477153778, "rewards/accuracy_reward": 0.7083333730697632, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1274007260799408, "step": 5 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 297.75, "epoch": 5.808831359944235e-05, "grad_norm": 4.222126846936773, "kl": 0.0004444122314453125, "learning_rate": 9.9999999421831e-07, "loss": 0.0, "reward": 1.716162085533142, "reward_std": 0.5885024666786194, "rewards/accuracy_reward": 0.5416666865348816, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1494954526424408, "step": 6 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 282.54168701171875, "epoch": 6.776969919934941e-05, "grad_norm": 7.165159103337778, "kl": 0.000659942626953125, "learning_rate": 9.999999916743664e-07, "loss": 0.0, "reward": 1.2527244091033936, "reward_std": 0.7829837799072266, "rewards/accuracy_reward": 0.28663554787635803, "rewards/format_reward": 0.8333333730697632, "rewards/semantic_reward": 0.1160888671875, "step": 7 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 249.2916717529297, "epoch": 7.745108479925647e-05, "grad_norm": 5.737552779885816, "kl": 0.0012969970703125, "learning_rate": 9.999999886678877e-07, "loss": 0.0001, "reward": 1.4524524211883545, "reward_std": 0.30717721581459045, "rewards/accuracy_reward": 0.3124620318412781, "rewards/format_reward": 0.9166666865348816, "rewards/semantic_reward": 0.1983235776424408, "step": 8 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 240.875, "epoch": 8.713247039916353e-05, "grad_norm": 5.366318378382097, "kl": 0.00090789794921875, "learning_rate": 9.999999851988738e-07, "loss": 0.0, "reward": 1.5177730321884155, "reward_std": 0.0932733416557312, "rewards/accuracy_reward": 0.21988068521022797, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2562255859375, "step": 9 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 273.41668701171875, "epoch": 9.681385599907059e-05, "grad_norm": 4.090936838464869, "kl": 0.000835418701171875, "learning_rate": 9.999999812673246e-07, "loss": 0.0, "reward": 1.459357738494873, "reward_std": 0.310688853263855, "rewards/accuracy_reward": 0.27057182788848877, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1304524838924408, "step": 10 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 448.29168701171875, "epoch": 0.00010649524159897764, "grad_norm": 2.8587964032952953, "kl": 0.000560760498046875, "learning_rate": 9.999999768732404e-07, "loss": 0.0, "reward": 1.4476261138916016, "reward_std": 0.6397146582603455, "rewards/accuracy_reward": 0.3998112082481384, "rewards/format_reward": 0.875, "rewards/semantic_reward": 0.1561482846736908, "step": 11 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 188.0416717529297, "epoch": 0.0001161766271988847, "grad_norm": 9.170719782280283, "kl": 0.00095367431640625, "learning_rate": 9.999999720166208e-07, "loss": 0.0, "reward": 1.792284369468689, "reward_std": 0.06845249980688095, "rewards/accuracy_reward": 0.4976227879524231, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2779948115348816, "step": 12 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 262.375, "epoch": 0.00012585801279879177, "grad_norm": 6.439842722309531, "kl": 0.00101470947265625, "learning_rate": 9.99999966697466e-07, "loss": 0.0, "reward": 1.527942419052124, "reward_std": 0.09239068627357483, "rewards/accuracy_reward": 0.4134891927242279, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1061197966337204, "step": 13 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 229.45834350585938, "epoch": 0.00013553939839869883, "grad_norm": 4.736288227057585, "kl": 0.0012054443359375, "learning_rate": 9.999999609157763e-07, "loss": 0.0, "reward": 1.5963399410247803, "reward_std": 0.4139372706413269, "rewards/accuracy_reward": 0.4363138675689697, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1516927182674408, "step": 14 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 193.20834350585938, "epoch": 0.00014522078399860589, "grad_norm": 3.8520399139072747, "kl": 0.00173187255859375, "learning_rate": 9.999999546715514e-07, "loss": 0.0001, "reward": 1.5987792015075684, "reward_std": 0.22279062867164612, "rewards/accuracy_reward": 0.40057772397994995, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.2398681640625, "step": 15 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 386.8333435058594, "epoch": 0.00015490216959851294, "grad_norm": 18.11441232521738, "kl": 0.00141143798828125, "learning_rate": 9.999999479647914e-07, "loss": 0.0001, "reward": 1.0429072380065918, "reward_std": 0.30901962518692017, "rewards/accuracy_reward": 0.15669701993465424, "rewards/format_reward": 0.75, "rewards/semantic_reward": 0.1362101286649704, "step": 16 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 243.1666717529297, "epoch": 0.00016458355519842, "grad_norm": 6.9851978178727725, "kl": 0.0018463134765625, "learning_rate": 9.99999940795496e-07, "loss": 0.0001, "reward": 1.5036861896514893, "reward_std": 0.3192262053489685, "rewards/accuracy_reward": 0.3331376314163208, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1455485075712204, "step": 17 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 313.2083435058594, "epoch": 0.00017426494079832706, "grad_norm": 2.482246260293583, "kl": 0.00128936767578125, "learning_rate": 9.999999331636656e-07, "loss": 0.0001, "reward": 1.780838966369629, "reward_std": 0.4219633936882019, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1308390349149704, "step": 18 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 283.54168701171875, "epoch": 0.00018394632639823411, "grad_norm": 2.6958721345409487, "kl": 0.00147247314453125, "learning_rate": 9.999999250693e-07, "loss": 0.0001, "reward": 1.4792029857635498, "reward_std": 0.41808927059173584, "rewards/accuracy_reward": 0.32195186614990234, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1405843198299408, "step": 19 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 391.0833435058594, "epoch": 0.00019362771199814117, "grad_norm": 3.5278618647404985, "kl": 0.00193023681640625, "learning_rate": 9.999999165123993e-07, "loss": 0.0001, "reward": 1.493882179260254, "reward_std": 0.4038986563682556, "rewards/accuracy_reward": 0.4758971631526947, "rewards/format_reward": 0.8333333730697632, "rewards/semantic_reward": 0.1846517026424408, "step": 20 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 239.4166717529297, "epoch": 0.00020330909759804823, "grad_norm": 4.247082937320684, "kl": 0.0040283203125, "learning_rate": 9.999999074929636e-07, "loss": 0.0002, "reward": 1.7764428853988647, "reward_std": 0.12965510785579681, "rewards/accuracy_reward": 0.48206624388694763, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2777099609375, "step": 21 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 294.16668701171875, "epoch": 0.00021299048319795529, "grad_norm": 2.995137915314803, "kl": 0.0024261474609375, "learning_rate": 9.999998980109928e-07, "loss": 0.0001, "reward": 1.6920270919799805, "reward_std": 0.334602415561676, "rewards/accuracy_reward": 0.38821038603782654, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2454833984375, "step": 22 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 235.7916717529297, "epoch": 0.00022267186879786234, "grad_norm": 6.097503672654647, "kl": 0.0025634765625, "learning_rate": 9.999998880664866e-07, "loss": 0.0001, "reward": 1.5351613759994507, "reward_std": 0.05848950147628784, "rewards/accuracy_reward": 0.3811981678009033, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1456298828125, "step": 23 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 367.5833435058594, "epoch": 0.0002323532543977694, "grad_norm": 2.0562312878902254, "kl": 0.001922607421875, "learning_rate": 9.999998776594457e-07, "loss": 0.0001, "reward": 1.9919718503952026, "reward_std": 0.5915851593017578, "rewards/accuracy_reward": 0.8333333730697632, "rewards/format_reward": 0.9166666865348816, "rewards/semantic_reward": 0.2086385190486908, "step": 24 }, { "all_correct": 0.0, "all_wrong": 0.6666666666666666, "completion_length": 230.5, "epoch": 0.00024203463999767646, "grad_norm": 2.8329123676865064, "kl": 0.00396728515625, "learning_rate": 9.999998667898693e-07, "loss": 0.0002, "reward": 1.1903878450393677, "reward_std": 0.020909279584884644, "rewards/accuracy_reward": 0.08443891257047653, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.0976155623793602, "step": 25 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 244.95834350585938, "epoch": 0.00025171602559758354, "grad_norm": 3.179137513077544, "kl": 0.0048828125, "learning_rate": 9.99999855457758e-07, "loss": 0.0002, "reward": 1.8097747564315796, "reward_std": 0.554413914680481, "rewards/accuracy_reward": 0.5459725260734558, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.23046875, "step": 26 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 250.4166717529297, "epoch": 0.00026139741119749057, "grad_norm": 12.527737859508068, "kl": 0.006256103515625, "learning_rate": 9.999998436631117e-07, "loss": 0.0003, "reward": 1.4230401515960693, "reward_std": 0.06623169779777527, "rewards/accuracy_reward": 0.21503232419490814, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2080078125, "step": 27 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 254.45834350585938, "epoch": 0.00027107879679739766, "grad_norm": 2.2366975775779268, "kl": 0.0029754638671875, "learning_rate": 9.999998314059305e-07, "loss": 0.0001, "reward": 1.4441794157028198, "reward_std": 0.29627853631973267, "rewards/accuracy_reward": 0.33932098746299744, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1048583984375, "step": 28 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 214.25, "epoch": 0.0002807601823973047, "grad_norm": 6.134868262123102, "kl": 0.0050048828125, "learning_rate": 9.99999818686214e-07, "loss": 0.0002, "reward": 1.9449918270111084, "reward_std": 0.08884235471487045, "rewards/accuracy_reward": 0.6588751077651978, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2694498896598816, "step": 29 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 277.91668701171875, "epoch": 0.00029044156799721177, "grad_norm": 3.5461936033595136, "kl": 0.00433349609375, "learning_rate": 9.999998055039624e-07, "loss": 0.0002, "reward": 1.8352688550949097, "reward_std": 0.26560965180397034, "rewards/accuracy_reward": 0.5093004107475281, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2843017578125, "step": 30 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 205.0416717529297, "epoch": 0.0003001229535971188, "grad_norm": 2.7671747182945, "kl": 0.007293701171875, "learning_rate": 9.99999791859176e-07, "loss": 0.0003, "reward": 1.7703099250793457, "reward_std": 0.03949965164065361, "rewards/accuracy_reward": 0.5555555820465088, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2064208984375, "step": 31 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 287.875, "epoch": 0.0003098043391970259, "grad_norm": 3.1885902873591325, "kl": 0.0033416748046875, "learning_rate": 9.999997777518545e-07, "loss": 0.0001, "reward": 1.94183349609375, "reward_std": 0.28012752532958984, "rewards/accuracy_reward": 0.6225300431251526, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2443033903837204, "step": 32 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 262.79168701171875, "epoch": 0.0003194857247969329, "grad_norm": 2.769789628431254, "kl": 0.003326416015625, "learning_rate": 9.99999763181998e-07, "loss": 0.0001, "reward": 1.9392727613449097, "reward_std": 0.31999677419662476, "rewards/accuracy_reward": 0.6535387635231018, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2524007260799408, "step": 33 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 351.9583435058594, "epoch": 0.00032916711039684, "grad_norm": 2.107949650987347, "kl": 0.0022430419921875, "learning_rate": 9.999997481496067e-07, "loss": 0.0001, "reward": 1.4127153158187866, "reward_std": 0.37554359436035156, "rewards/accuracy_reward": 0.24352572858333588, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1275227963924408, "step": 34 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 320.7083435058594, "epoch": 0.00033884849599674703, "grad_norm": 3.2346474029494923, "kl": 0.004852294921875, "learning_rate": 9.999997326546803e-07, "loss": 0.0002, "reward": 1.8155333995819092, "reward_std": 0.33576905727386475, "rewards/accuracy_reward": 0.5309956669807434, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2345377653837204, "step": 35 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 276.25, "epoch": 0.0003485298815966541, "grad_norm": 4.828155172379193, "kl": 0.00537109375, "learning_rate": 9.999997166972188e-07, "loss": 0.0002, "reward": 1.6908411979675293, "reward_std": 0.354861319065094, "rewards/accuracy_reward": 0.42172497510910034, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2191162109375, "step": 36 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 216.45834350585938, "epoch": 0.0003582112671965612, "grad_norm": 2.374120970639913, "kl": 0.0030059814453125, "learning_rate": 9.999997002772226e-07, "loss": 0.0001, "reward": 1.4286350011825562, "reward_std": 0.21755945682525635, "rewards/accuracy_reward": 0.3055555820465088, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.11474609375, "step": 37 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 495.2083435058594, "epoch": 0.00036789265279646823, "grad_norm": 1.430128821188822, "kl": 0.00142669677734375, "learning_rate": 9.999996833946915e-07, "loss": 0.0001, "reward": 1.5226024389266968, "reward_std": 0.6714519262313843, "rewards/accuracy_reward": 0.4775095582008362, "rewards/format_reward": 0.7916666865348816, "rewards/semantic_reward": 0.1784261167049408, "step": 38 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 200.20834350585938, "epoch": 0.0003775740383963753, "grad_norm": 2.679286368702018, "kl": 0.004638671875, "learning_rate": 9.999996660496253e-07, "loss": 0.0002, "reward": 1.7791748046875, "reward_std": 0.36083146929740906, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1541748046875, "step": 39 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 245.5, "epoch": 0.00038725542399628234, "grad_norm": 4.178744005625897, "kl": 0.003662109375, "learning_rate": 9.999996482420244e-07, "loss": 0.0001, "reward": 1.6380624771118164, "reward_std": 0.04689010977745056, "rewards/accuracy_reward": 0.48192232847213745, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1478068083524704, "step": 40 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 251.1666717529297, "epoch": 0.00039693680959618943, "grad_norm": 3.8028339059635115, "kl": 0.003387451171875, "learning_rate": 9.999996299718886e-07, "loss": 0.0001, "reward": 1.6804851293563843, "reward_std": 0.3041228950023651, "rewards/accuracy_reward": 0.4583333432674408, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2054850310087204, "step": 41 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 253.0416717529297, "epoch": 0.00040661819519609646, "grad_norm": 4.928270845737591, "kl": 0.005584716796875, "learning_rate": 9.99999611239218e-07, "loss": 0.0002, "reward": 2.0481300354003906, "reward_std": 0.10336107015609741, "rewards/accuracy_reward": 0.7751114368438721, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.23968505859375, "step": 42 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 239.1666717529297, "epoch": 0.00041629958079600354, "grad_norm": 2.883005306811863, "kl": 0.0034942626953125, "learning_rate": 9.999995920440122e-07, "loss": 0.0001, "reward": 1.907381296157837, "reward_std": 0.20796941220760345, "rewards/accuracy_reward": 0.7083333730697632, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1907145231962204, "step": 43 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 247.70834350585938, "epoch": 0.00042598096639591057, "grad_norm": 35.63990018327172, "kl": 0.005645751953125, "learning_rate": 9.99999572386272e-07, "loss": 0.0002, "reward": 2.0485177040100098, "reward_std": 0.33672863245010376, "rewards/accuracy_reward": 0.8236885070800781, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1998291015625, "step": 44 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 255.875, "epoch": 0.00043566235199581766, "grad_norm": 2.24686895008275, "kl": 0.005035400390625, "learning_rate": 9.999995522659967e-07, "loss": 0.0002, "reward": 1.2665064334869385, "reward_std": 0.19445715844631195, "rewards/accuracy_reward": 0.13115471601486206, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.1520182341337204, "step": 45 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 300.125, "epoch": 0.0004453437375957247, "grad_norm": 2.761223104768101, "kl": 0.0078125, "learning_rate": 9.999995316831868e-07, "loss": 0.0003, "reward": 1.5128042697906494, "reward_std": 0.17071500420570374, "rewards/accuracy_reward": 0.29656070470809937, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.1912434995174408, "step": 46 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 281.125, "epoch": 0.00045502512319563177, "grad_norm": 2.2277537537615406, "kl": 0.006103515625, "learning_rate": 9.999995106378422e-07, "loss": 0.0002, "reward": 2.208097457885742, "reward_std": 0.3403695225715637, "rewards/accuracy_reward": 0.9166666865348816, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2580973505973816, "step": 47 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 251.7916717529297, "epoch": 0.0004647065087955388, "grad_norm": 4.668471011194852, "kl": 0.007110595703125, "learning_rate": 9.999994891299625e-07, "loss": 0.0003, "reward": 1.8106863498687744, "reward_std": 0.13276630640029907, "rewards/accuracy_reward": 0.5014984607696533, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2925211787223816, "step": 48 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 325.3333435058594, "epoch": 0.0004743878943954459, "grad_norm": 2.5677282946688553, "kl": 0.00762939453125, "learning_rate": 9.999994671595484e-07, "loss": 0.0003, "reward": 1.9520070552825928, "reward_std": 0.12136250734329224, "rewards/accuracy_reward": 0.5753225088119507, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2683512568473816, "step": 49 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 343.0833435058594, "epoch": 0.0004840692799953529, "grad_norm": 1.5995595156971427, "kl": 0.005126953125, "learning_rate": 9.999994447265995e-07, "loss": 0.0002, "reward": 1.3803495168685913, "reward_std": 0.3354682922363281, "rewards/accuracy_reward": 0.201776921749115, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.1619059294462204, "step": 50 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 411.625, "epoch": 0.0004937506655952599, "grad_norm": 1.5164621529741111, "kl": 0.002655029296875, "learning_rate": 9.99999421831116e-07, "loss": 0.0001, "reward": 1.462019920349121, "reward_std": 0.297624409198761, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.1120198592543602, "step": 51 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 283.04168701171875, "epoch": 0.0005034320511951671, "grad_norm": 3.733194632461437, "kl": 0.007415771484375, "learning_rate": 9.999993984730979e-07, "loss": 0.0003, "reward": 1.9562478065490723, "reward_std": 0.11199294030666351, "rewards/accuracy_reward": 0.5989317893981934, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2906494140625, "step": 52 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 281.125, "epoch": 0.0005131134367950741, "grad_norm": 4.686342744943605, "kl": 0.0078125, "learning_rate": 9.999993746525449e-07, "loss": 0.0003, "reward": 1.8454368114471436, "reward_std": 0.2958255708217621, "rewards/accuracy_reward": 0.516831636428833, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2786051630973816, "step": 53 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 326.375, "epoch": 0.0005227948223949811, "grad_norm": 1.9921670324672636, "kl": 0.0059814453125, "learning_rate": 9.999993503694575e-07, "loss": 0.0002, "reward": 1.6340093612670898, "reward_std": 0.1654907613992691, "rewards/accuracy_reward": 0.4494798183441162, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.1845296323299408, "step": 54 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 333.0833435058594, "epoch": 0.0005324762079948882, "grad_norm": 6.094894720748536, "kl": 0.00921630859375, "learning_rate": 9.999993256238353e-07, "loss": 0.0004, "reward": 1.6164811849594116, "reward_std": 0.095056913793087, "rewards/accuracy_reward": 0.2899145483970642, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.20989990234375, "step": 55 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 296.41668701171875, "epoch": 0.0005421575935947953, "grad_norm": 8.260689313146225, "kl": 0.00714111328125, "learning_rate": 9.999993004156786e-07, "loss": 0.0003, "reward": 1.956239104270935, "reward_std": 0.466292142868042, "rewards/accuracy_reward": 0.6322726011276245, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2739664912223816, "step": 56 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 262.3333435058594, "epoch": 0.0005518389791947023, "grad_norm": 4.741332999113126, "kl": 0.0081787109375, "learning_rate": 9.999992747449874e-07, "loss": 0.0003, "reward": 1.9506481885910034, "reward_std": 0.11475707590579987, "rewards/accuracy_reward": 0.6639049649238586, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2450765073299408, "step": 57 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 407.54168701171875, "epoch": 0.0005615203647946094, "grad_norm": 2.6097906729816303, "kl": 0.006439208984375, "learning_rate": 9.999992486117617e-07, "loss": 0.0003, "reward": 1.7529609203338623, "reward_std": 0.46296805143356323, "rewards/accuracy_reward": 0.5940172076225281, "rewards/format_reward": 0.875, "rewards/semantic_reward": 0.2506103515625, "step": 58 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 307.0, "epoch": 0.0005712017503945165, "grad_norm": 3.8270525788354304, "kl": 0.0084228515625, "learning_rate": 9.999992220160016e-07, "loss": 0.0003, "reward": 1.4457576274871826, "reward_std": 0.07583862543106079, "rewards/accuracy_reward": 0.1730036735534668, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.19775390625, "step": 59 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 249.125, "epoch": 0.0005808831359944235, "grad_norm": 2.935044929722553, "kl": 0.0093994140625, "learning_rate": 9.999991949577067e-07, "loss": 0.0004, "reward": 1.8851399421691895, "reward_std": 0.18500691652297974, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.226806640625, "step": 60 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 307.0833435058594, "epoch": 0.0005905645215943306, "grad_norm": 4.472391532197234, "kl": 0.007537841796875, "learning_rate": 9.999991674368776e-07, "loss": 0.0003, "reward": 2.3194172382354736, "reward_std": 0.200618177652359, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.2777506709098816, "step": 61 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 286.04168701171875, "epoch": 0.0006002459071942376, "grad_norm": 9.867531295466552, "kl": 0.007720947265625, "learning_rate": 9.99999139453514e-07, "loss": 0.0003, "reward": 2.305647850036621, "reward_std": 0.08528100699186325, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.247314453125, "step": 62 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 245.7916717529297, "epoch": 0.0006099272927941447, "grad_norm": 4.4441269026483115, "kl": 0.010986328125, "learning_rate": 9.999991110076161e-07, "loss": 0.0004, "reward": 1.6424455642700195, "reward_std": 0.058063071221113205, "rewards/accuracy_reward": 0.35164961218833923, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2741292417049408, "step": 63 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 315.41668701171875, "epoch": 0.0006196086783940518, "grad_norm": 6.4628623976494355, "kl": 0.00701904296875, "learning_rate": 9.999990820991838e-07, "loss": 0.0003, "reward": 1.4654563665390015, "reward_std": 0.26304322481155396, "rewards/accuracy_reward": 0.2557312846183777, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1347249448299408, "step": 64 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 329.8333435058594, "epoch": 0.0006292900639939588, "grad_norm": 1.391914660402965, "kl": 0.004180908203125, "learning_rate": 9.99999052728217e-07, "loss": 0.0002, "reward": 1.8870930671691895, "reward_std": 0.20147721469402313, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.228759765625, "step": 65 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 388.875, "epoch": 0.0006389714495938658, "grad_norm": 2.393900824801779, "kl": 0.00677490234375, "learning_rate": 9.99999022894716e-07, "loss": 0.0003, "reward": 1.6151301860809326, "reward_std": 0.27184855937957764, "rewards/accuracy_reward": 0.2951757311820984, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2449544370174408, "step": 66 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 237.0416717529297, "epoch": 0.000648652835193773, "grad_norm": 3.0737345692077294, "kl": 0.005889892578125, "learning_rate": 9.999989925986807e-07, "loss": 0.0002, "reward": 2.055352210998535, "reward_std": 0.24674245715141296, "rewards/accuracy_reward": 0.758680522441864, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2966715693473816, "step": 67 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.6666666666666666, "completion_length": 348.8333435058594, "epoch": 0.00065833422079368, "grad_norm": 1.3515079308343187, "kl": 0.0057373046875, "learning_rate": 9.99998961840111e-07, "loss": 0.0002, "reward": 1.4497884511947632, "reward_std": 0.04191696271300316, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.0747884139418602, "step": 68 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 284.3333435058594, "epoch": 0.000668015606393587, "grad_norm": 2.4338732849839086, "kl": 0.0087890625, "learning_rate": 9.999989306190073e-07, "loss": 0.0004, "reward": 1.3450536727905273, "reward_std": 0.26576748490333557, "rewards/accuracy_reward": 0.1901872158050537, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1548665463924408, "step": 69 }, { "all_correct": 0.0, "all_wrong": 0.6666666666666666, "completion_length": 381.875, "epoch": 0.0006776969919934941, "grad_norm": 0.6333029058507517, "kl": 0.00537109375, "learning_rate": 9.999988989353694e-07, "loss": 0.0002, "reward": 1.4287109375, "reward_std": 0.1760980784893036, "rewards/accuracy_reward": 0.2916666865348816, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.0953776091337204, "step": 70 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 381.0, "epoch": 0.0006873783775934012, "grad_norm": 45.53724345486743, "kl": 0.008056640625, "learning_rate": 9.99998866789197e-07, "loss": 0.0003, "reward": 1.5727390050888062, "reward_std": 0.3160770535469055, "rewards/accuracy_reward": 0.3716404438018799, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.1677653044462204, "step": 71 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 266.7083435058594, "epoch": 0.0006970597631933082, "grad_norm": 3.471651002611566, "kl": 0.007659912109375, "learning_rate": 9.999988341804906e-07, "loss": 0.0003, "reward": 2.090010643005371, "reward_std": 0.2683168053627014, "rewards/accuracy_reward": 0.7387737035751343, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3095703125, "step": 72 }, { "all_correct": 0.0, "all_wrong": 1.0, "completion_length": 264.2083435058594, "epoch": 0.0007067411487932153, "grad_norm": 0.1204950967631191, "kl": 0.005889892578125, "learning_rate": 9.999988011092503e-07, "loss": 0.0002, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.0, "step": 73 }, { "all_correct": 0.0, "all_wrong": 0.6666666666666666, "completion_length": 273.375, "epoch": 0.0007164225343931224, "grad_norm": 1.7345570206203866, "kl": 0.0103759765625, "learning_rate": 9.999987675754757e-07, "loss": 0.0004, "reward": 1.2261244058609009, "reward_std": 0.04045962542295456, "rewards/accuracy_reward": 0.08374153077602386, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1090494841337204, "step": 74 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 251.70834350585938, "epoch": 0.0007261039199930294, "grad_norm": 2.397771938335922, "kl": 0.009765625, "learning_rate": 9.999987335791672e-07, "loss": 0.0004, "reward": 1.5128238201141357, "reward_std": 0.23105761408805847, "rewards/accuracy_reward": 0.33022284507751465, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1659342497587204, "step": 75 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 234.125, "epoch": 0.0007357853055929365, "grad_norm": 5.560823956094934, "kl": 0.0093994140625, "learning_rate": 9.999986991203244e-07, "loss": 0.0004, "reward": 2.041436195373535, "reward_std": 0.2787952125072479, "rewards/accuracy_reward": 0.8472222685813904, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.2358805388212204, "step": 76 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 272.0, "epoch": 0.0007454666911928435, "grad_norm": 4.036151612670064, "kl": 0.01263427734375, "learning_rate": 9.999986641989477e-07, "loss": 0.0005, "reward": 1.6388180255889893, "reward_std": 0.25800129771232605, "rewards/accuracy_reward": 0.4449133276939392, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1689046323299408, "step": 77 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 438.4583435058594, "epoch": 0.0007551480767927506, "grad_norm": 2.647937886371552, "kl": 0.008056640625, "learning_rate": 9.999986288150372e-07, "loss": 0.0003, "reward": 1.3895328044891357, "reward_std": 0.21611981093883514, "rewards/accuracy_reward": 0.40774568915367126, "rewards/format_reward": 0.7083333730697632, "rewards/semantic_reward": 0.2484537810087204, "step": 78 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 394.54168701171875, "epoch": 0.0007648294623926577, "grad_norm": 2.985543360862308, "kl": 0.0048828125, "learning_rate": 9.999985929685927e-07, "loss": 0.0002, "reward": 1.8473584651947021, "reward_std": 0.2096451222896576, "rewards/accuracy_reward": 0.4726838767528534, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2496744841337204, "step": 79 }, { "all_correct": 0.0, "all_wrong": 0.6666666666666666, "completion_length": 248.125, "epoch": 0.0007745108479925647, "grad_norm": 1.9932634992468594, "kl": 0.00787353515625, "learning_rate": 9.999985566596142e-07, "loss": 0.0003, "reward": 1.3222575187683105, "reward_std": 0.19987516105175018, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.0639241561293602, "step": 80 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 299.625, "epoch": 0.0007841922335924717, "grad_norm": 3.523028403747184, "kl": 0.00982666015625, "learning_rate": 9.99998519888102e-07, "loss": 0.0004, "reward": 2.031838893890381, "reward_std": 0.3913438320159912, "rewards/accuracy_reward": 0.7533233165740967, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.228515625, "step": 81 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 454.2083435058594, "epoch": 0.0007938736191923789, "grad_norm": 2.85095650997578, "kl": 0.006561279296875, "learning_rate": 9.99998482654056e-07, "loss": 0.0003, "reward": 1.0645601749420166, "reward_std": 0.3931168019771576, "rewards/accuracy_reward": 0.1502615511417389, "rewards/format_reward": 0.75, "rewards/semantic_reward": 0.1142985075712204, "step": 82 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 333.8333435058594, "epoch": 0.0008035550047922859, "grad_norm": 5.742915414118102, "kl": 0.00872802734375, "learning_rate": 9.99998444957476e-07, "loss": 0.0003, "reward": 1.6674938201904297, "reward_std": 0.0895787924528122, "rewards/accuracy_reward": 0.34438180923461914, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2314453125, "step": 83 }, { "all_correct": 0.0, "all_wrong": 0.6666666666666666, "completion_length": 289.125, "epoch": 0.0008132363903921929, "grad_norm": 2.413617584200721, "kl": 0.0062255859375, "learning_rate": 9.999984067983623e-07, "loss": 0.0002, "reward": 1.0401476621627808, "reward_std": 0.11355449259281158, "rewards/accuracy_reward": 0.0277777798473835, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.012369791977107525, "step": 84 }, { "all_correct": 0.0, "all_wrong": 0.6666666666666666, "completion_length": 438.375, "epoch": 0.0008229177759920999, "grad_norm": 6.374557831494328, "kl": 0.006622314453125, "learning_rate": 9.99998368176715e-07, "loss": 0.0003, "reward": 0.9076051115989685, "reward_std": 0.22621206939220428, "rewards/accuracy_reward": 0.07116303592920303, "rewards/format_reward": 0.7083333730697632, "rewards/semantic_reward": 0.1031087264418602, "step": 85 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.6666666666666666, "completion_length": 283.79168701171875, "epoch": 0.0008325991615920071, "grad_norm": 1.4233350971465746, "kl": 0.00775146484375, "learning_rate": 9.999983290925338e-07, "loss": 0.0003, "reward": 1.4673665761947632, "reward_std": 0.032621391117572784, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.0923665389418602, "step": 86 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 297.29168701171875, "epoch": 0.0008422805471919141, "grad_norm": 3.9070931524328354, "kl": 0.00762939453125, "learning_rate": 9.999982895458191e-07, "loss": 0.0003, "reward": 1.3250465393066406, "reward_std": 0.3443993031978607, "rewards/accuracy_reward": 0.17889586091041565, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1378173828125, "step": 87 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 280.5, "epoch": 0.0008519619327918211, "grad_norm": 2.373075106461087, "kl": 0.007415771484375, "learning_rate": 9.999982495365709e-07, "loss": 0.0003, "reward": 2.0932579040527344, "reward_std": 0.2912212610244751, "rewards/accuracy_reward": 0.8333333730697632, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.23492431640625, "step": 88 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 291.5, "epoch": 0.0008616433183917282, "grad_norm": 15.869542793265149, "kl": 0.007232666015625, "learning_rate": 9.999982090647888e-07, "loss": 0.0003, "reward": 1.8383519649505615, "reward_std": 0.377177894115448, "rewards/accuracy_reward": 0.5555555820465088, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2411295622587204, "step": 89 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 379.4583435058594, "epoch": 0.0008713247039916353, "grad_norm": 4.155532229408532, "kl": 0.00518798828125, "learning_rate": 9.999981681304737e-07, "loss": 0.0002, "reward": 2.053783655166626, "reward_std": 0.38841724395751953, "rewards/accuracy_reward": 0.805555522441864, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.1732279509305954, "step": 90 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 341.8333435058594, "epoch": 0.0008810060895915423, "grad_norm": 2.4718350187046854, "kl": 0.01202392578125, "learning_rate": 9.999981267336247e-07, "loss": 0.0005, "reward": 1.6466999053955078, "reward_std": 0.25324004888534546, "rewards/accuracy_reward": 0.3323730230331421, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.2476603239774704, "step": 91 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 282.91668701171875, "epoch": 0.0008906874751914494, "grad_norm": 5.640394738499631, "kl": 0.0098876953125, "learning_rate": 9.999980848742422e-07, "loss": 0.0004, "reward": 2.062883138656616, "reward_std": 0.3548971116542816, "rewards/accuracy_reward": 0.7370855212211609, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.3174642026424408, "step": 92 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.3333333333333333, "completion_length": 346.0, "epoch": 0.0009003688607913565, "grad_norm": 2.111380015753831, "kl": 0.0089111328125, "learning_rate": 9.999980425523263e-07, "loss": 0.0004, "reward": 1.9707112312316895, "reward_std": 0.08480590581893921, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2290446013212204, "step": 93 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 249.45834350585938, "epoch": 0.0009100502463912635, "grad_norm": 2.071295962677454, "kl": 0.016845703125, "learning_rate": 9.99997999767877e-07, "loss": 0.0007, "reward": 1.708444595336914, "reward_std": 0.25974807143211365, "rewards/accuracy_reward": 0.4722222685813904, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2112223356962204, "step": 94 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 326.91668701171875, "epoch": 0.0009197316319911706, "grad_norm": 1.8782098086894647, "kl": 0.00982666015625, "learning_rate": 9.999979565208945e-07, "loss": 0.0004, "reward": 1.7357558012008667, "reward_std": 0.23542520403862, "rewards/accuracy_reward": 0.4930555820465088, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1843668669462204, "step": 95 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 256.04168701171875, "epoch": 0.0009294130175910776, "grad_norm": 2.316167808240408, "kl": 0.01202392578125, "learning_rate": 9.999979128113786e-07, "loss": 0.0005, "reward": 1.612837791442871, "reward_std": 0.04762125015258789, "rewards/accuracy_reward": 0.4158245027065277, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1470133513212204, "step": 96 }, { "all_correct": 0.0, "all_wrong": 0.6666666666666666, "completion_length": 300.0833435058594, "epoch": 0.0009390944031909847, "grad_norm": 1.7239626465092501, "kl": 0.00982666015625, "learning_rate": 9.999978686393295e-07, "loss": 0.0004, "reward": 1.0773310661315918, "reward_std": 0.04841721057891846, "rewards/accuracy_reward": 0.01873725652694702, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.05859375, "step": 97 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 342.5833435058594, "epoch": 0.0009487757887908918, "grad_norm": 3.1855141375586062, "kl": 0.00897216796875, "learning_rate": 9.99997824004747e-07, "loss": 0.0004, "reward": 1.9785655736923218, "reward_std": 0.1437837928533554, "rewards/accuracy_reward": 0.552011251449585, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3015543818473816, "step": 98 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.6666666666666666, "completion_length": 263.7083435058594, "epoch": 0.0009584571743907988, "grad_norm": 2.5894289511540713, "kl": 0.00982666015625, "learning_rate": 9.999977789076317e-07, "loss": 0.0004, "reward": 1.4788737297058105, "reward_std": 0.030171243473887444, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.13720703125, "step": 99 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 409.54168701171875, "epoch": 0.0009681385599907058, "grad_norm": 2.600223924041825, "kl": 0.00787353515625, "learning_rate": 9.99997733347983e-07, "loss": 0.0003, "reward": 1.5551249980926514, "reward_std": 0.23242078721523285, "rewards/accuracy_reward": 0.40698033571243286, "rewards/format_reward": 0.9166666865348816, "rewards/semantic_reward": 0.1814778745174408, "step": 100 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 301.29168701171875, "epoch": 0.000977819945590613, "grad_norm": 5.28634673990987, "kl": 0.011962890625, "learning_rate": 9.999976873258012e-07, "loss": 0.0005, "reward": 1.7200007438659668, "reward_std": 0.2106447070837021, "rewards/accuracy_reward": 0.43703362345695496, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2079671323299408, "step": 101 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 454.54168701171875, "epoch": 0.0009875013311905199, "grad_norm": 5.748914739664536, "kl": 0.01007080078125, "learning_rate": 9.999976408410864e-07, "loss": 0.0004, "reward": 1.820683240890503, "reward_std": 0.4779224395751953, "rewards/accuracy_reward": 0.6169111728668213, "rewards/format_reward": 0.875, "rewards/semantic_reward": 0.2204386442899704, "step": 102 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 308.0, "epoch": 0.000997182716790427, "grad_norm": 2.8916641468568614, "kl": 0.00970458984375, "learning_rate": 9.999975938938385e-07, "loss": 0.0004, "reward": 2.0653157234191895, "reward_std": 0.4708643853664398, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2403157651424408, "step": 103 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 299.625, "epoch": 0.0010068641023903342, "grad_norm": 2.692994320301664, "kl": 0.01385498046875, "learning_rate": 9.999975464840577e-07, "loss": 0.0006, "reward": 2.018651008605957, "reward_std": 0.23854486644268036, "rewards/accuracy_reward": 0.6727282404899597, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2792561948299408, "step": 104 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 325.3333435058594, "epoch": 0.001016545487990241, "grad_norm": 2.6839826444017407, "kl": 0.01263427734375, "learning_rate": 9.99997498611744e-07, "loss": 0.0005, "reward": 1.6704788208007812, "reward_std": 0.09353940188884735, "rewards/accuracy_reward": 0.4340042769908905, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1781412810087204, "step": 105 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 282.2083435058594, "epoch": 0.0010262268735901482, "grad_norm": 3.8677351280199463, "kl": 0.01239013671875, "learning_rate": 9.999974502768972e-07, "loss": 0.0005, "reward": 1.933692455291748, "reward_std": 0.28383713960647583, "rewards/accuracy_reward": 0.5733897089958191, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.310302734375, "step": 106 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 281.125, "epoch": 0.0010359082591900554, "grad_norm": 3.1275430071443187, "kl": 0.0108642578125, "learning_rate": 9.999974014795178e-07, "loss": 0.0004, "reward": 1.8123868703842163, "reward_std": 0.3379300534725189, "rewards/accuracy_reward": 0.5637947916984558, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2235921323299408, "step": 107 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 353.41668701171875, "epoch": 0.0010455896447899623, "grad_norm": 2.803863908382271, "kl": 0.0101318359375, "learning_rate": 9.999973522196057e-07, "loss": 0.0004, "reward": 1.6542062759399414, "reward_std": 0.5424546599388123, "rewards/accuracy_reward": 0.3882233500480652, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.17431640625, "step": 108 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 297.66668701171875, "epoch": 0.0010552710303898694, "grad_norm": 3.2130731139944313, "kl": 0.010498046875, "learning_rate": 9.999973024971606e-07, "loss": 0.0004, "reward": 1.3560981750488281, "reward_std": 0.2728726863861084, "rewards/accuracy_reward": 0.2361111342906952, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1116536483168602, "step": 109 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 315.16668701171875, "epoch": 0.0010649524159897763, "grad_norm": 7.976109808767599, "kl": 0.01416015625, "learning_rate": 9.999972523121828e-07, "loss": 0.0006, "reward": 1.7122306823730469, "reward_std": 0.08463114500045776, "rewards/accuracy_reward": 0.4601392149925232, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1854248046875, "step": 110 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 352.625, "epoch": 0.0010746338015896835, "grad_norm": 7.967178273222902, "kl": 0.01202392578125, "learning_rate": 9.999972016646725e-07, "loss": 0.0005, "reward": 2.1920697689056396, "reward_std": 0.23688280582427979, "rewards/accuracy_reward": 0.8333333730697632, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2670695185661316, "step": 111 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 321.3333435058594, "epoch": 0.0010843151871895906, "grad_norm": 3.30475641679048, "kl": 0.0185546875, "learning_rate": 9.999971505546295e-07, "loss": 0.0007, "reward": 2.1679301261901855, "reward_std": 0.28440266847610474, "rewards/accuracy_reward": 0.7156920433044434, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3772379755973816, "step": 112 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 300.16668701171875, "epoch": 0.0010939965727894975, "grad_norm": 2.6928339627948663, "kl": 0.0167236328125, "learning_rate": 9.99997098982054e-07, "loss": 0.0007, "reward": 1.34972083568573, "reward_std": 0.11288432776927948, "rewards/accuracy_reward": 0.15145820379257202, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1565958708524704, "step": 113 }, { "all_correct": 0.0, "all_wrong": 0.6666666666666666, "completion_length": 283.4583435058594, "epoch": 0.0011036779583894047, "grad_norm": 1.6017689274660416, "kl": 0.0107421875, "learning_rate": 9.999970469469458e-07, "loss": 0.0004, "reward": 1.1651570796966553, "reward_std": 0.2279965728521347, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.01515706442296505, "step": 114 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 321.54168701171875, "epoch": 0.0011133593439893118, "grad_norm": 2.9528272930427195, "kl": 0.01361083984375, "learning_rate": 9.999969944493055e-07, "loss": 0.0005, "reward": 1.5565202236175537, "reward_std": 0.09017335623502731, "rewards/accuracy_reward": 0.3229670226573944, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1418863981962204, "step": 115 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 386.125, "epoch": 0.0011230407295892187, "grad_norm": 5.04140117752252, "kl": 0.00970458984375, "learning_rate": 9.999969414891325e-07, "loss": 0.0004, "reward": 1.5175541639328003, "reward_std": 0.28467273712158203, "rewards/accuracy_reward": 0.22687625885009766, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2156779021024704, "step": 116 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 539.875, "epoch": 0.0011327221151891259, "grad_norm": 1.7775888212505737, "kl": 0.00506591796875, "learning_rate": 9.999968880664272e-07, "loss": 0.0002, "reward": 1.3311686515808105, "reward_std": 0.5687007904052734, "rewards/accuracy_reward": 0.4583333432674408, "rewards/format_reward": 0.7083333730697632, "rewards/semantic_reward": 0.1561686247587204, "step": 117 }, { "all_correct": 0.0, "all_wrong": 0.6666666666666666, "completion_length": 386.25, "epoch": 0.001142403500789033, "grad_norm": 1.1309795600503159, "kl": 0.00885009765625, "learning_rate": 9.999968341811896e-07, "loss": 0.0004, "reward": 1.2343918085098267, "reward_std": 0.03528338298201561, "rewards/accuracy_reward": 0.10354055464267731, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.0808512419462204, "step": 118 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 405.125, "epoch": 0.00115208488638894, "grad_norm": 2.9020332369578306, "kl": 0.01068115234375, "learning_rate": 9.999967798334197e-07, "loss": 0.0004, "reward": 1.8593262434005737, "reward_std": 0.6334639191627502, "rewards/accuracy_reward": 0.7083333730697632, "rewards/format_reward": 0.875, "rewards/semantic_reward": 0.184326171875, "step": 119 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 380.5833435058594, "epoch": 0.001161766271988847, "grad_norm": 2.0016769958711103, "kl": 0.00946044921875, "learning_rate": 9.999967250231177e-07, "loss": 0.0004, "reward": 2.086763381958008, "reward_std": 0.36255699396133423, "rewards/accuracy_reward": 0.7193154692649841, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2591145932674408, "step": 120 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 459.8333435058594, "epoch": 0.001171447657588754, "grad_norm": 2.6591087029026013, "kl": 0.01007080078125, "learning_rate": 9.999966697502832e-07, "loss": 0.0004, "reward": 1.3466167449951172, "reward_std": 0.19247514009475708, "rewards/accuracy_reward": 0.4231709837913513, "rewards/format_reward": 0.7083333730697632, "rewards/semantic_reward": 0.1567789763212204, "step": 121 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 352.125, "epoch": 0.0011811290431886611, "grad_norm": 3.630451606405393, "kl": 0.0128173828125, "learning_rate": 9.99996614014917e-07, "loss": 0.0005, "reward": 1.8268941640853882, "reward_std": 0.2757302522659302, "rewards/accuracy_reward": 0.5167540907859802, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.226806640625, "step": 122 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 325.375, "epoch": 0.0011908104287885683, "grad_norm": 2.2127337322873863, "kl": 0.0111083984375, "learning_rate": 9.999965578170186e-07, "loss": 0.0004, "reward": 1.2846713066101074, "reward_std": 0.49837058782577515, "rewards/accuracy_reward": 0.19306159019470215, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.0666097030043602, "step": 123 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 369.75, "epoch": 0.0012004918143884752, "grad_norm": 8.69736936019502, "kl": 0.013427734375, "learning_rate": 9.99996501156588e-07, "loss": 0.0005, "reward": 2.1344499588012695, "reward_std": 0.10180775821208954, "rewards/accuracy_reward": 0.6523697376251221, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3404134213924408, "step": 124 }, { "all_correct": 0.0, "all_wrong": 0.6666666666666666, "completion_length": 528.75, "epoch": 0.0012101731999883823, "grad_norm": 1.120285701991284, "kl": 0.00439453125, "learning_rate": 9.999964440336257e-07, "loss": 0.0002, "reward": 1.3934407234191895, "reward_std": 0.3504936099052429, "rewards/accuracy_reward": 0.2916666865348816, "rewards/format_reward": 0.9166666865348816, "rewards/semantic_reward": 0.1517740935087204, "step": 125 }, { "all_correct": 0.0, "all_wrong": 1.0, "completion_length": 553.0, "epoch": 0.0012198545855882895, "grad_norm": 0.739515285359868, "kl": 0.006072998046875, "learning_rate": 9.999963864481314e-07, "loss": 0.0002, "reward": 0.5, "reward_std": 0.17817416787147522, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.5, "rewards/semantic_reward": 0.0, "step": 126 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 342.25, "epoch": 0.0012295359711881964, "grad_norm": 10.774435643348001, "kl": 0.01556396484375, "learning_rate": 9.99996328400105e-07, "loss": 0.0006, "reward": 1.4000394344329834, "reward_std": 0.24679654836654663, "rewards/accuracy_reward": 0.1967679262161255, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1449381560087204, "step": 127 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 332.5, "epoch": 0.0012392173567881035, "grad_norm": 3.3670253534978505, "kl": 0.0169677734375, "learning_rate": 9.999962698895473e-07, "loss": 0.0007, "reward": 1.8025693893432617, "reward_std": 0.0622062087059021, "rewards/accuracy_reward": 0.48302173614501953, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.202880859375, "step": 128 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 321.125, "epoch": 0.0012488987423880107, "grad_norm": 6.233324528957824, "kl": 0.016357421875, "learning_rate": 9.999962109164574e-07, "loss": 0.0007, "reward": 1.932242751121521, "reward_std": 0.0967564806342125, "rewards/accuracy_reward": 0.5795164108276367, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2527262568473816, "step": 129 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 422.75, "epoch": 0.0012585801279879176, "grad_norm": 3.275898202775583, "kl": 0.01312255859375, "learning_rate": 9.999961514808361e-07, "loss": 0.0005, "reward": 1.4577525854110718, "reward_std": 0.36945438385009766, "rewards/accuracy_reward": 0.33280956745147705, "rewards/format_reward": 0.9166666865348816, "rewards/semantic_reward": 0.1499430388212204, "step": 130 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 303.66668701171875, "epoch": 0.0012682615135878247, "grad_norm": 3.1086115133924093, "kl": 0.01300048828125, "learning_rate": 9.999960915826831e-07, "loss": 0.0005, "reward": 1.7430744171142578, "reward_std": 0.485651433467865, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1930745542049408, "step": 131 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 347.5, "epoch": 0.0012779428991877317, "grad_norm": 2.068484112489206, "kl": 0.010986328125, "learning_rate": 9.999960312219983e-07, "loss": 0.0004, "reward": 1.2599610090255737, "reward_std": 0.5258345007896423, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.0849609375, "step": 132 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 334.8333435058594, "epoch": 0.0012876242847876388, "grad_norm": 3.5473127537516587, "kl": 0.0211181640625, "learning_rate": 9.999959703987822e-07, "loss": 0.0008, "reward": 1.9599311351776123, "reward_std": 0.09523885697126389, "rewards/accuracy_reward": 0.6116319894790649, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2566325068473816, "step": 133 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 327.0, "epoch": 0.001297305670387546, "grad_norm": 2.1862498740315295, "kl": 0.01202392578125, "learning_rate": 9.999959091130345e-07, "loss": 0.0005, "reward": 1.6328206062316895, "reward_std": 0.2857286334037781, "rewards/accuracy_reward": 0.4583333432674408, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1411539763212204, "step": 134 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 390.91668701171875, "epoch": 0.0013069870559874529, "grad_norm": 7.955688493162367, "kl": 0.01092529296875, "learning_rate": 9.999958473647557e-07, "loss": 0.0004, "reward": 2.296574115753174, "reward_std": 0.27410441637039185, "rewards/accuracy_reward": 0.9166666865348816, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3215739130973816, "step": 135 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 481.5833435058594, "epoch": 0.00131666844158736, "grad_norm": 1.7441338418000738, "kl": 0.00775146484375, "learning_rate": 9.99995785153945e-07, "loss": 0.0003, "reward": 1.6513265371322632, "reward_std": 0.5524803996086121, "rewards/accuracy_reward": 0.5416666865348816, "rewards/format_reward": 0.9166666865348816, "rewards/semantic_reward": 0.1513265073299408, "step": 136 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 320.54168701171875, "epoch": 0.0013263498271872671, "grad_norm": 2.569070514588624, "kl": 0.016357421875, "learning_rate": 9.999957224806034e-07, "loss": 0.0007, "reward": 1.6905248165130615, "reward_std": 0.2534968852996826, "rewards/accuracy_reward": 0.4722222685813904, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1766357421875, "step": 137 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 467.41668701171875, "epoch": 0.001336031212787174, "grad_norm": 2.205151740057286, "kl": 0.00640869140625, "learning_rate": 9.999956593447305e-07, "loss": 0.0003, "reward": 1.6753010749816895, "reward_std": 0.5816464424133301, "rewards/accuracy_reward": 0.5833333730697632, "rewards/format_reward": 0.875, "rewards/semantic_reward": 0.1836344450712204, "step": 138 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 434.5, "epoch": 0.0013457125983870812, "grad_norm": 7.461944671284251, "kl": 0.00994873046875, "learning_rate": 9.999955957463264e-07, "loss": 0.0004, "reward": 1.8087972402572632, "reward_std": 0.26572567224502563, "rewards/accuracy_reward": 0.5044108629226685, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2460530698299408, "step": 139 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 322.29168701171875, "epoch": 0.0013553939839869881, "grad_norm": 6.27019323077913, "kl": 0.0198974609375, "learning_rate": 9.999955316853913e-07, "loss": 0.0008, "reward": 1.8840248584747314, "reward_std": 0.15742343664169312, "rewards/accuracy_reward": 0.5321367979049683, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2685546875, "step": 140 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 349.8333435058594, "epoch": 0.0013650753695868953, "grad_norm": 3.030029882989174, "kl": 0.01708984375, "learning_rate": 9.999954671619249e-07, "loss": 0.0007, "reward": 1.333155870437622, "reward_std": 0.1908913105726242, "rewards/accuracy_reward": 0.16940250992774963, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.0970865935087204, "step": 141 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 434.04168701171875, "epoch": 0.0013747567551868024, "grad_norm": 2.1446789556552304, "kl": 0.0142822265625, "learning_rate": 9.999954021759277e-07, "loss": 0.0006, "reward": 1.4999078512191772, "reward_std": 0.23951594531536102, "rewards/accuracy_reward": 0.3055555820465088, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1693522185087204, "step": 142 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 411.5, "epoch": 0.0013844381407867093, "grad_norm": 4.73292933582996, "kl": 0.01953125, "learning_rate": 9.999953367273995e-07, "loss": 0.0008, "reward": 1.7772185802459717, "reward_std": 0.38896703720092773, "rewards/accuracy_reward": 0.3616911768913269, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.29052734375, "step": 143 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 461.66668701171875, "epoch": 0.0013941195263866165, "grad_norm": 20.348288134850865, "kl": 0.01287841796875, "learning_rate": 9.999952708163402e-07, "loss": 0.0005, "reward": 1.4403324127197266, "reward_std": 0.46154236793518066, "rewards/accuracy_reward": 0.24074743688106537, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1579183042049408, "step": 144 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 304.29168701171875, "epoch": 0.0014038009119865236, "grad_norm": 5.293715865695287, "kl": 0.0194091796875, "learning_rate": 9.999952044427504e-07, "loss": 0.0008, "reward": 2.023409843444824, "reward_std": 0.29344063997268677, "rewards/accuracy_reward": 0.7145962715148926, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2421468198299408, "step": 145 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 391.2083435058594, "epoch": 0.0014134822975864305, "grad_norm": 3.1955153593179806, "kl": 0.0140380859375, "learning_rate": 9.999951376066297e-07, "loss": 0.0006, "reward": 1.9634169340133667, "reward_std": 0.5540640950202942, "rewards/accuracy_reward": 0.6805555820465088, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.182861328125, "step": 146 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 479.7083435058594, "epoch": 0.0014231636831863377, "grad_norm": 1.8946629301814997, "kl": 0.01507568359375, "learning_rate": 9.999950703079783e-07, "loss": 0.0006, "reward": 1.1438113451004028, "reward_std": 0.33001989126205444, "rewards/accuracy_reward": 0.17913030087947845, "rewards/format_reward": 0.8333333730697632, "rewards/semantic_reward": 0.0896809920668602, "step": 147 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 451.79168701171875, "epoch": 0.0014328450687862448, "grad_norm": 2.8211019364621674, "kl": 0.01220703125, "learning_rate": 9.999950025467962e-07, "loss": 0.0005, "reward": 1.5406126976013184, "reward_std": 0.21355611085891724, "rewards/accuracy_reward": 0.45902085304260254, "rewards/format_reward": 0.75, "rewards/semantic_reward": 0.256591796875, "step": 148 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 323.0833435058594, "epoch": 0.0014425264543861517, "grad_norm": 1.8486105013310963, "kl": 0.01239013671875, "learning_rate": 9.999949343230836e-07, "loss": 0.0005, "reward": 1.6164177656173706, "reward_std": 0.06152179092168808, "rewards/accuracy_reward": 0.42729830741882324, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1391194760799408, "step": 149 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 459.2083435058594, "epoch": 0.0014522078399860589, "grad_norm": 2.8046107713342154, "kl": 0.016845703125, "learning_rate": 9.999948656368405e-07, "loss": 0.0007, "reward": 1.657078742980957, "reward_std": 0.5109454989433289, "rewards/accuracy_reward": 0.4970038831233978, "rewards/format_reward": 0.9166666865348816, "rewards/semantic_reward": 0.1600748747587204, "step": 150 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 429.75, "epoch": 0.0014618892255859658, "grad_norm": 4.250124453242239, "kl": 0.013427734375, "learning_rate": 9.999947964880669e-07, "loss": 0.0005, "reward": 1.465202808380127, "reward_std": 0.5032483339309692, "rewards/accuracy_reward": 0.2960784435272217, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.1441243588924408, "step": 151 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 302.29168701171875, "epoch": 0.001471570611185873, "grad_norm": 2.2440527329428193, "kl": 0.0150146484375, "learning_rate": 9.99994726876763e-07, "loss": 0.0006, "reward": 1.6004788875579834, "reward_std": 0.18612031638622284, "rewards/accuracy_reward": 0.444444477558136, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.16436767578125, "step": 152 }, { "all_correct": 0.0, "all_wrong": 0.6666666666666666, "completion_length": 386.4583435058594, "epoch": 0.00148125199678578, "grad_norm": 3.3862523401386437, "kl": 0.013916015625, "learning_rate": 9.999946568029286e-07, "loss": 0.0006, "reward": 1.1877458095550537, "reward_std": 0.14929227530956268, "rewards/accuracy_reward": 0.1005387157201767, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.0955403670668602, "step": 153 }, { "all_correct": 0.0, "all_wrong": 0.6666666666666666, "completion_length": 480.5833435058594, "epoch": 0.001490933382385687, "grad_norm": 1.9246885924085608, "kl": 0.012939453125, "learning_rate": 9.999945862665642e-07, "loss": 0.0005, "reward": 1.0012198686599731, "reward_std": 0.21463772654533386, "rewards/accuracy_reward": 0.06714590638875961, "rewards/format_reward": 0.7916666865348816, "rewards/semantic_reward": 0.0924072265625, "step": 154 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 423.8333435058594, "epoch": 0.0015006147679855941, "grad_norm": 3.6957023286280393, "kl": 0.01220703125, "learning_rate": 9.999945152676695e-07, "loss": 0.0005, "reward": 1.5231188535690308, "reward_std": 0.34560924768447876, "rewards/accuracy_reward": 0.25988638401031494, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.188232421875, "step": 155 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 416.9583435058594, "epoch": 0.0015102961535855013, "grad_norm": 6.951401140241296, "kl": 0.01806640625, "learning_rate": 9.999944438062447e-07, "loss": 0.0007, "reward": 1.7932329177856445, "reward_std": 0.3526585102081299, "rewards/accuracy_reward": 0.45220106840133667, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.224365234375, "step": 156 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 298.3333435058594, "epoch": 0.0015199775391854082, "grad_norm": 2.2685828341617227, "kl": 0.0181884765625, "learning_rate": 9.999943718822898e-07, "loss": 0.0007, "reward": 1.6174352169036865, "reward_std": 0.1876598596572876, "rewards/accuracy_reward": 0.4010208547115326, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1580810546875, "step": 157 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 342.2083435058594, "epoch": 0.0015296589247853153, "grad_norm": 1.5688506482069087, "kl": 0.007598876953125, "learning_rate": 9.999942994958047e-07, "loss": 0.0003, "reward": 1.7082613706588745, "reward_std": 0.31293177604675293, "rewards/accuracy_reward": 0.4423840045928955, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1908772885799408, "step": 158 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 295.4583435058594, "epoch": 0.0015393403103852222, "grad_norm": 6.138260997221955, "kl": 0.019287109375, "learning_rate": 9.9999422664679e-07, "loss": 0.0008, "reward": 1.630552053451538, "reward_std": 0.06938620656728745, "rewards/accuracy_reward": 0.4294613301753998, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.159423828125, "step": 159 }, { "all_correct": 0.0, "all_wrong": 0.6666666666666666, "completion_length": 557.1666870117188, "epoch": 0.0015490216959851294, "grad_norm": 1.0727107182109923, "kl": 0.007598876953125, "learning_rate": 9.999941533352453e-07, "loss": 0.0003, "reward": 0.7375133037567139, "reward_std": 0.42025160789489746, "rewards/accuracy_reward": 0.037741176784038544, "rewards/format_reward": 0.625, "rewards/semantic_reward": 0.05810546875, "step": 160 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 394.79168701171875, "epoch": 0.0015587030815850365, "grad_norm": 3.4629955402790333, "kl": 0.02001953125, "learning_rate": 9.99994079561171e-07, "loss": 0.0008, "reward": 1.5702141523361206, "reward_std": 0.1995668113231659, "rewards/accuracy_reward": 0.29334235191345215, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1685384213924408, "step": 161 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 340.0, "epoch": 0.0015683844671849434, "grad_norm": 2.290034158584733, "kl": 0.01483154296875, "learning_rate": 9.999940053245668e-07, "loss": 0.0006, "reward": 1.8477230072021484, "reward_std": 0.23557327687740326, "rewards/accuracy_reward": 0.548585832118988, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2408040463924408, "step": 162 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 362.3333435058594, "epoch": 0.0015780658527848506, "grad_norm": 1.9454022508045965, "kl": 0.0166015625, "learning_rate": 9.99993930625433e-07, "loss": 0.0007, "reward": 1.330848217010498, "reward_std": 0.24829666316509247, "rewards/accuracy_reward": 0.16614282131195068, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1147054061293602, "step": 163 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 365.29168701171875, "epoch": 0.0015877472383847577, "grad_norm": 2.7863517706017262, "kl": 0.01611328125, "learning_rate": 9.999938554637696e-07, "loss": 0.0006, "reward": 1.5434887409210205, "reward_std": 0.06706379354000092, "rewards/accuracy_reward": 0.27975982427597046, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1970621794462204, "step": 164 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 350.9583435058594, "epoch": 0.0015974286239846646, "grad_norm": 12.185272701845081, "kl": 0.01531982421875, "learning_rate": 9.999937798395769e-07, "loss": 0.0006, "reward": 1.8500436544418335, "reward_std": 0.29933521151542664, "rewards/accuracy_reward": 0.5325795412063599, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.234130859375, "step": 165 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 410.66668701171875, "epoch": 0.0016071100095845718, "grad_norm": 4.197307142339537, "kl": 0.0091552734375, "learning_rate": 9.999937037528542e-07, "loss": 0.0004, "reward": 1.796053171157837, "reward_std": 0.22976845502853394, "rewards/accuracy_reward": 0.5833333730697632, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1627197265625, "step": 166 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 361.5, "epoch": 0.001616791395184479, "grad_norm": 2.205417949357978, "kl": 0.0166015625, "learning_rate": 9.999936272036026e-07, "loss": 0.0007, "reward": 1.693516492843628, "reward_std": 0.5311737060546875, "rewards/accuracy_reward": 0.41029712557792664, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.216552734375, "step": 167 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 483.7083435058594, "epoch": 0.0016264727807843858, "grad_norm": 1.1655156245008593, "kl": 0.006317138671875, "learning_rate": 9.999935501918217e-07, "loss": 0.0003, "reward": 1.4976413249969482, "reward_std": 0.6083568334579468, "rewards/accuracy_reward": 0.3728121519088745, "rewards/format_reward": 0.875, "rewards/semantic_reward": 0.1581624448299408, "step": 168 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 331.29168701171875, "epoch": 0.001636154166384293, "grad_norm": 2.6199375431896312, "kl": 0.0201416015625, "learning_rate": 9.999934727175114e-07, "loss": 0.0008, "reward": 1.5592087507247925, "reward_std": 0.3103478252887726, "rewards/accuracy_reward": 0.32032525539398193, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1888834685087204, "step": 169 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.6666666666666666, "completion_length": 363.25, "epoch": 0.0016458355519841999, "grad_norm": 1.4967960829475107, "kl": 0.0091552734375, "learning_rate": 9.99993394780672e-07, "loss": 0.0004, "reward": 1.3749512434005737, "reward_std": 0.14767608046531677, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.074951171875, "step": 170 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 339.04168701171875, "epoch": 0.001655516937584107, "grad_norm": 6.2251207793034915, "kl": 0.015625, "learning_rate": 9.999933163813034e-07, "loss": 0.0006, "reward": 1.611930251121521, "reward_std": 0.2521185874938965, "rewards/accuracy_reward": 0.4583333432674408, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1035970076918602, "step": 171 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 298.0, "epoch": 0.0016651983231840142, "grad_norm": 2.752740056880436, "kl": 0.02099609375, "learning_rate": 9.99993237519406e-07, "loss": 0.0008, "reward": 2.120323657989502, "reward_std": 0.2850939631462097, "rewards/accuracy_reward": 0.6944445371627808, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3592122495174408, "step": 172 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 364.625, "epoch": 0.001674879708783921, "grad_norm": 3.605164971859343, "kl": 0.017333984375, "learning_rate": 9.999931581949795e-07, "loss": 0.0007, "reward": 1.7996826171875, "reward_std": 0.32768917083740234, "rewards/accuracy_reward": 0.5833333730697632, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1330159604549408, "step": 173 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 437.5833435058594, "epoch": 0.0016845610943838282, "grad_norm": 3.391239382980869, "kl": 0.01361083984375, "learning_rate": 9.99993078408024e-07, "loss": 0.0005, "reward": 2.210799217224121, "reward_std": 0.3920226991176605, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.2357991635799408, "step": 174 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 403.41668701171875, "epoch": 0.0016942424799837354, "grad_norm": 2.3760746240714115, "kl": 0.014892578125, "learning_rate": 9.999929981585402e-07, "loss": 0.0006, "reward": 1.8380792140960693, "reward_std": 0.2674846649169922, "rewards/accuracy_reward": 0.4885755777359009, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2411702573299408, "step": 175 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 324.54168701171875, "epoch": 0.0017039238655836423, "grad_norm": 2.6884632293144257, "kl": 0.015625, "learning_rate": 9.999929174465272e-07, "loss": 0.0006, "reward": 2.0608441829681396, "reward_std": 0.11085718870162964, "rewards/accuracy_reward": 0.6744264364242554, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2864176630973816, "step": 176 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 399.8333435058594, "epoch": 0.0017136052511835494, "grad_norm": 2.660485539622687, "kl": 0.0181884765625, "learning_rate": 9.999928362719859e-07, "loss": 0.0007, "reward": 1.5265405178070068, "reward_std": 0.19057194888591766, "rewards/accuracy_reward": 0.37105947732925415, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.0971476286649704, "step": 177 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 357.625, "epoch": 0.0017232866367834563, "grad_norm": 2.1513566590061743, "kl": 0.01507568359375, "learning_rate": 9.99992754634916e-07, "loss": 0.0006, "reward": 1.6668980121612549, "reward_std": 0.05712739750742912, "rewards/accuracy_reward": 0.4353306293487549, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1899007260799408, "step": 178 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 364.8333435058594, "epoch": 0.0017329680223833635, "grad_norm": 6.670175026007178, "kl": 0.0179443359375, "learning_rate": 9.999926725353173e-07, "loss": 0.0007, "reward": 1.5607621669769287, "reward_std": 0.05206109210848808, "rewards/accuracy_reward": 0.22665564715862274, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2091064453125, "step": 179 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 342.75, "epoch": 0.0017426494079832706, "grad_norm": 2.563778597089711, "kl": 0.016845703125, "learning_rate": 9.999925899731903e-07, "loss": 0.0007, "reward": 1.4559602737426758, "reward_std": 0.2417035549879074, "rewards/accuracy_reward": 0.3106478750705719, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1119791716337204, "step": 180 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 388.9583435058594, "epoch": 0.0017523307935831775, "grad_norm": 4.045239481543479, "kl": 0.0113525390625, "learning_rate": 9.99992506948535e-07, "loss": 0.0005, "reward": 2.0577800273895264, "reward_std": 0.3081304728984833, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2244466245174408, "step": 181 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 416.625, "epoch": 0.0017620121791830847, "grad_norm": 3.326314888276132, "kl": 0.017822265625, "learning_rate": 9.999924234613514e-07, "loss": 0.0007, "reward": 2.003307580947876, "reward_std": 0.07974971830844879, "rewards/accuracy_reward": 0.5704869031906128, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2661539912223816, "step": 182 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 445.125, "epoch": 0.0017716935647829918, "grad_norm": 4.417167559912088, "kl": 0.0185546875, "learning_rate": 9.999923395116397e-07, "loss": 0.0007, "reward": 1.625801920890808, "reward_std": 0.1570104956626892, "rewards/accuracy_reward": 0.2619834542274475, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2471517026424408, "step": 183 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 390.79168701171875, "epoch": 0.0017813749503828987, "grad_norm": 2.6074980413647753, "kl": 0.0078125, "learning_rate": 9.999922550993999e-07, "loss": 0.0003, "reward": 1.854544997215271, "reward_std": 0.40971165895462036, "rewards/accuracy_reward": 0.6000771522521973, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1628011167049408, "step": 184 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 412.3333435058594, "epoch": 0.0017910563359828059, "grad_norm": 3.448518926959634, "kl": 0.01611328125, "learning_rate": 9.999921702246318e-07, "loss": 0.0006, "reward": 1.9503625631332397, "reward_std": 0.1608058512210846, "rewards/accuracy_reward": 0.5507044196128845, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.274658203125, "step": 185 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 379.29168701171875, "epoch": 0.001800737721582713, "grad_norm": 2.1471300699342497, "kl": 0.0164794921875, "learning_rate": 9.99992084887336e-07, "loss": 0.0007, "reward": 2.1657142639160156, "reward_std": 0.18532449007034302, "rewards/accuracy_reward": 0.7835690975189209, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2821452021598816, "step": 186 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 344.5833435058594, "epoch": 0.00181041910718262, "grad_norm": 2.6721994208724786, "kl": 0.0194091796875, "learning_rate": 9.999919990875123e-07, "loss": 0.0008, "reward": 2.1545016765594482, "reward_std": 0.11184129118919373, "rewards/accuracy_reward": 0.7525238394737244, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2769775390625, "step": 187 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 363.4583435058594, "epoch": 0.001820100492782527, "grad_norm": 1.9090763083168354, "kl": 0.01434326171875, "learning_rate": 9.99991912825161e-07, "loss": 0.0006, "reward": 1.6794514656066895, "reward_std": 0.31311482191085815, "rewards/accuracy_reward": 0.4583333432674408, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1461181640625, "step": 188 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 495.0833435058594, "epoch": 0.001829781878382434, "grad_norm": 1.6641142714910009, "kl": 0.01361083984375, "learning_rate": 9.999918261002815e-07, "loss": 0.0005, "reward": 1.6834633350372314, "reward_std": 0.5381174087524414, "rewards/accuracy_reward": 0.5098468065261841, "rewards/format_reward": 0.875, "rewards/semantic_reward": 0.2069498747587204, "step": 189 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 422.0833435058594, "epoch": 0.0018394632639823411, "grad_norm": 2.5076980148927417, "kl": 0.01470947265625, "learning_rate": 9.999917389128748e-07, "loss": 0.0006, "reward": 1.8555606603622437, "reward_std": 0.504223108291626, "rewards/accuracy_reward": 0.5563094615936279, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2325846403837204, "step": 190 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 381.2083435058594, "epoch": 0.0018491446495822483, "grad_norm": 1.5302051500316345, "kl": 0.01104736328125, "learning_rate": 9.999916512629403e-07, "loss": 0.0004, "reward": 1.1861002445220947, "reward_std": 0.39650148153305054, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.04443359375, "step": 191 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 427.5833435058594, "epoch": 0.0018588260351821552, "grad_norm": 3.873660856945283, "kl": 0.0164794921875, "learning_rate": 9.999915631504785e-07, "loss": 0.0007, "reward": 1.095077633857727, "reward_std": 0.3211175203323364, "rewards/accuracy_reward": 0.12457793951034546, "rewards/format_reward": 0.8333333730697632, "rewards/semantic_reward": 0.0954996794462204, "step": 192 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 412.75, "epoch": 0.0018685074207820623, "grad_norm": 4.000558155321536, "kl": 0.02001953125, "learning_rate": 9.999914745754892e-07, "loss": 0.0008, "reward": 2.0331931114196777, "reward_std": 0.10804878920316696, "rewards/accuracy_reward": 0.5787169933319092, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.296142578125, "step": 193 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 329.5833435058594, "epoch": 0.0018781888063819695, "grad_norm": 2.4280926299617724, "kl": 0.01336669921875, "learning_rate": 9.999913855379726e-07, "loss": 0.0005, "reward": 2.0008625984191895, "reward_std": 0.4139685034751892, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.217529296875, "step": 194 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 410.875, "epoch": 0.0018878701919818764, "grad_norm": 4.637993114881143, "kl": 0.0113525390625, "learning_rate": 9.999912960379288e-07, "loss": 0.0005, "reward": 1.220686912536621, "reward_std": 0.7122386693954468, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.9166666865348816, "rewards/semantic_reward": 0.0790201872587204, "step": 195 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 361.125, "epoch": 0.0018975515775817835, "grad_norm": 2.36136547602421, "kl": 0.0216064453125, "learning_rate": 9.999912060753578e-07, "loss": 0.0009, "reward": 1.5341256856918335, "reward_std": 0.07324312627315521, "rewards/accuracy_reward": 0.3160511255264282, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1930745542049408, "step": 196 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 418.04168701171875, "epoch": 0.0019072329631816905, "grad_norm": 2.317956870299654, "kl": 0.01519775390625, "learning_rate": 9.999911156502598e-07, "loss": 0.0006, "reward": 1.7427897453308105, "reward_std": 0.40462177991867065, "rewards/accuracy_reward": 0.5416667461395264, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.1927897185087204, "step": 197 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 442.75, "epoch": 0.0019169143487815976, "grad_norm": 2.8730715363022514, "kl": 0.01904296875, "learning_rate": 9.999910247626348e-07, "loss": 0.0008, "reward": 1.7158937454223633, "reward_std": 0.27309122681617737, "rewards/accuracy_reward": 0.36234569549560547, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2618815302848816, "step": 198 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 392.2083435058594, "epoch": 0.0019265957343815047, "grad_norm": 2.2542892659401974, "kl": 0.015625, "learning_rate": 9.99990933412483e-07, "loss": 0.0006, "reward": 1.6575114727020264, "reward_std": 0.42604726552963257, "rewards/accuracy_reward": 0.4583333432674408, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1575113981962204, "step": 199 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.3333333333333333, "completion_length": 274.125, "epoch": 0.0019362771199814117, "grad_norm": 3.782213207980282, "kl": 0.0206298828125, "learning_rate": 9.999908415998042e-07, "loss": 0.0008, "reward": 1.9326496124267578, "reward_std": 0.06579481065273285, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2159830778837204, "step": 200 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 301.5, "epoch": 0.0019459585055813188, "grad_norm": 1.7864280907827665, "kl": 0.0184326171875, "learning_rate": 9.999907493245987e-07, "loss": 0.0007, "reward": 1.6557129621505737, "reward_std": 0.24687784910202026, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1473795622587204, "step": 201 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 378.9583435058594, "epoch": 0.001955639891181226, "grad_norm": 6.18233367014029, "kl": 0.02001953125, "learning_rate": 9.999906565868667e-07, "loss": 0.0008, "reward": 1.4333456754684448, "reward_std": 0.4107607305049896, "rewards/accuracy_reward": 0.2596965730190277, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.156982421875, "step": 202 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 424.7083435058594, "epoch": 0.001965321276781133, "grad_norm": 3.3458861190090197, "kl": 0.0150146484375, "learning_rate": 9.999905633866082e-07, "loss": 0.0006, "reward": 1.153808832168579, "reward_std": 0.26727530360221863, "rewards/accuracy_reward": 0.09629331529140472, "rewards/format_reward": 0.9166666865348816, "rewards/semantic_reward": 0.09918212890625, "step": 203 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 440.8333435058594, "epoch": 0.0019750026623810398, "grad_norm": 2.0488077077825495, "kl": 0.01251220703125, "learning_rate": 9.999904697238231e-07, "loss": 0.0005, "reward": 1.4473751783370972, "reward_std": 0.062425337731838226, "rewards/accuracy_reward": 0.18517625331878662, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1705322265625, "step": 204 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 478.625, "epoch": 0.001984684047980947, "grad_norm": 4.6561669833235, "kl": 0.0125732421875, "learning_rate": 9.999903755985114e-07, "loss": 0.0005, "reward": 1.3377902507781982, "reward_std": 0.32262927293777466, "rewards/accuracy_reward": 0.24089893698692322, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.105224609375, "step": 205 }, { "all_correct": 0.0, "all_wrong": 0.6666666666666666, "completion_length": 352.25, "epoch": 0.001994365433580854, "grad_norm": 1.3296464853138699, "kl": 0.02099609375, "learning_rate": 9.999902810106736e-07, "loss": 0.0008, "reward": 1.2197916507720947, "reward_std": 0.23717954754829407, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.0364583358168602, "step": 206 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 380.75, "epoch": 0.002004046819180761, "grad_norm": 5.793528024351231, "kl": 0.02099609375, "learning_rate": 9.999901859603099e-07, "loss": 0.0008, "reward": 2.0355594158172607, "reward_std": 0.0949191153049469, "rewards/accuracy_reward": 0.6395101547241211, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2210489958524704, "step": 207 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.6666666666666666, "completion_length": 625.5416870117188, "epoch": 0.0020137282047806683, "grad_norm": 1.234379809740205, "kl": 0.00506591796875, "learning_rate": 9.999900904474197e-07, "loss": 0.0002, "reward": 1.1109131574630737, "reward_std": 0.384111225605011, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.625, "rewards/semantic_reward": 0.1025797575712204, "step": 208 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 505.04168701171875, "epoch": 0.0020234095903805755, "grad_norm": 1.5220315049295838, "kl": 0.009765625, "learning_rate": 9.999899944720037e-07, "loss": 0.0004, "reward": 1.564681053161621, "reward_std": 0.5329180955886841, "rewards/accuracy_reward": 0.4583333432674408, "rewards/format_reward": 0.875, "rewards/semantic_reward": 0.1730143278837204, "step": 209 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 322.29168701171875, "epoch": 0.002033090975980482, "grad_norm": 2.987256743195971, "kl": 0.01708984375, "learning_rate": 9.999898980340615e-07, "loss": 0.0007, "reward": 2.212538242340088, "reward_std": 0.271171510219574, "rewards/accuracy_reward": 0.8055555820465088, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3236491084098816, "step": 210 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 358.625, "epoch": 0.0020427723615803893, "grad_norm": 3.230360030375895, "kl": 0.021240234375, "learning_rate": 9.999898011335937e-07, "loss": 0.0008, "reward": 1.8825461864471436, "reward_std": 0.33591052889823914, "rewards/accuracy_reward": 0.5453147292137146, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.2872314453125, "step": 211 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 348.54168701171875, "epoch": 0.0020524537471802965, "grad_norm": 3.7160106567109596, "kl": 0.018798828125, "learning_rate": 9.999897037706e-07, "loss": 0.0008, "reward": 2.191492795944214, "reward_std": 0.13149568438529968, "rewards/accuracy_reward": 0.7606415152549744, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3308512568473816, "step": 212 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 437.4583435058594, "epoch": 0.0020621351327802036, "grad_norm": 2.5143153107891165, "kl": 0.014404296875, "learning_rate": 9.999896059450807e-07, "loss": 0.0006, "reward": 1.5817525386810303, "reward_std": 0.35635849833488464, "rewards/accuracy_reward": 0.3621397614479065, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1446126401424408, "step": 213 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 412.125, "epoch": 0.0020718165183801107, "grad_norm": 4.962855361138032, "kl": 0.020263671875, "learning_rate": 9.999895076570357e-07, "loss": 0.0008, "reward": 1.744337558746338, "reward_std": 0.20164921879768372, "rewards/accuracy_reward": 0.3717544972896576, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2725830078125, "step": 214 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 356.79168701171875, "epoch": 0.0020814979039800174, "grad_norm": 3.29991833288398, "kl": 0.0177001953125, "learning_rate": 9.999894089064653e-07, "loss": 0.0007, "reward": 1.405376672744751, "reward_std": 0.29791080951690674, "rewards/accuracy_reward": 0.3055555820465088, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.108154296875, "step": 215 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 365.875, "epoch": 0.0020911792895799246, "grad_norm": 3.050715934538997, "kl": 0.022216796875, "learning_rate": 9.999893096933696e-07, "loss": 0.0009, "reward": 1.3270608186721802, "reward_std": 0.17474034428596497, "rewards/accuracy_reward": 0.14014668762683868, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1285807341337204, "step": 216 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 372.8333435058594, "epoch": 0.0021008606751798317, "grad_norm": 4.777778648942247, "kl": 0.01470947265625, "learning_rate": 9.999892100177483e-07, "loss": 0.0006, "reward": 2.0925967693328857, "reward_std": 0.20817232131958008, "rewards/accuracy_reward": 0.7240418791770935, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2685546875, "step": 217 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 335.0833435058594, "epoch": 0.002110542060779739, "grad_norm": 2.9486784741036454, "kl": 0.0162353515625, "learning_rate": 9.999891098796022e-07, "loss": 0.0007, "reward": 1.9883391857147217, "reward_std": 0.38391435146331787, "rewards/accuracy_reward": 0.6240407228469849, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2809651792049408, "step": 218 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 492.41668701171875, "epoch": 0.002120223446379646, "grad_norm": 2.18105916618102, "kl": 0.0125732421875, "learning_rate": 9.999890092789307e-07, "loss": 0.0005, "reward": 1.210752248764038, "reward_std": 0.42978161573410034, "rewards/accuracy_reward": 0.20503529906272888, "rewards/format_reward": 0.875, "rewards/semantic_reward": 0.08905029296875, "step": 219 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 337.79168701171875, "epoch": 0.0021299048319795527, "grad_norm": 5.261030933020367, "kl": 0.0196533203125, "learning_rate": 9.999889082157342e-07, "loss": 0.0008, "reward": 2.084885358810425, "reward_std": 0.1152058094739914, "rewards/accuracy_reward": 0.6434627175331116, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3330892026424408, "step": 220 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 331.8333435058594, "epoch": 0.00213958621757946, "grad_norm": 8.409085734664462, "kl": 0.017822265625, "learning_rate": 9.999888066900127e-07, "loss": 0.0007, "reward": 1.656111717224121, "reward_std": 0.436248242855072, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1394449919462204, "step": 221 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 356.2083435058594, "epoch": 0.002149267603179367, "grad_norm": 5.779675499736269, "kl": 0.0267333984375, "learning_rate": 9.999887047017666e-07, "loss": 0.0011, "reward": 1.7906568050384521, "reward_std": 0.24806325137615204, "rewards/accuracy_reward": 0.3618318736553192, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2954915463924408, "step": 222 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 333.7083435058594, "epoch": 0.002158948988779274, "grad_norm": 2.7170426162906574, "kl": 0.0213623046875, "learning_rate": 9.999886022509955e-07, "loss": 0.0009, "reward": 1.7232182025909424, "reward_std": 0.08359505981206894, "rewards/accuracy_reward": 0.442399263381958, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2141520231962204, "step": 223 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 363.375, "epoch": 0.0021686303743791813, "grad_norm": 4.136923127498118, "kl": 0.01226806640625, "learning_rate": 9.999884993376998e-07, "loss": 0.0005, "reward": 1.5541925430297852, "reward_std": 0.48785483837127686, "rewards/accuracy_reward": 0.37958332896232605, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1412760466337204, "step": 224 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 584.3333740234375, "epoch": 0.0021783117599790884, "grad_norm": 1.6816468464382004, "kl": 0.009033203125, "learning_rate": 9.999883959618795e-07, "loss": 0.0004, "reward": 1.014794945716858, "reward_std": 0.7890567779541016, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.6666666865348816, "rewards/semantic_reward": 0.0814615935087204, "step": 225 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 523.1666870117188, "epoch": 0.002187993145578995, "grad_norm": 2.8018306725382303, "kl": 0.007659912109375, "learning_rate": 9.99988292123535e-07, "loss": 0.0003, "reward": 1.2203688621520996, "reward_std": 0.5966906547546387, "rewards/accuracy_reward": 0.1529860943555832, "rewards/format_reward": 0.9166666865348816, "rewards/semantic_reward": 0.1507161557674408, "step": 226 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 351.75, "epoch": 0.0021976745311789022, "grad_norm": 5.414655987897365, "kl": 0.0218505859375, "learning_rate": 9.99988187822666e-07, "loss": 0.0009, "reward": 1.7554233074188232, "reward_std": 0.35324519872665405, "rewards/accuracy_reward": 0.5015169382095337, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1705729216337204, "step": 227 }, { "all_correct": 0.0, "all_wrong": 0.6666666666666666, "completion_length": 403.9583435058594, "epoch": 0.0022073559167788094, "grad_norm": 4.916845053362021, "kl": 0.01708984375, "learning_rate": 9.999880830592726e-07, "loss": 0.0007, "reward": 1.2426135540008545, "reward_std": 0.03621337562799454, "rewards/accuracy_reward": 0.1059354767203331, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.0950113981962204, "step": 228 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 397.9583435058594, "epoch": 0.0022170373023787165, "grad_norm": 3.1947971677789844, "kl": 0.02197265625, "learning_rate": 9.999879778333551e-07, "loss": 0.0009, "reward": 1.99929940700531, "reward_std": 0.10703043639659882, "rewards/accuracy_reward": 0.5744701623916626, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2831624448299408, "step": 229 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.6666666666666666, "completion_length": 380.04168701171875, "epoch": 0.0022267186879786236, "grad_norm": 2.1523792668403066, "kl": 0.01416015625, "learning_rate": 9.999878721449136e-07, "loss": 0.0006, "reward": 1.5004231929779053, "reward_std": 0.009890386834740639, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1004231795668602, "step": 230 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 418.9583435058594, "epoch": 0.0022364000735785304, "grad_norm": 2.265937955687173, "kl": 0.02001953125, "learning_rate": 9.99987765993948e-07, "loss": 0.0008, "reward": 2.208782911300659, "reward_std": 0.07163824141025543, "rewards/accuracy_reward": 0.7794696688652039, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2543131709098816, "step": 231 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 360.25, "epoch": 0.0022460814591784375, "grad_norm": 2.1468332709957236, "kl": 0.018310546875, "learning_rate": 9.999876593804586e-07, "loss": 0.0007, "reward": 1.7858715057373047, "reward_std": 0.0434081070125103, "rewards/accuracy_reward": 0.4248689115047455, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2360026091337204, "step": 232 }, { "all_correct": 0.0, "all_wrong": 0.6666666666666666, "completion_length": 429.375, "epoch": 0.0022557628447783446, "grad_norm": 1.6257922387419372, "kl": 0.0252685546875, "learning_rate": 9.999875523044454e-07, "loss": 0.001, "reward": 0.9790419340133667, "reward_std": 0.3652966022491455, "rewards/accuracy_reward": 0.055555559694767, "rewards/format_reward": 0.875, "rewards/semantic_reward": 0.0318196639418602, "step": 233 }, { "all_correct": 0.0, "all_wrong": 0.6666666666666666, "completion_length": 423.375, "epoch": 0.0022654442303782518, "grad_norm": 1.8677124074662572, "kl": 0.0137939453125, "learning_rate": 9.999874447659085e-07, "loss": 0.0006, "reward": 1.1816072463989258, "reward_std": 0.19080612063407898, "rewards/accuracy_reward": 0.12258213758468628, "rewards/format_reward": 0.9166666865348816, "rewards/semantic_reward": 0.0840250700712204, "step": 234 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 432.8333435058594, "epoch": 0.002275125615978159, "grad_norm": 2.4234058267641014, "kl": 0.018798828125, "learning_rate": 9.99987336764848e-07, "loss": 0.0008, "reward": 1.6683902740478516, "reward_std": 0.31267523765563965, "rewards/accuracy_reward": 0.3513738512992859, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1920166015625, "step": 235 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 395.0833435058594, "epoch": 0.002284807001578066, "grad_norm": 3.3019399738785946, "kl": 0.0162353515625, "learning_rate": 9.999872283012642e-07, "loss": 0.0006, "reward": 1.470079779624939, "reward_std": 0.5668826699256897, "rewards/accuracy_reward": 0.287519633769989, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1242268905043602, "step": 236 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 355.5, "epoch": 0.0022944883871779728, "grad_norm": 2.335853727897475, "kl": 0.0177001953125, "learning_rate": 9.999871193751567e-07, "loss": 0.0007, "reward": 1.7131924629211426, "reward_std": 0.0572732612490654, "rewards/accuracy_reward": 0.4419034421443939, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1962890625, "step": 237 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 387.9583435058594, "epoch": 0.00230416977277788, "grad_norm": 2.4655609915613477, "kl": 0.01409912109375, "learning_rate": 9.99987009986526e-07, "loss": 0.0006, "reward": 2.3722739219665527, "reward_std": 0.30729421973228455, "rewards/accuracy_reward": 0.9166666865348816, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3306071162223816, "step": 238 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 392.9583435058594, "epoch": 0.002313851158377787, "grad_norm": 3.4243649334478543, "kl": 0.0181884765625, "learning_rate": 9.999869001353725e-07, "loss": 0.0007, "reward": 1.6833910942077637, "reward_std": 0.48101162910461426, "rewards/accuracy_reward": 0.39163488149642944, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1750895231962204, "step": 239 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 351.41668701171875, "epoch": 0.002323532543977694, "grad_norm": 5.717059711202625, "kl": 0.019775390625, "learning_rate": 9.999867898216956e-07, "loss": 0.0008, "reward": 1.3100448846817017, "reward_std": 0.26206421852111816, "rewards/accuracy_reward": 0.16608327627182007, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.102294921875, "step": 240 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 432.875, "epoch": 0.0023332139295776013, "grad_norm": 1.924598461046694, "kl": 0.019287109375, "learning_rate": 9.999866790454956e-07, "loss": 0.0008, "reward": 1.7236685752868652, "reward_std": 0.282551109790802, "rewards/accuracy_reward": 0.48547691106796265, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.2298583984375, "step": 241 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 346.0833435058594, "epoch": 0.002342895315177508, "grad_norm": 3.5640896816808763, "kl": 0.0230712890625, "learning_rate": 9.99986567806773e-07, "loss": 0.0009, "reward": 1.6903722286224365, "reward_std": 0.05724545568227768, "rewards/accuracy_reward": 0.3624832332134247, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2112223356962204, "step": 242 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 339.0, "epoch": 0.002352576700777415, "grad_norm": 2.2816907613905526, "kl": 0.021728515625, "learning_rate": 9.999864561055276e-07, "loss": 0.0009, "reward": 1.72123384475708, "reward_std": 0.06234545260667801, "rewards/accuracy_reward": 0.438177227973938, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1747233122587204, "step": 243 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 412.04168701171875, "epoch": 0.0023622580863773223, "grad_norm": 2.041559343572815, "kl": 0.012939453125, "learning_rate": 9.999863439417595e-07, "loss": 0.0005, "reward": 1.6479085683822632, "reward_std": 0.5207564830780029, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.1479085385799408, "step": 244 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 439.875, "epoch": 0.0023719394719772294, "grad_norm": 1.67251089739338, "kl": 0.012939453125, "learning_rate": 9.999862313154686e-07, "loss": 0.0005, "reward": 1.152099609375, "reward_std": 0.5418949127197266, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.875, "rewards/semantic_reward": 0.0687662810087204, "step": 245 }, { "all_correct": 0.0, "all_wrong": 0.6666666666666666, "completion_length": 412.91668701171875, "epoch": 0.0023816208575771366, "grad_norm": 1.510652722862887, "kl": 0.0172119140625, "learning_rate": 9.999861182266556e-07, "loss": 0.0007, "reward": 1.3388020992279053, "reward_std": 0.3287776708602905, "rewards/accuracy_reward": 0.2916666865348816, "rewards/format_reward": 0.9166666865348816, "rewards/semantic_reward": 0.10546875, "step": 246 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 439.25, "epoch": 0.0023913022431770437, "grad_norm": 3.754190049887374, "kl": 0.0244140625, "learning_rate": 9.9998600467532e-07, "loss": 0.001, "reward": 1.7981688976287842, "reward_std": 0.32005542516708374, "rewards/accuracy_reward": 0.41130349040031433, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2451985776424408, "step": 247 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 457.5833435058594, "epoch": 0.0024009836287769504, "grad_norm": 2.0508474523774227, "kl": 0.019287109375, "learning_rate": 9.999858906614622e-07, "loss": 0.0008, "reward": 1.6880367994308472, "reward_std": 0.3513493537902832, "rewards/accuracy_reward": 0.5027828216552734, "rewards/format_reward": 0.875, "rewards/semantic_reward": 0.2185872495174408, "step": 248 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 409.5, "epoch": 0.0024106650143768575, "grad_norm": 2.431946091533449, "kl": 0.024169921875, "learning_rate": 9.999857761850822e-07, "loss": 0.001, "reward": 1.8364055156707764, "reward_std": 0.11773550510406494, "rewards/accuracy_reward": 0.45759686827659607, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.27880859375, "step": 249 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 390.9583435058594, "epoch": 0.0024203463999767647, "grad_norm": 1.9053557283787355, "kl": 0.0216064453125, "learning_rate": 9.999856612461803e-07, "loss": 0.0009, "reward": 1.7089046239852905, "reward_std": 0.07585903257131577, "rewards/accuracy_reward": 0.41656243801116943, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1923421323299408, "step": 250 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 413.75, "epoch": 0.002430027785576672, "grad_norm": 4.865920543742524, "kl": 0.0213623046875, "learning_rate": 9.999855458447565e-07, "loss": 0.0009, "reward": 1.5769256353378296, "reward_std": 0.08045879751443863, "rewards/accuracy_reward": 0.3431691825389862, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.14208984375, "step": 251 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 425.54168701171875, "epoch": 0.002439709171176579, "grad_norm": 1.406232458758439, "kl": 0.01470947265625, "learning_rate": 9.999854299808107e-07, "loss": 0.0006, "reward": 1.8166016340255737, "reward_std": 0.4126740097999573, "rewards/accuracy_reward": 0.5416666865348816, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1832682341337204, "step": 252 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 448.0, "epoch": 0.0024493905567764857, "grad_norm": 8.515557675731662, "kl": 0.0194091796875, "learning_rate": 9.999853136543433e-07, "loss": 0.0008, "reward": 1.7296843528747559, "reward_std": 0.07152315229177475, "rewards/accuracy_reward": 0.42576998472213745, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2205810546875, "step": 253 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 398.9583435058594, "epoch": 0.002459071942376393, "grad_norm": 3.4059115103269075, "kl": 0.0174560546875, "learning_rate": 9.999851968653543e-07, "loss": 0.0007, "reward": 1.8823726177215576, "reward_std": 0.13794925808906555, "rewards/accuracy_reward": 0.5048171877861023, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.285888671875, "step": 254 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 449.375, "epoch": 0.0024687533279763, "grad_norm": 4.135819968021069, "kl": 0.0174560546875, "learning_rate": 9.999850796138439e-07, "loss": 0.0007, "reward": 1.6599152088165283, "reward_std": 0.21725714206695557, "rewards/accuracy_reward": 0.39499807357788086, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.2482503354549408, "step": 255 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 382.875, "epoch": 0.002478434713576207, "grad_norm": 2.1338546463313137, "kl": 0.0224609375, "learning_rate": 9.999849618998117e-07, "loss": 0.0009, "reward": 2.0446290969848633, "reward_std": 0.27271005511283875, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2029622495174408, "step": 256 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 389.8333435058594, "epoch": 0.0024881160991761142, "grad_norm": 25.18092930188073, "kl": 0.0244140625, "learning_rate": 9.999848437232585e-07, "loss": 0.001, "reward": 1.9829047918319702, "reward_std": 0.14820650219917297, "rewards/accuracy_reward": 0.542304277420044, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2822672724723816, "step": 257 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 427.3333435058594, "epoch": 0.0024977974847760214, "grad_norm": 2.6761295151504387, "kl": 0.017822265625, "learning_rate": 9.999847250841842e-07, "loss": 0.0007, "reward": 1.633927345275879, "reward_std": 0.09897884726524353, "rewards/accuracy_reward": 0.31931138038635254, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2312825620174408, "step": 258 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 520.0833740234375, "epoch": 0.002507478870375928, "grad_norm": 3.0250863460315496, "kl": 0.0181884765625, "learning_rate": 9.999846059825886e-07, "loss": 0.0007, "reward": 1.2014234066009521, "reward_std": 0.23686233162879944, "rewards/accuracy_reward": 0.2954173684120178, "rewards/format_reward": 0.625, "rewards/semantic_reward": 0.1976725310087204, "step": 259 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 461.125, "epoch": 0.002517160255975835, "grad_norm": 2.810393749695637, "kl": 0.02197265625, "learning_rate": 9.999844864184724e-07, "loss": 0.0009, "reward": 1.6016921997070312, "reward_std": 0.3898561894893646, "rewards/accuracy_reward": 0.28539174795150757, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2079671323299408, "step": 260 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 500.29168701171875, "epoch": 0.0025268416415757423, "grad_norm": 10.192306309298516, "kl": 0.01165771484375, "learning_rate": 9.99984366391835e-07, "loss": 0.0005, "reward": 1.3802164793014526, "reward_std": 0.4625740945339203, "rewards/accuracy_reward": 0.4166666865348816, "rewards/format_reward": 0.7916666865348816, "rewards/semantic_reward": 0.1385498046875, "step": 261 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 395.2083435058594, "epoch": 0.0025365230271756495, "grad_norm": 3.0094320110686628, "kl": 0.024169921875, "learning_rate": 9.99984245902677e-07, "loss": 0.001, "reward": 1.7443418502807617, "reward_std": 0.07072693109512329, "rewards/accuracy_reward": 0.4887102246284485, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1472981870174408, "step": 262 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 351.75, "epoch": 0.0025462044127755566, "grad_norm": 3.206888764040175, "kl": 0.0179443359375, "learning_rate": 9.999841249509984e-07, "loss": 0.0007, "reward": 1.9476299285888672, "reward_std": 0.14702093601226807, "rewards/accuracy_reward": 0.5613341927528381, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2862955927848816, "step": 263 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 524.2083740234375, "epoch": 0.0025558857983754633, "grad_norm": 1.8103073386841153, "kl": 0.0167236328125, "learning_rate": 9.999840035367991e-07, "loss": 0.0007, "reward": 1.1163115501403809, "reward_std": 0.3275279104709625, "rewards/accuracy_reward": 0.1815621703863144, "rewards/format_reward": 0.75, "rewards/semantic_reward": 0.1180826872587204, "step": 264 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 324.66668701171875, "epoch": 0.0025655671839753705, "grad_norm": 3.0960612876268483, "kl": 0.024658203125, "learning_rate": 9.999838816600797e-07, "loss": 0.001, "reward": 1.2257487773895264, "reward_std": 0.3877462148666382, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.05908203125, "step": 265 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 505.0833435058594, "epoch": 0.0025752485695752776, "grad_norm": 2.087456120880835, "kl": 0.018310546875, "learning_rate": 9.9998375932084e-07, "loss": 0.0007, "reward": 0.9440668821334839, "reward_std": 0.454139769077301, "rewards/accuracy_reward": 0.11685331910848618, "rewards/format_reward": 0.7083333730697632, "rewards/semantic_reward": 0.1022135466337204, "step": 266 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 346.375, "epoch": 0.0025849299551751847, "grad_norm": 2.041600369268579, "kl": 0.0203857421875, "learning_rate": 9.999836365190799e-07, "loss": 0.0008, "reward": 1.2551724910736084, "reward_std": 0.22296357154846191, "rewards/accuracy_reward": 0.11460846662521362, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.14056396484375, "step": 267 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 364.375, "epoch": 0.002594611340775092, "grad_norm": 1.827383332551719, "kl": 0.01318359375, "learning_rate": 9.999835132548e-07, "loss": 0.0005, "reward": 1.7684895992279053, "reward_std": 0.3914550244808197, "rewards/accuracy_reward": 0.5416666865348816, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.16015625, "step": 268 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 302.0833435058594, "epoch": 0.0026042927263749986, "grad_norm": 2.9100434099814145, "kl": 0.021484375, "learning_rate": 9.999833895279998e-07, "loss": 0.0009, "reward": 1.8225864171981812, "reward_std": 0.08089639246463776, "rewards/accuracy_reward": 0.46913576126098633, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2867838740348816, "step": 269 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 454.29168701171875, "epoch": 0.0026139741119749057, "grad_norm": 1.852128456459216, "kl": 0.0157470703125, "learning_rate": 9.999832653386801e-07, "loss": 0.0006, "reward": 1.5621256828308105, "reward_std": 0.2163095474243164, "rewards/accuracy_reward": 0.4166666865348816, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.095458984375, "step": 270 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 445.91668701171875, "epoch": 0.002623655497574813, "grad_norm": 4.156596344714937, "kl": 0.0146484375, "learning_rate": 9.999831406868406e-07, "loss": 0.0006, "reward": 1.9928630590438843, "reward_std": 0.5040713548660278, "rewards/accuracy_reward": 0.7083333730697632, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.2261962890625, "step": 271 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 356.9583435058594, "epoch": 0.00263333688317472, "grad_norm": 2.012348677014082, "kl": 0.02685546875, "learning_rate": 9.999830155724813e-07, "loss": 0.0011, "reward": 1.7038758993148804, "reward_std": 0.07999375462532043, "rewards/accuracy_reward": 0.4442485570907593, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1596272885799408, "step": 272 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 387.5833435058594, "epoch": 0.002643018268774627, "grad_norm": 19.60231669045621, "kl": 0.0262451171875, "learning_rate": 9.999828899956028e-07, "loss": 0.0011, "reward": 1.7507829666137695, "reward_std": 0.06919696927070618, "rewards/accuracy_reward": 0.32206377387046814, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2703857421875, "step": 273 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 407.8333435058594, "epoch": 0.0026526996543745343, "grad_norm": 2.366883369993983, "kl": 0.0185546875, "learning_rate": 9.99982763956205e-07, "loss": 0.0007, "reward": 1.3044270277023315, "reward_std": 0.18961402773857117, "rewards/accuracy_reward": 0.14370107650756836, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1190592497587204, "step": 274 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 319.7083435058594, "epoch": 0.002662381039974441, "grad_norm": 2.106120264186146, "kl": 0.0281982421875, "learning_rate": 9.999826374542877e-07, "loss": 0.0011, "reward": 1.6813607215881348, "reward_std": 0.06496359407901764, "rewards/accuracy_reward": 0.47402024269104004, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1490071713924408, "step": 275 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 338.2083435058594, "epoch": 0.002672062425574348, "grad_norm": 2.3022086413274274, "kl": 0.0260009765625, "learning_rate": 9.999825104898513e-07, "loss": 0.001, "reward": 1.960161566734314, "reward_std": 0.3269464373588562, "rewards/accuracy_reward": 0.5837129354476929, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2764485776424408, "step": 276 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 295.625, "epoch": 0.0026817438111742553, "grad_norm": 2.5416005008279625, "kl": 0.017333984375, "learning_rate": 9.999823830628957e-07, "loss": 0.0007, "reward": 1.9496582746505737, "reward_std": 0.5458934307098389, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2329915463924408, "step": 277 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 485.5, "epoch": 0.0026914251967741624, "grad_norm": 1.906232856265172, "kl": 0.0164794921875, "learning_rate": 9.999822551734216e-07, "loss": 0.0007, "reward": 1.974411964416504, "reward_std": 0.4319896399974823, "rewards/accuracy_reward": 0.6664937734603882, "rewards/format_reward": 0.9166666865348816, "rewards/semantic_reward": 0.2829183042049408, "step": 278 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 343.7083435058594, "epoch": 0.0027011065823740695, "grad_norm": 3.7256983969916133, "kl": 0.021240234375, "learning_rate": 9.999821268214283e-07, "loss": 0.0008, "reward": 1.552075743675232, "reward_std": 0.27415555715560913, "rewards/accuracy_reward": 0.29847875237464905, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1869303435087204, "step": 279 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 439.66668701171875, "epoch": 0.0027107879679739762, "grad_norm": 2.3187204199705107, "kl": 0.0177001953125, "learning_rate": 9.999819980069165e-07, "loss": 0.0007, "reward": 2.0631566047668457, "reward_std": 0.5767027735710144, "rewards/accuracy_reward": 0.7222222685813904, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.2909342646598816, "step": 280 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 379.04168701171875, "epoch": 0.0027204693535738834, "grad_norm": 5.768596348712711, "kl": 0.01324462890625, "learning_rate": 9.999818687298864e-07, "loss": 0.0005, "reward": 1.686881422996521, "reward_std": 0.05897974967956543, "rewards/accuracy_reward": 0.40994447469711304, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2352701872587204, "step": 281 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 478.625, "epoch": 0.0027301507391737905, "grad_norm": 1.763925497004737, "kl": 0.00970458984375, "learning_rate": 9.999817389903376e-07, "loss": 0.0004, "reward": 1.7161214351654053, "reward_std": 0.49548110365867615, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.8333333730697632, "rewards/semantic_reward": 0.1911214292049408, "step": 282 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 369.16668701171875, "epoch": 0.0027398321247736977, "grad_norm": 8.046202633387045, "kl": 0.0252685546875, "learning_rate": 9.999816087882706e-07, "loss": 0.001, "reward": 1.4548619985580444, "reward_std": 0.0882500633597374, "rewards/accuracy_reward": 0.21215364336967468, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1510416716337204, "step": 283 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 370.375, "epoch": 0.002749513510373605, "grad_norm": 2.6740319511651904, "kl": 0.0213623046875, "learning_rate": 9.999814781236856e-07, "loss": 0.0009, "reward": 1.6655082702636719, "reward_std": 0.4154313802719116, "rewards/accuracy_reward": 0.4012666344642639, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2225748747587204, "step": 284 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 353.41668701171875, "epoch": 0.002759194895973512, "grad_norm": 2.9950835046777957, "kl": 0.02392578125, "learning_rate": 9.999813469965822e-07, "loss": 0.001, "reward": 1.4968161582946777, "reward_std": 0.18469500541687012, "rewards/accuracy_reward": 0.23379527032375336, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1796875, "step": 285 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 347.0833435058594, "epoch": 0.0027688762815734186, "grad_norm": 3.1092278190432947, "kl": 0.0208740234375, "learning_rate": 9.999812154069611e-07, "loss": 0.0008, "reward": 1.8548648357391357, "reward_std": 0.2938160300254822, "rewards/accuracy_reward": 0.5648989677429199, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2149658203125, "step": 286 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.6666666666666666, "completion_length": 349.0, "epoch": 0.0027785576671733258, "grad_norm": 1.1271568573940207, "kl": 0.0218505859375, "learning_rate": 9.99981083354822e-07, "loss": 0.0009, "reward": 1.4197429418563843, "reward_std": 0.030260492116212845, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.0697428435087204, "step": 287 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 421.0833435058594, "epoch": 0.002788239052773233, "grad_norm": 2.863130828592714, "kl": 0.0167236328125, "learning_rate": 9.999809508401653e-07, "loss": 0.0007, "reward": 1.5897786617279053, "reward_std": 0.48022183775901794, "rewards/accuracy_reward": 0.34355461597442627, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1712239682674408, "step": 288 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 346.875, "epoch": 0.00279792043837314, "grad_norm": 3.1696700293036386, "kl": 0.0234375, "learning_rate": 9.99980817862991e-07, "loss": 0.0009, "reward": 1.7373361587524414, "reward_std": 0.534966230392456, "rewards/accuracy_reward": 0.4419097602367401, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.228759765625, "step": 289 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 367.0833435058594, "epoch": 0.002807601823973047, "grad_norm": 1.831170613252724, "kl": 0.0206298828125, "learning_rate": 9.999806844232994e-07, "loss": 0.0008, "reward": 1.684876799583435, "reward_std": 0.06511694192886353, "rewards/accuracy_reward": 0.40413129329681396, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1807454526424408, "step": 290 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 425.25, "epoch": 0.002817283209572954, "grad_norm": 3.791005799722047, "kl": 0.022705078125, "learning_rate": 9.999805505210901e-07, "loss": 0.0009, "reward": 1.412612795829773, "reward_std": 0.3675524890422821, "rewards/accuracy_reward": 0.224062979221344, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.1802164763212204, "step": 291 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 361.3333435058594, "epoch": 0.002826964595172861, "grad_norm": 2.7717587704067475, "kl": 0.022705078125, "learning_rate": 9.99980416156364e-07, "loss": 0.0009, "reward": 2.401904344558716, "reward_std": 0.07752712070941925, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2852376401424408, "step": 292 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 429.25, "epoch": 0.002836645980772768, "grad_norm": 2.4838590772811306, "kl": 0.0205078125, "learning_rate": 9.999802813291207e-07, "loss": 0.0008, "reward": 1.936262607574463, "reward_std": 0.12612442672252655, "rewards/accuracy_reward": 0.48765015602111816, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.30694580078125, "step": 293 }, { "all_correct": 0.0, "all_wrong": 0.6666666666666666, "completion_length": 341.7083435058594, "epoch": 0.0028463273663726753, "grad_norm": 1.416621444893759, "kl": 0.023193359375, "learning_rate": 9.999801460393603e-07, "loss": 0.0009, "reward": 1.26341712474823, "reward_std": 0.01862356625497341, "rewards/accuracy_reward": 0.09567618370056152, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.10107421875, "step": 294 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 390.5833435058594, "epoch": 0.0028560087519725825, "grad_norm": 2.306140678240578, "kl": 0.023681640625, "learning_rate": 9.999800102870833e-07, "loss": 0.0009, "reward": 1.7134878635406494, "reward_std": 0.2092793881893158, "rewards/accuracy_reward": 0.3997589349746704, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1970621794462204, "step": 295 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 386.0, "epoch": 0.0028656901375724896, "grad_norm": 3.011828231410303, "kl": 0.02099609375, "learning_rate": 9.999798740722894e-07, "loss": 0.0008, "reward": 1.1678476333618164, "reward_std": 0.2794337272644043, "rewards/accuracy_reward": 0.07870370894670486, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.080810546875, "step": 296 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 354.9583435058594, "epoch": 0.0028753715231723963, "grad_norm": 1.271178518193494, "kl": 0.0137939453125, "learning_rate": 9.99979737394979e-07, "loss": 0.0006, "reward": 1.5958126783370972, "reward_std": 0.2870422899723053, "rewards/accuracy_reward": 0.4031531512737274, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1426595151424408, "step": 297 }, { "all_correct": 0.0, "all_wrong": 0.6666666666666666, "completion_length": 380.625, "epoch": 0.0028850529087723034, "grad_norm": 1.4168846988628254, "kl": 0.0205078125, "learning_rate": 9.99979600255152e-07, "loss": 0.0008, "reward": 1.31974196434021, "reward_std": 0.04138616472482681, "rewards/accuracy_reward": 0.15637928247451782, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1133626326918602, "step": 298 }, { "all_correct": 0.0, "all_wrong": 0.6666666666666666, "completion_length": 430.91668701171875, "epoch": 0.0028947342943722106, "grad_norm": 3.7627861886995255, "kl": 0.017333984375, "learning_rate": 9.99979462652809e-07, "loss": 0.0007, "reward": 1.1785423755645752, "reward_std": 0.15766803920269012, "rewards/accuracy_reward": 0.1218203604221344, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.0567220076918602, "step": 299 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.6666666666666666, "completion_length": 400.66668701171875, "epoch": 0.0029044156799721177, "grad_norm": 3.00483373013361, "kl": 0.02099609375, "learning_rate": 9.999793245879496e-07, "loss": 0.0008, "reward": 1.498632788658142, "reward_std": 0.007365695666521788, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.0986328125, "step": 300 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 424.375, "epoch": 0.002914097065572025, "grad_norm": 1.994607679909454, "kl": 0.01239013671875, "learning_rate": 9.99979186060574e-07, "loss": 0.0005, "reward": 1.5059945583343506, "reward_std": 0.6047961711883545, "rewards/accuracy_reward": 0.3554166555404663, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.1339111328125, "step": 301 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 342.0, "epoch": 0.0029237784511719316, "grad_norm": 1.9776333442257363, "kl": 0.01531982421875, "learning_rate": 9.999790470706826e-07, "loss": 0.0006, "reward": 1.9079101085662842, "reward_std": 0.452055424451828, "rewards/accuracy_reward": 0.7083333730697632, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.1912434995174408, "step": 302 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 336.9583435058594, "epoch": 0.0029334598367718387, "grad_norm": 2.4711036674687614, "kl": 0.0208740234375, "learning_rate": 9.999789076182754e-07, "loss": 0.0008, "reward": 1.7012814283370972, "reward_std": 0.09144680202007294, "rewards/accuracy_reward": 0.4177852272987366, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.20849609375, "step": 303 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 470.875, "epoch": 0.002943141222371746, "grad_norm": 2.1712930574249456, "kl": 0.01129150390625, "learning_rate": 9.999787677033523e-07, "loss": 0.0005, "reward": 1.4239648580551147, "reward_std": 0.5561316013336182, "rewards/accuracy_reward": 0.2896549701690674, "rewards/format_reward": 0.9166666865348816, "rewards/semantic_reward": 0.1676432341337204, "step": 304 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 356.4583435058594, "epoch": 0.002952822607971653, "grad_norm": 8.937514294688283, "kl": 0.02490234375, "learning_rate": 9.999786273259142e-07, "loss": 0.001, "reward": 1.6687864065170288, "reward_std": 0.19218182563781738, "rewards/accuracy_reward": 0.28705617785453796, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2733968198299408, "step": 305 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 397.125, "epoch": 0.00296250399357156, "grad_norm": 4.061327517410507, "kl": 0.0225830078125, "learning_rate": 9.999784864859602e-07, "loss": 0.0009, "reward": 1.5880262851715088, "reward_std": 0.05905860662460327, "rewards/accuracy_reward": 0.33453506231307983, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1618245542049408, "step": 306 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 381.7083435058594, "epoch": 0.002972185379171467, "grad_norm": 2.120560657877671, "kl": 0.015869140625, "learning_rate": 9.99978345183491e-07, "loss": 0.0006, "reward": 1.9010452032089233, "reward_std": 0.3039354681968689, "rewards/accuracy_reward": 0.6001582145690918, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2008870542049408, "step": 307 }, { "all_correct": 0.0, "all_wrong": 0.6666666666666666, "completion_length": 340.25, "epoch": 0.002981866764771374, "grad_norm": 2.3048395412758933, "kl": 0.02001953125, "learning_rate": 9.999782034185067e-07, "loss": 0.0008, "reward": 1.2847846746444702, "reward_std": 0.11947010457515717, "rewards/accuracy_reward": 0.1944444626569748, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.0570068359375, "step": 308 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 329.4583435058594, "epoch": 0.002991548150371281, "grad_norm": 2.8954913006556966, "kl": 0.0247802734375, "learning_rate": 9.999780611910073e-07, "loss": 0.001, "reward": 1.7323522567749023, "reward_std": 0.2874675691127777, "rewards/accuracy_reward": 0.45194053649902344, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2137451171875, "step": 309 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 443.9583435058594, "epoch": 0.0030012295359711882, "grad_norm": 2.414929590251311, "kl": 0.0213623046875, "learning_rate": 9.99977918500993e-07, "loss": 0.0009, "reward": 1.7607152462005615, "reward_std": 0.0568908266723156, "rewards/accuracy_reward": 0.4539444148540497, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1901041716337204, "step": 310 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 344.625, "epoch": 0.0030109109215710954, "grad_norm": 3.7403487237289745, "kl": 0.01904296875, "learning_rate": 9.99977775348464e-07, "loss": 0.0008, "reward": 1.9008138179779053, "reward_std": 0.20943889021873474, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2508138120174408, "step": 311 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 360.16668701171875, "epoch": 0.0030205923071710025, "grad_norm": 7.5641840103835785, "kl": 0.020751953125, "learning_rate": 9.999776317334202e-07, "loss": 0.0008, "reward": 1.6054033041000366, "reward_std": 0.09678220003843307, "rewards/accuracy_reward": 0.2726883292198181, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2410481870174408, "step": 312 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 324.29168701171875, "epoch": 0.003030273692770909, "grad_norm": 4.279714113710152, "kl": 0.025390625, "learning_rate": 9.999774876558622e-07, "loss": 0.001, "reward": 2.290313243865967, "reward_std": 0.12692716717720032, "rewards/accuracy_reward": 0.9113253355026245, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.295654296875, "step": 313 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 348.875, "epoch": 0.0030399550783708163, "grad_norm": 2.97033383780842, "kl": 0.0250244140625, "learning_rate": 9.999773431157894e-07, "loss": 0.001, "reward": 1.7548298835754395, "reward_std": 0.053673021495342255, "rewards/accuracy_reward": 0.4603799283504486, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1861165463924408, "step": 314 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 419.04168701171875, "epoch": 0.0030496364639707235, "grad_norm": 3.53693726648901, "kl": 0.0191650390625, "learning_rate": 9.999771981132028e-07, "loss": 0.0008, "reward": 2.0499722957611084, "reward_std": 0.45901018381118774, "rewards/accuracy_reward": 0.6343877911567688, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2655843198299408, "step": 315 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 362.66668701171875, "epoch": 0.0030593178495706306, "grad_norm": 2.5753789584511715, "kl": 0.0242919921875, "learning_rate": 9.99977052648102e-07, "loss": 0.001, "reward": 1.6523059606552124, "reward_std": 0.0744955763220787, "rewards/accuracy_reward": 0.41007766127586365, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1588948667049408, "step": 316 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 313.875, "epoch": 0.0030689992351705378, "grad_norm": 3.519958707193799, "kl": 0.0208740234375, "learning_rate": 9.99976906720487e-07, "loss": 0.0008, "reward": 2.16270112991333, "reward_std": 0.3711512088775635, "rewards/accuracy_reward": 0.7222222685813904, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.365478515625, "step": 317 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 388.16668701171875, "epoch": 0.0030786806207704445, "grad_norm": 3.468963376588137, "kl": 0.018798828125, "learning_rate": 9.999767603303582e-07, "loss": 0.0008, "reward": 2.4020590782165527, "reward_std": 0.22200052440166473, "rewards/accuracy_reward": 0.9583333730697632, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3187255859375, "step": 318 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 402.0, "epoch": 0.0030883620063703516, "grad_norm": 2.1613508882045167, "kl": 0.0213623046875, "learning_rate": 9.99976613477716e-07, "loss": 0.0009, "reward": 1.5397651195526123, "reward_std": 0.47591251134872437, "rewards/accuracy_reward": 0.4027777910232544, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.1203206405043602, "step": 319 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 391.04168701171875, "epoch": 0.0030980433919702587, "grad_norm": 6.301397371297693, "kl": 0.01806640625, "learning_rate": 9.9997646616256e-07, "loss": 0.0007, "reward": 1.7054976224899292, "reward_std": 0.24811944365501404, "rewards/accuracy_reward": 0.3746951222419739, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2391357421875, "step": 320 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 361.91668701171875, "epoch": 0.003107724777570166, "grad_norm": 2.8108106649797366, "kl": 0.021484375, "learning_rate": 9.999763183848905e-07, "loss": 0.0009, "reward": 1.5757254362106323, "reward_std": 0.24240794777870178, "rewards/accuracy_reward": 0.361101359128952, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1562906950712204, "step": 321 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 398.04168701171875, "epoch": 0.003117406163170073, "grad_norm": 3.017208411470664, "kl": 0.027587890625, "learning_rate": 9.999761701447079e-07, "loss": 0.0011, "reward": 1.802478551864624, "reward_std": 0.0996929258108139, "rewards/accuracy_reward": 0.4145472049713135, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2545979917049408, "step": 322 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 398.41668701171875, "epoch": 0.00312708754876998, "grad_norm": 12.089417927355296, "kl": 0.02001953125, "learning_rate": 9.999760214420122e-07, "loss": 0.0008, "reward": 1.433861494064331, "reward_std": 0.48689982295036316, "rewards/accuracy_reward": 0.28439849615097046, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.1161295622587204, "step": 323 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 343.25, "epoch": 0.003136768934369887, "grad_norm": 2.121814611620916, "kl": 0.01708984375, "learning_rate": 9.999758722768033e-07, "loss": 0.0007, "reward": 1.7247337102890015, "reward_std": 0.26665258407592773, "rewards/accuracy_reward": 0.43151259422302246, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2182210385799408, "step": 324 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 361.16668701171875, "epoch": 0.003146450319969794, "grad_norm": 3.3506523079873674, "kl": 0.0262451171875, "learning_rate": 9.999757226490815e-07, "loss": 0.0011, "reward": 2.076690196990967, "reward_std": 0.12166941165924072, "rewards/accuracy_reward": 0.6998955607414246, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2351277768611908, "step": 325 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 352.875, "epoch": 0.003156131705569701, "grad_norm": 4.536952921218125, "kl": 0.0247802734375, "learning_rate": 9.99975572558847e-07, "loss": 0.001, "reward": 1.6287546157836914, "reward_std": 0.5097863674163818, "rewards/accuracy_reward": 0.4319082200527191, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1218465194106102, "step": 326 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 401.04168701171875, "epoch": 0.0031658130911696083, "grad_norm": 1.9802386901317204, "kl": 0.0185546875, "learning_rate": 9.999754220061e-07, "loss": 0.0007, "reward": 1.3848145008087158, "reward_std": 0.4190603196620941, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1014811247587204, "step": 327 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 428.16668701171875, "epoch": 0.0031754944767695154, "grad_norm": 7.6343811873304945, "kl": 0.0185546875, "learning_rate": 9.999752709908405e-07, "loss": 0.0007, "reward": 1.300328016281128, "reward_std": 0.1802329123020172, "rewards/accuracy_reward": 0.12523029744625092, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1417643278837204, "step": 328 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 469.75, "epoch": 0.003185175862369422, "grad_norm": 2.275213880983039, "kl": 0.0155029296875, "learning_rate": 9.999751195130686e-07, "loss": 0.0006, "reward": 2.200003147125244, "reward_std": 0.2735140919685364, "rewards/accuracy_reward": 0.7533885836601257, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3216145932674408, "step": 329 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 375.875, "epoch": 0.0031948572479693293, "grad_norm": 2.4762300708952942, "kl": 0.024658203125, "learning_rate": 9.999749675727847e-07, "loss": 0.001, "reward": 1.7820897102355957, "reward_std": 0.297054260969162, "rewards/accuracy_reward": 0.44440245628356934, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2126871794462204, "step": 330 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 355.4583435058594, "epoch": 0.0032045386335692364, "grad_norm": 7.740407544189631, "kl": 0.01251220703125, "learning_rate": 9.999748151699885e-07, "loss": 0.0005, "reward": 1.6669420003890991, "reward_std": 0.6712875366210938, "rewards/accuracy_reward": 0.5568181872367859, "rewards/format_reward": 0.875, "rewards/semantic_reward": 0.2101237028837204, "step": 331 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 366.3333435058594, "epoch": 0.0032142200191691435, "grad_norm": 4.332192830462161, "kl": 0.0269775390625, "learning_rate": 9.999746623046807e-07, "loss": 0.0011, "reward": 1.4817039966583252, "reward_std": 0.0739499181509018, "rewards/accuracy_reward": 0.23495909571647644, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1634114682674408, "step": 332 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 373.2083435058594, "epoch": 0.0032239014047690507, "grad_norm": 2.399127176393822, "kl": 0.0185546875, "learning_rate": 9.999745089768609e-07, "loss": 0.0007, "reward": 2.2339224815368652, "reward_std": 0.11890288442373276, "rewards/accuracy_reward": 0.783190131187439, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3340657651424408, "step": 333 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.3333333333333333, "completion_length": 458.3333435058594, "epoch": 0.003233582790368958, "grad_norm": 7.820212623587669, "kl": 0.0113525390625, "learning_rate": 9.999743551865296e-07, "loss": 0.0005, "reward": 1.922623872756958, "reward_std": 0.16951832175254822, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.1892903745174408, "step": 334 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 547.875, "epoch": 0.0032432641759688645, "grad_norm": 1.5824008441740538, "kl": 0.00653076171875, "learning_rate": 9.999742009336867e-07, "loss": 0.0003, "reward": 1.1231282949447632, "reward_std": 0.6537867784500122, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.75, "rewards/semantic_reward": 0.0814615935087204, "step": 335 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 363.625, "epoch": 0.0032529455615687717, "grad_norm": 2.929540431476071, "kl": 0.026611328125, "learning_rate": 9.999740462183325e-07, "loss": 0.0011, "reward": 1.5876274108886719, "reward_std": 0.055618833750486374, "rewards/accuracy_reward": 0.29782432317733765, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1731363981962204, "step": 336 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 326.25, "epoch": 0.003262626947168679, "grad_norm": 1.9644053793061778, "kl": 0.0211181640625, "learning_rate": 9.999738910404674e-07, "loss": 0.0008, "reward": 1.456705093383789, "reward_std": 0.28387895226478577, "rewards/accuracy_reward": 0.2574700713157654, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.132568359375, "step": 337 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 353.9583435058594, "epoch": 0.003272308332768586, "grad_norm": 10.737405878276638, "kl": 0.0208740234375, "learning_rate": 9.99973735400091e-07, "loss": 0.0008, "reward": 1.8357315063476562, "reward_std": 0.052390776574611664, "rewards/accuracy_reward": 0.5555555820465088, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.18017578125, "step": 338 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 412.7083435058594, "epoch": 0.003281989718368493, "grad_norm": 4.380110047735161, "kl": 0.0201416015625, "learning_rate": 9.999735792972037e-07, "loss": 0.0008, "reward": 1.7171306610107422, "reward_std": 0.36004990339279175, "rewards/accuracy_reward": 0.4306885600090027, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.2281087338924408, "step": 339 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 347.91668701171875, "epoch": 0.0032916711039683998, "grad_norm": 7.225741161766752, "kl": 0.0296630859375, "learning_rate": 9.999734227318057e-07, "loss": 0.0012, "reward": 1.8181235790252686, "reward_std": 0.5405480861663818, "rewards/accuracy_reward": 0.5130901336669922, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2217000424861908, "step": 340 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 448.625, "epoch": 0.003301352489568307, "grad_norm": 4.059985477924979, "kl": 0.0162353515625, "learning_rate": 9.999732657038971e-07, "loss": 0.0007, "reward": 1.5344347953796387, "reward_std": 0.2406301498413086, "rewards/accuracy_reward": 0.33201780915260315, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.1857503354549408, "step": 341 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 317.91668701171875, "epoch": 0.003311033875168214, "grad_norm": 2.3987823657664538, "kl": 0.029296875, "learning_rate": 9.99973108213478e-07, "loss": 0.0012, "reward": 1.2714513540267944, "reward_std": 0.1944926679134369, "rewards/accuracy_reward": 0.12803497910499573, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.0850830078125, "step": 342 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 391.29168701171875, "epoch": 0.003320715260768121, "grad_norm": 2.585113738573326, "kl": 0.0247802734375, "learning_rate": 9.999729502605487e-07, "loss": 0.001, "reward": 1.8278920650482178, "reward_std": 0.10452531278133392, "rewards/accuracy_reward": 0.3609647750854492, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.30859375, "step": 343 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 383.54168701171875, "epoch": 0.0033303966463680283, "grad_norm": 4.925050168362717, "kl": 0.01806640625, "learning_rate": 9.999727918451092e-07, "loss": 0.0007, "reward": 1.4026098251342773, "reward_std": 0.30363211035728455, "rewards/accuracy_reward": 0.19095627963542938, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1533203125, "step": 344 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 374.375, "epoch": 0.003340078031967935, "grad_norm": 2.857076686003514, "kl": 0.0225830078125, "learning_rate": 9.999726329671594e-07, "loss": 0.0009, "reward": 1.8011153936386108, "reward_std": 0.268196165561676, "rewards/accuracy_reward": 0.4450523257255554, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.2977294921875, "step": 345 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 354.16668701171875, "epoch": 0.003349759417567842, "grad_norm": 4.028579270564312, "kl": 0.026611328125, "learning_rate": 9.999724736267e-07, "loss": 0.0011, "reward": 1.9251774549484253, "reward_std": 0.3055190443992615, "rewards/accuracy_reward": 0.5624006986618042, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2211100310087204, "step": 346 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 324.54168701171875, "epoch": 0.0033594408031677493, "grad_norm": 1.93483953358868, "kl": 0.02392578125, "learning_rate": 9.999723138237306e-07, "loss": 0.001, "reward": 1.7319635152816772, "reward_std": 0.16509166359901428, "rewards/accuracy_reward": 0.513888955116272, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1514078825712204, "step": 347 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 363.3333435058594, "epoch": 0.0033691221887676565, "grad_norm": 6.071142993890619, "kl": 0.022705078125, "learning_rate": 9.99972153558252e-07, "loss": 0.0009, "reward": 1.8005130290985107, "reward_std": 0.29882851243019104, "rewards/accuracy_reward": 0.4239748418331146, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2432047575712204, "step": 348 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 458.375, "epoch": 0.0033788035743675636, "grad_norm": 8.21436471848379, "kl": 0.019287109375, "learning_rate": 9.999719928302636e-07, "loss": 0.0008, "reward": 1.9465411901474, "reward_std": 0.29198652505874634, "rewards/accuracy_reward": 0.6200274229049683, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2181803435087204, "step": 349 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 383.625, "epoch": 0.0033884849599674707, "grad_norm": 2.689627194600216, "kl": 0.0224609375, "learning_rate": 9.999718316397663e-07, "loss": 0.0009, "reward": 1.5635061264038086, "reward_std": 0.06818480789661407, "rewards/accuracy_reward": 0.27843940258026123, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1934000700712204, "step": 350 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 424.875, "epoch": 0.0033981663455673774, "grad_norm": 1.494581564836965, "kl": 0.00946044921875, "learning_rate": 9.999716699867595e-07, "loss": 0.0004, "reward": 1.8405790328979492, "reward_std": 0.43255528807640076, "rewards/accuracy_reward": 0.506773829460144, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2338053435087204, "step": 351 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 332.25, "epoch": 0.0034078477311672846, "grad_norm": 3.6667058496415414, "kl": 0.0220947265625, "learning_rate": 9.99971507871244e-07, "loss": 0.0009, "reward": 1.8419814109802246, "reward_std": 0.06708623468875885, "rewards/accuracy_reward": 0.5555555820465088, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.21142578125, "step": 352 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 310.625, "epoch": 0.0034175291167671917, "grad_norm": 2.6751161305026407, "kl": 0.025146484375, "learning_rate": 9.999713452932194e-07, "loss": 0.001, "reward": 2.4007163047790527, "reward_std": 0.09284576773643494, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3173828125, "step": 353 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 416.2083435058594, "epoch": 0.003427210502367099, "grad_norm": 2.070461754993761, "kl": 0.0191650390625, "learning_rate": 9.999711822526863e-07, "loss": 0.0008, "reward": 1.5797345638275146, "reward_std": 0.18061679601669312, "rewards/accuracy_reward": 0.38702210783958435, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.1343790739774704, "step": 354 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 420.0, "epoch": 0.003436891887967006, "grad_norm": 2.5568907187934227, "kl": 0.0228271484375, "learning_rate": 9.999710187496443e-07, "loss": 0.0009, "reward": 2.080029010772705, "reward_std": 0.20522406697273254, "rewards/accuracy_reward": 0.6497309803962708, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2636311948299408, "step": 355 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 415.625, "epoch": 0.0034465732735669127, "grad_norm": 2.5877435152302937, "kl": 0.023681640625, "learning_rate": 9.999708547840943e-07, "loss": 0.0009, "reward": 1.6896770000457764, "reward_std": 0.24534295499324799, "rewards/accuracy_reward": 0.4076131582260132, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.19873046875, "step": 356 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.3333333333333333, "completion_length": 345.54168701171875, "epoch": 0.00345625465916682, "grad_norm": 1.8068292980161362, "kl": 0.0206298828125, "learning_rate": 9.99970690356036e-07, "loss": 0.0008, "reward": 1.9131592512130737, "reward_std": 0.08438451588153839, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1964925229549408, "step": 357 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 344.29168701171875, "epoch": 0.003465936044766727, "grad_norm": 6.784520500640923, "kl": 0.0294189453125, "learning_rate": 9.999705254654695e-07, "loss": 0.0012, "reward": 1.8171768188476562, "reward_std": 0.07099848985671997, "rewards/accuracy_reward": 0.5555555820465088, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.16162109375, "step": 358 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 504.4583435058594, "epoch": 0.003475617430366634, "grad_norm": 0.8505995127773585, "kl": 0.00909423828125, "learning_rate": 9.999703601123951e-07, "loss": 0.0004, "reward": 1.388688087463379, "reward_std": 0.6189925074577332, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.9166666865348816, "rewards/semantic_reward": 0.1136881560087204, "step": 359 }, { "all_correct": 0.0, "all_wrong": 0.6666666666666666, "completion_length": 357.8333435058594, "epoch": 0.0034852988159665413, "grad_norm": 1.4454019084419674, "kl": 0.024658203125, "learning_rate": 9.99970194296813e-07, "loss": 0.001, "reward": 1.07355797290802, "reward_std": 0.23290953040122986, "rewards/accuracy_reward": 0.06234375387430191, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.0362141951918602, "step": 360 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 349.75, "epoch": 0.0034949802015664484, "grad_norm": 1.7195403466526247, "kl": 0.027099609375, "learning_rate": 9.999700280187233e-07, "loss": 0.0011, "reward": 1.7455819845199585, "reward_std": 0.07531312108039856, "rewards/accuracy_reward": 0.4929390549659729, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1359761655330658, "step": 361 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 419.4583435058594, "epoch": 0.003504661587166355, "grad_norm": 2.082088666118443, "kl": 0.0205078125, "learning_rate": 9.99969861278126e-07, "loss": 0.0008, "reward": 1.8609193563461304, "reward_std": 0.2933592200279236, "rewards/accuracy_reward": 0.49531054496765137, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2489420622587204, "step": 362 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 386.91668701171875, "epoch": 0.0035143429727662622, "grad_norm": 6.441423699747415, "kl": 0.031005859375, "learning_rate": 9.999696940750214e-07, "loss": 0.0012, "reward": 1.4363603591918945, "reward_std": 0.3875970244407654, "rewards/accuracy_reward": 0.3835691213607788, "rewards/format_reward": 0.9166666865348816, "rewards/semantic_reward": 0.1111246794462204, "step": 363 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 392.04168701171875, "epoch": 0.0035240243583661694, "grad_norm": 3.074866802707499, "kl": 0.0247802734375, "learning_rate": 9.9996952640941e-07, "loss": 0.001, "reward": 1.8293747901916504, "reward_std": 0.15960785746574402, "rewards/accuracy_reward": 0.45435842871665955, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.308349609375, "step": 364 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 383.04168701171875, "epoch": 0.0035337057439660765, "grad_norm": 1.2807442253280825, "kl": 0.01611328125, "learning_rate": 9.999693582812911e-07, "loss": 0.0006, "reward": 1.4072265625, "reward_std": 0.6375261545181274, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.9166666865348816, "rewards/semantic_reward": 0.1155598983168602, "step": 365 }, { "all_correct": 0.0, "all_wrong": 0.6666666666666666, "completion_length": 376.66668701171875, "epoch": 0.0035433871295659836, "grad_norm": 2.0480588239633652, "kl": 0.02978515625, "learning_rate": 9.999691896906655e-07, "loss": 0.0012, "reward": 1.4050673246383667, "reward_std": 0.03987356275320053, "rewards/accuracy_reward": 0.222222238779068, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.12451171875, "step": 366 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 351.5833435058594, "epoch": 0.0035530685151658904, "grad_norm": 2.1318383348294145, "kl": 0.0201416015625, "learning_rate": 9.999690206375333e-07, "loss": 0.0008, "reward": 1.7645509243011475, "reward_std": 0.3082908093929291, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2062174528837204, "step": 367 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 430.5833435058594, "epoch": 0.0035627499007657975, "grad_norm": 3.8600640752985287, "kl": 0.0291748046875, "learning_rate": 9.999688511218945e-07, "loss": 0.0012, "reward": 1.5646551847457886, "reward_std": 0.2192382514476776, "rewards/accuracy_reward": 0.2618720531463623, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.227783203125, "step": 368 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 307.0833435058594, "epoch": 0.0035724312863657046, "grad_norm": 2.721865771026195, "kl": 0.02685546875, "learning_rate": 9.999686811437493e-07, "loss": 0.0011, "reward": 1.30003023147583, "reward_std": 0.3257433772087097, "rewards/accuracy_reward": 0.17892025411128998, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.0961100310087204, "step": 369 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 409.54168701171875, "epoch": 0.0035821126719656118, "grad_norm": 3.6147981341817257, "kl": 0.0252685546875, "learning_rate": 9.99968510703098e-07, "loss": 0.001, "reward": 2.1383464336395264, "reward_std": 0.7572588920593262, "rewards/accuracy_reward": 0.7916666865348816, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.2633463740348816, "step": 370 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 449.04168701171875, "epoch": 0.003591794057565519, "grad_norm": 5.275598961856222, "kl": 0.0206298828125, "learning_rate": 9.999683397999407e-07, "loss": 0.0008, "reward": 1.447786569595337, "reward_std": 0.44534099102020264, "rewards/accuracy_reward": 0.4583333432674408, "rewards/format_reward": 0.7916666865348816, "rewards/semantic_reward": 0.1477864682674408, "step": 371 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 384.79168701171875, "epoch": 0.003601475443165426, "grad_norm": 2.1402431554222425, "kl": 0.0263671875, "learning_rate": 9.99968168434277e-07, "loss": 0.0011, "reward": 1.8111546039581299, "reward_std": 0.06791117787361145, "rewards/accuracy_reward": 0.5555555820465088, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1555989682674408, "step": 372 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 403.79168701171875, "epoch": 0.0036111568287653328, "grad_norm": 2.0052884269222115, "kl": 0.027587890625, "learning_rate": 9.999679966061079e-07, "loss": 0.0011, "reward": 1.708653211593628, "reward_std": 0.21929650008678436, "rewards/accuracy_reward": 0.442507266998291, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1744791716337204, "step": 373 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 366.875, "epoch": 0.00362083821436524, "grad_norm": 2.900808191924154, "kl": 0.03662109375, "learning_rate": 9.999678243154333e-07, "loss": 0.0015, "reward": 1.4232909679412842, "reward_std": 0.17166230082511902, "rewards/accuracy_reward": 0.2611977159976959, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.103759765625, "step": 374 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 459.875, "epoch": 0.003630519599965147, "grad_norm": 6.696445918612738, "kl": 0.0272216796875, "learning_rate": 9.99967651562253e-07, "loss": 0.0011, "reward": 1.8067760467529297, "reward_std": 0.34554728865623474, "rewards/accuracy_reward": 0.4570934772491455, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.2580159604549408, "step": 375 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 413.04168701171875, "epoch": 0.003640200985565054, "grad_norm": 5.8132921959249835, "kl": 0.0206298828125, "learning_rate": 9.999674783465677e-07, "loss": 0.0008, "reward": 2.073673725128174, "reward_std": 0.27958253026008606, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1820068359375, "step": 376 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 417.66668701171875, "epoch": 0.0036498823711649613, "grad_norm": 1.7075635687142987, "kl": 0.016357421875, "learning_rate": 9.999673046683771e-07, "loss": 0.0007, "reward": 1.5861735343933105, "reward_std": 0.2571130394935608, "rewards/accuracy_reward": 0.4166666865348816, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1195068359375, "step": 377 }, { "all_correct": 0.0, "all_wrong": 0.6666666666666666, "completion_length": 332.125, "epoch": 0.003659563756764868, "grad_norm": 2.8405355958896195, "kl": 0.02490234375, "learning_rate": 9.999671305276818e-07, "loss": 0.001, "reward": 1.295317530632019, "reward_std": 0.037570469081401825, "rewards/accuracy_reward": 0.17464694380760193, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.10400390625, "step": 378 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 359.29168701171875, "epoch": 0.003669245142364775, "grad_norm": 3.213863794285896, "kl": 0.0211181640625, "learning_rate": 9.999669559244815e-07, "loss": 0.0008, "reward": 1.853856086730957, "reward_std": 0.23546430468559265, "rewards/accuracy_reward": 0.48895522952079773, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2732340693473816, "step": 379 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 378.66668701171875, "epoch": 0.0036789265279646823, "grad_norm": 2.046817469214113, "kl": 0.0286865234375, "learning_rate": 9.999667808587766e-07, "loss": 0.0011, "reward": 1.5350250005722046, "reward_std": 0.09058289974927902, "rewards/accuracy_reward": 0.21419966220855713, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2208251953125, "step": 380 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 411.41668701171875, "epoch": 0.0036886079135645894, "grad_norm": 2.8970941483101904, "kl": 0.0264892578125, "learning_rate": 9.999666053305671e-07, "loss": 0.0011, "reward": 1.5085370540618896, "reward_std": 0.07203066349029541, "rewards/accuracy_reward": 0.21940939128398895, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1891276091337204, "step": 381 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 383.3333435058594, "epoch": 0.0036982892991644966, "grad_norm": 2.7447469523746766, "kl": 0.0299072265625, "learning_rate": 9.999664293398536e-07, "loss": 0.0012, "reward": 1.6668434143066406, "reward_std": 0.5635987520217896, "rewards/accuracy_reward": 0.42909106612205505, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.1960856169462204, "step": 382 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 388.2083435058594, "epoch": 0.0037079706847644033, "grad_norm": 3.790092643906364, "kl": 0.0242919921875, "learning_rate": 9.999662528866357e-07, "loss": 0.001, "reward": 2.3083200454711914, "reward_std": 0.13081982731819153, "rewards/accuracy_reward": 0.845982551574707, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3123372495174408, "step": 383 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 370.5833435058594, "epoch": 0.0037176520703643104, "grad_norm": 2.826170838113695, "kl": 0.031982421875, "learning_rate": 9.999660759709139e-07, "loss": 0.0013, "reward": 1.9866387844085693, "reward_std": 0.4502750635147095, "rewards/accuracy_reward": 0.6805555820465088, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.21441650390625, "step": 384 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 366.25, "epoch": 0.0037273334559642175, "grad_norm": 2.4731298681038094, "kl": 0.02294921875, "learning_rate": 9.999658985926884e-07, "loss": 0.0009, "reward": 1.57213294506073, "reward_std": 0.07964334636926651, "rewards/accuracy_reward": 0.3374485671520233, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1763509213924408, "step": 385 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.3333333333333333, "completion_length": 379.9583435058594, "epoch": 0.0037370148415641247, "grad_norm": 1.6028972834305235, "kl": 0.0223388671875, "learning_rate": 9.999657207519591e-07, "loss": 0.0009, "reward": 1.982926607131958, "reward_std": 0.059930458664894104, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2079264372587204, "step": 386 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 336.0, "epoch": 0.003746696227164032, "grad_norm": 3.1236541941664484, "kl": 0.028076171875, "learning_rate": 9.999655424487264e-07, "loss": 0.0011, "reward": 1.6712783575057983, "reward_std": 0.2421051263809204, "rewards/accuracy_reward": 0.43879956007003784, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1658121794462204, "step": 387 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 357.75, "epoch": 0.003756377612763939, "grad_norm": 2.7691024995799207, "kl": 0.0306396484375, "learning_rate": 9.999653636829902e-07, "loss": 0.0012, "reward": 2.2274200916290283, "reward_std": 0.12784752249717712, "rewards/accuracy_reward": 0.8123645782470703, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3067220151424408, "step": 388 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 342.9583435058594, "epoch": 0.0037660589983638457, "grad_norm": 9.02524560887718, "kl": 0.03271484375, "learning_rate": 9.999651844547509e-07, "loss": 0.0013, "reward": 2.029083251953125, "reward_std": 0.11401475965976715, "rewards/accuracy_reward": 0.5869852304458618, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3170979917049408, "step": 389 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 355.625, "epoch": 0.003775740383963753, "grad_norm": 4.586866050400236, "kl": 0.031005859375, "learning_rate": 9.99965004764009e-07, "loss": 0.0012, "reward": 1.8658045530319214, "reward_std": 0.2699533700942993, "rewards/accuracy_reward": 0.4924728274345398, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2483317106962204, "step": 390 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 393.75, "epoch": 0.00378542176956366, "grad_norm": 3.218916382967083, "kl": 0.025634765625, "learning_rate": 9.999648246107637e-07, "loss": 0.001, "reward": 1.7298250198364258, "reward_std": 0.31476151943206787, "rewards/accuracy_reward": 0.4685457646846771, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.186279296875, "step": 391 }, { "all_correct": 0.0, "all_wrong": 0.6666666666666666, "completion_length": 317.875, "epoch": 0.003795103155163567, "grad_norm": 1.074684986084725, "kl": 0.0211181640625, "learning_rate": 9.99964643995016e-07, "loss": 0.0008, "reward": 1.2416341304779053, "reward_std": 0.25927025079727173, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.0499674491584301, "step": 392 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 401.875, "epoch": 0.0038047845407634742, "grad_norm": 2.7269525323389794, "kl": 0.029052734375, "learning_rate": 9.99964462916766e-07, "loss": 0.0012, "reward": 1.7606886625289917, "reward_std": 0.5453044176101685, "rewards/accuracy_reward": 0.4626987874507904, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2313232421875, "step": 393 }, { "all_correct": 0.0, "all_wrong": 0.6666666666666666, "completion_length": 398.79168701171875, "epoch": 0.003814465926363381, "grad_norm": 2.4175723032178627, "kl": 0.01806640625, "learning_rate": 9.999642813760135e-07, "loss": 0.0007, "reward": 1.2375149726867676, "reward_std": 0.038795895874500275, "rewards/accuracy_reward": 0.1316392570734024, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.064208984375, "step": 394 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 488.29168701171875, "epoch": 0.003824147311963288, "grad_norm": 2.4945783928952383, "kl": 0.0194091796875, "learning_rate": 9.99964099372759e-07, "loss": 0.0008, "reward": 1.9445648193359375, "reward_std": 0.2936674654483795, "rewards/accuracy_reward": 0.7167245745658875, "rewards/format_reward": 0.9166666865348816, "rewards/semantic_reward": 0.2028401792049408, "step": 395 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 422.5833435058594, "epoch": 0.003833828697563195, "grad_norm": 2.304660889499772, "kl": 0.027099609375, "learning_rate": 9.999639169070022e-07, "loss": 0.0011, "reward": 1.8350234031677246, "reward_std": 0.43337902426719666, "rewards/accuracy_reward": 0.4903620183467865, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.236328125, "step": 396 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 417.0833435058594, "epoch": 0.0038435100831631023, "grad_norm": 3.1723778473352966, "kl": 0.026611328125, "learning_rate": 9.999637339787436e-07, "loss": 0.0011, "reward": 2.451383590698242, "reward_std": 0.10205042362213135, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2930501401424408, "step": 397 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 367.125, "epoch": 0.0038531914687630095, "grad_norm": 3.2952085613955817, "kl": 0.03515625, "learning_rate": 9.999635505879837e-07, "loss": 0.0014, "reward": 1.863816499710083, "reward_std": 0.20633554458618164, "rewards/accuracy_reward": 0.524216890335083, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2562662959098816, "step": 398 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 373.3333435058594, "epoch": 0.0038628728543629166, "grad_norm": 5.496915601271636, "kl": 0.01953125, "learning_rate": 9.99963366734722e-07, "loss": 0.0008, "reward": 2.4072346687316895, "reward_std": 0.12314381450414658, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3322347104549408, "step": 399 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 452.75, "epoch": 0.0038725542399628233, "grad_norm": 1.7904435738508542, "kl": 0.017578125, "learning_rate": 9.999631824189592e-07, "loss": 0.0007, "reward": 1.2980000972747803, "reward_std": 0.399502158164978, "rewards/accuracy_reward": 0.18139033019542694, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.1082763671875, "step": 400 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 407.16668701171875, "epoch": 0.0038822356255627305, "grad_norm": 2.552984255802578, "kl": 0.0230712890625, "learning_rate": 9.999629976406952e-07, "loss": 0.0009, "reward": 1.8314471244812012, "reward_std": 0.42804479598999023, "rewards/accuracy_reward": 0.5275164842605591, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2372640073299408, "step": 401 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 403.75, "epoch": 0.0038919170111626376, "grad_norm": 2.5208631506060852, "kl": 0.0216064453125, "learning_rate": 9.999628123999302e-07, "loss": 0.0009, "reward": 1.4123620986938477, "reward_std": 0.056053634732961655, "rewards/accuracy_reward": 0.16292773187160492, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1494344174861908, "step": 402 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 448.5, "epoch": 0.0039015983967625447, "grad_norm": 2.5194724749388744, "kl": 0.0274658203125, "learning_rate": 9.999626266966644e-07, "loss": 0.0011, "reward": 1.8553390502929688, "reward_std": 0.3527756929397583, "rewards/accuracy_reward": 0.598214328289032, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1571248471736908, "step": 403 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.6666666666666666, "completion_length": 407.16668701171875, "epoch": 0.003911279782362452, "grad_norm": 0.9968472111310888, "kl": 0.0145263671875, "learning_rate": 9.999624405308979e-07, "loss": 0.0006, "reward": 1.3898274898529053, "reward_std": 0.1880854368209839, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.9166666865348816, "rewards/semantic_reward": 0.1148274764418602, "step": 404 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 381.4583435058594, "epoch": 0.003920961167962359, "grad_norm": 7.53798892290498, "kl": 0.0390625, "learning_rate": 9.99962253902631e-07, "loss": 0.0016, "reward": 1.4955228567123413, "reward_std": 0.24662815034389496, "rewards/accuracy_reward": 0.2313871830701828, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1974690854549408, "step": 405 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 384.7083435058594, "epoch": 0.003930642553562266, "grad_norm": 3.647216644159523, "kl": 0.0206298828125, "learning_rate": 9.99962066811864e-07, "loss": 0.0008, "reward": 1.7662675380706787, "reward_std": 0.5146679878234863, "rewards/accuracy_reward": 0.4354081153869629, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2141927182674408, "step": 406 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 381.79168701171875, "epoch": 0.003940323939162173, "grad_norm": 2.747054102648782, "kl": 0.0281982421875, "learning_rate": 9.999618792585967e-07, "loss": 0.0011, "reward": 1.454168677330017, "reward_std": 0.057765305042266846, "rewards/accuracy_reward": 0.12192580103874207, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.215576171875, "step": 407 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 400.79168701171875, "epoch": 0.0039500053247620796, "grad_norm": 5.303837836170887, "kl": 0.01953125, "learning_rate": 9.999616912428295e-07, "loss": 0.0008, "reward": 2.0971033573150635, "reward_std": 0.4788835048675537, "rewards/accuracy_reward": 0.6933760643005371, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.3203938901424408, "step": 408 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 423.25, "epoch": 0.003959686710361987, "grad_norm": 10.54067278566434, "kl": 0.0289306640625, "learning_rate": 9.999615027645627e-07, "loss": 0.0012, "reward": 1.5715965032577515, "reward_std": 0.1778700351715088, "rewards/accuracy_reward": 0.26902490854263306, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1775716245174408, "step": 409 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 359.16668701171875, "epoch": 0.003969368095961894, "grad_norm": 2.0122122168584915, "kl": 0.03076171875, "learning_rate": 9.999613138237962e-07, "loss": 0.0012, "reward": 1.892561912536621, "reward_std": 0.1784651130437851, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1675618588924408, "step": 410 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 482.3333435058594, "epoch": 0.003979049481561801, "grad_norm": 2.0344423802423623, "kl": 0.0240478515625, "learning_rate": 9.999611244205302e-07, "loss": 0.001, "reward": 1.1984648704528809, "reward_std": 0.3445093631744385, "rewards/accuracy_reward": 0.1202666386961937, "rewards/format_reward": 0.9166666865348816, "rewards/semantic_reward": 0.1115315780043602, "step": 411 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 339.125, "epoch": 0.003988730867161708, "grad_norm": 3.489746531065204, "kl": 0.034912109375, "learning_rate": 9.99960934554765e-07, "loss": 0.0014, "reward": 1.624995231628418, "reward_std": 0.07165496796369553, "rewards/accuracy_reward": 0.397993266582489, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1353352963924408, "step": 412 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 415.0, "epoch": 0.003998412252761615, "grad_norm": 1.8970518701174894, "kl": 0.0206298828125, "learning_rate": 9.999607442265007e-07, "loss": 0.0008, "reward": 1.6140154600143433, "reward_std": 0.062106162309646606, "rewards/accuracy_reward": 0.3641985356807709, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1664835661649704, "step": 413 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 381.91668701171875, "epoch": 0.004008093638361522, "grad_norm": 2.2408949036677916, "kl": 0.0294189453125, "learning_rate": 9.999605534357378e-07, "loss": 0.0012, "reward": 1.5658700466156006, "reward_std": 0.024283315986394882, "rewards/accuracy_reward": 0.20540444552898407, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2271321713924408, "step": 414 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 469.2083435058594, "epoch": 0.0040177750239614295, "grad_norm": 7.665853925525778, "kl": 0.0211181640625, "learning_rate": 9.999603621824758e-07, "loss": 0.0008, "reward": 2.2981691360473633, "reward_std": 0.35011792182922363, "rewards/accuracy_reward": 0.9583333730697632, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.2898356318473816, "step": 415 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 403.66668701171875, "epoch": 0.004027456409561337, "grad_norm": 3.6149093414622273, "kl": 0.020751953125, "learning_rate": 9.999601704667155e-07, "loss": 0.0008, "reward": 1.6871198415756226, "reward_std": 0.041576892137527466, "rewards/accuracy_reward": 0.3602970242500305, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2018229216337204, "step": 416 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 378.79168701171875, "epoch": 0.004037137795161244, "grad_norm": 2.4488752422771, "kl": 0.021240234375, "learning_rate": 9.99959978288457e-07, "loss": 0.0008, "reward": 1.7712501287460327, "reward_std": 0.1834230124950409, "rewards/accuracy_reward": 0.5017921924591064, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1944580078125, "step": 417 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 436.5833435058594, "epoch": 0.004046819180761151, "grad_norm": 2.217142486524089, "kl": 0.01953125, "learning_rate": 9.999597856477e-07, "loss": 0.0008, "reward": 1.772732138633728, "reward_std": 0.4290505051612854, "rewards/accuracy_reward": 0.557587206363678, "rewards/format_reward": 0.9166666865348816, "rewards/semantic_reward": 0.1901448667049408, "step": 418 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.3333333333333333, "completion_length": 316.2083435058594, "epoch": 0.004056500566361057, "grad_norm": 2.03215106984074, "kl": 0.03076171875, "learning_rate": 9.999595925444454e-07, "loss": 0.0012, "reward": 1.9405598640441895, "reward_std": 0.06993018090724945, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1988932341337204, "step": 419 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.6666666666666666, "completion_length": 346.0833435058594, "epoch": 0.004066181951960964, "grad_norm": 1.4160086859859038, "kl": 0.01470947265625, "learning_rate": 9.999593989786926e-07, "loss": 0.0006, "reward": 1.4246110916137695, "reward_std": 0.04041055962443352, "rewards/accuracy_reward": 0.3240740895271301, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.0922037810087204, "step": 420 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.6666666666666666, "completion_length": 319.75, "epoch": 0.0040758633375608715, "grad_norm": 1.1497732025181222, "kl": 0.0283203125, "learning_rate": 9.99959204950442e-07, "loss": 0.0011, "reward": 1.503027319908142, "reward_std": 0.026449445635080338, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1446940153837204, "step": 421 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.3333333333333333, "completion_length": 391.66668701171875, "epoch": 0.004085544723160779, "grad_norm": 3.1982077825209, "kl": 0.02001953125, "learning_rate": 9.999590104596944e-07, "loss": 0.0008, "reward": 1.8332194089889526, "reward_std": 0.19557949900627136, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.1748860776424408, "step": 422 }, { "all_correct": 0.0, "all_wrong": 0.6666666666666666, "completion_length": 400.625, "epoch": 0.004095226108760686, "grad_norm": 2.1118231490951715, "kl": 0.0274658203125, "learning_rate": 9.999588155064494e-07, "loss": 0.0011, "reward": 1.1609899997711182, "reward_std": 0.1604631543159485, "rewards/accuracy_reward": 0.09401407092809677, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.0669759139418602, "step": 423 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 404.0833435058594, "epoch": 0.004104907494360593, "grad_norm": 2.793620208890294, "kl": 0.030517578125, "learning_rate": 9.999586200907073e-07, "loss": 0.0012, "reward": 1.9696688652038574, "reward_std": 0.11207960546016693, "rewards/accuracy_reward": 0.5151685476303101, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3211669921875, "step": 424 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 381.125, "epoch": 0.0041145888799605, "grad_norm": 16.555856986573996, "kl": 0.03271484375, "learning_rate": 9.999584242124682e-07, "loss": 0.0013, "reward": 2.0204408168792725, "reward_std": 0.1530420035123825, "rewards/accuracy_reward": 0.6116760969161987, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2754313349723816, "step": 425 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 396.0833435058594, "epoch": 0.004124270265560407, "grad_norm": 2.3720153716270134, "kl": 0.0244140625, "learning_rate": 9.999582278717324e-07, "loss": 0.001, "reward": 1.9767091274261475, "reward_std": 0.33667242527008057, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2100423276424408, "step": 426 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 385.41668701171875, "epoch": 0.004133951651160314, "grad_norm": 2.556123702480062, "kl": 0.020751953125, "learning_rate": 9.999580310685e-07, "loss": 0.0008, "reward": 2.0631022453308105, "reward_std": 0.13919775187969208, "rewards/accuracy_reward": 0.6622883677482605, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.29248046875, "step": 427 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 383.8333435058594, "epoch": 0.0041436330367602215, "grad_norm": 3.156687280863473, "kl": 0.027587890625, "learning_rate": 9.999578338027714e-07, "loss": 0.0011, "reward": 1.9102325439453125, "reward_std": 0.3105464279651642, "rewards/accuracy_reward": 0.5720163583755493, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2132161557674408, "step": 428 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 366.0, "epoch": 0.004153314422360129, "grad_norm": 4.845102221231099, "kl": 0.0177001953125, "learning_rate": 9.999576360745465e-07, "loss": 0.0007, "reward": 1.7449767589569092, "reward_std": 0.08028881251811981, "rewards/accuracy_reward": 0.4311419427394867, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.22216796875, "step": 429 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 507.16668701171875, "epoch": 0.004162995807960035, "grad_norm": 2.7116628084847574, "kl": 0.02099609375, "learning_rate": 9.999574378838256e-07, "loss": 0.0008, "reward": 1.757218837738037, "reward_std": 0.326730340719223, "rewards/accuracy_reward": 0.4157068431377411, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2581787109375, "step": 430 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 382.41668701171875, "epoch": 0.004172677193559942, "grad_norm": 5.307439782285919, "kl": 0.0264892578125, "learning_rate": 9.99957239230609e-07, "loss": 0.0011, "reward": 1.583055853843689, "reward_std": 0.03922393545508385, "rewards/accuracy_reward": 0.26527416706085205, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1844482421875, "step": 431 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 348.7083435058594, "epoch": 0.004182358579159849, "grad_norm": 3.882324507911959, "kl": 0.0322265625, "learning_rate": 9.999570401148964e-07, "loss": 0.0013, "reward": 1.6931238174438477, "reward_std": 0.05813431739807129, "rewards/accuracy_reward": 0.395500123500824, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1892903745174408, "step": 432 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 433.79168701171875, "epoch": 0.004192039964759756, "grad_norm": 2.23846182444838, "kl": 0.0267333984375, "learning_rate": 9.999568405366886e-07, "loss": 0.0011, "reward": 1.5113786458969116, "reward_std": 0.07026855647563934, "rewards/accuracy_reward": 0.2764665484428406, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.159912109375, "step": 433 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 404.7083435058594, "epoch": 0.004201721350359663, "grad_norm": 3.5002447583121175, "kl": 0.0263671875, "learning_rate": 9.999566404959857e-07, "loss": 0.0011, "reward": 2.033782482147217, "reward_std": 0.08834891766309738, "rewards/accuracy_reward": 0.5995540618896484, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2925618588924408, "step": 434 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 384.9583435058594, "epoch": 0.004211402735959571, "grad_norm": 24.156210930656243, "kl": 0.02392578125, "learning_rate": 9.999564399927874e-07, "loss": 0.001, "reward": 1.6680907011032104, "reward_std": 0.4421050250530243, "rewards/accuracy_reward": 0.4025389850139618, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1905517578125, "step": 435 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 500.66668701171875, "epoch": 0.004221084121559478, "grad_norm": 3.823678448004359, "kl": 0.025146484375, "learning_rate": 9.999562390270942e-07, "loss": 0.001, "reward": 1.088592290878296, "reward_std": 0.3227859437465668, "rewards/accuracy_reward": 0.09815439581871033, "rewards/format_reward": 0.8333333730697632, "rewards/semantic_reward": 0.1154378280043602, "step": 436 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 372.0, "epoch": 0.004230765507159385, "grad_norm": 4.115872977241434, "kl": 0.0289306640625, "learning_rate": 9.999560375989065e-07, "loss": 0.0012, "reward": 1.5218867063522339, "reward_std": 0.057477351278066635, "rewards/accuracy_reward": 0.21621452271938324, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1890055388212204, "step": 437 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 458.0, "epoch": 0.004240446892759292, "grad_norm": 21.443242777794705, "kl": 0.0224609375, "learning_rate": 9.99955835708224e-07, "loss": 0.0009, "reward": 1.530269980430603, "reward_std": 0.07085928320884705, "rewards/accuracy_reward": 0.26358699798583984, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.183349609375, "step": 438 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 418.375, "epoch": 0.004250128278359199, "grad_norm": 2.390469243746704, "kl": 0.017578125, "learning_rate": 9.999556333550475e-07, "loss": 0.0007, "reward": 1.8710116147994995, "reward_std": 0.2260907143354416, "rewards/accuracy_reward": 0.531761884689331, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2309163510799408, "step": 439 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 418.0833435058594, "epoch": 0.004259809663959105, "grad_norm": 2.6151114330339627, "kl": 0.0289306640625, "learning_rate": 9.999554305393767e-07, "loss": 0.0012, "reward": 1.3394930362701416, "reward_std": 0.16372674703598022, "rewards/accuracy_reward": 0.13561928272247314, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.13720703125, "step": 440 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 410.7083435058594, "epoch": 0.0042694910495590125, "grad_norm": 4.475808075663653, "kl": 0.021728515625, "learning_rate": 9.99955227261212e-07, "loss": 0.0009, "reward": 1.7932285070419312, "reward_std": 0.6133621335029602, "rewards/accuracy_reward": 0.5006868839263916, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.2092081755399704, "step": 441 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 418.29168701171875, "epoch": 0.00427917243515892, "grad_norm": 5.117098273065661, "kl": 0.03076171875, "learning_rate": 9.999550235205534e-07, "loss": 0.0012, "reward": 1.7207006216049194, "reward_std": 0.08395396173000336, "rewards/accuracy_reward": 0.31075599789619446, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2349446713924408, "step": 442 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 404.4583435058594, "epoch": 0.004288853820758827, "grad_norm": 2.9818974736445467, "kl": 0.0289306640625, "learning_rate": 9.99954819317401e-07, "loss": 0.0012, "reward": 1.7552869319915771, "reward_std": 0.09325060248374939, "rewards/accuracy_reward": 0.33681344985961914, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2684733271598816, "step": 443 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 347.79168701171875, "epoch": 0.004298535206358734, "grad_norm": 5.272693383110419, "kl": 0.03662109375, "learning_rate": 9.999546146517555e-07, "loss": 0.0015, "reward": 1.9831342697143555, "reward_std": 0.129185289144516, "rewards/accuracy_reward": 0.5280317068099976, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3134358823299408, "step": 444 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 444.875, "epoch": 0.004308216591958641, "grad_norm": 3.694634697646732, "kl": 0.023681640625, "learning_rate": 9.999544095236168e-07, "loss": 0.001, "reward": 2.1703853607177734, "reward_std": 0.2712267339229584, "rewards/accuracy_reward": 0.7826576232910156, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.25439453125, "step": 445 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.6666666666666666, "completion_length": 441.16668701171875, "epoch": 0.004317897977558548, "grad_norm": 10.314005176116513, "kl": 0.019287109375, "learning_rate": 9.99954203932985e-07, "loss": 0.0008, "reward": 1.3451173305511475, "reward_std": 0.19960859417915344, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.875, "rewards/semantic_reward": 0.0784505233168602, "step": 446 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 354.125, "epoch": 0.004327579363158455, "grad_norm": 2.545818165876374, "kl": 0.0264892578125, "learning_rate": 9.999539978798604e-07, "loss": 0.0011, "reward": 2.120180606842041, "reward_std": 0.2548010051250458, "rewards/accuracy_reward": 0.7040183544158936, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2744954526424408, "step": 447 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 368.4583435058594, "epoch": 0.0043372607487583625, "grad_norm": 1.8162539042114176, "kl": 0.0296630859375, "learning_rate": 9.999537913642432e-07, "loss": 0.0012, "reward": 1.6723531484603882, "reward_std": 0.2513613700866699, "rewards/accuracy_reward": 0.39101362228393555, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1646728515625, "step": 448 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 391.75, "epoch": 0.00434694213435827, "grad_norm": 3.6383502300838395, "kl": 0.0294189453125, "learning_rate": 9.999535843861332e-07, "loss": 0.0012, "reward": 2.092155933380127, "reward_std": 0.09529827535152435, "rewards/accuracy_reward": 0.6997976303100586, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2506917417049408, "step": 449 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 362.9583435058594, "epoch": 0.004356623519958177, "grad_norm": 2.4885937294435565, "kl": 0.0250244140625, "learning_rate": 9.999533769455315e-07, "loss": 0.001, "reward": 2.2106690406799316, "reward_std": 0.3616291284561157, "rewards/accuracy_reward": 0.8333333730697632, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2690022885799408, "step": 450 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 395.5, "epoch": 0.004366304905558083, "grad_norm": 3.7286469528047466, "kl": 0.0291748046875, "learning_rate": 9.999531690424374e-07, "loss": 0.0012, "reward": 1.539286494255066, "reward_std": 0.3790621757507324, "rewards/accuracy_reward": 0.25730404257774353, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.1986490935087204, "step": 451 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 465.0833435058594, "epoch": 0.00437598629115799, "grad_norm": 1.3348933777587015, "kl": 0.018798828125, "learning_rate": 9.999529606768513e-07, "loss": 0.0008, "reward": 1.6503866910934448, "reward_std": 0.30223968625068665, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.1253865659236908, "step": 452 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 570.75, "epoch": 0.004385667676757897, "grad_norm": 2.5682188203292142, "kl": 0.0184326171875, "learning_rate": 9.999527518487738e-07, "loss": 0.0007, "reward": 1.4980077743530273, "reward_std": 0.6567399501800537, "rewards/accuracy_reward": 0.5007583498954773, "rewards/format_reward": 0.7083333730697632, "rewards/semantic_reward": 0.1805826872587204, "step": 453 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 429.625, "epoch": 0.0043953490623578045, "grad_norm": 3.505291851126412, "kl": 0.021728515625, "learning_rate": 9.999525425582046e-07, "loss": 0.0009, "reward": 1.7971861362457275, "reward_std": 0.29842692613601685, "rewards/accuracy_reward": 0.44014257192611694, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2403768002986908, "step": 454 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 402.16668701171875, "epoch": 0.004405030447957712, "grad_norm": 2.042248888334644, "kl": 0.025146484375, "learning_rate": 9.999523328051442e-07, "loss": 0.001, "reward": 1.7661198377609253, "reward_std": 0.079668790102005, "rewards/accuracy_reward": 0.4826725721359253, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2001139372587204, "step": 455 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 490.875, "epoch": 0.004414711833557619, "grad_norm": 4.026213055693373, "kl": 0.0245361328125, "learning_rate": 9.999521225895929e-07, "loss": 0.001, "reward": 1.1167492866516113, "reward_std": 0.44239458441734314, "rewards/accuracy_reward": 0.18015262484550476, "rewards/format_reward": 0.7916666865348816, "rewards/semantic_reward": 0.0949300155043602, "step": 456 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 365.625, "epoch": 0.004424393219157526, "grad_norm": 2.3969062604596565, "kl": 0.02294921875, "learning_rate": 9.999519119115505e-07, "loss": 0.0009, "reward": 2.2422902584075928, "reward_std": 0.11526968330144882, "rewards/accuracy_reward": 0.7133755683898926, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3872477412223816, "step": 457 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 387.91668701171875, "epoch": 0.004434074604757433, "grad_norm": 7.236928863003073, "kl": 0.0301513671875, "learning_rate": 9.999517007710173e-07, "loss": 0.0012, "reward": 1.2853034734725952, "reward_std": 0.16798435151576996, "rewards/accuracy_reward": 0.12902897596359253, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.0979410856962204, "step": 458 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 325.75, "epoch": 0.00444375599035734, "grad_norm": 1.774912347062569, "kl": 0.02880859375, "learning_rate": 9.999514891679938e-07, "loss": 0.0012, "reward": 1.776429533958435, "reward_std": 0.05925469845533371, "rewards/accuracy_reward": 0.5555555820465088, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1458740234375, "step": 459 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 405.41668701171875, "epoch": 0.004453437375957247, "grad_norm": 13.33082032704567, "kl": 0.032470703125, "learning_rate": 9.9995127710248e-07, "loss": 0.0013, "reward": 1.8421167135238647, "reward_std": 0.11957675218582153, "rewards/accuracy_reward": 0.43622463941574097, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.24755859375, "step": 460 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 428.8333435058594, "epoch": 0.0044631187615571544, "grad_norm": 2.0644266323983964, "kl": 0.017333984375, "learning_rate": 9.999510645744758e-07, "loss": 0.0007, "reward": 2.0899415016174316, "reward_std": 0.5518855452537537, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.2732747495174408, "step": 461 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 430.41668701171875, "epoch": 0.004472800147157061, "grad_norm": 2.4375488287354274, "kl": 0.025146484375, "learning_rate": 9.999508515839819e-07, "loss": 0.001, "reward": 2.198728561401367, "reward_std": 0.3178463578224182, "rewards/accuracy_reward": 0.782016932964325, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2750447690486908, "step": 462 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 401.16668701171875, "epoch": 0.004482481532756968, "grad_norm": 32.24136082936701, "kl": 0.0380859375, "learning_rate": 9.999506381309981e-07, "loss": 0.0015, "reward": 1.9677517414093018, "reward_std": 0.12665696442127228, "rewards/accuracy_reward": 0.607391893863678, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2020263671875, "step": 463 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 394.91668701171875, "epoch": 0.004492162918356875, "grad_norm": 2.253706517811395, "kl": 0.0205078125, "learning_rate": 9.999504242155247e-07, "loss": 0.0008, "reward": 2.0115561485290527, "reward_std": 0.5744772553443909, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.2198893278837204, "step": 464 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 380.66668701171875, "epoch": 0.004501844303956782, "grad_norm": 2.1456605381956373, "kl": 0.02294921875, "learning_rate": 9.999502098375623e-07, "loss": 0.0009, "reward": 2.0654377937316895, "reward_std": 0.311947226524353, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2404378354549408, "step": 465 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 416.9583435058594, "epoch": 0.004511525689556689, "grad_norm": 2.769024884513546, "kl": 0.0262451171875, "learning_rate": 9.999499949971105e-07, "loss": 0.001, "reward": 2.030620574951172, "reward_std": 0.30474957823753357, "rewards/accuracy_reward": 0.6805555820465088, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2333984375, "step": 466 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 367.8333435058594, "epoch": 0.004521207075156596, "grad_norm": 2.4445627529305045, "kl": 0.0238037109375, "learning_rate": 9.999497796941698e-07, "loss": 0.001, "reward": 2.3734049797058105, "reward_std": 0.3210034966468811, "rewards/accuracy_reward": 0.9166666865348816, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3234049677848816, "step": 467 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 359.2083435058594, "epoch": 0.0045308884607565035, "grad_norm": 2.060606858379597, "kl": 0.03173828125, "learning_rate": 9.999495639287403e-07, "loss": 0.0013, "reward": 1.6584933996200562, "reward_std": 0.11841002106666565, "rewards/accuracy_reward": 0.3813529908657074, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1938069760799408, "step": 468 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 317.5, "epoch": 0.004540569846356411, "grad_norm": 2.4165713372293998, "kl": 0.038330078125, "learning_rate": 9.999493477008225e-07, "loss": 0.0015, "reward": 1.765671730041504, "reward_std": 0.07352422177791595, "rewards/accuracy_reward": 0.4702777862548828, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.237060546875, "step": 469 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 340.8333435058594, "epoch": 0.004550251231956318, "grad_norm": 5.929769589597142, "kl": 0.032470703125, "learning_rate": 9.999491310104162e-07, "loss": 0.0013, "reward": 1.9517520666122437, "reward_std": 0.3217388987541199, "rewards/accuracy_reward": 0.680555522441864, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1795298308134079, "step": 470 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 433.25, "epoch": 0.004559932617556225, "grad_norm": 2.6812597048260374, "kl": 0.0242919921875, "learning_rate": 9.999489138575218e-07, "loss": 0.001, "reward": 1.5679110288619995, "reward_std": 0.08143182843923569, "rewards/accuracy_reward": 0.2255445122718811, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2173665463924408, "step": 471 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 419.7083435058594, "epoch": 0.004569614003156132, "grad_norm": 2.3558277929049924, "kl": 0.026123046875, "learning_rate": 9.999486962421396e-07, "loss": 0.001, "reward": 1.8331048488616943, "reward_std": 0.40832996368408203, "rewards/accuracy_reward": 0.6174633502960205, "rewards/format_reward": 0.9166666865348816, "rewards/semantic_reward": 0.2406412810087204, "step": 472 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 397.04168701171875, "epoch": 0.004579295388756038, "grad_norm": 1.9799753461221212, "kl": 0.02197265625, "learning_rate": 9.999484781642694e-07, "loss": 0.0009, "reward": 1.6378825902938843, "reward_std": 0.2694491446018219, "rewards/accuracy_reward": 0.4583333432674408, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1212158203125, "step": 473 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 467.54168701171875, "epoch": 0.0045889767743559455, "grad_norm": 4.12447103670056, "kl": 0.0303955078125, "learning_rate": 9.999482596239118e-07, "loss": 0.0012, "reward": 1.5404014587402344, "reward_std": 0.17333772778511047, "rewards/accuracy_reward": 0.2700155973434448, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.1870524138212204, "step": 474 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 489.375, "epoch": 0.004598658159955853, "grad_norm": 3.0276672482033407, "kl": 0.029052734375, "learning_rate": 9.999480406210667e-07, "loss": 0.0012, "reward": 1.6837342977523804, "reward_std": 0.3383423686027527, "rewards/accuracy_reward": 0.4372173249721527, "rewards/format_reward": 0.875, "rewards/semantic_reward": 0.26318359375, "step": 475 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 390.625, "epoch": 0.00460833954555576, "grad_norm": 3.859536446991784, "kl": 0.03369140625, "learning_rate": 9.999478211557345e-07, "loss": 0.0013, "reward": 1.706798791885376, "reward_std": 0.06117861717939377, "rewards/accuracy_reward": 0.43875667452812195, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1597086638212204, "step": 476 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 394.0, "epoch": 0.004618020931155667, "grad_norm": 29.67252795948146, "kl": 0.0281982421875, "learning_rate": 9.999476012279157e-07, "loss": 0.0011, "reward": 1.2616196870803833, "reward_std": 0.2080150693655014, "rewards/accuracy_reward": 0.09003763645887375, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.12158203125, "step": 477 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 426.5833435058594, "epoch": 0.004627702316755574, "grad_norm": 2.102645847884154, "kl": 0.0201416015625, "learning_rate": 9.9994738083761e-07, "loss": 0.0008, "reward": 1.4990967512130737, "reward_std": 0.3454282879829407, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1157633513212204, "step": 478 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 418.4583435058594, "epoch": 0.004637383702355481, "grad_norm": 3.025832932005317, "kl": 0.0301513671875, "learning_rate": 9.999471599848177e-07, "loss": 0.0012, "reward": 1.678828477859497, "reward_std": 0.2516523599624634, "rewards/accuracy_reward": 0.4413486421108246, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.14581298828125, "step": 479 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 360.3333435058594, "epoch": 0.004647065087955388, "grad_norm": 3.9918758789370785, "kl": 0.0341796875, "learning_rate": 9.99946938669539e-07, "loss": 0.0014, "reward": 2.3875327110290527, "reward_std": 0.23407456278800964, "rewards/accuracy_reward": 0.9583333730697632, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.30419921875, "step": 480 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 419.3333435058594, "epoch": 0.0046567464735552955, "grad_norm": 2.053999361705723, "kl": 0.029541015625, "learning_rate": 9.999467168917743e-07, "loss": 0.0012, "reward": 1.9247015714645386, "reward_std": 0.5283482074737549, "rewards/accuracy_reward": 0.6944445371627808, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.1802571713924408, "step": 481 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 384.91668701171875, "epoch": 0.004666427859155203, "grad_norm": 1.7406883099111234, "kl": 0.0211181640625, "learning_rate": 9.999464946515238e-07, "loss": 0.0008, "reward": 1.7809195518493652, "reward_std": 0.26181644201278687, "rewards/accuracy_reward": 0.5025993585586548, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1949869841337204, "step": 482 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 403.0, "epoch": 0.00467610924475511, "grad_norm": 4.305630257688106, "kl": 0.03466796875, "learning_rate": 9.999462719487874e-07, "loss": 0.0014, "reward": 1.7880443334579468, "reward_std": 0.09700341522693634, "rewards/accuracy_reward": 0.33535870909690857, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2943522334098816, "step": 483 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 408.4583435058594, "epoch": 0.004685790630355016, "grad_norm": 2.394323432344274, "kl": 0.0322265625, "learning_rate": 9.999460487835657e-07, "loss": 0.0013, "reward": 1.3897435665130615, "reward_std": 0.16706719994544983, "rewards/accuracy_reward": 0.2638888955116272, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.0841878280043602, "step": 484 }, { "all_correct": 0.0, "all_wrong": 0.6666666666666666, "completion_length": 378.875, "epoch": 0.004695472015954923, "grad_norm": 1.9236185372908536, "kl": 0.0322265625, "learning_rate": 9.999458251558586e-07, "loss": 0.0013, "reward": 1.2583740949630737, "reward_std": 0.03545047715306282, "rewards/accuracy_reward": 0.13037113845348358, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.0780029296875, "step": 485 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 415.16668701171875, "epoch": 0.00470515340155483, "grad_norm": 11.159787292929439, "kl": 0.027587890625, "learning_rate": 9.999456010656663e-07, "loss": 0.0011, "reward": 2.254624843597412, "reward_std": 0.11042191088199615, "rewards/accuracy_reward": 0.8304222822189331, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.265869140625, "step": 486 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 438.4583435058594, "epoch": 0.0047148347871547374, "grad_norm": 2.335330356593664, "kl": 0.022705078125, "learning_rate": 9.999453765129894e-07, "loss": 0.0009, "reward": 1.5530858039855957, "reward_std": 0.25719064474105835, "rewards/accuracy_reward": 0.37311339378356934, "rewards/format_reward": 0.9166666865348816, "rewards/semantic_reward": 0.1799723356962204, "step": 487 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 387.7083435058594, "epoch": 0.004724516172754645, "grad_norm": 6.476967465579026, "kl": 0.0322265625, "learning_rate": 9.999451514978277e-07, "loss": 0.0013, "reward": 1.3695406913757324, "reward_std": 0.16984254121780396, "rewards/accuracy_reward": 0.17418751120567322, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1120198592543602, "step": 488 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 393.25, "epoch": 0.004734197558354552, "grad_norm": 3.3984508809637566, "kl": 0.037109375, "learning_rate": 9.999449260201815e-07, "loss": 0.0015, "reward": 2.145153045654297, "reward_std": 0.07936467230319977, "rewards/accuracy_reward": 0.6225782036781311, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3892415463924408, "step": 489 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.3333333333333333, "completion_length": 362.0833435058594, "epoch": 0.004743878943954459, "grad_norm": 10.278932048590535, "kl": 0.031982421875, "learning_rate": 9.99944700080051e-07, "loss": 0.0013, "reward": 1.8734430074691772, "reward_std": 0.04452129453420639, "rewards/accuracy_reward": 0.5555555820465088, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2095540463924408, "step": 490 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 387.25, "epoch": 0.004753560329554366, "grad_norm": 1.8359835440331729, "kl": 0.025146484375, "learning_rate": 9.999444736774366e-07, "loss": 0.001, "reward": 1.2897799015045166, "reward_std": 0.4189135432243347, "rewards/accuracy_reward": 0.17133085429668427, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.0767822265625, "step": 491 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 355.3333435058594, "epoch": 0.004763241715154273, "grad_norm": 2.865590758618135, "kl": 0.037109375, "learning_rate": 9.999442468123384e-07, "loss": 0.0015, "reward": 1.4303028583526611, "reward_std": 0.37696951627731323, "rewards/accuracy_reward": 0.2919808328151703, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1133219450712204, "step": 492 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 467.9583435058594, "epoch": 0.00477292310075418, "grad_norm": 3.87634288370345, "kl": 0.026611328125, "learning_rate": 9.999440194847566e-07, "loss": 0.0011, "reward": 1.7691905498504639, "reward_std": 0.08665182441473007, "rewards/accuracy_reward": 0.3003997504711151, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3021240234375, "step": 493 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 414.25, "epoch": 0.004782604486354087, "grad_norm": 5.173865336500178, "kl": 0.02734375, "learning_rate": 9.999437916946914e-07, "loss": 0.0011, "reward": 2.2060413360595703, "reward_std": 0.0808151364326477, "rewards/accuracy_reward": 0.7777777910232544, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2615966796875, "step": 494 }, { "all_correct": 0.0, "all_wrong": 1.0, "completion_length": 453.04168701171875, "epoch": 0.004792285871953994, "grad_norm": 0.6598638509545753, "kl": 0.0205078125, "learning_rate": 9.999435634421426e-07, "loss": 0.0008, "reward": 0.9583333730697632, "reward_std": 0.1178511306643486, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.0, "step": 495 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 325.875, "epoch": 0.004801967257553901, "grad_norm": 5.6584885607614845, "kl": 0.035888671875, "learning_rate": 9.999433347271113e-07, "loss": 0.0014, "reward": 2.0547261238098145, "reward_std": 0.1496579349040985, "rewards/accuracy_reward": 0.6491187214851379, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2889404296875, "step": 496 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 387.16668701171875, "epoch": 0.004811648643153808, "grad_norm": 1.977469394566687, "kl": 0.024169921875, "learning_rate": 9.99943105549597e-07, "loss": 0.001, "reward": 1.9209717512130737, "reward_std": 0.5392135977745056, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2043050229549408, "step": 497 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 534.125, "epoch": 0.004821330028753715, "grad_norm": 0.9768954435196213, "kl": 0.013671875, "learning_rate": 9.999428759096002e-07, "loss": 0.0005, "reward": 1.0815999507904053, "reward_std": 0.5346593856811523, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.7916666865348816, "rewards/semantic_reward": 0.0565999373793602, "step": 498 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.6666666666666666, "completion_length": 405.66668701171875, "epoch": 0.004831011414353622, "grad_norm": 2.0929036347024192, "kl": 0.02783203125, "learning_rate": 9.99942645807121e-07, "loss": 0.0011, "reward": 1.3074599504470825, "reward_std": 0.1806546449661255, "rewards/accuracy_reward": 0.222222238779068, "rewards/format_reward": 0.9166666865348816, "rewards/semantic_reward": 0.1185709685087204, "step": 499 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.6666666666666666, "completion_length": 402.25, "epoch": 0.004840692799953529, "grad_norm": 1.0845068709961743, "kl": 0.02734375, "learning_rate": 9.999424152421597e-07, "loss": 0.0011, "reward": 1.488297462463379, "reward_std": 0.012389051727950573, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.0882975310087204, "step": 500 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 412.5833435058594, "epoch": 0.0048503741855534365, "grad_norm": 2.735048708971603, "kl": 0.03125, "learning_rate": 9.999421842147164e-07, "loss": 0.0012, "reward": 1.923453688621521, "reward_std": 0.11206197738647461, "rewards/accuracy_reward": 0.4453042447566986, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3114827573299408, "step": 501 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 373.125, "epoch": 0.004860055571153344, "grad_norm": 3.579767028710539, "kl": 0.02099609375, "learning_rate": 9.999419527247914e-07, "loss": 0.0008, "reward": 1.8625162839889526, "reward_std": 0.5904179811477661, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.2041829526424408, "step": 502 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.3333333333333333, "completion_length": 391.2083435058594, "epoch": 0.004869736956753251, "grad_norm": 1.7913228994874597, "kl": 0.0277099609375, "learning_rate": 9.99941720772385e-07, "loss": 0.0011, "reward": 1.871408462524414, "reward_std": 0.046313583850860596, "rewards/accuracy_reward": 0.5555555820465088, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.20751953125, "step": 503 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 406.2083435058594, "epoch": 0.004879418342353158, "grad_norm": 2.3001365319869485, "kl": 0.0294189453125, "learning_rate": 9.999414883574973e-07, "loss": 0.0012, "reward": 1.6332334280014038, "reward_std": 0.05359995365142822, "rewards/accuracy_reward": 0.30973073840141296, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2151692807674408, "step": 504 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 479.3333435058594, "epoch": 0.004889099727953065, "grad_norm": 2.691546352302356, "kl": 0.0181884765625, "learning_rate": 9.999412554801283e-07, "loss": 0.0007, "reward": 1.698781967163086, "reward_std": 0.31536442041397095, "rewards/accuracy_reward": 0.5105332732200623, "rewards/format_reward": 0.875, "rewards/semantic_reward": 0.2049153745174408, "step": 505 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 365.25, "epoch": 0.004898781113552971, "grad_norm": 13.24493808099739, "kl": 0.043701171875, "learning_rate": 9.999410221402787e-07, "loss": 0.0017, "reward": 1.5611443519592285, "reward_std": 0.1065555214881897, "rewards/accuracy_reward": 0.2675977945327759, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1768798828125, "step": 506 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.6666666666666666, "completion_length": 403.875, "epoch": 0.0049084624991528785, "grad_norm": 1.423069835664873, "kl": 0.0194091796875, "learning_rate": 9.999407883379485e-07, "loss": 0.0008, "reward": 1.492163062095642, "reward_std": 0.014888226054608822, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.0921630859375, "step": 507 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 375.91668701171875, "epoch": 0.004918143884752786, "grad_norm": 2.331007445176174, "kl": 0.03173828125, "learning_rate": 9.999405540731378e-07, "loss": 0.0013, "reward": 1.5629231929779053, "reward_std": 0.16516569256782532, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1212565153837204, "step": 508 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 453.625, "epoch": 0.004927825270352693, "grad_norm": 3.4287806653948376, "kl": 0.02294921875, "learning_rate": 9.999403193458467e-07, "loss": 0.0009, "reward": 1.613440990447998, "reward_std": 0.4178047180175781, "rewards/accuracy_reward": 0.4631155729293823, "rewards/format_reward": 0.9166666865348816, "rewards/semantic_reward": 0.1669921875, "step": 509 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 403.4583435058594, "epoch": 0.0049375066559526, "grad_norm": 2.4675967504464453, "kl": 0.0281982421875, "learning_rate": 9.999400841560759e-07, "loss": 0.0011, "reward": 2.4290528297424316, "reward_std": 0.23140829801559448, "rewards/accuracy_reward": 0.9583333730697632, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3207194209098816, "step": 510 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 390.75, "epoch": 0.004947188041552507, "grad_norm": 3.0179866766839116, "kl": 0.031982421875, "learning_rate": 9.999398485038252e-07, "loss": 0.0013, "reward": 1.6187770366668701, "reward_std": 0.05666329711675644, "rewards/accuracy_reward": 0.3125351071357727, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1895751953125, "step": 511 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 429.25, "epoch": 0.004956869427152414, "grad_norm": 4.611314680096215, "kl": 0.028076171875, "learning_rate": 9.999396123890948e-07, "loss": 0.0011, "reward": 2.1891860961914062, "reward_std": 0.20355930924415588, "rewards/accuracy_reward": 0.7395357489585876, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2663167417049408, "step": 512 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 388.9583435058594, "epoch": 0.004966550812752321, "grad_norm": 2.7285653948580073, "kl": 0.028076171875, "learning_rate": 9.999393758118853e-07, "loss": 0.0011, "reward": 2.0529861450195312, "reward_std": 0.10390742123126984, "rewards/accuracy_reward": 0.5509110689163208, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3354085385799408, "step": 513 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 430.5833435058594, "epoch": 0.0049762321983522284, "grad_norm": 3.882919612869846, "kl": 0.030517578125, "learning_rate": 9.999391387721968e-07, "loss": 0.0012, "reward": 1.73851478099823, "reward_std": 0.5924288630485535, "rewards/accuracy_reward": 0.638888955116272, "rewards/format_reward": 0.8333333730697632, "rewards/semantic_reward": 0.1996256560087204, "step": 514 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 428.625, "epoch": 0.004985913583952136, "grad_norm": 4.381128142092961, "kl": 0.039794921875, "learning_rate": 9.99938901270029e-07, "loss": 0.0016, "reward": 2.2404115200042725, "reward_std": 0.14138075709342957, "rewards/accuracy_reward": 0.7463521361351013, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.327392578125, "step": 515 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 366.16668701171875, "epoch": 0.004995594969552043, "grad_norm": 12.590985345734442, "kl": 0.043212890625, "learning_rate": 9.999386633053828e-07, "loss": 0.0017, "reward": 2.169206142425537, "reward_std": 0.12968769669532776, "rewards/accuracy_reward": 0.7207686305046082, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3151041865348816, "step": 516 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 384.625, "epoch": 0.005005276355151949, "grad_norm": 3.568799009163176, "kl": 0.0380859375, "learning_rate": 9.999384248782582e-07, "loss": 0.0015, "reward": 2.002682685852051, "reward_std": 0.07714501023292542, "rewards/accuracy_reward": 0.5890596508979797, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2552897334098816, "step": 517 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 437.25, "epoch": 0.005014957740751856, "grad_norm": 7.57300507968585, "kl": 0.031005859375, "learning_rate": 9.99938185988655e-07, "loss": 0.0012, "reward": 1.5530452728271484, "reward_std": 0.32194364070892334, "rewards/accuracy_reward": 0.2719661593437195, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.1894124448299408, "step": 518 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 397.66668701171875, "epoch": 0.005024639126351763, "grad_norm": 3.6700143115589086, "kl": 0.023193359375, "learning_rate": 9.999379466365742e-07, "loss": 0.0009, "reward": 1.921489953994751, "reward_std": 0.36428409814834595, "rewards/accuracy_reward": 0.5251519083976746, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.246337890625, "step": 519 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.6666666666666666, "completion_length": 418.0833435058594, "epoch": 0.00503432051195167, "grad_norm": 1.1186607810204754, "kl": 0.02880859375, "learning_rate": 9.999377068220153e-07, "loss": 0.0012, "reward": 1.468237280845642, "reward_std": 0.009662679396569729, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.0682373046875, "step": 520 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 496.79168701171875, "epoch": 0.0050440018975515775, "grad_norm": 2.4588867055901513, "kl": 0.02734375, "learning_rate": 9.999374665449792e-07, "loss": 0.0011, "reward": 1.5706605911254883, "reward_std": 0.3379232883453369, "rewards/accuracy_reward": 0.3751445710659027, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.1538492888212204, "step": 521 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 429.5, "epoch": 0.005053683283151485, "grad_norm": 3.058386046247974, "kl": 0.033447265625, "learning_rate": 9.999372258054654e-07, "loss": 0.0013, "reward": 1.6780786514282227, "reward_std": 0.15968212485313416, "rewards/accuracy_reward": 0.4627872407436371, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1319580078125, "step": 522 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 394.41668701171875, "epoch": 0.005063364668751392, "grad_norm": 2.6876184801773904, "kl": 0.0269775390625, "learning_rate": 9.999369846034746e-07, "loss": 0.0011, "reward": 1.7209341526031494, "reward_std": 0.2179381549358368, "rewards/accuracy_reward": 0.34043264389038086, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2638346552848816, "step": 523 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 434.8333435058594, "epoch": 0.005073046054351299, "grad_norm": 2.247510998241039, "kl": 0.030029296875, "learning_rate": 9.999367429390068e-07, "loss": 0.0012, "reward": 1.6498478651046753, "reward_std": 0.07420849800109863, "rewards/accuracy_reward": 0.4028262495994568, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1553548276424408, "step": 524 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 333.8333435058594, "epoch": 0.005082727439951206, "grad_norm": 3.196775508845225, "kl": 0.046875, "learning_rate": 9.999365008120626e-07, "loss": 0.0019, "reward": 2.189753293991089, "reward_std": 0.09305381774902344, "rewards/accuracy_reward": 0.7371978759765625, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3275553584098816, "step": 525 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 379.91668701171875, "epoch": 0.005092408825551113, "grad_norm": 1.5771448517679945, "kl": 0.034912109375, "learning_rate": 9.999362582226415e-07, "loss": 0.0014, "reward": 1.3855513334274292, "reward_std": 0.19172844290733337, "rewards/accuracy_reward": 0.21169066429138184, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1238606795668602, "step": 526 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 496.16668701171875, "epoch": 0.0051020902111510195, "grad_norm": 2.1425298606136507, "kl": 0.0140380859375, "learning_rate": 9.999360151707445e-07, "loss": 0.0006, "reward": 1.8852403163909912, "reward_std": 0.49324831366539, "rewards/accuracy_reward": 0.6111111640930176, "rewards/format_reward": 0.9166666865348816, "rewards/semantic_reward": 0.2324625700712204, "step": 527 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 347.91668701171875, "epoch": 0.005111771596750927, "grad_norm": 2.660767617291302, "kl": 0.03759765625, "learning_rate": 9.999357716563713e-07, "loss": 0.0015, "reward": 1.4125299453735352, "reward_std": 0.19259759783744812, "rewards/accuracy_reward": 0.2638888955116272, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1069742888212204, "step": 528 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 396.4583435058594, "epoch": 0.005121452982350834, "grad_norm": 1.9108131594343494, "kl": 0.0201416015625, "learning_rate": 9.999355276795226e-07, "loss": 0.0008, "reward": 2.318570852279663, "reward_std": 0.2738639712333679, "rewards/accuracy_reward": 0.9166666865348816, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2852376401424408, "step": 529 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.3333333333333333, "completion_length": 342.16668701171875, "epoch": 0.005131134367950741, "grad_norm": 2.8029996466916387, "kl": 0.0361328125, "learning_rate": 9.999352832401984e-07, "loss": 0.0014, "reward": 1.9464243650436401, "reward_std": 0.07856190949678421, "rewards/accuracy_reward": 0.6523569226264954, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2107340544462204, "step": 530 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 382.75, "epoch": 0.005140815753550648, "grad_norm": 5.656388804633492, "kl": 0.0281982421875, "learning_rate": 9.999350383383983e-07, "loss": 0.0011, "reward": 1.5816895961761475, "reward_std": 0.31920313835144043, "rewards/accuracy_reward": 0.4166666865348816, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.106689453125, "step": 531 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 357.79168701171875, "epoch": 0.005150497139150555, "grad_norm": 2.5059565884180754, "kl": 0.035888671875, "learning_rate": 9.999347929741236e-07, "loss": 0.0014, "reward": 2.1877756118774414, "reward_std": 0.10596482455730438, "rewards/accuracy_reward": 0.7282297015190125, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3262125849723816, "step": 532 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 354.625, "epoch": 0.005160178524750462, "grad_norm": 2.2251397106766078, "kl": 0.0302734375, "learning_rate": 9.99934547147374e-07, "loss": 0.0012, "reward": 2.3546061515808105, "reward_std": 0.22917108237743378, "rewards/accuracy_reward": 0.9583333730697632, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.262939453125, "step": 533 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 358.0, "epoch": 0.0051698599103503695, "grad_norm": 2.6785779773196134, "kl": 0.04541015625, "learning_rate": 9.999343008581495e-07, "loss": 0.0018, "reward": 1.6791720390319824, "reward_std": 0.08707743883132935, "rewards/accuracy_reward": 0.3721245229244232, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1820475310087204, "step": 534 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 456.8333435058594, "epoch": 0.005179541295950277, "grad_norm": 2.0448685291099267, "kl": 0.02783203125, "learning_rate": 9.999340541064509e-07, "loss": 0.0011, "reward": 1.2902112007141113, "reward_std": 0.20388035476207733, "rewards/accuracy_reward": 0.15862759947776794, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.0815836638212204, "step": 535 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 382.2083435058594, "epoch": 0.005189222681550184, "grad_norm": 1.436406303124528, "kl": 0.0238037109375, "learning_rate": 9.999338068922778e-07, "loss": 0.001, "reward": 1.5014209747314453, "reward_std": 0.28825247287750244, "rewards/accuracy_reward": 0.2959604859352112, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1387939453125, "step": 536 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 412.16668701171875, "epoch": 0.005198904067150091, "grad_norm": 2.451825727314741, "kl": 0.0284423828125, "learning_rate": 9.99933559215631e-07, "loss": 0.0011, "reward": 2.2752182483673096, "reward_std": 0.13351625204086304, "rewards/accuracy_reward": 0.876666784286499, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2652181088924408, "step": 537 }, { "all_correct": 0.0, "all_wrong": 1.0, "completion_length": 365.625, "epoch": 0.005208585452749997, "grad_norm": 0.10768266687794284, "kl": 0.031494140625, "learning_rate": 9.999333110765103e-07, "loss": 0.0013, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.0, "step": 538 }, { "all_correct": 0.0, "all_wrong": 0.6666666666666666, "completion_length": 379.25, "epoch": 0.005218266838349904, "grad_norm": 1.2327102867756305, "kl": 0.029296875, "learning_rate": 9.99933062474916e-07, "loss": 0.0012, "reward": 1.3579909801483154, "reward_std": 0.04435964673757553, "rewards/accuracy_reward": 0.200829416513443, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.0904947966337204, "step": 539 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 427.0, "epoch": 0.0052279482239498114, "grad_norm": 14.136503857914585, "kl": 0.032470703125, "learning_rate": 9.999328134108485e-07, "loss": 0.0013, "reward": 2.131366729736328, "reward_std": 0.08764685690402985, "rewards/accuracy_reward": 0.6704699993133545, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2942301630973816, "step": 540 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 370.0, "epoch": 0.005237629609549719, "grad_norm": 3.925098954937974, "kl": 0.0400390625, "learning_rate": 9.999325638843081e-07, "loss": 0.0016, "reward": 1.4870294332504272, "reward_std": 0.3050827980041504, "rewards/accuracy_reward": 0.25115692615509033, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1442057341337204, "step": 541 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 393.875, "epoch": 0.005247310995149626, "grad_norm": 3.2340551544126765, "kl": 0.041259765625, "learning_rate": 9.999323138952947e-07, "loss": 0.0017, "reward": 2.0505669116973877, "reward_std": 0.09873110055923462, "rewards/accuracy_reward": 0.6160861253738403, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2594808042049408, "step": 542 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 413.16668701171875, "epoch": 0.005256992380749533, "grad_norm": 3.836786532662781, "kl": 0.03564453125, "learning_rate": 9.99932063443809e-07, "loss": 0.0014, "reward": 1.917581558227539, "reward_std": 0.11893706023693085, "rewards/accuracy_reward": 0.5242260694503784, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2516886591911316, "step": 543 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 351.79168701171875, "epoch": 0.00526667376634944, "grad_norm": 2.274436782229607, "kl": 0.0419921875, "learning_rate": 9.999318125298506e-07, "loss": 0.0017, "reward": 1.464719533920288, "reward_std": 0.18564054369926453, "rewards/accuracy_reward": 0.3248187303543091, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1065673828125, "step": 544 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 406.41668701171875, "epoch": 0.005276355151949347, "grad_norm": 1.9071963649252697, "kl": 0.024658203125, "learning_rate": 9.999315611534203e-07, "loss": 0.001, "reward": 1.949845552444458, "reward_std": 0.21449194848537445, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2165120542049408, "step": 545 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 315.75, "epoch": 0.005286036537549254, "grad_norm": 3.309687331672366, "kl": 0.043212890625, "learning_rate": 9.99931309314518e-07, "loss": 0.0017, "reward": 1.6810719966888428, "reward_std": 0.5057080984115601, "rewards/accuracy_reward": 0.4083179831504822, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.19775390625, "step": 546 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 399.29168701171875, "epoch": 0.005295717923149161, "grad_norm": 2.5731428870407904, "kl": 0.028564453125, "learning_rate": 9.99931057013144e-07, "loss": 0.0011, "reward": 1.729445457458496, "reward_std": 0.04984418675303459, "rewards/accuracy_reward": 0.4543314576148987, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.158447265625, "step": 547 }, { "all_correct": 0.0, "all_wrong": 0.6666666666666666, "completion_length": 353.91668701171875, "epoch": 0.0053053993087490686, "grad_norm": 0.941621381101652, "kl": 0.0308837890625, "learning_rate": 9.999308042492988e-07, "loss": 0.0012, "reward": 1.057861328125, "reward_std": 0.1636565625667572, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.01619466207921505, "step": 548 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 409.8333435058594, "epoch": 0.005315080694348975, "grad_norm": 2.0598834252558893, "kl": 0.02685546875, "learning_rate": 9.999305510229822e-07, "loss": 0.0011, "reward": 2.340315818786621, "reward_std": 0.254798024892807, "rewards/accuracy_reward": 0.9166666865348816, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.281982421875, "step": 549 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 401.375, "epoch": 0.005324762079948882, "grad_norm": 2.0165470452147978, "kl": 0.03662109375, "learning_rate": 9.999302973341948e-07, "loss": 0.0015, "reward": 1.516466498374939, "reward_std": 0.2891894280910492, "rewards/accuracy_reward": 0.3525972366333008, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.080535888671875, "step": 550 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 419.7083435058594, "epoch": 0.005334443465548789, "grad_norm": 1.3174805096819704, "kl": 0.01953125, "learning_rate": 9.999300431829366e-07, "loss": 0.0008, "reward": 1.7384642362594604, "reward_std": 0.27330583333969116, "rewards/accuracy_reward": 0.4375852942466736, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1925455778837204, "step": 551 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 399.8333435058594, "epoch": 0.005344124851148696, "grad_norm": 10.848463536354338, "kl": 0.02490234375, "learning_rate": 9.99929788569208e-07, "loss": 0.001, "reward": 1.9345738887786865, "reward_std": 0.2606397569179535, "rewards/accuracy_reward": 0.6934523582458496, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1911214292049408, "step": 552 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 374.7083435058594, "epoch": 0.005353806236748603, "grad_norm": 4.071500858889362, "kl": 0.030517578125, "learning_rate": 9.999295334930092e-07, "loss": 0.0012, "reward": 1.774922251701355, "reward_std": 0.042288508266210556, "rewards/accuracy_reward": 0.48427271842956543, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1656494140625, "step": 553 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 351.3333435058594, "epoch": 0.0053634876223485105, "grad_norm": 3.4958885145234673, "kl": 0.031982421875, "learning_rate": 9.999292779543403e-07, "loss": 0.0013, "reward": 1.6544326543807983, "reward_std": 0.22041520476341248, "rewards/accuracy_reward": 0.42222222685813904, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1405436247587204, "step": 554 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 419.9583435058594, "epoch": 0.005373169007948418, "grad_norm": 6.411749763769436, "kl": 0.03173828125, "learning_rate": 9.999290219532014e-07, "loss": 0.0013, "reward": 1.3019893169403076, "reward_std": 0.09868182241916656, "rewards/accuracy_reward": 0.13121287524700165, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1291097104549408, "step": 555 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.3333333333333333, "completion_length": 346.5, "epoch": 0.005382850393548325, "grad_norm": 1.7895302415175196, "kl": 0.02783203125, "learning_rate": 9.999287654895933e-07, "loss": 0.0011, "reward": 1.974365234375, "reward_std": 0.04255376383662224, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.224365234375, "step": 556 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 358.875, "epoch": 0.005392531779148232, "grad_norm": 6.788885440478831, "kl": 0.036376953125, "learning_rate": 9.999285085635158e-07, "loss": 0.0015, "reward": 1.9312503337860107, "reward_std": 0.23744241893291473, "rewards/accuracy_reward": 0.44845399260520935, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.324462890625, "step": 557 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 373.5, "epoch": 0.005402213164748139, "grad_norm": 5.031177844667254, "kl": 0.041015625, "learning_rate": 9.99928251174969e-07, "loss": 0.0016, "reward": 2.04146671295166, "reward_std": 0.2120579183101654, "rewards/accuracy_reward": 0.5749057531356812, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3082275390625, "step": 558 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 399.125, "epoch": 0.005411894550348046, "grad_norm": 2.660593352747751, "kl": 0.0302734375, "learning_rate": 9.999279933239538e-07, "loss": 0.0012, "reward": 1.3887929916381836, "reward_std": 0.17502662539482117, "rewards/accuracy_reward": 0.17268791794776917, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1327718198299408, "step": 559 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 416.91668701171875, "epoch": 0.0054215759359479525, "grad_norm": 2.093234222344968, "kl": 0.023681640625, "learning_rate": 9.999277350104698e-07, "loss": 0.0009, "reward": 2.0044431686401367, "reward_std": 0.48749834299087524, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.2211100310087204, "step": 560 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 372.79168701171875, "epoch": 0.00543125732154786, "grad_norm": 2.939145549833687, "kl": 0.0322265625, "learning_rate": 9.999274762345175e-07, "loss": 0.0013, "reward": 1.6149883270263672, "reward_std": 0.10468044131994247, "rewards/accuracy_reward": 0.3529357612133026, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1453857421875, "step": 561 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 386.75, "epoch": 0.005440938707147767, "grad_norm": 2.9659280687105944, "kl": 0.03271484375, "learning_rate": 9.99927216996097e-07, "loss": 0.0013, "reward": 1.711835265159607, "reward_std": 0.24332594871520996, "rewards/accuracy_reward": 0.4415227770805359, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1953125, "step": 562 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 423.16668701171875, "epoch": 0.005450620092747674, "grad_norm": 2.5097379285410293, "kl": 0.025390625, "learning_rate": 9.999269572952088e-07, "loss": 0.001, "reward": 1.6878507137298584, "reward_std": 0.37694597244262695, "rewards/accuracy_reward": 0.3498053550720215, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.2463785856962204, "step": 563 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.3333333333333333, "completion_length": 436.9583435058594, "epoch": 0.005460301478347581, "grad_norm": 3.180778597284027, "kl": 0.0228271484375, "learning_rate": 9.999266971318529e-07, "loss": 0.0009, "reward": 1.809535026550293, "reward_std": 0.053197722882032394, "rewards/accuracy_reward": 0.5555555820465088, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1623128354549408, "step": 564 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 488.7083435058594, "epoch": 0.005469982863947488, "grad_norm": 4.4080916349559445, "kl": 0.0137939453125, "learning_rate": 9.999264365060297e-07, "loss": 0.0006, "reward": 2.3273439407348633, "reward_std": 0.49999678134918213, "rewards/accuracy_reward": 0.9166666865348816, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.3606770932674408, "step": 565 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 358.8333435058594, "epoch": 0.005479664249547395, "grad_norm": 4.347218139307645, "kl": 0.044921875, "learning_rate": 9.99926175417739e-07, "loss": 0.0018, "reward": 1.9201654195785522, "reward_std": 0.11427616328001022, "rewards/accuracy_reward": 0.5417636632919312, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2367350310087204, "step": 566 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 407.3333435058594, "epoch": 0.0054893456351473024, "grad_norm": 2.07555768932119, "kl": 0.0311279296875, "learning_rate": 9.999259138669817e-07, "loss": 0.0012, "reward": 1.8102052211761475, "reward_std": 0.25232672691345215, "rewards/accuracy_reward": 0.5833333730697632, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1268717497587204, "step": 567 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 334.875, "epoch": 0.00549902702074721, "grad_norm": 2.6522705130163526, "kl": 0.03369140625, "learning_rate": 9.999256518537577e-07, "loss": 0.0013, "reward": 1.9949054718017578, "reward_std": 0.1911967396736145, "rewards/accuracy_reward": 0.7083333730697632, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.236572265625, "step": 568 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 374.2083435058594, "epoch": 0.005508708406347117, "grad_norm": 27.02482516611509, "kl": 0.0322265625, "learning_rate": 9.999253893780673e-07, "loss": 0.0013, "reward": 1.5577266216278076, "reward_std": 0.046316348016262054, "rewards/accuracy_reward": 0.29385122656822205, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1388753354549408, "step": 569 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 385.7083435058594, "epoch": 0.005518389791947024, "grad_norm": 2.8777484748298865, "kl": 0.03759765625, "learning_rate": 9.999251264399106e-07, "loss": 0.0015, "reward": 2.2390294075012207, "reward_std": 0.10854937136173248, "rewards/accuracy_reward": 0.7015455961227417, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.379150390625, "step": 570 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 401.7083435058594, "epoch": 0.00552807117754693, "grad_norm": 3.3633081702002863, "kl": 0.037353515625, "learning_rate": 9.99924863039288e-07, "loss": 0.0015, "reward": 1.5609363317489624, "reward_std": 0.31893011927604675, "rewards/accuracy_reward": 0.27179229259490967, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.205810546875, "step": 571 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 421.5, "epoch": 0.005537752563146837, "grad_norm": 1.8451623062533156, "kl": 0.0225830078125, "learning_rate": 9.999245991761997e-07, "loss": 0.0009, "reward": 1.7464263439178467, "reward_std": 0.2902866303920746, "rewards/accuracy_reward": 0.47207629680633545, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1826833188533783, "step": 572 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.3333333333333333, "completion_length": 328.54168701171875, "epoch": 0.005547433948746744, "grad_norm": 1.7990525510583082, "kl": 0.0281982421875, "learning_rate": 9.99924334850646e-07, "loss": 0.0011, "reward": 2.0038576126098633, "reward_std": 0.06961682438850403, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2455240935087204, "step": 573 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 460.54168701171875, "epoch": 0.0055571153343466516, "grad_norm": 4.99318505880871, "kl": 0.0218505859375, "learning_rate": 9.999240700626273e-07, "loss": 0.0009, "reward": 1.899932861328125, "reward_std": 0.3120839595794678, "rewards/accuracy_reward": 0.5540220737457275, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.24591064453125, "step": 574 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 362.25, "epoch": 0.005566796719946559, "grad_norm": 3.532862441785566, "kl": 0.047119140625, "learning_rate": 9.999238048121433e-07, "loss": 0.0019, "reward": 1.511114478111267, "reward_std": 0.12642282247543335, "rewards/accuracy_reward": 0.1683003157377243, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2178141325712204, "step": 575 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 395.625, "epoch": 0.005576478105546466, "grad_norm": 1.430058964438327, "kl": 0.0230712890625, "learning_rate": 9.999235390991949e-07, "loss": 0.0009, "reward": 1.895605444908142, "reward_std": 0.22194311022758484, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2039388120174408, "step": 576 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 386.125, "epoch": 0.005586159491146373, "grad_norm": 7.197757142888881, "kl": 0.03515625, "learning_rate": 9.999232729237818e-07, "loss": 0.0014, "reward": 1.953953742980957, "reward_std": 0.3186539113521576, "rewards/accuracy_reward": 0.5360011458396912, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2762858271598816, "step": 577 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 395.8333435058594, "epoch": 0.00559584087674628, "grad_norm": 2.124803435577806, "kl": 0.026123046875, "learning_rate": 9.999230062859045e-07, "loss": 0.001, "reward": 1.5000814199447632, "reward_std": 0.5290036797523499, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1250813901424408, "step": 578 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.6666666666666666, "completion_length": 340.4583435058594, "epoch": 0.005605522262346187, "grad_norm": 1.4131847246696103, "kl": 0.0286865234375, "learning_rate": 9.999227391855633e-07, "loss": 0.0011, "reward": 1.4434245824813843, "reward_std": 0.016322772949934006, "rewards/accuracy_reward": 0.3291666805744171, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1142578125, "step": 579 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 502.0833435058594, "epoch": 0.005615203647946094, "grad_norm": 2.462915734701556, "kl": 0.0267333984375, "learning_rate": 9.999224716227584e-07, "loss": 0.0011, "reward": 1.3034683465957642, "reward_std": 0.2397584170103073, "rewards/accuracy_reward": 0.16963039338588715, "rewards/format_reward": 0.875, "rewards/semantic_reward": 0.1421712338924408, "step": 580 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 463.8333435058594, "epoch": 0.0056248850335460015, "grad_norm": 12.097916718915812, "kl": 0.031005859375, "learning_rate": 9.9992220359749e-07, "loss": 0.0012, "reward": 1.808806300163269, "reward_std": 0.12277530133724213, "rewards/accuracy_reward": 0.3640429973602295, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2780965268611908, "step": 581 }, { "all_correct": 0.0, "all_wrong": 0.6666666666666666, "completion_length": 397.04168701171875, "epoch": 0.005634566419145908, "grad_norm": 1.6811646294271814, "kl": 0.037109375, "learning_rate": 9.999219351097583e-07, "loss": 0.0015, "reward": 1.2913169860839844, "reward_std": 0.0276213139295578, "rewards/accuracy_reward": 0.12146013975143433, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1031901091337204, "step": 582 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 425.0833435058594, "epoch": 0.005644247804745815, "grad_norm": 5.7834163312264595, "kl": 0.033203125, "learning_rate": 9.999216661595638e-07, "loss": 0.0013, "reward": 1.945150375366211, "reward_std": 0.08417444676160812, "rewards/accuracy_reward": 0.5431809425354004, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2686360776424408, "step": 583 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 349.8333435058594, "epoch": 0.005653929190345722, "grad_norm": 3.4365853539068896, "kl": 0.0291748046875, "learning_rate": 9.999213967469063e-07, "loss": 0.0012, "reward": 1.8138706684112549, "reward_std": 0.13934722542762756, "rewards/accuracy_reward": 0.5137566328048706, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2001139372587204, "step": 584 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 423.04168701171875, "epoch": 0.005663610575945629, "grad_norm": 2.582568987460055, "kl": 0.037841796875, "learning_rate": 9.999211268717864e-07, "loss": 0.0015, "reward": 1.7535465955734253, "reward_std": 0.3294718265533447, "rewards/accuracy_reward": 0.424746036529541, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1871337890625, "step": 585 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 374.54168701171875, "epoch": 0.005673291961545536, "grad_norm": 1.9341381070383765, "kl": 0.034912109375, "learning_rate": 9.999208565342044e-07, "loss": 0.0014, "reward": 2.234135150909424, "reward_std": 0.24647727608680725, "rewards/accuracy_reward": 0.7875286936759949, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3132731318473816, "step": 586 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 543.125, "epoch": 0.0056829733471454435, "grad_norm": 1.7374139978708187, "kl": 0.013427734375, "learning_rate": 9.999205857341605e-07, "loss": 0.0005, "reward": 1.6541829109191895, "reward_std": 0.32871848344802856, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.7083333730697632, "rewards/semantic_reward": 0.245849609375, "step": 587 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 423.41668701171875, "epoch": 0.005692654732745351, "grad_norm": 8.609770887981076, "kl": 0.0296630859375, "learning_rate": 9.999203144716546e-07, "loss": 0.0012, "reward": 2.0222060680389404, "reward_std": 0.3389699161052704, "rewards/accuracy_reward": 0.6600069999694824, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2121988981962204, "step": 588 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 407.0833435058594, "epoch": 0.005702336118345258, "grad_norm": 4.764409309655413, "kl": 0.035400390625, "learning_rate": 9.999200427466873e-07, "loss": 0.0014, "reward": 2.07344126701355, "reward_std": 0.09093311429023743, "rewards/accuracy_reward": 0.6125117540359497, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2859293818473816, "step": 589 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 498.125, "epoch": 0.005712017503945165, "grad_norm": 2.650601047162097, "kl": 0.03076171875, "learning_rate": 9.999197705592589e-07, "loss": 0.0012, "reward": 1.825101613998413, "reward_std": 0.26774173974990845, "rewards/accuracy_reward": 0.48927801847457886, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.2608235776424408, "step": 590 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 432.5833435058594, "epoch": 0.005721698889545072, "grad_norm": 2.1913513110296403, "kl": 0.030029296875, "learning_rate": 9.999194979093693e-07, "loss": 0.0012, "reward": 2.0288076400756836, "reward_std": 0.3026936948299408, "rewards/accuracy_reward": 0.6514555215835571, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2440185546875, "step": 591 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 425.54168701171875, "epoch": 0.005731380275144979, "grad_norm": 3.083540748333885, "kl": 0.0267333984375, "learning_rate": 9.999192247970193e-07, "loss": 0.0011, "reward": 2.210049629211426, "reward_std": 0.21460828185081482, "rewards/accuracy_reward": 0.7460845708847046, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2722981870174408, "step": 592 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 368.79168701171875, "epoch": 0.0057410616607448854, "grad_norm": 1.7482271467637185, "kl": 0.031982421875, "learning_rate": 9.999189512222083e-07, "loss": 0.0013, "reward": 1.797914981842041, "reward_std": 0.0523386225104332, "rewards/accuracy_reward": 0.49060696363449097, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2406412810087204, "step": 593 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 466.91668701171875, "epoch": 0.005750743046344793, "grad_norm": 7.29145363809803, "kl": 0.0322265625, "learning_rate": 9.999186771849374e-07, "loss": 0.0013, "reward": 1.640944242477417, "reward_std": 0.24380557239055634, "rewards/accuracy_reward": 0.2518572509288788, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.2724202573299408, "step": 594 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 397.8333435058594, "epoch": 0.0057604244319447, "grad_norm": 2.500113463046984, "kl": 0.0390625, "learning_rate": 9.999184026852067e-07, "loss": 0.0016, "reward": 1.7116363048553467, "reward_std": 0.33412814140319824, "rewards/accuracy_reward": 0.3364001512527466, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2419026792049408, "step": 595 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 338.8333435058594, "epoch": 0.005770105817544607, "grad_norm": 4.198547574541493, "kl": 0.03369140625, "learning_rate": 9.99918127723016e-07, "loss": 0.0013, "reward": 2.0494141578674316, "reward_std": 0.29350316524505615, "rewards/accuracy_reward": 0.67059725522995, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2871500849723816, "step": 596 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 364.54168701171875, "epoch": 0.005779787203144514, "grad_norm": 2.928314064183214, "kl": 0.03564453125, "learning_rate": 9.99917852298366e-07, "loss": 0.0014, "reward": 1.7021315097808838, "reward_std": 0.19095444679260254, "rewards/accuracy_reward": 0.3646475672721863, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2124837338924408, "step": 597 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 404.9583435058594, "epoch": 0.005789468588744421, "grad_norm": 18.38874415700877, "kl": 0.03515625, "learning_rate": 9.99917576411257e-07, "loss": 0.0014, "reward": 1.8990428447723389, "reward_std": 0.1341315358877182, "rewards/accuracy_reward": 0.48635154962539673, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2376912534236908, "step": 598 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 383.66668701171875, "epoch": 0.005799149974344328, "grad_norm": 2.511916763182127, "kl": 0.035400390625, "learning_rate": 9.999173000616886e-07, "loss": 0.0014, "reward": 1.3572986125946045, "reward_std": 0.15019258856773376, "rewards/accuracy_reward": 0.1913805603981018, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1075846403837204, "step": 599 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 400.54168701171875, "epoch": 0.005808831359944235, "grad_norm": 2.3787502395575744, "kl": 0.02880859375, "learning_rate": 9.999170232496617e-07, "loss": 0.0012, "reward": 2.3050050735473633, "reward_std": 0.2741117775440216, "rewards/accuracy_reward": 0.9166666865348816, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2550048828125, "step": 600 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 388.3333435058594, "epoch": 0.0058185127455441426, "grad_norm": 3.4138865165919734, "kl": 0.033935546875, "learning_rate": 9.999167459751766e-07, "loss": 0.0014, "reward": 2.1121106147766113, "reward_std": 0.2172512710094452, "rewards/accuracy_reward": 0.7513519525527954, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.194091796875, "step": 601 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 376.66668701171875, "epoch": 0.00582819413114405, "grad_norm": 1.5816681077155197, "kl": 0.032470703125, "learning_rate": 9.999164682382329e-07, "loss": 0.0013, "reward": 1.7491779327392578, "reward_std": 0.31197091937065125, "rewards/accuracy_reward": 0.5416666865348816, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1575113981962204, "step": 602 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 441.41668701171875, "epoch": 0.005837875516743956, "grad_norm": 2.734370992564275, "kl": 0.02978515625, "learning_rate": 9.999161900388316e-07, "loss": 0.0012, "reward": 2.327897310256958, "reward_std": 0.27899035811424255, "rewards/accuracy_reward": 0.9166666865348816, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.26123046875, "step": 603 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 388.79168701171875, "epoch": 0.005847556902343863, "grad_norm": 3.098436176864441, "kl": 0.036865234375, "learning_rate": 9.999159113769724e-07, "loss": 0.0015, "reward": 1.7404813766479492, "reward_std": 0.04273334890604019, "rewards/accuracy_reward": 0.444444477558136, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1877034604549408, "step": 604 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 377.3333435058594, "epoch": 0.00585723828794377, "grad_norm": 2.10023374073624, "kl": 0.0263671875, "learning_rate": 9.999156322526559e-07, "loss": 0.0011, "reward": 1.877579689025879, "reward_std": 0.2072339504957199, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1859130859375, "step": 605 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 443.0833435058594, "epoch": 0.005866919673543677, "grad_norm": 3.06501132425259, "kl": 0.038330078125, "learning_rate": 9.999153526658824e-07, "loss": 0.0015, "reward": 1.6373366117477417, "reward_std": 0.2685506045818329, "rewards/accuracy_reward": 0.2731602191925049, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2391764372587204, "step": 606 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 388.04168701171875, "epoch": 0.0058766010591435845, "grad_norm": 2.648936756338541, "kl": 0.040283203125, "learning_rate": 9.999150726166518e-07, "loss": 0.0016, "reward": 1.7577537298202515, "reward_std": 0.29170194268226624, "rewards/accuracy_reward": 0.41035789251327515, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2473958432674408, "step": 607 }, { "all_correct": 0.0, "all_wrong": 0.6666666666666666, "completion_length": 400.375, "epoch": 0.005886282444743492, "grad_norm": 1.5344564312614781, "kl": 0.0322265625, "learning_rate": 9.999147921049646e-07, "loss": 0.0013, "reward": 1.2456417083740234, "reward_std": 0.12906764447689056, "rewards/accuracy_reward": 0.10278688371181488, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.0928548201918602, "step": 608 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 299.5, "epoch": 0.005895963830343399, "grad_norm": 1.7325900920324377, "kl": 0.0341796875, "learning_rate": 9.99914511130821e-07, "loss": 0.0014, "reward": 1.6337617635726929, "reward_std": 0.12305203080177307, "rewards/accuracy_reward": 0.35798054933547974, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2174479216337204, "step": 609 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 400.16668701171875, "epoch": 0.005905645215943306, "grad_norm": 2.6942264978374952, "kl": 0.033935546875, "learning_rate": 9.999142296942214e-07, "loss": 0.0014, "reward": 1.7703899145126343, "reward_std": 0.5301768183708191, "rewards/accuracy_reward": 0.44650477170944214, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1905517578125, "step": 610 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 442.04168701171875, "epoch": 0.005915326601543213, "grad_norm": 2.7093407231405764, "kl": 0.02783203125, "learning_rate": 9.99913947795166e-07, "loss": 0.0011, "reward": 1.7711427211761475, "reward_std": 0.42183348536491394, "rewards/accuracy_reward": 0.5416666865348816, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.171142578125, "step": 611 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 502.7083435058594, "epoch": 0.00592500798714312, "grad_norm": 1.5858152764089277, "kl": 0.017333984375, "learning_rate": 9.999136654336547e-07, "loss": 0.0007, "reward": 1.2664194107055664, "reward_std": 0.5206137895584106, "rewards/accuracy_reward": 0.19820646941661835, "rewards/format_reward": 0.8333333730697632, "rewards/semantic_reward": 0.168212890625, "step": 612 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 412.625, "epoch": 0.005934689372743027, "grad_norm": 3.123075385742047, "kl": 0.037353515625, "learning_rate": 9.999133826096884e-07, "loss": 0.0015, "reward": 2.231672763824463, "reward_std": 0.12359391152858734, "rewards/accuracy_reward": 0.8324661254882812, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.24920654296875, "step": 613 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 424.91668701171875, "epoch": 0.005944370758342934, "grad_norm": 12.126368559723364, "kl": 0.0380859375, "learning_rate": 9.999130993232667e-07, "loss": 0.0015, "reward": 1.8086433410644531, "reward_std": 0.136917844414711, "rewards/accuracy_reward": 0.36773356795310974, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3075765073299408, "step": 614 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.6666666666666666, "completion_length": 396.66668701171875, "epoch": 0.005954052143942841, "grad_norm": 2.267372574572739, "kl": 0.033203125, "learning_rate": 9.999128155743906e-07, "loss": 0.0013, "reward": 1.5013997554779053, "reward_std": 0.00969874206930399, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1013997420668602, "step": 615 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 378.0833435058594, "epoch": 0.005963733529542748, "grad_norm": 2.7638998100989824, "kl": 0.036376953125, "learning_rate": 9.999125313630597e-07, "loss": 0.0015, "reward": 1.7198314666748047, "reward_std": 0.2129468321800232, "rewards/accuracy_reward": 0.43731188774108887, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1658528745174408, "step": 616 }, { "all_correct": 0.0, "all_wrong": 0.6666666666666666, "completion_length": 389.7083435058594, "epoch": 0.005973414915142655, "grad_norm": 1.2052384011207855, "kl": 0.03369140625, "learning_rate": 9.999122466892744e-07, "loss": 0.0013, "reward": 1.3066532611846924, "reward_std": 0.04686928540468216, "rewards/accuracy_reward": 0.14874304831027985, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.10791015625, "step": 617 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 371.79168701171875, "epoch": 0.005983096300742562, "grad_norm": 1.5564676392844432, "kl": 0.032958984375, "learning_rate": 9.999119615530355e-07, "loss": 0.0013, "reward": 1.4832032918930054, "reward_std": 0.3034476041793823, "rewards/accuracy_reward": 0.30014652013778687, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.133056640625, "step": 618 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.3333333333333333, "completion_length": 390.8333435058594, "epoch": 0.005992777686342469, "grad_norm": 1.8315967931622992, "kl": 0.033447265625, "learning_rate": 9.999116759543425e-07, "loss": 0.0013, "reward": 1.8626275062561035, "reward_std": 0.059316836297512054, "rewards/accuracy_reward": 0.5555555820465088, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2070719450712204, "step": 619 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 431.625, "epoch": 0.0060024590719423765, "grad_norm": 1.9529442357026352, "kl": 0.0263671875, "learning_rate": 9.99911389893196e-07, "loss": 0.0011, "reward": 1.4524253606796265, "reward_std": 0.3600965142250061, "rewards/accuracy_reward": 0.22717301547527313, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1335856169462204, "step": 620 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 422.5833435058594, "epoch": 0.006012140457542284, "grad_norm": 2.323792784993549, "kl": 0.035400390625, "learning_rate": 9.999111033695964e-07, "loss": 0.0014, "reward": 1.7286027669906616, "reward_std": 0.363310307264328, "rewards/accuracy_reward": 0.4593157172203064, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1859537810087204, "step": 621 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 456.5, "epoch": 0.006021821843142191, "grad_norm": 2.4438909753396922, "kl": 0.0218505859375, "learning_rate": 9.99910816383544e-07, "loss": 0.0009, "reward": 1.7946590185165405, "reward_std": 0.45074719190597534, "rewards/accuracy_reward": 0.49523672461509705, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.2410888671875, "step": 622 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.3333333333333333, "completion_length": 388.25, "epoch": 0.006031503228742098, "grad_norm": 5.475935839382076, "kl": 0.05810546875, "learning_rate": 9.999105289350386e-07, "loss": 0.0023, "reward": 1.8831968307495117, "reward_std": 0.08468981832265854, "rewards/accuracy_reward": 0.599545955657959, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2003173828125, "step": 623 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 461.75, "epoch": 0.006041184614342005, "grad_norm": 2.5513112896545116, "kl": 0.0302734375, "learning_rate": 9.999102410240809e-07, "loss": 0.0012, "reward": 1.9307916164398193, "reward_std": 0.36258333921432495, "rewards/accuracy_reward": 0.6805555820465088, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.2002360075712204, "step": 624 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 386.625, "epoch": 0.006050865999941911, "grad_norm": 1.751444352729282, "kl": 0.03173828125, "learning_rate": 9.999099526506709e-07, "loss": 0.0013, "reward": 1.560609221458435, "reward_std": 0.3129345178604126, "rewards/accuracy_reward": 0.3472222685813904, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1383870542049408, "step": 625 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 416.16668701171875, "epoch": 0.006060547385541818, "grad_norm": 2.1482652263494173, "kl": 0.033935546875, "learning_rate": 9.999096638148092e-07, "loss": 0.0014, "reward": 1.6185382604599, "reward_std": 0.4195534586906433, "rewards/accuracy_reward": 0.31581196188926697, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.2110595703125, "step": 626 }, { "all_correct": 0.0, "all_wrong": 0.6666666666666666, "completion_length": 352.625, "epoch": 0.0060702287711417256, "grad_norm": 1.1614005478600589, "kl": 0.040771484375, "learning_rate": 9.999093745164958e-07, "loss": 0.0016, "reward": 1.1955676078796387, "reward_std": 0.2106931507587433, "rewards/accuracy_reward": 0.111111119389534, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.0677897185087204, "step": 627 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 396.04168701171875, "epoch": 0.006079910156741633, "grad_norm": 3.301102687501154, "kl": 0.0301513671875, "learning_rate": 9.99909084755731e-07, "loss": 0.0012, "reward": 2.23018217086792, "reward_std": 0.11589986085891724, "rewards/accuracy_reward": 0.7394349575042725, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3490804135799408, "step": 628 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 374.2083435058594, "epoch": 0.00608959154234154, "grad_norm": 5.225749753265009, "kl": 0.03466796875, "learning_rate": 9.999087945325153e-07, "loss": 0.0014, "reward": 1.8316841125488281, "reward_std": 0.3856726586818695, "rewards/accuracy_reward": 0.4916694164276123, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2233479917049408, "step": 629 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 433.41668701171875, "epoch": 0.006099272927941447, "grad_norm": 2.8963745829487384, "kl": 0.029296875, "learning_rate": 9.999085038468485e-07, "loss": 0.0012, "reward": 1.610774278640747, "reward_std": 0.27565503120422363, "rewards/accuracy_reward": 0.36287379264831543, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1812337338924408, "step": 630 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 376.16668701171875, "epoch": 0.006108954313541354, "grad_norm": 3.1647347079411325, "kl": 0.04150390625, "learning_rate": 9.999082126987313e-07, "loss": 0.0017, "reward": 2.086017370223999, "reward_std": 0.08744688332080841, "rewards/accuracy_reward": 0.6623601317405701, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2486572265625, "step": 631 }, { "all_correct": 0.0, "all_wrong": 1.0, "completion_length": 483.375, "epoch": 0.006118635699141261, "grad_norm": 0.4416798908268056, "kl": 0.0238037109375, "learning_rate": 9.999079210881638e-07, "loss": 0.001, "reward": 0.9166666865348816, "reward_std": 0.15430335700511932, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9166666865348816, "rewards/semantic_reward": 0.0, "step": 632 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 379.0, "epoch": 0.006128317084741168, "grad_norm": 3.6153065314636943, "kl": 0.03271484375, "learning_rate": 9.999076290151462e-07, "loss": 0.0013, "reward": 2.0553603172302246, "reward_std": 0.09673541784286499, "rewards/accuracy_reward": 0.6064262390136719, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2822672724723816, "step": 633 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 431.25, "epoch": 0.0061379984703410755, "grad_norm": 2.6060638046193865, "kl": 0.026611328125, "learning_rate": 9.999073364796789e-07, "loss": 0.0011, "reward": 2.0896518230438232, "reward_std": 0.3036274015903473, "rewards/accuracy_reward": 0.6804639101028442, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2925211787223816, "step": 634 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 382.66668701171875, "epoch": 0.006147679855940983, "grad_norm": 8.02831509380435, "kl": 0.04150390625, "learning_rate": 9.999070434817622e-07, "loss": 0.0017, "reward": 1.6431739330291748, "reward_std": 0.04810967296361923, "rewards/accuracy_reward": 0.2779069244861603, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.23193359375, "step": 635 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.6666666666666666, "completion_length": 413.54168701171875, "epoch": 0.006157361241540889, "grad_norm": 1.3574545327808767, "kl": 0.03466796875, "learning_rate": 9.999067500213963e-07, "loss": 0.0014, "reward": 1.518896460533142, "reward_std": 0.011334951967000961, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.118896484375, "step": 636 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 435.79168701171875, "epoch": 0.006167042627140796, "grad_norm": 1.5998895690759043, "kl": 0.0250244140625, "learning_rate": 9.999064560985814e-07, "loss": 0.001, "reward": 1.5333325862884521, "reward_std": 0.21740296483039856, "rewards/accuracy_reward": 0.3934480845928192, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.1315511167049408, "step": 637 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 402.125, "epoch": 0.006176724012740703, "grad_norm": 1.9638289568727012, "kl": 0.038330078125, "learning_rate": 9.999061617133177e-07, "loss": 0.0015, "reward": 1.647078514099121, "reward_std": 0.05830862373113632, "rewards/accuracy_reward": 0.32997235655784607, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.200439453125, "step": 638 }, { "all_correct": 0.0, "all_wrong": 0.6666666666666666, "completion_length": 383.0833435058594, "epoch": 0.00618640539834061, "grad_norm": 1.5304366517139278, "kl": 0.042236328125, "learning_rate": 9.999058668656059e-07, "loss": 0.0017, "reward": 1.250123143196106, "reward_std": 0.024762485176324844, "rewards/accuracy_reward": 0.09759216010570526, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.0941975936293602, "step": 639 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 379.8333435058594, "epoch": 0.0061960867839405175, "grad_norm": 2.7861391189656253, "kl": 0.032470703125, "learning_rate": 9.999055715554458e-07, "loss": 0.0013, "reward": 1.7457585334777832, "reward_std": 0.14863364398479462, "rewards/accuracy_reward": 0.33421069383621216, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2532145380973816, "step": 640 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 474.7083435058594, "epoch": 0.006205768169540425, "grad_norm": 2.4009154436915265, "kl": 0.0283203125, "learning_rate": 9.99905275782838e-07, "loss": 0.0011, "reward": 1.657265067100525, "reward_std": 0.18991810083389282, "rewards/accuracy_reward": 0.4095844030380249, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1643473356962204, "step": 641 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 506.0833435058594, "epoch": 0.006215449555140332, "grad_norm": 2.470711118761101, "kl": 0.02392578125, "learning_rate": 9.999049795477823e-07, "loss": 0.001, "reward": 1.4232391119003296, "reward_std": 0.20198635756969452, "rewards/accuracy_reward": 0.23857924342155457, "rewards/format_reward": 0.9166666865348816, "rewards/semantic_reward": 0.1513265073299408, "step": 642 }, { "all_correct": 0.0, "all_wrong": 0.6666666666666666, "completion_length": 349.54168701171875, "epoch": 0.006225130940740239, "grad_norm": 1.4814582620520989, "kl": 0.044189453125, "learning_rate": 9.999046828502796e-07, "loss": 0.0018, "reward": 1.247994303703308, "reward_std": 0.04671882092952728, "rewards/accuracy_reward": 0.08617787063121796, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.11181640625, "step": 643 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 406.2083435058594, "epoch": 0.006234812326340146, "grad_norm": 1.3871629570297321, "kl": 0.02685546875, "learning_rate": 9.999043856903298e-07, "loss": 0.0011, "reward": 1.847550630569458, "reward_std": 0.255105197429657, "rewards/accuracy_reward": 0.5833333730697632, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1558837890625, "step": 644 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 423.2083435058594, "epoch": 0.006244493711940053, "grad_norm": 2.037280271272928, "kl": 0.02587890625, "learning_rate": 9.999040880679333e-07, "loss": 0.001, "reward": 1.7051215171813965, "reward_std": 0.10670262575149536, "rewards/accuracy_reward": 0.381944477558136, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.21484375, "step": 645 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 499.41668701171875, "epoch": 0.00625417509753996, "grad_norm": 2.497010421420726, "kl": 0.0208740234375, "learning_rate": 9.999037899830902e-07, "loss": 0.0008, "reward": 1.772808313369751, "reward_std": 0.3099118769168854, "rewards/accuracy_reward": 0.5972222685813904, "rewards/format_reward": 0.875, "rewards/semantic_reward": 0.1839192807674408, "step": 646 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 449.0833435058594, "epoch": 0.006263856483139867, "grad_norm": 3.8557004454379173, "kl": 0.036376953125, "learning_rate": 9.99903491435801e-07, "loss": 0.0015, "reward": 1.9863051176071167, "reward_std": 0.3328849971294403, "rewards/accuracy_reward": 0.5996677875518799, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2616373896598816, "step": 647 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 397.79168701171875, "epoch": 0.006273537868739774, "grad_norm": 3.3837366574152576, "kl": 0.04052734375, "learning_rate": 9.99903192426066e-07, "loss": 0.0016, "reward": 1.877199411392212, "reward_std": 0.09017523378133774, "rewards/accuracy_reward": 0.3788839280605316, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3149821162223816, "step": 648 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 438.91668701171875, "epoch": 0.006283219254339681, "grad_norm": 2.318928407338465, "kl": 0.033447265625, "learning_rate": 9.999028929538852e-07, "loss": 0.0013, "reward": 1.6917026042938232, "reward_std": 0.05250316113233566, "rewards/accuracy_reward": 0.3502717614173889, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2164306640625, "step": 649 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 382.875, "epoch": 0.006292900639939588, "grad_norm": 6.617295632265885, "kl": 0.049072265625, "learning_rate": 9.999025930192591e-07, "loss": 0.002, "reward": 1.9720759391784668, "reward_std": 0.09661602973937988, "rewards/accuracy_reward": 0.5303034782409668, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2751058042049408, "step": 650 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 397.16668701171875, "epoch": 0.006302582025539495, "grad_norm": 5.255325961872831, "kl": 0.04296875, "learning_rate": 9.999022926221878e-07, "loss": 0.0017, "reward": 2.1568429470062256, "reward_std": 0.11558522284030914, "rewards/accuracy_reward": 0.7663480043411255, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2571614682674408, "step": 651 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 418.29168701171875, "epoch": 0.006312263411139402, "grad_norm": 5.5293082062700485, "kl": 0.0255126953125, "learning_rate": 9.999019917626717e-07, "loss": 0.001, "reward": 2.004134178161621, "reward_std": 0.22352075576782227, "rewards/accuracy_reward": 0.7083333730697632, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.19580078125, "step": 652 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 363.91668701171875, "epoch": 0.006321944796739309, "grad_norm": 3.031016107799563, "kl": 0.039306640625, "learning_rate": 9.999016904407112e-07, "loss": 0.0016, "reward": 2.053081512451172, "reward_std": 0.3019014298915863, "rewards/accuracy_reward": 0.7222222685813904, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2141927182674408, "step": 653 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 396.4583435058594, "epoch": 0.0063316261823392166, "grad_norm": 1.4927475705623423, "kl": 0.0302734375, "learning_rate": 9.999013886563064e-07, "loss": 0.0012, "reward": 1.6908339262008667, "reward_std": 0.043198734521865845, "rewards/accuracy_reward": 0.424818217754364, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.166015625, "step": 654 }, { "all_correct": 0.0, "all_wrong": 0.6666666666666666, "completion_length": 394.625, "epoch": 0.006341307567939124, "grad_norm": 1.39099570886206, "kl": 0.031494140625, "learning_rate": 9.999010864094575e-07, "loss": 0.0013, "reward": 1.241377353668213, "reward_std": 0.024445435032248497, "rewards/accuracy_reward": 0.059761013835668564, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1149495467543602, "step": 655 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 421.91668701171875, "epoch": 0.006350988953539031, "grad_norm": 1.564610851075758, "kl": 0.021728515625, "learning_rate": 9.999007837001652e-07, "loss": 0.0009, "reward": 1.3182704448699951, "reward_std": 0.2872394025325775, "rewards/accuracy_reward": 0.1999599188566208, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1016438826918602, "step": 656 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 490.4583435058594, "epoch": 0.006360670339138938, "grad_norm": 2.262553749545505, "kl": 0.020263671875, "learning_rate": 9.999004805284293e-07, "loss": 0.0008, "reward": 1.6014323234558105, "reward_std": 0.4710674285888672, "rewards/accuracy_reward": 0.4583333432674408, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.134765625, "step": 657 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 308.375, "epoch": 0.006370351724738844, "grad_norm": 2.9109373311146256, "kl": 0.03466796875, "learning_rate": 9.999001768942501e-07, "loss": 0.0014, "reward": 1.6886394023895264, "reward_std": 0.08151769638061523, "rewards/accuracy_reward": 0.4052896499633789, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2250162810087204, "step": 658 }, { "all_correct": 0.0, "all_wrong": 0.6666666666666666, "completion_length": 459.3333435058594, "epoch": 0.006380033110338751, "grad_norm": 3.454785669871547, "kl": 0.0283203125, "learning_rate": 9.998998727976282e-07, "loss": 0.0011, "reward": 0.9070557355880737, "reward_std": 0.38782525062561035, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.7916666865348816, "rewards/semantic_reward": 0.0237223319709301, "step": 659 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 363.0833435058594, "epoch": 0.0063897144959386585, "grad_norm": 2.5702915068392995, "kl": 0.0296630859375, "learning_rate": 9.998995682385637e-07, "loss": 0.0012, "reward": 2.2231805324554443, "reward_std": 0.22649767994880676, "rewards/accuracy_reward": 0.7477814555168152, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3253987729549408, "step": 660 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 378.91668701171875, "epoch": 0.006399395881538566, "grad_norm": 7.342088128418087, "kl": 0.03564453125, "learning_rate": 9.99899263217057e-07, "loss": 0.0014, "reward": 1.569104552268982, "reward_std": 0.23184603452682495, "rewards/accuracy_reward": 0.2598516643047333, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1925862729549408, "step": 661 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 516.3333740234375, "epoch": 0.006409077267138473, "grad_norm": 19.70827389637105, "kl": 0.01708984375, "learning_rate": 9.998989577331084e-07, "loss": 0.0007, "reward": 1.6859325170516968, "reward_std": 0.5268838405609131, "rewards/accuracy_reward": 0.5241506099700928, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.128448486328125, "step": 662 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 378.0833435058594, "epoch": 0.00641875865273838, "grad_norm": 2.210005859983082, "kl": 0.033447265625, "learning_rate": 9.998986517867179e-07, "loss": 0.0013, "reward": 1.6764681339263916, "reward_std": 0.19715271890163422, "rewards/accuracy_reward": 0.4225047826766968, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1456298828125, "step": 663 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 339.3333435058594, "epoch": 0.006428440038338287, "grad_norm": 2.299769416368395, "kl": 0.036865234375, "learning_rate": 9.99898345377886e-07, "loss": 0.0015, "reward": 1.8515055179595947, "reward_std": 0.41800743341445923, "rewards/accuracy_reward": 0.5833333730697632, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2098388671875, "step": 664 }, { "all_correct": 0.0, "all_wrong": 0.6666666666666666, "completion_length": 352.625, "epoch": 0.006438121423938194, "grad_norm": 1.1140581603108313, "kl": 0.029052734375, "learning_rate": 9.99898038506613e-07, "loss": 0.0012, "reward": 1.3209228515625, "reward_std": 0.26635849475860596, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.0709228515625, "step": 665 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 410.5833435058594, "epoch": 0.006447802809538101, "grad_norm": 2.1708971643382364, "kl": 0.036865234375, "learning_rate": 9.99897731172899e-07, "loss": 0.0015, "reward": 1.4462828636169434, "reward_std": 0.41256293654441833, "rewards/accuracy_reward": 0.26728716492652893, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1373291015625, "step": 666 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 422.125, "epoch": 0.0064574841951380085, "grad_norm": 1.9990435927236665, "kl": 0.0250244140625, "learning_rate": 9.998974233767444e-07, "loss": 0.001, "reward": 2.115464925765991, "reward_std": 0.4487912654876709, "rewards/accuracy_reward": 0.7246608138084412, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2408040463924408, "step": 667 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 415.0, "epoch": 0.006467165580737916, "grad_norm": 2.5443956571212647, "kl": 0.0284423828125, "learning_rate": 9.998971151181494e-07, "loss": 0.0011, "reward": 1.6264300346374512, "reward_std": 0.0672127902507782, "rewards/accuracy_reward": 0.3393044173717499, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1787923276424408, "step": 668 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 389.125, "epoch": 0.006476846966337822, "grad_norm": 2.9229021587226405, "kl": 0.0283203125, "learning_rate": 9.998968063971147e-07, "loss": 0.0011, "reward": 1.9834922552108765, "reward_std": 0.22318938374519348, "rewards/accuracy_reward": 0.6050823926925659, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2450765073299408, "step": 669 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 304.7083435058594, "epoch": 0.006486528351937729, "grad_norm": 1.9758197555321992, "kl": 0.0274658203125, "learning_rate": 9.9989649721364e-07, "loss": 0.0011, "reward": 1.9130535125732422, "reward_std": 0.22707507014274597, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2130533903837204, "step": 670 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 339.91668701171875, "epoch": 0.006496209737537636, "grad_norm": 5.368715411454602, "kl": 0.041015625, "learning_rate": 9.99896187567726e-07, "loss": 0.0016, "reward": 1.874272346496582, "reward_std": 0.2137221395969391, "rewards/accuracy_reward": 0.41068166494369507, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.346923828125, "step": 671 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 351.5833435058594, "epoch": 0.006505891123137543, "grad_norm": 8.792123879735431, "kl": 0.034423828125, "learning_rate": 9.998958774593728e-07, "loss": 0.0014, "reward": 2.1291096210479736, "reward_std": 0.5281264185905457, "rewards/accuracy_reward": 0.7916666865348816, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2541097104549408, "step": 672 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 368.125, "epoch": 0.0065155725087374505, "grad_norm": 11.382764009184699, "kl": 0.03564453125, "learning_rate": 9.998955668885807e-07, "loss": 0.0014, "reward": 1.7609633207321167, "reward_std": 0.19064557552337646, "rewards/accuracy_reward": 0.42857396602630615, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1990559995174408, "step": 673 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 480.16668701171875, "epoch": 0.006525253894337358, "grad_norm": 7.189842463338324, "kl": 0.021728515625, "learning_rate": 9.9989525585535e-07, "loss": 0.0009, "reward": 1.7740046977996826, "reward_std": 0.27267998456954956, "rewards/accuracy_reward": 0.5052382349967957, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.1937662810087204, "step": 674 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 434.79168701171875, "epoch": 0.006534935279937265, "grad_norm": 3.5998341718556657, "kl": 0.024169921875, "learning_rate": 9.99894944359681e-07, "loss": 0.001, "reward": 1.642282485961914, "reward_std": 0.49378257989883423, "rewards/accuracy_reward": 0.3888889253139496, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1867268979549408, "step": 675 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.3333333333333333, "completion_length": 373.2083435058594, "epoch": 0.006544616665537172, "grad_norm": 3.3857469798652957, "kl": 0.031982421875, "learning_rate": 9.99894632401574e-07, "loss": 0.0013, "reward": 1.9332398176193237, "reward_std": 0.07575526088476181, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1749064177274704, "step": 676 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 415.0833435058594, "epoch": 0.006554298051137079, "grad_norm": 4.727182011649817, "kl": 0.02880859375, "learning_rate": 9.998943199810293e-07, "loss": 0.0012, "reward": 2.0490529537200928, "reward_std": 0.10787476599216461, "rewards/accuracy_reward": 0.540092945098877, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3506266474723816, "step": 677 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 371.125, "epoch": 0.006563979436736986, "grad_norm": 4.997186350288423, "kl": 0.037353515625, "learning_rate": 9.998940070980472e-07, "loss": 0.0015, "reward": 2.2224040031433105, "reward_std": 0.07737760990858078, "rewards/accuracy_reward": 0.771476149559021, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2842610776424408, "step": 678 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 373.7083435058594, "epoch": 0.006573660822336892, "grad_norm": 3.31402311482459, "kl": 0.037841796875, "learning_rate": 9.998936937526278e-07, "loss": 0.0015, "reward": 2.096306800842285, "reward_std": 0.11051150411367416, "rewards/accuracy_reward": 0.7205090522766113, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.234130859375, "step": 679 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 426.4583435058594, "epoch": 0.0065833422079367996, "grad_norm": 1.368964585957272, "kl": 0.0198974609375, "learning_rate": 9.998933799447716e-07, "loss": 0.0008, "reward": 1.495516061782837, "reward_std": 0.18709373474121094, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1121826171875, "step": 680 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 389.375, "epoch": 0.006593023593536707, "grad_norm": 3.5566824829753556, "kl": 0.036376953125, "learning_rate": 9.998930656744789e-07, "loss": 0.0015, "reward": 1.914255142211914, "reward_std": 0.195293590426445, "rewards/accuracy_reward": 0.5972222685813904, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2086995542049408, "step": 681 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 352.41668701171875, "epoch": 0.006602704979136614, "grad_norm": 10.577621464444773, "kl": 0.04248046875, "learning_rate": 9.9989275094175e-07, "loss": 0.0017, "reward": 1.85791015625, "reward_std": 0.19235914945602417, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1495768278837204, "step": 682 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 479.8333435058594, "epoch": 0.006612386364736521, "grad_norm": 1.5035755834700206, "kl": 0.012939453125, "learning_rate": 9.99892435746585e-07, "loss": 0.0005, "reward": 2.0241048336029053, "reward_std": 0.6886278390884399, "rewards/accuracy_reward": 0.7083333730697632, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2491048276424408, "step": 683 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 372.4583435058594, "epoch": 0.006622067750336428, "grad_norm": 1.8801466482122706, "kl": 0.04345703125, "learning_rate": 9.998921200889844e-07, "loss": 0.0017, "reward": 1.4909753799438477, "reward_std": 0.04974628612399101, "rewards/accuracy_reward": 0.18840375542640686, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1775716245174408, "step": 684 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 361.2083435058594, "epoch": 0.006631749135936335, "grad_norm": 7.719246777182342, "kl": 0.04931640625, "learning_rate": 9.998918039689483e-07, "loss": 0.002, "reward": 1.5214238166809082, "reward_std": 0.044960103929042816, "rewards/accuracy_reward": 0.2387007772922516, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1660563200712204, "step": 685 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 476.2083435058594, "epoch": 0.006641430521536242, "grad_norm": 2.2508211001928484, "kl": 0.0211181640625, "learning_rate": 9.99891487386477e-07, "loss": 0.0008, "reward": 1.6270751953125, "reward_std": 0.5632498264312744, "rewards/accuracy_reward": 0.40833336114883423, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.1687418669462204, "step": 686 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 440.8333435058594, "epoch": 0.0066511119071361495, "grad_norm": 2.361816797141016, "kl": 0.03271484375, "learning_rate": 9.99891170341571e-07, "loss": 0.0013, "reward": 2.1553549766540527, "reward_std": 0.05453037843108177, "rewards/accuracy_reward": 0.7139077186584473, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2747802734375, "step": 687 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 431.875, "epoch": 0.006660793292736057, "grad_norm": 2.455337787627319, "kl": 0.03125, "learning_rate": 9.998908528342305e-07, "loss": 0.0012, "reward": 1.5423139333724976, "reward_std": 0.04656847566366196, "rewards/accuracy_reward": 0.23698340356349945, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1719970703125, "step": 688 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 401.75, "epoch": 0.006670474678335964, "grad_norm": 1.5943696876052162, "kl": 0.0308837890625, "learning_rate": 9.998905348644557e-07, "loss": 0.0012, "reward": 1.687485933303833, "reward_std": 0.20842936635017395, "rewards/accuracy_reward": 0.39254769682884216, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.228271484375, "step": 689 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 437.16668701171875, "epoch": 0.00668015606393587, "grad_norm": 2.804730751194235, "kl": 0.043212890625, "learning_rate": 9.99890216432247e-07, "loss": 0.0017, "reward": 1.6040842533111572, "reward_std": 0.3436889350414276, "rewards/accuracy_reward": 0.24452196061611176, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.2595621943473816, "step": 690 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 419.29168701171875, "epoch": 0.006689837449535777, "grad_norm": 2.4979174149814036, "kl": 0.02978515625, "learning_rate": 9.998898975376049e-07, "loss": 0.0012, "reward": 2.016650676727295, "reward_std": 0.18068985641002655, "rewards/accuracy_reward": 0.7083333730697632, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.191650390625, "step": 691 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 434.66668701171875, "epoch": 0.006699518835135684, "grad_norm": 1.117627655886059, "kl": 0.028076171875, "learning_rate": 9.998895781805292e-07, "loss": 0.0011, "reward": 1.3656575679779053, "reward_std": 0.5059788227081299, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.0906575545668602, "step": 692 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 392.0, "epoch": 0.0067092002207355915, "grad_norm": 1.8623306125854402, "kl": 0.0306396484375, "learning_rate": 9.998892583610204e-07, "loss": 0.0012, "reward": 1.8546956777572632, "reward_std": 0.550574541091919, "rewards/accuracy_reward": 0.5833333730697632, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1880289763212204, "step": 693 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 376.75, "epoch": 0.006718881606335499, "grad_norm": 1.8892273625363338, "kl": 0.0245361328125, "learning_rate": 9.998889380790788e-07, "loss": 0.001, "reward": 2.2731094360351562, "reward_std": 0.23863057792186737, "rewards/accuracy_reward": 0.8472222089767456, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2842203974723816, "step": 694 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 409.16668701171875, "epoch": 0.006728562991935406, "grad_norm": 4.695355609589592, "kl": 0.04345703125, "learning_rate": 9.998886173347051e-07, "loss": 0.0017, "reward": 1.4335426092147827, "reward_std": 0.07058023661375046, "rewards/accuracy_reward": 0.18592703342437744, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1392822265625, "step": 695 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 466.9583435058594, "epoch": 0.006738244377535313, "grad_norm": 2.7271857747644996, "kl": 0.0235595703125, "learning_rate": 9.99888296127899e-07, "loss": 0.0009, "reward": 1.349233627319336, "reward_std": 0.18800315260887146, "rewards/accuracy_reward": 0.18267276883125305, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.0998942106962204, "step": 696 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 379.9583435058594, "epoch": 0.00674792576313522, "grad_norm": 2.8860561028171916, "kl": 0.04150390625, "learning_rate": 9.99887974458661e-07, "loss": 0.0017, "reward": 1.7779107093811035, "reward_std": 0.20857737958431244, "rewards/accuracy_reward": 0.513888955116272, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1640218198299408, "step": 697 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 401.41668701171875, "epoch": 0.006757607148735127, "grad_norm": 2.715460515351917, "kl": 0.0419921875, "learning_rate": 9.998876523269916e-07, "loss": 0.0017, "reward": 2.109114170074463, "reward_std": 0.31838279962539673, "rewards/accuracy_reward": 0.7014622092247009, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.290985107421875, "step": 698 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 571.5, "epoch": 0.006767288534335034, "grad_norm": 2.769880132219636, "kl": 0.019287109375, "learning_rate": 9.998873297328909e-07, "loss": 0.0008, "reward": 1.1395924091339111, "reward_std": 0.29159480333328247, "rewards/accuracy_reward": 0.17867928743362427, "rewards/format_reward": 0.7916666865348816, "rewards/semantic_reward": 0.1025797575712204, "step": 699 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 424.125, "epoch": 0.0067769699199349415, "grad_norm": 3.3809776857351745, "kl": 0.04150390625, "learning_rate": 9.99887006676359e-07, "loss": 0.0017, "reward": 2.098867177963257, "reward_std": 0.10296797752380371, "rewards/accuracy_reward": 0.6318098902702332, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3170573115348816, "step": 700 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 482.9583435058594, "epoch": 0.006786651305534848, "grad_norm": 3.3640982395603998, "kl": 0.027099609375, "learning_rate": 9.998866831573967e-07, "loss": 0.0011, "reward": 1.4475154876708984, "reward_std": 0.17716118693351746, "rewards/accuracy_reward": 0.2273494005203247, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.170166015625, "step": 701 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 421.91668701171875, "epoch": 0.006796332691134755, "grad_norm": 2.402932682913642, "kl": 0.038818359375, "learning_rate": 9.998863591760038e-07, "loss": 0.0015, "reward": 1.9084091186523438, "reward_std": 0.27342256903648376, "rewards/accuracy_reward": 0.567507266998291, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2159017026424408, "step": 702 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 460.2083435058594, "epoch": 0.006806014076734662, "grad_norm": 2.155967032874734, "kl": 0.0279541015625, "learning_rate": 9.99886034732181e-07, "loss": 0.0011, "reward": 1.9607865810394287, "reward_std": 0.3693581223487854, "rewards/accuracy_reward": 0.6257603168487549, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.2350260466337204, "step": 703 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 395.9583435058594, "epoch": 0.006815695462334569, "grad_norm": 3.0672100505315245, "kl": 0.040283203125, "learning_rate": 9.998857098259284e-07, "loss": 0.0016, "reward": 1.7437968254089355, "reward_std": 0.16175740957260132, "rewards/accuracy_reward": 0.4551168382167816, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1886800229549408, "step": 704 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 416.3333435058594, "epoch": 0.006825376847934476, "grad_norm": 1.7702933581237972, "kl": 0.03271484375, "learning_rate": 9.998853844572463e-07, "loss": 0.0013, "reward": 1.772189974784851, "reward_std": 0.051654644310474396, "rewards/accuracy_reward": 0.4700576663017273, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1854654997587204, "step": 705 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 454.91668701171875, "epoch": 0.006835058233534383, "grad_norm": 1.9942536829155064, "kl": 0.0262451171875, "learning_rate": 9.998850586261352e-07, "loss": 0.001, "reward": 2.055490493774414, "reward_std": 0.46838849782943726, "rewards/accuracy_reward": 0.6492405533790588, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.28125, "step": 706 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 502.91668701171875, "epoch": 0.006844739619134291, "grad_norm": 5.355579271794181, "kl": 0.0308837890625, "learning_rate": 9.99884732332595e-07, "loss": 0.0012, "reward": 1.7031428813934326, "reward_std": 0.44241759181022644, "rewards/accuracy_reward": 0.46210283041000366, "rewards/format_reward": 0.9166666865348816, "rewards/semantic_reward": 0.2327067106962204, "step": 707 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 415.125, "epoch": 0.006854421004734198, "grad_norm": 3.9096722120178384, "kl": 0.05322265625, "learning_rate": 9.998844055766262e-07, "loss": 0.0021, "reward": 2.049056053161621, "reward_std": 0.0635722428560257, "rewards/accuracy_reward": 0.6456623673439026, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2283935546875, "step": 708 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 400.0, "epoch": 0.006864102390334105, "grad_norm": 3.1788884612557173, "kl": 0.05126953125, "learning_rate": 9.998840783582295e-07, "loss": 0.0021, "reward": 1.7796123027801514, "reward_std": 0.2175447642803192, "rewards/accuracy_reward": 0.516314685344696, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.171630859375, "step": 709 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 385.25, "epoch": 0.006873783775934012, "grad_norm": 2.298234471320188, "kl": 0.0322265625, "learning_rate": 9.998837506774045e-07, "loss": 0.0013, "reward": 1.870065689086914, "reward_std": 0.23266810178756714, "rewards/accuracy_reward": 0.5972222685813904, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2061767578125, "step": 710 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 462.375, "epoch": 0.006883465161533919, "grad_norm": 5.791734477809008, "kl": 0.031982421875, "learning_rate": 9.998834225341518e-07, "loss": 0.0013, "reward": 1.6388791799545288, "reward_std": 0.38095328211784363, "rewards/accuracy_reward": 0.3515133261680603, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.1956990659236908, "step": 711 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 449.29168701171875, "epoch": 0.006893146547133825, "grad_norm": 2.7399120427477057, "kl": 0.031982421875, "learning_rate": 9.99883093928472e-07, "loss": 0.0013, "reward": 2.2128071784973145, "reward_std": 0.13078731298446655, "rewards/accuracy_reward": 0.7457660436630249, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3003743588924408, "step": 712 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 397.8333435058594, "epoch": 0.0069028279327337325, "grad_norm": 3.392092622512227, "kl": 0.044189453125, "learning_rate": 9.99882764860365e-07, "loss": 0.0018, "reward": 1.961472749710083, "reward_std": 0.1665472388267517, "rewards/accuracy_reward": 0.49052533507347107, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3042806088924408, "step": 713 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 435.875, "epoch": 0.00691250931833364, "grad_norm": 1.5057858731970546, "kl": 0.038818359375, "learning_rate": 9.99882435329831e-07, "loss": 0.0016, "reward": 1.6462305784225464, "reward_std": 0.2917177081108093, "rewards/accuracy_reward": 0.45117849111557007, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.13671875, "step": 714 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 433.91668701171875, "epoch": 0.006922190703933547, "grad_norm": 1.2622116394437257, "kl": 0.031005859375, "learning_rate": 9.99882105336871e-07, "loss": 0.0012, "reward": 1.3679752349853516, "reward_std": 0.3080826699733734, "rewards/accuracy_reward": 0.19341468811035156, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.1328938901424408, "step": 715 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 421.375, "epoch": 0.006931872089533454, "grad_norm": 5.676105620854893, "kl": 0.02880859375, "learning_rate": 9.998817748814846e-07, "loss": 0.0011, "reward": 2.3087403774261475, "reward_std": 0.20919740200042725, "rewards/accuracy_reward": 0.9583333730697632, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2504069209098816, "step": 716 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 381.0833435058594, "epoch": 0.006941553475133361, "grad_norm": 3.309790570778655, "kl": 0.035400390625, "learning_rate": 9.998814439636726e-07, "loss": 0.0014, "reward": 2.461246967315674, "reward_std": 0.10611580312252045, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.319580078125, "step": 717 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 451.4583435058594, "epoch": 0.006951234860733268, "grad_norm": 1.5580565448648742, "kl": 0.0296630859375, "learning_rate": 9.99881112583435e-07, "loss": 0.0012, "reward": 1.9457683563232422, "reward_std": 0.5925931930541992, "rewards/accuracy_reward": 0.7083333730697632, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.2041015625, "step": 718 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 394.3333435058594, "epoch": 0.006960916246333175, "grad_norm": 2.1792645602039786, "kl": 0.0400390625, "learning_rate": 9.998807807407721e-07, "loss": 0.0016, "reward": 1.819422721862793, "reward_std": 0.08718245476484299, "rewards/accuracy_reward": 0.4543266296386719, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2650960385799408, "step": 719 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.3333333333333333, "completion_length": 420.7083435058594, "epoch": 0.0069705976319330825, "grad_norm": 1.463686555467493, "kl": 0.0311279296875, "learning_rate": 9.998804484356845e-07, "loss": 0.0012, "reward": 1.958650827407837, "reward_std": 0.052440233528614044, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2003173828125, "step": 720 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 379.4583435058594, "epoch": 0.00698027901753299, "grad_norm": 4.78766738446806, "kl": 0.0498046875, "learning_rate": 9.99880115668172e-07, "loss": 0.002, "reward": 2.112959146499634, "reward_std": 0.07616014778614044, "rewards/accuracy_reward": 0.6559360027313232, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2820231318473816, "step": 721 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 392.625, "epoch": 0.006989960403132897, "grad_norm": 1.2299442610811102, "kl": 0.035400390625, "learning_rate": 9.998797824382352e-07, "loss": 0.0014, "reward": 1.3590171337127686, "reward_std": 0.4961298406124115, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.07568359375, "step": 722 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.6666666666666666, "completion_length": 358.25, "epoch": 0.006999641788732803, "grad_norm": 1.6968183227962454, "kl": 0.042724609375, "learning_rate": 9.998794487458747e-07, "loss": 0.0017, "reward": 1.480322241783142, "reward_std": 0.0076554263941943645, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.080322265625, "step": 723 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.3333333333333333, "completion_length": 451.3333435058594, "epoch": 0.00700932317433271, "grad_norm": 2.2616202829924332, "kl": 0.0242919921875, "learning_rate": 9.998791145910903e-07, "loss": 0.001, "reward": 1.9800944328308105, "reward_std": 0.17680500447750092, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.263427734375, "step": 724 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 449.16668701171875, "epoch": 0.007019004559932617, "grad_norm": 2.627867482514727, "kl": 0.028076171875, "learning_rate": 9.998787799738826e-07, "loss": 0.0011, "reward": 2.030792236328125, "reward_std": 0.383673757314682, "rewards/accuracy_reward": 0.6678363680839539, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2379557341337204, "step": 725 }, { "all_correct": 0.0, "all_wrong": 0.6666666666666666, "completion_length": 489.0833435058594, "epoch": 0.0070286859455325245, "grad_norm": 0.5200750117982116, "kl": 0.027587890625, "learning_rate": 9.99878444894252e-07, "loss": 0.0011, "reward": 1.053011178970337, "reward_std": 0.37622377276420593, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.875, "rewards/semantic_reward": 0.044677734375, "step": 726 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 434.66668701171875, "epoch": 0.007038367331132432, "grad_norm": 15.215828168838536, "kl": 0.02587890625, "learning_rate": 9.998781093521985e-07, "loss": 0.001, "reward": 2.231907844543457, "reward_std": 0.24822857975959778, "rewards/accuracy_reward": 0.7987207770347595, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3081868588924408, "step": 727 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 399.4583435058594, "epoch": 0.007048048716732339, "grad_norm": 4.117718357521948, "kl": 0.041015625, "learning_rate": 9.998777733477224e-07, "loss": 0.0016, "reward": 1.7630113363265991, "reward_std": 0.3829078674316406, "rewards/accuracy_reward": 0.3859442174434662, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2437337338924408, "step": 728 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 421.0, "epoch": 0.007057730102332246, "grad_norm": 1.5148764136387547, "kl": 0.03125, "learning_rate": 9.998774368808243e-07, "loss": 0.0013, "reward": 1.925406813621521, "reward_std": 0.10553587973117828, "rewards/accuracy_reward": 0.6247474551200867, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2173258513212204, "step": 729 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 395.16668701171875, "epoch": 0.007067411487932153, "grad_norm": 4.870404282280204, "kl": 0.045654296875, "learning_rate": 9.998770999515045e-07, "loss": 0.0018, "reward": 2.0050292015075684, "reward_std": 0.08140058070421219, "rewards/accuracy_reward": 0.5573322176933289, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2643636167049408, "step": 730 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 371.25, "epoch": 0.00707709287353206, "grad_norm": 1.787225765994971, "kl": 0.040283203125, "learning_rate": 9.998767625597631e-07, "loss": 0.0016, "reward": 1.7044919729232788, "reward_std": 0.07624439895153046, "rewards/accuracy_reward": 0.40310847759246826, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.209716796875, "step": 731 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 423.375, "epoch": 0.007086774259131967, "grad_norm": 2.240548147609405, "kl": 0.04248046875, "learning_rate": 9.998764247056004e-07, "loss": 0.0017, "reward": 1.8749594688415527, "reward_std": 0.14509034156799316, "rewards/accuracy_reward": 0.5609865188598633, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1973063200712204, "step": 732 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 403.3333435058594, "epoch": 0.0070964556447318744, "grad_norm": 4.954205777020939, "kl": 0.037841796875, "learning_rate": 9.99876086389017e-07, "loss": 0.0015, "reward": 1.9746627807617188, "reward_std": 0.15969763696193695, "rewards/accuracy_reward": 0.6201298832893372, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1878662109375, "step": 733 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 423.41668701171875, "epoch": 0.007106137030331781, "grad_norm": 4.027552940466618, "kl": 0.04296875, "learning_rate": 9.99875747610013e-07, "loss": 0.0017, "reward": 1.8711981773376465, "reward_std": 0.3120081424713135, "rewards/accuracy_reward": 0.494513601064682, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2266845703125, "step": 734 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 403.5833435058594, "epoch": 0.007115818415931688, "grad_norm": 1.8267033404539375, "kl": 0.04150390625, "learning_rate": 9.998754083685886e-07, "loss": 0.0017, "reward": 1.8959283828735352, "reward_std": 0.030477717518806458, "rewards/accuracy_reward": 0.5555555820465088, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2153727263212204, "step": 735 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 413.79168701171875, "epoch": 0.007125499801531595, "grad_norm": 5.697876969051142, "kl": 0.04248046875, "learning_rate": 9.998750686647444e-07, "loss": 0.0017, "reward": 2.2846953868865967, "reward_std": 0.07494455575942993, "rewards/accuracy_reward": 0.8310088515281677, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2786865234375, "step": 736 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 416.375, "epoch": 0.007135181187131502, "grad_norm": 1.9251820204319416, "kl": 0.0302734375, "learning_rate": 9.998747284984805e-07, "loss": 0.0012, "reward": 1.7923400402069092, "reward_std": 0.2190157175064087, "rewards/accuracy_reward": 0.49932241439819336, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1763509213924408, "step": 737 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 448.9583435058594, "epoch": 0.007144862572731409, "grad_norm": 3.434476794479792, "kl": 0.035400390625, "learning_rate": 9.998743878697973e-07, "loss": 0.0014, "reward": 2.163832902908325, "reward_std": 0.23561134934425354, "rewards/accuracy_reward": 0.7248516082763672, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2889811396598816, "step": 738 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 411.4583435058594, "epoch": 0.007154543958331316, "grad_norm": 1.5825671536109176, "kl": 0.0279541015625, "learning_rate": 9.998740467786952e-07, "loss": 0.0011, "reward": 1.3417062759399414, "reward_std": 0.3731958568096161, "rewards/accuracy_reward": 0.2338978499174118, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.1078084334731102, "step": 739 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 430.8333435058594, "epoch": 0.0071642253439312235, "grad_norm": 1.9030445446226147, "kl": 0.03466796875, "learning_rate": 9.998737052251744e-07, "loss": 0.0014, "reward": 1.769676923751831, "reward_std": 0.5996317267417908, "rewards/accuracy_reward": 0.46274328231811523, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1902669370174408, "step": 740 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 393.75, "epoch": 0.007173906729531131, "grad_norm": 1.9949424162641793, "kl": 0.0322265625, "learning_rate": 9.998733632092351e-07, "loss": 0.0013, "reward": 2.342496871948242, "reward_std": 0.3231651782989502, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3091634213924408, "step": 741 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 426.2083435058594, "epoch": 0.007183588115131038, "grad_norm": 4.377401843741767, "kl": 0.03271484375, "learning_rate": 9.99873020730878e-07, "loss": 0.0013, "reward": 1.774126410484314, "reward_std": 0.05309101566672325, "rewards/accuracy_reward": 0.42605501413345337, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2314046323299408, "step": 742 }, { "all_correct": 0.0, "all_wrong": 0.6666666666666666, "completion_length": 470.5, "epoch": 0.007193269500730945, "grad_norm": 3.0159253452191654, "kl": 0.0279541015625, "learning_rate": 9.998726777901028e-07, "loss": 0.0011, "reward": 1.3431370258331299, "reward_std": 0.14067649841308594, "rewards/accuracy_reward": 0.222222238779068, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.104248046875, "step": 743 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 359.375, "epoch": 0.007202950886330852, "grad_norm": 1.8296460532247423, "kl": 0.03173828125, "learning_rate": 9.998723343869104e-07, "loss": 0.0013, "reward": 1.5794932842254639, "reward_std": 0.13848620653152466, "rewards/accuracy_reward": 0.31411227583885193, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1820475310087204, "step": 744 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 415.5833435058594, "epoch": 0.007212632271930758, "grad_norm": 13.789627227375714, "kl": 0.037353515625, "learning_rate": 9.99871990521301e-07, "loss": 0.0015, "reward": 1.8728306293487549, "reward_std": 0.2335953414440155, "rewards/accuracy_reward": 0.4917594790458679, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.264404296875, "step": 745 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 526.125, "epoch": 0.0072223136575306655, "grad_norm": 2.8640893295371748, "kl": 0.02587890625, "learning_rate": 9.998716461932745e-07, "loss": 0.001, "reward": 1.4802148342132568, "reward_std": 0.23777835071086884, "rewards/accuracy_reward": 0.43936190009117126, "rewards/format_reward": 0.75, "rewards/semantic_reward": 0.20751953125, "step": 746 }, { "all_correct": 0.0, "all_wrong": 0.6666666666666666, "completion_length": 375.25, "epoch": 0.007231995043130573, "grad_norm": 2.3371868258999653, "kl": 0.04248046875, "learning_rate": 9.998713014028319e-07, "loss": 0.0017, "reward": 1.350235939025879, "reward_std": 0.27822256088256836, "rewards/accuracy_reward": 0.2916666865348816, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.0752360075712204, "step": 747 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 351.16668701171875, "epoch": 0.00724167642873048, "grad_norm": 2.7007620721430095, "kl": 0.033447265625, "learning_rate": 9.99870956149973e-07, "loss": 0.0013, "reward": 1.8674888610839844, "reward_std": 0.08876684308052063, "rewards/accuracy_reward": 0.4990236759185791, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2601318359375, "step": 748 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 391.625, "epoch": 0.007251357814330387, "grad_norm": 3.2962668004643283, "kl": 0.041259765625, "learning_rate": 9.998706104346984e-07, "loss": 0.0017, "reward": 1.8826254606246948, "reward_std": 0.08699692785739899, "rewards/accuracy_reward": 0.4903647303581238, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2339274138212204, "step": 749 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 394.75, "epoch": 0.007261039199930294, "grad_norm": 3.051323335741184, "kl": 0.03759765625, "learning_rate": 9.99870264257008e-07, "loss": 0.0015, "reward": 2.0809898376464844, "reward_std": 0.05458168685436249, "rewards/accuracy_reward": 0.5320640802383423, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3572591245174408, "step": 750 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 364.7083435058594, "epoch": 0.007270720585530201, "grad_norm": 3.1709444365197563, "kl": 0.041259765625, "learning_rate": 9.998699176169028e-07, "loss": 0.0017, "reward": 1.5923970937728882, "reward_std": 0.058438487350940704, "rewards/accuracy_reward": 0.3006490468978882, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.166748046875, "step": 751 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 400.3333435058594, "epoch": 0.007280401971130108, "grad_norm": 3.988384010195282, "kl": 0.042724609375, "learning_rate": 9.998695705143825e-07, "loss": 0.0017, "reward": 1.934571623802185, "reward_std": 0.07810066640377045, "rewards/accuracy_reward": 0.4876558184623718, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2552490234375, "step": 752 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 363.25, "epoch": 0.0072900833567300155, "grad_norm": 8.57772260614785, "kl": 0.033935546875, "learning_rate": 9.998692229494476e-07, "loss": 0.0014, "reward": 1.8536107540130615, "reward_std": 0.07151427865028381, "rewards/accuracy_reward": 0.5555555820465088, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2147216796875, "step": 753 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.6666666666666666, "completion_length": 486.75, "epoch": 0.007299764742329923, "grad_norm": 1.933750349772437, "kl": 0.0196533203125, "learning_rate": 9.998688749220986e-07, "loss": 0.0008, "reward": 1.395247459411621, "reward_std": 0.18271182477474213, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.9166666865348816, "rewards/semantic_reward": 0.0869140625, "step": 754 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 478.16668701171875, "epoch": 0.00730944612792983, "grad_norm": 1.3768598159470702, "kl": 0.02490234375, "learning_rate": 9.998685264323357e-07, "loss": 0.001, "reward": 1.086181640625, "reward_std": 0.5032175779342651, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.875, "rewards/semantic_reward": 0.0445149764418602, "step": 755 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 415.375, "epoch": 0.007319127513529736, "grad_norm": 5.59326175207672, "kl": 0.0274658203125, "learning_rate": 9.998681774801593e-07, "loss": 0.0011, "reward": 1.6155498027801514, "reward_std": 0.5213034749031067, "rewards/accuracy_reward": 0.3583396077156067, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1822102963924408, "step": 756 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 370.3333435058594, "epoch": 0.007328808899129643, "grad_norm": 2.33075050088786, "kl": 0.036376953125, "learning_rate": 9.998678280655694e-07, "loss": 0.0015, "reward": 1.6792426109313965, "reward_std": 0.07671823352575302, "rewards/accuracy_reward": 0.3938666582107544, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1770426481962204, "step": 757 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 391.9583435058594, "epoch": 0.00733849028472955, "grad_norm": 2.0928447324761805, "kl": 0.038818359375, "learning_rate": 9.998674781885668e-07, "loss": 0.0016, "reward": 1.4911994934082031, "reward_std": 0.05358078330755234, "rewards/accuracy_reward": 0.16519032418727875, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2010091245174408, "step": 758 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 443.625, "epoch": 0.0073481716703294574, "grad_norm": 2.606695467187884, "kl": 0.0311279296875, "learning_rate": 9.998671278491516e-07, "loss": 0.0012, "reward": 1.5521717071533203, "reward_std": 0.24394381046295166, "rewards/accuracy_reward": 0.38477659225463867, "rewards/format_reward": 0.9166666865348816, "rewards/semantic_reward": 0.14239501953125, "step": 759 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 370.16668701171875, "epoch": 0.007357853055929365, "grad_norm": 4.074325895736906, "kl": 0.03955078125, "learning_rate": 9.998667770473242e-07, "loss": 0.0016, "reward": 1.98689603805542, "reward_std": 0.21729448437690735, "rewards/accuracy_reward": 0.6487043499946594, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2298583984375, "step": 760 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 364.9583435058594, "epoch": 0.007367534441529272, "grad_norm": 2.316037232188869, "kl": 0.030029296875, "learning_rate": 9.998664257830845e-07, "loss": 0.0012, "reward": 1.9849103689193726, "reward_std": 0.5830854773521423, "rewards/accuracy_reward": 0.6437970399856567, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.26611328125, "step": 761 }, { "all_correct": 0.0, "all_wrong": 0.6666666666666666, "completion_length": 394.75, "epoch": 0.007377215827129179, "grad_norm": 0.9830460787172552, "kl": 0.0277099609375, "learning_rate": 9.998660740564335e-07, "loss": 0.0011, "reward": 1.3799642324447632, "reward_std": 0.23633860051631927, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.0882975310087204, "step": 762 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 376.2083435058594, "epoch": 0.007386897212729086, "grad_norm": 2.5993814934228348, "kl": 0.03125, "learning_rate": 9.998657218673709e-07, "loss": 0.0013, "reward": 2.3463993072509766, "reward_std": 0.054784588515758514, "rewards/accuracy_reward": 0.7410768270492554, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.4136556088924408, "step": 763 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 468.0, "epoch": 0.007396578598328993, "grad_norm": 2.1673714300409435, "kl": 0.0286865234375, "learning_rate": 9.998653692158977e-07, "loss": 0.0011, "reward": 1.6739085912704468, "reward_std": 0.2904411852359772, "rewards/accuracy_reward": 0.48032546043395996, "rewards/format_reward": 0.9166666865348816, "rewards/semantic_reward": 0.15191650390625, "step": 764 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 394.8333435058594, "epoch": 0.0074062599839289, "grad_norm": 2.537888771053143, "kl": 0.027587890625, "learning_rate": 9.998650161020134e-07, "loss": 0.0011, "reward": 1.5945799350738525, "reward_std": 0.21716979146003723, "rewards/accuracy_reward": 0.3890256881713867, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.13055419921875, "step": 765 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 381.25, "epoch": 0.0074159413695288065, "grad_norm": 2.966346218306615, "kl": 0.036376953125, "learning_rate": 9.998646625257191e-07, "loss": 0.0015, "reward": 2.2557373046875, "reward_std": 0.31396010518074036, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2557373046875, "step": 766 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 348.41668701171875, "epoch": 0.007425622755128714, "grad_norm": 2.210303486416945, "kl": 0.02880859375, "learning_rate": 9.998643084870148e-07, "loss": 0.0012, "reward": 1.5897135734558105, "reward_std": 0.37575989961624146, "rewards/accuracy_reward": 0.3387613892555237, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1842854917049408, "step": 767 }, { "all_correct": 0.0, "all_wrong": 1.0, "completion_length": 363.125, "epoch": 0.007435304140728621, "grad_norm": 0.14338174290997205, "kl": 0.0400390625, "learning_rate": 9.998639539859006e-07, "loss": 0.0016, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.0, "step": 768 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 330.8333435058594, "epoch": 0.007444985526328528, "grad_norm": 1.778883545194593, "kl": 0.03759765625, "learning_rate": 9.998635990223773e-07, "loss": 0.0015, "reward": 1.549490213394165, "reward_std": 0.14449360966682434, "rewards/accuracy_reward": 0.3611111342906952, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1300455778837204, "step": 769 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 325.5833435058594, "epoch": 0.007454666911928435, "grad_norm": 2.8050780766894112, "kl": 0.0390625, "learning_rate": 9.998632435964447e-07, "loss": 0.0016, "reward": 2.1128156185150146, "reward_std": 0.22745725512504578, "rewards/accuracy_reward": 0.7547119855880737, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2747701108455658, "step": 770 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 388.0833435058594, "epoch": 0.007464348297528342, "grad_norm": 3.149737228763795, "kl": 0.04052734375, "learning_rate": 9.998628877081038e-07, "loss": 0.0016, "reward": 1.5799778699874878, "reward_std": 0.05845455452799797, "rewards/accuracy_reward": 0.2998427152633667, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1801351010799408, "step": 771 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 334.91668701171875, "epoch": 0.007474029683128249, "grad_norm": 1.6446443318016886, "kl": 0.0390625, "learning_rate": 9.998625313573542e-07, "loss": 0.0016, "reward": 1.643791675567627, "reward_std": 0.0766119733452797, "rewards/accuracy_reward": 0.3524749279022217, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2246500700712204, "step": 772 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 421.5833435058594, "epoch": 0.0074837110687281565, "grad_norm": 2.0975942144240785, "kl": 0.031494140625, "learning_rate": 9.998621745441968e-07, "loss": 0.0013, "reward": 2.2894186973571777, "reward_std": 0.09178261458873749, "rewards/accuracy_reward": 0.7406471967697144, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3654378354549408, "step": 773 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 385.7083435058594, "epoch": 0.007493392454328064, "grad_norm": 2.5617128896877026, "kl": 0.033935546875, "learning_rate": 9.998618172686315e-07, "loss": 0.0014, "reward": 1.7932288646697998, "reward_std": 0.20769314467906952, "rewards/accuracy_reward": 0.4526851177215576, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.223876953125, "step": 774 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 392.2083435058594, "epoch": 0.007503073839927971, "grad_norm": 2.606266006706376, "kl": 0.0296630859375, "learning_rate": 9.99861459530659e-07, "loss": 0.0012, "reward": 1.8563203811645508, "reward_std": 0.2003190964460373, "rewards/accuracy_reward": 0.45215368270874023, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2291666716337204, "step": 775 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 389.625, "epoch": 0.007512755225527878, "grad_norm": 2.984839211989193, "kl": 0.040283203125, "learning_rate": 9.998611013302793e-07, "loss": 0.0016, "reward": 2.322930335998535, "reward_std": 0.0859365463256836, "rewards/accuracy_reward": 0.8888888359069824, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2673746943473816, "step": 776 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 532.9583740234375, "epoch": 0.007522436611127784, "grad_norm": 0.7622466033877711, "kl": 0.0147705078125, "learning_rate": 9.99860742667493e-07, "loss": 0.0006, "reward": 1.1151530742645264, "reward_std": 0.39899349212646484, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.0318196639418602, "step": 777 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 383.41668701171875, "epoch": 0.007532117996727691, "grad_norm": 2.2341141557835127, "kl": 0.044189453125, "learning_rate": 9.998603835423005e-07, "loss": 0.0018, "reward": 1.7204711437225342, "reward_std": 0.10257844626903534, "rewards/accuracy_reward": 0.4225219190120697, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1896158903837204, "step": 778 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 442.5, "epoch": 0.0075417993823275985, "grad_norm": 2.395509594637873, "kl": 0.02978515625, "learning_rate": 9.998600239547017e-07, "loss": 0.0012, "reward": 2.170313835144043, "reward_std": 0.34895429015159607, "rewards/accuracy_reward": 0.7926770448684692, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.2859700620174408, "step": 779 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 382.41668701171875, "epoch": 0.007551480767927506, "grad_norm": 7.07456165392235, "kl": 0.039794921875, "learning_rate": 9.998596639046973e-07, "loss": 0.0016, "reward": 1.7814357280731201, "reward_std": 0.2765692174434662, "rewards/accuracy_reward": 0.3793033957481384, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.268798828125, "step": 780 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 390.0, "epoch": 0.007561162153527413, "grad_norm": 2.8711136856591235, "kl": 0.037353515625, "learning_rate": 9.998593033922874e-07, "loss": 0.0015, "reward": 2.048377752304077, "reward_std": 0.10387282073497772, "rewards/accuracy_reward": 0.5621798634529114, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3111979365348816, "step": 781 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 397.875, "epoch": 0.00757084353912732, "grad_norm": 2.8338675230735495, "kl": 0.0400390625, "learning_rate": 9.998589424174726e-07, "loss": 0.0016, "reward": 1.6788758039474487, "reward_std": 0.1030362993478775, "rewards/accuracy_reward": 0.3109721541404724, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2262369841337204, "step": 782 }, { "all_correct": 0.0, "all_wrong": 0.6666666666666666, "completion_length": 351.5, "epoch": 0.007580524924727227, "grad_norm": 1.868989615226507, "kl": 0.04345703125, "learning_rate": 9.99858580980253e-07, "loss": 0.0017, "reward": 1.4293864965438843, "reward_std": 0.17374494671821594, "rewards/accuracy_reward": 0.2916666865348816, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.0793863981962204, "step": 783 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 423.79168701171875, "epoch": 0.007590206310327134, "grad_norm": 3.3330264340906077, "kl": 0.038330078125, "learning_rate": 9.99858219080629e-07, "loss": 0.0015, "reward": 2.092444896697998, "reward_std": 0.11222710460424423, "rewards/accuracy_reward": 0.5973438620567322, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3534342646598816, "step": 784 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 454.75, "epoch": 0.007599887695927041, "grad_norm": 2.2039924451026085, "kl": 0.0269775390625, "learning_rate": 9.998578567186011e-07, "loss": 0.0011, "reward": 1.8872708082199097, "reward_std": 0.28516727685928345, "rewards/accuracy_reward": 0.5599108934402466, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.210693359375, "step": 785 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 423.5833435058594, "epoch": 0.0076095690815269484, "grad_norm": 1.9232114215585054, "kl": 0.02490234375, "learning_rate": 9.998574938941695e-07, "loss": 0.001, "reward": 2.114518642425537, "reward_std": 0.29241421818733215, "rewards/accuracy_reward": 0.6580814123153687, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3064371943473816, "step": 786 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 340.79168701171875, "epoch": 0.007619250467126856, "grad_norm": 1.7032917398611072, "kl": 0.03515625, "learning_rate": 9.998571306073343e-07, "loss": 0.0014, "reward": 1.625483512878418, "reward_std": 0.06220651790499687, "rewards/accuracy_reward": 0.3659132421016693, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1845703125, "step": 787 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 484.79168701171875, "epoch": 0.007628931852726762, "grad_norm": 1.844302341792229, "kl": 0.0252685546875, "learning_rate": 9.998567668580963e-07, "loss": 0.001, "reward": 2.107208728790283, "reward_std": 0.3219144940376282, "rewards/accuracy_reward": 0.7589828372001648, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.264892578125, "step": 788 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 342.3333435058594, "epoch": 0.007638613238326669, "grad_norm": 1.4581506694771744, "kl": 0.03369140625, "learning_rate": 9.998564026464555e-07, "loss": 0.0013, "reward": 1.738134741783142, "reward_std": 0.30425113439559937, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1714681088924408, "step": 789 }, { "all_correct": 0.0, "all_wrong": 0.6666666666666666, "completion_length": 411.625, "epoch": 0.007648294623926576, "grad_norm": 1.7566572615221492, "kl": 0.03125, "learning_rate": 9.998560379724123e-07, "loss": 0.0013, "reward": 1.222472906112671, "reward_std": 0.02033352293074131, "rewards/accuracy_reward": 0.06693904101848602, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.0888671875, "step": 790 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 427.79168701171875, "epoch": 0.007657976009526483, "grad_norm": 3.007618048175303, "kl": 0.035888671875, "learning_rate": 9.998556728359672e-07, "loss": 0.0014, "reward": 1.5660165548324585, "reward_std": 0.06685196608304977, "rewards/accuracy_reward": 0.2552255392074585, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.185791015625, "step": 791 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 426.66668701171875, "epoch": 0.00766765739512639, "grad_norm": 3.398157954613517, "kl": 0.033447265625, "learning_rate": 9.998553072371204e-07, "loss": 0.0013, "reward": 1.860994577407837, "reward_std": 0.3719562888145447, "rewards/accuracy_reward": 0.5375732183456421, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.215087890625, "step": 792 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 363.5, "epoch": 0.0076773387807262975, "grad_norm": 5.732696906581065, "kl": 0.042724609375, "learning_rate": 9.998549411758723e-07, "loss": 0.0017, "reward": 1.796480417251587, "reward_std": 0.12106259167194366, "rewards/accuracy_reward": 0.3451375365257263, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2930094599723816, "step": 793 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 374.2083435058594, "epoch": 0.007687020166326205, "grad_norm": 1.6735052641190051, "kl": 0.033203125, "learning_rate": 9.998545746522231e-07, "loss": 0.0013, "reward": 1.9642417430877686, "reward_std": 0.19662097096443176, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2225748747587204, "step": 794 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 380.625, "epoch": 0.007696701551926112, "grad_norm": 14.529915888605817, "kl": 0.042236328125, "learning_rate": 9.998542076661732e-07, "loss": 0.0017, "reward": 1.8877272605895996, "reward_std": 0.20091238617897034, "rewards/accuracy_reward": 0.521793007850647, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.249267578125, "step": 795 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 393.125, "epoch": 0.007706382937526019, "grad_norm": 2.061619730288169, "kl": 0.03173828125, "learning_rate": 9.99853840217723e-07, "loss": 0.0013, "reward": 2.2663636207580566, "reward_std": 0.31034475564956665, "rewards/accuracy_reward": 0.7668923139572144, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3328043818473816, "step": 796 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 337.41668701171875, "epoch": 0.007716064323125926, "grad_norm": 2.427257522460513, "kl": 0.034912109375, "learning_rate": 9.99853472306873e-07, "loss": 0.0014, "reward": 1.7269260883331299, "reward_std": 0.251926064491272, "rewards/accuracy_reward": 0.42981502413749695, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2054443359375, "step": 797 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 401.4583435058594, "epoch": 0.007725745708725833, "grad_norm": 3.9969832974995305, "kl": 0.032470703125, "learning_rate": 9.998531039336232e-07, "loss": 0.0013, "reward": 2.2033095359802246, "reward_std": 0.27785471081733704, "rewards/accuracy_reward": 0.8055555820465088, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2394205778837204, "step": 798 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 369.9583435058594, "epoch": 0.0077354270943257395, "grad_norm": 11.42806652342626, "kl": 0.031982421875, "learning_rate": 9.998527350979739e-07, "loss": 0.0013, "reward": 1.8744618892669678, "reward_std": 0.24790963530540466, "rewards/accuracy_reward": 0.5277495384216309, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.21337890625, "step": 799 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 392.125, "epoch": 0.007745108479925647, "grad_norm": 3.6447966001755923, "kl": 0.0341796875, "learning_rate": 9.99852365799926e-07, "loss": 0.0014, "reward": 1.6086599826812744, "reward_std": 0.03988007456064224, "rewards/accuracy_reward": 0.2722747325897217, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2113851010799408, "step": 800 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 398.75, "epoch": 0.007754789865525554, "grad_norm": 4.619646197822151, "kl": 0.0341796875, "learning_rate": 9.99851996039479e-07, "loss": 0.0014, "reward": 1.654954195022583, "reward_std": 0.32918083667755127, "rewards/accuracy_reward": 0.2847635746002197, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2368571013212204, "step": 801 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 416.625, "epoch": 0.007764471251125461, "grad_norm": 2.339119588254647, "kl": 0.037841796875, "learning_rate": 9.998516258166341e-07, "loss": 0.0015, "reward": 1.5658797025680542, "reward_std": 0.056413423269987106, "rewards/accuracy_reward": 0.21200594305992126, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2205403745174408, "step": 802 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 419.875, "epoch": 0.007774152636725368, "grad_norm": 3.795602072327284, "kl": 0.03564453125, "learning_rate": 9.998512551313912e-07, "loss": 0.0014, "reward": 1.9679985046386719, "reward_std": 0.3314206600189209, "rewards/accuracy_reward": 0.5975719690322876, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.228759765625, "step": 803 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 407.0, "epoch": 0.007783834022325275, "grad_norm": 2.342598754956176, "kl": 0.035888671875, "learning_rate": 9.998508839837508e-07, "loss": 0.0014, "reward": 1.8806947469711304, "reward_std": 0.30773788690567017, "rewards/accuracy_reward": 0.5431296825408936, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2125651091337204, "step": 804 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 380.9583435058594, "epoch": 0.007793515407925182, "grad_norm": 3.698189189686341, "kl": 0.0498046875, "learning_rate": 9.998505123737128e-07, "loss": 0.002, "reward": 1.9490965604782104, "reward_std": 0.11734070628881454, "rewards/accuracy_reward": 0.4722329080104828, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3268636167049408, "step": 805 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.3333333333333333, "completion_length": 351.25, "epoch": 0.0078031967935250895, "grad_norm": 2.178801128696244, "kl": 0.0390625, "learning_rate": 9.998501403012783e-07, "loss": 0.0016, "reward": 1.9774903059005737, "reward_std": 0.06939400732517242, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2191569060087204, "step": 806 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 364.625, "epoch": 0.007812878179124997, "grad_norm": 2.9266330656263886, "kl": 0.041015625, "learning_rate": 9.99849767766447e-07, "loss": 0.0016, "reward": 2.2749364376068115, "reward_std": 0.10068371891975403, "rewards/accuracy_reward": 0.8320082426071167, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2929280698299408, "step": 807 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 437.0833435058594, "epoch": 0.007822559564724904, "grad_norm": 1.7788554919842348, "kl": 0.024658203125, "learning_rate": 9.998493947692194e-07, "loss": 0.001, "reward": 1.7433247566223145, "reward_std": 0.3524053990840912, "rewards/accuracy_reward": 0.4447651505470276, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1902262419462204, "step": 808 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 368.16668701171875, "epoch": 0.007832240950324811, "grad_norm": 8.36254522950194, "kl": 0.049072265625, "learning_rate": 9.99849021309596e-07, "loss": 0.002, "reward": 1.979810118675232, "reward_std": 0.2491249144077301, "rewards/accuracy_reward": 0.624137818813324, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2306722104549408, "step": 809 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 305.625, "epoch": 0.007841922335924718, "grad_norm": 4.604986430104279, "kl": 0.04150390625, "learning_rate": 9.998486473875772e-07, "loss": 0.0017, "reward": 1.7781442403793335, "reward_std": 0.05985913798213005, "rewards/accuracy_reward": 0.5039417743682861, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2242024838924408, "step": 810 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 356.4583435058594, "epoch": 0.007851603721524625, "grad_norm": 2.374242214459072, "kl": 0.037109375, "learning_rate": 9.998482730031633e-07, "loss": 0.0015, "reward": 1.7944374084472656, "reward_std": 0.23169267177581787, "rewards/accuracy_reward": 0.459989070892334, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2261149138212204, "step": 811 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 392.5, "epoch": 0.007861285107124532, "grad_norm": 2.1478880247183363, "kl": 0.04052734375, "learning_rate": 9.998478981563543e-07, "loss": 0.0016, "reward": 1.719717025756836, "reward_std": 0.07338020205497742, "rewards/accuracy_reward": 0.4522120952606201, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1508382260799408, "step": 812 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 443.5, "epoch": 0.00787096649272444, "grad_norm": 4.028546849113153, "kl": 0.031494140625, "learning_rate": 9.998475228471509e-07, "loss": 0.0013, "reward": 1.9409303665161133, "reward_std": 0.3504556119441986, "rewards/accuracy_reward": 0.5385539531707764, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.26904296875, "step": 813 }, { "all_correct": 0.0, "all_wrong": 0.6666666666666666, "completion_length": 325.375, "epoch": 0.007880647878324347, "grad_norm": 1.8015929111567133, "kl": 0.0498046875, "learning_rate": 9.998471470755532e-07, "loss": 0.002, "reward": 1.3430784940719604, "reward_std": 0.03469037264585495, "rewards/accuracy_reward": 0.16260159015655518, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1138102263212204, "step": 814 }, { "all_correct": 0.0, "all_wrong": 0.6666666666666666, "completion_length": 364.7083435058594, "epoch": 0.007890329263924252, "grad_norm": 2.350444329398687, "kl": 0.039794921875, "learning_rate": 9.99846770841562e-07, "loss": 0.0016, "reward": 1.0432400703430176, "reward_std": 0.12230125069618225, "rewards/accuracy_reward": 0.0277777798473835, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.01546224020421505, "step": 815 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 393.2083435058594, "epoch": 0.007900010649524159, "grad_norm": 3.690847605688603, "kl": 0.03662109375, "learning_rate": 9.99846394145177e-07, "loss": 0.0015, "reward": 2.2251782417297363, "reward_std": 0.12289617955684662, "rewards/accuracy_reward": 0.7153064608573914, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3265380859375, "step": 816 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 421.4583435058594, "epoch": 0.007909692035124066, "grad_norm": 2.299782327622042, "kl": 0.034912109375, "learning_rate": 9.998460169863993e-07, "loss": 0.0014, "reward": 2.0783300399780273, "reward_std": 0.27431267499923706, "rewards/accuracy_reward": 0.6670424342155457, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2362874448299408, "step": 817 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 429.7083435058594, "epoch": 0.007919373420723973, "grad_norm": 2.3494386985915554, "kl": 0.041015625, "learning_rate": 9.998456393652285e-07, "loss": 0.0016, "reward": 2.0490944385528564, "reward_std": 0.06585288047790527, "rewards/accuracy_reward": 0.5784158706665039, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2790120542049408, "step": 818 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 407.5833435058594, "epoch": 0.00792905480632388, "grad_norm": 1.9033858995465642, "kl": 0.037841796875, "learning_rate": 9.998452612816657e-07, "loss": 0.0015, "reward": 1.6519206762313843, "reward_std": 0.1840202510356903, "rewards/accuracy_reward": 0.4166666865348816, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.13525390625, "step": 819 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 400.79168701171875, "epoch": 0.007938736191923788, "grad_norm": 3.098605969297058, "kl": 0.037841796875, "learning_rate": 9.998448827357105e-07, "loss": 0.0015, "reward": 1.951995849609375, "reward_std": 0.045687057077884674, "rewards/accuracy_reward": 0.5207945704460144, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.231201171875, "step": 820 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 416.79168701171875, "epoch": 0.007948417577523695, "grad_norm": 2.82119479407923, "kl": 0.042236328125, "learning_rate": 9.998445037273638e-07, "loss": 0.0017, "reward": 1.573678970336914, "reward_std": 0.03918851166963577, "rewards/accuracy_reward": 0.23380272090435028, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.20654296875, "step": 821 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 369.25, "epoch": 0.007958098963123602, "grad_norm": 1.6438098899433242, "kl": 0.03662109375, "learning_rate": 9.998441242566255e-07, "loss": 0.0015, "reward": 1.6433675289154053, "reward_std": 0.26347899436950684, "rewards/accuracy_reward": 0.4583333432674408, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1183675155043602, "step": 822 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 400.79168701171875, "epoch": 0.007967780348723509, "grad_norm": 2.387505804035668, "kl": 0.037353515625, "learning_rate": 9.998437443234963e-07, "loss": 0.0015, "reward": 1.5995025634765625, "reward_std": 0.36651504039764404, "rewards/accuracy_reward": 0.2555328905582428, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2356363981962204, "step": 823 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 387.75, "epoch": 0.007977461734323416, "grad_norm": 2.2542492281044386, "kl": 0.04296875, "learning_rate": 9.998433639279767e-07, "loss": 0.0017, "reward": 2.1697988510131836, "reward_std": 0.07918737083673477, "rewards/accuracy_reward": 0.7495717406272888, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2535603940486908, "step": 824 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 378.7083435058594, "epoch": 0.007987143119923323, "grad_norm": 1.8829101139978692, "kl": 0.042724609375, "learning_rate": 9.998429830700665e-07, "loss": 0.0017, "reward": 2.3858156204223633, "reward_std": 0.051942791789770126, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2108154296875, "step": 825 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 428.0, "epoch": 0.00799682450552323, "grad_norm": 5.298304586391659, "kl": 0.0308837890625, "learning_rate": 9.998426017497665e-07, "loss": 0.0012, "reward": 2.2605626583099365, "reward_std": 0.08754992485046387, "rewards/accuracy_reward": 0.7928704619407654, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.301025390625, "step": 826 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 528.1666870117188, "epoch": 0.008006505891123138, "grad_norm": 1.7151145492565107, "kl": 0.0255126953125, "learning_rate": 9.998422199670769e-07, "loss": 0.001, "reward": 1.6227067708969116, "reward_std": 0.22873321175575256, "rewards/accuracy_reward": 0.4663183093070984, "rewards/format_reward": 0.8333333730697632, "rewards/semantic_reward": 0.2147216796875, "step": 827 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 411.2083435058594, "epoch": 0.008016187276723045, "grad_norm": 5.206865130592051, "kl": 0.0380859375, "learning_rate": 9.99841837721998e-07, "loss": 0.0015, "reward": 2.0522890090942383, "reward_std": 0.1657688021659851, "rewards/accuracy_reward": 0.6256777048110962, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2349446713924408, "step": 828 }, { "all_correct": 0.0, "all_wrong": 0.6666666666666666, "completion_length": 386.875, "epoch": 0.008025868662322952, "grad_norm": 2.4483384487083817, "kl": 0.044677734375, "learning_rate": 9.998414550145303e-07, "loss": 0.0018, "reward": 1.229791283607483, "reward_std": 0.05293823778629303, "rewards/accuracy_reward": 0.07993774116039276, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.099853515625, "step": 829 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 353.75, "epoch": 0.008035550047922859, "grad_norm": 3.210238910153432, "kl": 0.041015625, "learning_rate": 9.99841071844674e-07, "loss": 0.0016, "reward": 1.7206276655197144, "reward_std": 0.06113608926534653, "rewards/accuracy_reward": 0.39574968814849854, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2498779296875, "step": 830 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 356.5, "epoch": 0.008045231433522766, "grad_norm": 8.208455804911239, "kl": 0.04150390625, "learning_rate": 9.998406882124295e-07, "loss": 0.0017, "reward": 1.6680452823638916, "reward_std": 0.09102940559387207, "rewards/accuracy_reward": 0.3693717420101166, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2236735075712204, "step": 831 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 378.3333435058594, "epoch": 0.008054912819122673, "grad_norm": 10.549536157371824, "kl": 0.033203125, "learning_rate": 9.998403041177971e-07, "loss": 0.0013, "reward": 1.8610265254974365, "reward_std": 0.5215307474136353, "rewards/accuracy_reward": 0.49392837285995483, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2337646484375, "step": 832 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 417.79168701171875, "epoch": 0.00806459420472258, "grad_norm": 1.8520375460374394, "kl": 0.03173828125, "learning_rate": 9.998399195607773e-07, "loss": 0.0013, "reward": 1.738425612449646, "reward_std": 0.039314042776823044, "rewards/accuracy_reward": 0.40664657950401306, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1984456479549408, "step": 833 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 387.91668701171875, "epoch": 0.008074275590322488, "grad_norm": 1.977874534413773, "kl": 0.027587890625, "learning_rate": 9.998395345413705e-07, "loss": 0.0011, "reward": 1.5774340629577637, "reward_std": 0.3200456500053406, "rewards/accuracy_reward": 0.31151604652404785, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.19091796875, "step": 834 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 386.875, "epoch": 0.008083956975922395, "grad_norm": 1.6588983855727781, "kl": 0.039306640625, "learning_rate": 9.99839149059577e-07, "loss": 0.0016, "reward": 1.5293740034103394, "reward_std": 0.062169529497623444, "rewards/accuracy_reward": 0.22684305906295776, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1775309294462204, "step": 835 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 348.0, "epoch": 0.008093638361522302, "grad_norm": 3.8577529775917108, "kl": 0.037109375, "learning_rate": 9.998387631153969e-07, "loss": 0.0015, "reward": 1.7507604360580444, "reward_std": 0.0925971269607544, "rewards/accuracy_reward": 0.4657750129699707, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1933186948299408, "step": 836 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 361.7083435058594, "epoch": 0.008103319747122207, "grad_norm": 4.2044715358060705, "kl": 0.0400390625, "learning_rate": 9.998383767088309e-07, "loss": 0.0016, "reward": 1.9177138805389404, "reward_std": 0.3423558175563812, "rewards/accuracy_reward": 0.5931674242019653, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1912129819393158, "step": 837 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 461.3333435058594, "epoch": 0.008113001132722114, "grad_norm": 1.3568575692960905, "kl": 0.026611328125, "learning_rate": 9.998379898398791e-07, "loss": 0.0011, "reward": 1.844824194908142, "reward_std": 0.16217803955078125, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1531575620174408, "step": 838 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 427.7083435058594, "epoch": 0.008122682518322022, "grad_norm": 3.008490246478992, "kl": 0.0361328125, "learning_rate": 9.99837602508542e-07, "loss": 0.0014, "reward": 2.0844645500183105, "reward_std": 0.49687278270721436, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2011311948299408, "step": 839 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 369.875, "epoch": 0.008132363903921929, "grad_norm": 3.5204864862517566, "kl": 0.039794921875, "learning_rate": 9.9983721471482e-07, "loss": 0.0016, "reward": 1.7330994606018066, "reward_std": 0.024822279810905457, "rewards/accuracy_reward": 0.4440450668334961, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1557210385799408, "step": 840 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 352.66668701171875, "epoch": 0.008142045289521836, "grad_norm": 2.9746628304653595, "kl": 0.0341796875, "learning_rate": 9.998368264587134e-07, "loss": 0.0014, "reward": 1.7084176540374756, "reward_std": 0.07014314830303192, "rewards/accuracy_reward": 0.44761013984680176, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1608072966337204, "step": 841 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 354.0, "epoch": 0.008151726675121743, "grad_norm": 3.5493515287560657, "kl": 0.045654296875, "learning_rate": 9.998364377402226e-07, "loss": 0.0018, "reward": 2.3268041610717773, "reward_std": 0.08796411752700806, "rewards/accuracy_reward": 0.888888955116272, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2795817255973816, "step": 842 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 363.4583435058594, "epoch": 0.00816140806072165, "grad_norm": 2.4220735364111947, "kl": 0.041259765625, "learning_rate": 9.998360485593478e-07, "loss": 0.0016, "reward": 1.7027028799057007, "reward_std": 0.052627887576818466, "rewards/accuracy_reward": 0.4038584232330322, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1905110776424408, "step": 843 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.6666666666666666, "completion_length": 379.9583435058594, "epoch": 0.008171089446321557, "grad_norm": 1.1937007098832944, "kl": 0.037109375, "learning_rate": 9.998356589160895e-07, "loss": 0.0015, "reward": 1.507665991783142, "reward_std": 0.014402753673493862, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.107666015625, "step": 844 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 484.3333435058594, "epoch": 0.008180770831921464, "grad_norm": 1.3668520633161894, "kl": 0.01422119140625, "learning_rate": 9.99835268810448e-07, "loss": 0.0006, "reward": 1.5335774421691895, "reward_std": 0.7163833379745483, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.125244140625, "step": 845 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 362.79168701171875, "epoch": 0.008190452217521372, "grad_norm": 4.218310016014832, "kl": 0.0419921875, "learning_rate": 9.998348782424238e-07, "loss": 0.0017, "reward": 1.9588534832000732, "reward_std": 0.10303843021392822, "rewards/accuracy_reward": 0.4231846034526825, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3523356318473816, "step": 846 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 363.625, "epoch": 0.008200133603121279, "grad_norm": 5.109482571484323, "kl": 0.038818359375, "learning_rate": 9.99834487212017e-07, "loss": 0.0016, "reward": 2.20086669921875, "reward_std": 0.28909996151924133, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.20086669921875, "step": 847 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 523.7083740234375, "epoch": 0.008209814988721186, "grad_norm": 1.4480311577141853, "kl": 0.0194091796875, "learning_rate": 9.998340957192282e-07, "loss": 0.0008, "reward": 1.8403565883636475, "reward_std": 0.6296359896659851, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 0.875, "rewards/semantic_reward": 0.1986897885799408, "step": 848 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 360.29168701171875, "epoch": 0.008219496374321093, "grad_norm": 2.365664571741076, "kl": 0.03515625, "learning_rate": 9.998337037640577e-07, "loss": 0.0014, "reward": 1.7461459636688232, "reward_std": 0.4468775987625122, "rewards/accuracy_reward": 0.432417094707489, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1970621794462204, "step": 849 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 367.3333435058594, "epoch": 0.008229177759921, "grad_norm": 5.0531401574878885, "kl": 0.036376953125, "learning_rate": 9.998333113465057e-07, "loss": 0.0015, "reward": 2.2587928771972656, "reward_std": 0.07516690343618393, "rewards/accuracy_reward": 0.7938594818115234, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3065999448299408, "step": 850 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.3333333333333333, "completion_length": 366.3333435058594, "epoch": 0.008238859145520907, "grad_norm": 2.1769223469075785, "kl": 0.037841796875, "learning_rate": 9.998329184665728e-07, "loss": 0.0015, "reward": 1.7780656814575195, "reward_std": 0.049424272030591965, "rewards/accuracy_reward": 0.4649471044540405, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1964518278837204, "step": 851 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 370.66668701171875, "epoch": 0.008248540531120814, "grad_norm": 3.0515022139314225, "kl": 0.03466796875, "learning_rate": 9.998325251242592e-07, "loss": 0.0014, "reward": 2.048020839691162, "reward_std": 0.10518886148929596, "rewards/accuracy_reward": 0.5456118583679199, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3440755307674408, "step": 852 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 422.4583435058594, "epoch": 0.008258221916720722, "grad_norm": 2.013860406741419, "kl": 0.02001953125, "learning_rate": 9.998321313195655e-07, "loss": 0.0008, "reward": 1.4343913793563843, "reward_std": 0.5513026714324951, "rewards/accuracy_reward": 0.3020833432674408, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.1156412810087204, "step": 853 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 376.91668701171875, "epoch": 0.008267903302320629, "grad_norm": 4.5600564986289385, "kl": 0.038330078125, "learning_rate": 9.998317370524919e-07, "loss": 0.0015, "reward": 1.7795872688293457, "reward_std": 0.22421006858348846, "rewards/accuracy_reward": 0.49455317854881287, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1600341796875, "step": 854 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 410.4583435058594, "epoch": 0.008277584687920536, "grad_norm": 1.3534508170113189, "kl": 0.0296630859375, "learning_rate": 9.998313423230384e-07, "loss": 0.0012, "reward": 1.943497657775879, "reward_std": 0.1931588053703308, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2101643979549408, "step": 855 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 371.375, "epoch": 0.008287266073520443, "grad_norm": 5.8447567270501315, "kl": 0.03955078125, "learning_rate": 9.998309471312061e-07, "loss": 0.0016, "reward": 1.5185174942016602, "reward_std": 0.0503108873963356, "rewards/accuracy_reward": 0.24158066511154175, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1519368588924408, "step": 856 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.3333333333333333, "completion_length": 412.8333435058594, "epoch": 0.00829694745912035, "grad_norm": 1.8662053054735845, "kl": 0.0262451171875, "learning_rate": 9.998305514769949e-07, "loss": 0.0011, "reward": 1.991560935974121, "reward_std": 0.06372451037168503, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2248942106962204, "step": 857 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.3333333333333333, "completion_length": 393.9583435058594, "epoch": 0.008306628844720257, "grad_norm": 7.962492697012899, "kl": 0.0322265625, "learning_rate": 9.99830155360405e-07, "loss": 0.0013, "reward": 2.012012004852295, "reward_std": 0.04960092157125473, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2286783903837204, "step": 858 }, { "all_correct": 0.0, "all_wrong": 0.6666666666666666, "completion_length": 370.29168701171875, "epoch": 0.008316310230320163, "grad_norm": 1.474673455756832, "kl": 0.036376953125, "learning_rate": 9.998297587814373e-07, "loss": 0.0015, "reward": 1.13493013381958, "reward_std": 0.10702045261859894, "rewards/accuracy_reward": 0.0401231050491333, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.0781402587890625, "step": 859 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 373.0, "epoch": 0.00832599161592007, "grad_norm": 3.8859941134270484, "kl": 0.0380859375, "learning_rate": 9.99829361740092e-07, "loss": 0.0015, "reward": 2.1683297157287598, "reward_std": 0.2972180247306824, "rewards/accuracy_reward": 0.7440377473831177, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2742919921875, "step": 860 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 373.4583435058594, "epoch": 0.008335673001519977, "grad_norm": 2.580811870404012, "kl": 0.04443359375, "learning_rate": 9.998289642363692e-07, "loss": 0.0018, "reward": 1.5777490139007568, "reward_std": 0.14005693793296814, "rewards/accuracy_reward": 0.17303697764873505, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2880452573299408, "step": 861 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.3333333333333333, "completion_length": 355.9583435058594, "epoch": 0.008345354387119884, "grad_norm": 3.3956237229160635, "kl": 0.03759765625, "learning_rate": 9.998285662702694e-07, "loss": 0.0015, "reward": 1.963411569595337, "reward_std": 0.06578509509563446, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.205078125, "step": 862 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 381.16668701171875, "epoch": 0.008355035772719791, "grad_norm": 7.834696605941973, "kl": 0.0390625, "learning_rate": 9.99828167841793e-07, "loss": 0.0016, "reward": 1.8878673315048218, "reward_std": 0.05453568696975708, "rewards/accuracy_reward": 0.38256940245628357, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3052978515625, "step": 863 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.6666666666666666, "completion_length": 399.91668701171875, "epoch": 0.008364717158319698, "grad_norm": 0.9500951097300241, "kl": 0.0303955078125, "learning_rate": 9.998277689509403e-07, "loss": 0.0012, "reward": 1.5140626430511475, "reward_std": 0.02659686468541622, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1223958358168602, "step": 864 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 368.5833435058594, "epoch": 0.008374398543919605, "grad_norm": 3.2926507938526717, "kl": 0.0458984375, "learning_rate": 9.998273695977118e-07, "loss": 0.0018, "reward": 1.5395698547363281, "reward_std": 0.1879763901233673, "rewards/accuracy_reward": 0.3923611342906952, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.0972086638212204, "step": 865 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 438.8333435058594, "epoch": 0.008384079929519513, "grad_norm": 1.805523549743536, "kl": 0.0283203125, "learning_rate": 9.998269697821077e-07, "loss": 0.0011, "reward": 2.032881736755371, "reward_std": 0.20784644782543182, "rewards/accuracy_reward": 0.7083333730697632, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.22454833984375, "step": 866 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 383.5, "epoch": 0.00839376131511942, "grad_norm": 2.313419173232839, "kl": 0.038818359375, "learning_rate": 9.998265695041287e-07, "loss": 0.0016, "reward": 2.5263266563415527, "reward_std": 0.08369453251361847, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3596598505973816, "step": 867 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 367.5, "epoch": 0.008403442700719327, "grad_norm": 3.9328774368413213, "kl": 0.0341796875, "learning_rate": 9.998261687637747e-07, "loss": 0.0014, "reward": 1.8791722059249878, "reward_std": 0.057864960283041, "rewards/accuracy_reward": 0.5555555820465088, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2069498747587204, "step": 868 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.6666666666666666, "completion_length": 368.875, "epoch": 0.008413124086319234, "grad_norm": 1.1025315374325675, "kl": 0.0257568359375, "learning_rate": 9.998257675610464e-07, "loss": 0.001, "reward": 1.4748209714889526, "reward_std": 0.03273054212331772, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.108154296875, "step": 869 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 397.5833435058594, "epoch": 0.008422805471919141, "grad_norm": 2.4791765477444914, "kl": 0.042236328125, "learning_rate": 9.998253658959442e-07, "loss": 0.0017, "reward": 2.086930274963379, "reward_std": 0.24346062541007996, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1869303435087204, "step": 870 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 406.29168701171875, "epoch": 0.008432486857519048, "grad_norm": 2.6989184573145706, "kl": 0.04296875, "learning_rate": 9.99824963768468e-07, "loss": 0.0017, "reward": 1.6938247680664062, "reward_std": 0.10884582996368408, "rewards/accuracy_reward": 0.3165542185306549, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2439371794462204, "step": 871 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 356.8333435058594, "epoch": 0.008442168243118955, "grad_norm": 2.6887363370189736, "kl": 0.040283203125, "learning_rate": 9.99824561178619e-07, "loss": 0.0016, "reward": 2.3382434844970703, "reward_std": 0.0739373117685318, "rewards/accuracy_reward": 0.7777778506278992, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.393798828125, "step": 872 }, { "all_correct": 0.0, "all_wrong": 0.6666666666666666, "completion_length": 642.4166870117188, "epoch": 0.008451849628718863, "grad_norm": 1.3554981379821147, "kl": 0.0155029296875, "learning_rate": 9.99824158126397e-07, "loss": 0.0006, "reward": 0.5974935293197632, "reward_std": 0.33041849732398987, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.5416666865348816, "rewards/semantic_reward": 0.01416015625, "step": 873 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 384.54168701171875, "epoch": 0.00846153101431877, "grad_norm": 2.814939603556591, "kl": 0.042724609375, "learning_rate": 9.99823754611802e-07, "loss": 0.0017, "reward": 2.2812154293060303, "reward_std": 0.1213543564081192, "rewards/accuracy_reward": 0.7917214632034302, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3144938349723816, "step": 874 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 387.0833435058594, "epoch": 0.008471212399918677, "grad_norm": 2.573861167868894, "kl": 0.0380859375, "learning_rate": 9.998233506348353e-07, "loss": 0.0015, "reward": 2.106456756591797, "reward_std": 0.18510380387306213, "rewards/accuracy_reward": 0.6189159154891968, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3125407099723816, "step": 875 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 399.75, "epoch": 0.008480893785518584, "grad_norm": 2.30253088890665, "kl": 0.038330078125, "learning_rate": 9.998229461954969e-07, "loss": 0.0015, "reward": 1.88837468624115, "reward_std": 0.19630533456802368, "rewards/accuracy_reward": 0.4669959247112274, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2463785856962204, "step": 876 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 434.5, "epoch": 0.008490575171118491, "grad_norm": 18.640856855803555, "kl": 0.029541015625, "learning_rate": 9.998225412937867e-07, "loss": 0.0012, "reward": 1.7290058135986328, "reward_std": 0.17320847511291504, "rewards/accuracy_reward": 0.42269057035446167, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.2229817807674408, "step": 877 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 415.625, "epoch": 0.008500256556718398, "grad_norm": 3.452207257395101, "kl": 0.043701171875, "learning_rate": 9.998221359297056e-07, "loss": 0.0017, "reward": 1.9354671239852905, "reward_std": 0.09484558552503586, "rewards/accuracy_reward": 0.5230239629745483, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2541097104549408, "step": 878 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.3333333333333333, "completion_length": 380.9583435058594, "epoch": 0.008509937942318305, "grad_norm": 3.8196154922545307, "kl": 0.04833984375, "learning_rate": 9.99821730103254e-07, "loss": 0.0019, "reward": 1.8555963039398193, "reward_std": 0.016431011259555817, "rewards/accuracy_reward": 0.5555555820465088, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1667073667049408, "step": 879 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 416.91668701171875, "epoch": 0.00851961932791821, "grad_norm": 2.497789744580651, "kl": 0.038330078125, "learning_rate": 9.99821323814432e-07, "loss": 0.0015, "reward": 1.562793493270874, "reward_std": 0.41658490896224976, "rewards/accuracy_reward": 0.2629399597644806, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.224853515625, "step": 880 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 418.04168701171875, "epoch": 0.008529300713518118, "grad_norm": 3.594216006418971, "kl": 0.036376953125, "learning_rate": 9.9982091706324e-07, "loss": 0.0015, "reward": 2.0131566524505615, "reward_std": 0.3646790385246277, "rewards/accuracy_reward": 0.6805555820465088, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.249267578125, "step": 881 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 384.2083435058594, "epoch": 0.008538982099118025, "grad_norm": 2.4959997158469327, "kl": 0.044189453125, "learning_rate": 9.998205098496786e-07, "loss": 0.0018, "reward": 1.6783571243286133, "reward_std": 0.3111202120780945, "rewards/accuracy_reward": 0.396960586309433, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.181396484375, "step": 882 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 395.79168701171875, "epoch": 0.008548663484717932, "grad_norm": 3.3573199715068665, "kl": 0.05517578125, "learning_rate": 9.99820102173748e-07, "loss": 0.0022, "reward": 2.230917453765869, "reward_std": 0.26045724749565125, "rewards/accuracy_reward": 0.741466224193573, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.322784423828125, "step": 883 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 399.79168701171875, "epoch": 0.00855834487031784, "grad_norm": 2.215064707456407, "kl": 0.03076171875, "learning_rate": 9.998196940354486e-07, "loss": 0.0012, "reward": 1.998238444328308, "reward_std": 0.24228456616401672, "rewards/accuracy_reward": 0.6123864650726318, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2275187224149704, "step": 884 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 397.41668701171875, "epoch": 0.008568026255917746, "grad_norm": 4.965617519107648, "kl": 0.03466796875, "learning_rate": 9.998192854347809e-07, "loss": 0.0014, "reward": 2.1730360984802246, "reward_std": 0.28113532066345215, "rewards/accuracy_reward": 0.763888955116272, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2508138120174408, "step": 885 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 418.5, "epoch": 0.008577707641517654, "grad_norm": 5.999877362954369, "kl": 0.03564453125, "learning_rate": 9.998188763717452e-07, "loss": 0.0014, "reward": 1.8249189853668213, "reward_std": 0.06681324541568756, "rewards/accuracy_reward": 0.463464617729187, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2614542841911316, "step": 886 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 365.75, "epoch": 0.00858738902711756, "grad_norm": 3.220010204545296, "kl": 0.05126953125, "learning_rate": 9.998184668463416e-07, "loss": 0.0021, "reward": 1.8218210935592651, "reward_std": 0.35582131147384644, "rewards/accuracy_reward": 0.37400203943252563, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.30615234375, "step": 887 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 379.25, "epoch": 0.008597070412717468, "grad_norm": 2.5926177203888305, "kl": 0.0419921875, "learning_rate": 9.99818056858571e-07, "loss": 0.0017, "reward": 1.5150874853134155, "reward_std": 0.23492400348186493, "rewards/accuracy_reward": 0.2372635304927826, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1861572265625, "step": 888 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 452.5, "epoch": 0.008606751798317375, "grad_norm": 1.9304189281196376, "kl": 0.03125, "learning_rate": 9.998176464084334e-07, "loss": 0.0013, "reward": 1.6303510665893555, "reward_std": 0.19020716845989227, "rewards/accuracy_reward": 0.35608357191085815, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2076009213924408, "step": 889 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 386.9583435058594, "epoch": 0.008616433183917282, "grad_norm": 2.0782891391262734, "kl": 0.0283203125, "learning_rate": 9.998172354959294e-07, "loss": 0.0011, "reward": 1.6918619871139526, "reward_std": 0.7501479983329773, "rewards/accuracy_reward": 0.4583333432674408, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1585286557674408, "step": 890 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 341.75, "epoch": 0.00862611456951719, "grad_norm": 3.1541164354161864, "kl": 0.04638671875, "learning_rate": 9.99816824121059e-07, "loss": 0.0019, "reward": 2.203340530395508, "reward_std": 0.2905160188674927, "rewards/accuracy_reward": 0.8606237769126892, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2427164763212204, "step": 891 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 385.29168701171875, "epoch": 0.008635795955117096, "grad_norm": 3.6431307046816532, "kl": 0.043212890625, "learning_rate": 9.998164122838231e-07, "loss": 0.0017, "reward": 2.084193229675293, "reward_std": 0.1423732340335846, "rewards/accuracy_reward": 0.6465710401535034, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2709554135799408, "step": 892 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.6666666666666666, "completion_length": 322.04168701171875, "epoch": 0.008645477340717004, "grad_norm": 1.6678327582326185, "kl": 0.056396484375, "learning_rate": 9.998159999842216e-07, "loss": 0.0023, "reward": 1.4955079555511475, "reward_std": 0.03998367488384247, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1038411483168602, "step": 893 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 376.54168701171875, "epoch": 0.00865515872631691, "grad_norm": 2.592384145015811, "kl": 0.0361328125, "learning_rate": 9.998155872222554e-07, "loss": 0.0014, "reward": 1.9371418952941895, "reward_std": 0.07428654283285141, "rewards/accuracy_reward": 0.6087970733642578, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2200113981962204, "step": 894 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 361.54168701171875, "epoch": 0.008664840111916818, "grad_norm": 3.518781051779782, "kl": 0.038818359375, "learning_rate": 9.998151739979245e-07, "loss": 0.0016, "reward": 1.8077280521392822, "reward_std": 0.2775077819824219, "rewards/accuracy_reward": 0.4164356589317322, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2412923276424408, "step": 895 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 357.16668701171875, "epoch": 0.008674521497516725, "grad_norm": 2.7428249015626065, "kl": 0.03564453125, "learning_rate": 9.998147603112295e-07, "loss": 0.0014, "reward": 1.570855975151062, "reward_std": 0.4348742663860321, "rewards/accuracy_reward": 0.34461891651153564, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1429036557674408, "step": 896 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 342.29168701171875, "epoch": 0.008684202883116632, "grad_norm": 2.1453918460069983, "kl": 0.03564453125, "learning_rate": 9.998143461621705e-07, "loss": 0.0014, "reward": 2.429205894470215, "reward_std": 0.103570356965065, "rewards/accuracy_reward": 0.9734848737716675, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3223876953125, "step": 897 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 359.7083435058594, "epoch": 0.00869388426871654, "grad_norm": 3.3140650673500835, "kl": 0.03271484375, "learning_rate": 9.99813931550748e-07, "loss": 0.0013, "reward": 1.7577238082885742, "reward_std": 0.5594600439071655, "rewards/accuracy_reward": 0.41505616903305054, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2343343198299408, "step": 898 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 345.5833435058594, "epoch": 0.008703565654316446, "grad_norm": 5.4546624574842015, "kl": 0.037353515625, "learning_rate": 9.998135164769626e-07, "loss": 0.0015, "reward": 1.860215425491333, "reward_std": 0.3000255823135376, "rewards/accuracy_reward": 0.5517436265945435, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2251383513212204, "step": 899 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 334.9583435058594, "epoch": 0.008713247039916354, "grad_norm": 2.054048485044637, "kl": 0.035888671875, "learning_rate": 9.998131009408143e-07, "loss": 0.0014, "reward": 1.7826385498046875, "reward_std": 0.26200395822525024, "rewards/accuracy_reward": 0.569023609161377, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1636149138212204, "step": 900 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 350.91668701171875, "epoch": 0.00872292842551626, "grad_norm": 4.621949564752077, "kl": 0.037841796875, "learning_rate": 9.998126849423038e-07, "loss": 0.0015, "reward": 1.436264991760254, "reward_std": 0.28237634897232056, "rewards/accuracy_reward": 0.21995624899864197, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1746419370174408, "step": 901 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 411.3333435058594, "epoch": 0.008732609811116166, "grad_norm": 3.416163749491046, "kl": 0.0400390625, "learning_rate": 9.998122684814315e-07, "loss": 0.0016, "reward": 1.7203497886657715, "reward_std": 0.3751642107963562, "rewards/accuracy_reward": 0.43768367171287537, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.232666015625, "step": 902 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 362.8333435058594, "epoch": 0.008742291196716073, "grad_norm": 4.158183923356755, "kl": 0.03515625, "learning_rate": 9.998118515581975e-07, "loss": 0.0014, "reward": 2.0386242866516113, "reward_std": 0.1089138388633728, "rewards/accuracy_reward": 0.6011242866516113, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2708333432674408, "step": 903 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 353.375, "epoch": 0.00875197258231598, "grad_norm": 2.2278964972391395, "kl": 0.037109375, "learning_rate": 9.998114341726026e-07, "loss": 0.0015, "reward": 1.9223525524139404, "reward_std": 0.06569363176822662, "rewards/accuracy_reward": 0.5555555820465088, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2584635615348816, "step": 904 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 370.5833435058594, "epoch": 0.008761653967915888, "grad_norm": 12.670799275907694, "kl": 0.031005859375, "learning_rate": 9.998110163246466e-07, "loss": 0.0012, "reward": 2.0248665809631348, "reward_std": 0.08056473731994629, "rewards/accuracy_reward": 0.5319710969924927, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3012288510799408, "step": 905 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 348.91668701171875, "epoch": 0.008771335353515795, "grad_norm": 4.018884210535958, "kl": 0.041259765625, "learning_rate": 9.998105980143304e-07, "loss": 0.0017, "reward": 2.306483268737793, "reward_std": 0.07337071746587753, "rewards/accuracy_reward": 0.888888955116272, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2842610776424408, "step": 906 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 380.125, "epoch": 0.008781016739115702, "grad_norm": 3.2052045630333117, "kl": 0.030029296875, "learning_rate": 9.998101792416542e-07, "loss": 0.0012, "reward": 2.3859703540802, "reward_std": 0.09168269485235214, "rewards/accuracy_reward": 0.8911787271499634, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.328125, "step": 907 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 333.875, "epoch": 0.008790698124715609, "grad_norm": 2.0415367539321068, "kl": 0.033203125, "learning_rate": 9.998097600066185e-07, "loss": 0.0013, "reward": 1.5291016101837158, "reward_std": 0.18905645608901978, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1207682341337204, "step": 908 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 372.9583435058594, "epoch": 0.008800379510315516, "grad_norm": 2.391968524089246, "kl": 0.037841796875, "learning_rate": 9.998093403092233e-07, "loss": 0.0015, "reward": 1.7618215084075928, "reward_std": 0.216415137052536, "rewards/accuracy_reward": 0.47148117423057556, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1820068359375, "step": 909 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 353.25, "epoch": 0.008810060895915423, "grad_norm": 3.189356445789948, "kl": 0.041259765625, "learning_rate": 9.998089201494697e-07, "loss": 0.0017, "reward": 1.9650052785873413, "reward_std": 0.3307151794433594, "rewards/accuracy_reward": 0.5786608457565308, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2530110776424408, "step": 910 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 361.41668701171875, "epoch": 0.00881974228151533, "grad_norm": 2.854024452931709, "kl": 0.039306640625, "learning_rate": 9.998084995273574e-07, "loss": 0.0016, "reward": 2.0294904708862305, "reward_std": 0.2544848918914795, "rewards/accuracy_reward": 0.6425926089286804, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.278564453125, "step": 911 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 384.0833435058594, "epoch": 0.008829423667115237, "grad_norm": 2.111943829484718, "kl": 0.03466796875, "learning_rate": 9.99808078442887e-07, "loss": 0.0014, "reward": 1.6830005645751953, "reward_std": 0.07843232154846191, "rewards/accuracy_reward": 0.3642424941062927, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2270914763212204, "step": 912 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 385.8333435058594, "epoch": 0.008839105052715145, "grad_norm": 2.2933934412635257, "kl": 0.040771484375, "learning_rate": 9.998076568960592e-07, "loss": 0.0016, "reward": 1.8020648956298828, "reward_std": 0.3022570312023163, "rewards/accuracy_reward": 0.4428851008415222, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2425130307674408, "step": 913 }, { "all_correct": 0.0, "all_wrong": 0.6666666666666666, "completion_length": 438.79168701171875, "epoch": 0.008848786438315052, "grad_norm": 2.79940676392292, "kl": 0.034912109375, "learning_rate": 9.99807234886874e-07, "loss": 0.0014, "reward": 1.21156907081604, "reward_std": 0.14720739424228668, "rewards/accuracy_reward": 0.10425296425819397, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.0823160856962204, "step": 914 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 401.0833435058594, "epoch": 0.008858467823914959, "grad_norm": 2.574838694068105, "kl": 0.038818359375, "learning_rate": 9.998068124153319e-07, "loss": 0.0016, "reward": 1.900955080986023, "reward_std": 0.058811675757169724, "rewards/accuracy_reward": 0.45282667875289917, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2481282651424408, "step": 915 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 355.16668701171875, "epoch": 0.008868149209514866, "grad_norm": 2.661738844835555, "kl": 0.041259765625, "learning_rate": 9.998063894814333e-07, "loss": 0.0016, "reward": 2.3628029823303223, "reward_std": 0.10983140766620636, "rewards/accuracy_reward": 0.9032325148582458, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3095703125, "step": 916 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 385.5833435058594, "epoch": 0.008877830595114773, "grad_norm": 3.9966603354538814, "kl": 0.039794921875, "learning_rate": 9.998059660851785e-07, "loss": 0.0016, "reward": 1.678840160369873, "reward_std": 0.08574768155813217, "rewards/accuracy_reward": 0.39557191729545593, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1832682341337204, "step": 917 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 444.7083435058594, "epoch": 0.00888751198071468, "grad_norm": 10.877948693357062, "kl": 0.035888671875, "learning_rate": 9.998055422265681e-07, "loss": 0.0014, "reward": 1.9753177165985107, "reward_std": 0.3516260087490082, "rewards/accuracy_reward": 0.7080975770950317, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.1922200620174408, "step": 918 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 408.4583435058594, "epoch": 0.008897193366314587, "grad_norm": 2.8346011893264564, "kl": 0.037109375, "learning_rate": 9.998051179056025e-07, "loss": 0.0015, "reward": 2.079036235809326, "reward_std": 0.0668669044971466, "rewards/accuracy_reward": 0.6054278612136841, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2819417417049408, "step": 919 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 456.66668701171875, "epoch": 0.008906874751914495, "grad_norm": 2.4187213804991807, "kl": 0.02880859375, "learning_rate": 9.998046931222818e-07, "loss": 0.0012, "reward": 1.8714098930358887, "reward_std": 0.5019924640655518, "rewards/accuracy_reward": 0.5611071586608887, "rewards/format_reward": 0.875, "rewards/semantic_reward": 0.2686360776424408, "step": 920 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 431.75, "epoch": 0.008916556137514402, "grad_norm": 3.569037823871858, "kl": 0.044189453125, "learning_rate": 9.998042678766067e-07, "loss": 0.0018, "reward": 1.7450757026672363, "reward_std": 0.08717718720436096, "rewards/accuracy_reward": 0.3141675591468811, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2642415463924408, "step": 921 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 372.5833435058594, "epoch": 0.008926237523114309, "grad_norm": 2.357394100815414, "kl": 0.049072265625, "learning_rate": 9.998038421685775e-07, "loss": 0.002, "reward": 1.5711110830307007, "reward_std": 0.40854400396347046, "rewards/accuracy_reward": 0.3202158212661743, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1675618588924408, "step": 922 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 376.25, "epoch": 0.008935918908714216, "grad_norm": 1.943540799326009, "kl": 0.030517578125, "learning_rate": 9.998034159981946e-07, "loss": 0.0012, "reward": 1.8425509929656982, "reward_std": 0.3757389485836029, "rewards/accuracy_reward": 0.5087943077087402, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2254231870174408, "step": 923 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 401.5, "epoch": 0.008945600294314121, "grad_norm": 3.343456131297455, "kl": 0.037353515625, "learning_rate": 9.998029893654582e-07, "loss": 0.0015, "reward": 2.270510673522949, "reward_std": 0.10547284781932831, "rewards/accuracy_reward": 0.7656115293502808, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3548991084098816, "step": 924 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 452.29168701171875, "epoch": 0.008955281679914029, "grad_norm": 3.0010064797144382, "kl": 0.0306396484375, "learning_rate": 9.998025622703688e-07, "loss": 0.0012, "reward": 1.5733197927474976, "reward_std": 0.36914730072021484, "rewards/accuracy_reward": 0.35640883445739746, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1669108122587204, "step": 925 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 480.7083435058594, "epoch": 0.008964963065513936, "grad_norm": 1.6929155541255565, "kl": 0.02197265625, "learning_rate": 9.998021347129271e-07, "loss": 0.0009, "reward": 1.188867211341858, "reward_std": 0.40408024191856384, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.0472005233168602, "step": 926 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 418.8333435058594, "epoch": 0.008974644451113843, "grad_norm": 6.321980925868353, "kl": 0.04345703125, "learning_rate": 9.99801706693133e-07, "loss": 0.0017, "reward": 2.213196277618408, "reward_std": 0.0881272479891777, "rewards/accuracy_reward": 0.7670701742172241, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2794596552848816, "step": 927 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 410.7083435058594, "epoch": 0.00898432583671375, "grad_norm": 3.012636547737593, "kl": 0.048095703125, "learning_rate": 9.998012782109874e-07, "loss": 0.0019, "reward": 2.1128363609313965, "reward_std": 0.33557313680648804, "rewards/accuracy_reward": 0.7139267921447754, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.2572428584098816, "step": 928 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 427.04168701171875, "epoch": 0.008994007222313657, "grad_norm": 7.070012127851026, "kl": 0.03955078125, "learning_rate": 9.998008492664903e-07, "loss": 0.0016, "reward": 1.729219675064087, "reward_std": 0.22356432676315308, "rewards/accuracy_reward": 0.37889406085014343, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2503255307674408, "step": 929 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 343.79168701171875, "epoch": 0.009003688607913564, "grad_norm": 6.074673333247969, "kl": 0.03759765625, "learning_rate": 9.998004198596422e-07, "loss": 0.0015, "reward": 2.5291993618011475, "reward_std": 0.1017666906118393, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3875325620174408, "step": 930 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.3333333333333333, "completion_length": 367.3333435058594, "epoch": 0.009013369993513471, "grad_norm": 2.274160658048065, "kl": 0.053466796875, "learning_rate": 9.997999899904435e-07, "loss": 0.0021, "reward": 1.962744116783142, "reward_std": 0.051901645958423615, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.187744140625, "step": 931 }, { "all_correct": 0.0, "all_wrong": 1.0, "completion_length": 395.4583435058594, "epoch": 0.009023051379113379, "grad_norm": 0.14555124790886229, "kl": 0.047607421875, "learning_rate": 9.997995596588946e-07, "loss": 0.0019, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.0, "step": 932 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 385.0, "epoch": 0.009032732764713286, "grad_norm": 1.8637371620264285, "kl": 0.037841796875, "learning_rate": 9.99799128864996e-07, "loss": 0.0015, "reward": 1.4870498180389404, "reward_std": 0.23815767467021942, "rewards/accuracy_reward": 0.3055555820465088, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1148274764418602, "step": 933 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 522.7916870117188, "epoch": 0.009042414150313193, "grad_norm": 2.9672464372109117, "kl": 0.031494140625, "learning_rate": 9.99798697608748e-07, "loss": 0.0013, "reward": 1.8007469177246094, "reward_std": 0.34980309009552, "rewards/accuracy_reward": 0.5083557963371277, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.2423909604549408, "step": 934 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 428.625, "epoch": 0.0090520955359131, "grad_norm": 5.6352357358225795, "kl": 0.04638671875, "learning_rate": 9.99798265890151e-07, "loss": 0.0019, "reward": 1.955549716949463, "reward_std": 0.10027619451284409, "rewards/accuracy_reward": 0.4883866012096405, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3004964292049408, "step": 935 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 421.66668701171875, "epoch": 0.009061776921513007, "grad_norm": 2.5540904614602877, "kl": 0.03173828125, "learning_rate": 9.997978337092055e-07, "loss": 0.0013, "reward": 2.115462303161621, "reward_std": 0.5326488018035889, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2237955778837204, "step": 936 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 408.125, "epoch": 0.009071458307112914, "grad_norm": 2.521772231563201, "kl": 0.0322265625, "learning_rate": 9.997974010659118e-07, "loss": 0.0013, "reward": 2.1050689220428467, "reward_std": 0.14901301264762878, "rewards/accuracy_reward": 0.6898751258850098, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2818603515625, "step": 937 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 417.375, "epoch": 0.009081139692712821, "grad_norm": 2.684672925198129, "kl": 0.033203125, "learning_rate": 9.997969679602703e-07, "loss": 0.0013, "reward": 1.575656771659851, "reward_std": 0.3787345886230469, "rewards/accuracy_reward": 0.24502524733543396, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2306315153837204, "step": 938 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 358.41668701171875, "epoch": 0.009090821078312728, "grad_norm": 1.9020627458819959, "kl": 0.047607421875, "learning_rate": 9.997965343922814e-07, "loss": 0.0019, "reward": 1.9450196027755737, "reward_std": 0.2096073478460312, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2283528745174408, "step": 939 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 418.125, "epoch": 0.009100502463912636, "grad_norm": 29.781552147327197, "kl": 0.041259765625, "learning_rate": 9.997961003619454e-07, "loss": 0.0016, "reward": 1.947382926940918, "reward_std": 0.04340456426143646, "rewards/accuracy_reward": 0.5291415452957153, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2182413786649704, "step": 940 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 416.3333435058594, "epoch": 0.009110183849512543, "grad_norm": 1.9512659534204804, "kl": 0.027099609375, "learning_rate": 9.997956658692633e-07, "loss": 0.0011, "reward": 1.8047784566879272, "reward_std": 0.2190152406692505, "rewards/accuracy_reward": 0.4666029214859009, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2131754606962204, "step": 941 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 390.125, "epoch": 0.00911986523511245, "grad_norm": 6.568495770940537, "kl": 0.035888671875, "learning_rate": 9.997952309142346e-07, "loss": 0.0014, "reward": 2.2416775226593018, "reward_std": 0.08640927076339722, "rewards/accuracy_reward": 0.7001897692680359, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.358154296875, "step": 942 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 350.625, "epoch": 0.009129546620712357, "grad_norm": 1.6713300846419974, "kl": 0.04638671875, "learning_rate": 9.997947954968602e-07, "loss": 0.0019, "reward": 1.6907227039337158, "reward_std": 0.28845521807670593, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1573893278837204, "step": 943 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 475.125, "epoch": 0.009139228006312264, "grad_norm": 4.4387427632732415, "kl": 0.033203125, "learning_rate": 9.997943596171405e-07, "loss": 0.0013, "reward": 1.9617650508880615, "reward_std": 0.42828863859176636, "rewards/accuracy_reward": 0.7222222685813904, "rewards/format_reward": 0.875, "rewards/semantic_reward": 0.2395426481962204, "step": 944 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 407.0833435058594, "epoch": 0.009148909391912171, "grad_norm": 3.2830927034882635, "kl": 0.04638671875, "learning_rate": 9.997939232750759e-07, "loss": 0.0019, "reward": 1.7808046340942383, "reward_std": 0.10026721656322479, "rewards/accuracy_reward": 0.37335842847824097, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2574462890625, "step": 945 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 351.66668701171875, "epoch": 0.009158590777512077, "grad_norm": 1.8987914116291618, "kl": 0.0419921875, "learning_rate": 9.997934864706666e-07, "loss": 0.0017, "reward": 1.8937010765075684, "reward_std": 0.5033270716667175, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2103678435087204, "step": 946 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 429.25, "epoch": 0.009168272163111984, "grad_norm": 2.883946093688434, "kl": 0.0380859375, "learning_rate": 9.99793049203913e-07, "loss": 0.0015, "reward": 1.7898566722869873, "reward_std": 0.08332045376300812, "rewards/accuracy_reward": 0.33284157514572144, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.273681640625, "step": 947 }, { "all_correct": 0.0, "all_wrong": 0.6666666666666666, "completion_length": 413.75, "epoch": 0.009177953548711891, "grad_norm": 2.157426282308408, "kl": 0.041015625, "learning_rate": 9.997926114748159e-07, "loss": 0.0016, "reward": 1.2658171653747559, "reward_std": 0.02447371929883957, "rewards/accuracy_reward": 0.1110156700015068, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.088134765625, "step": 948 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 380.41668701171875, "epoch": 0.009187634934311798, "grad_norm": 1.689715673888824, "kl": 0.0419921875, "learning_rate": 9.997921732833753e-07, "loss": 0.0017, "reward": 1.7659019231796265, "reward_std": 0.05876173824071884, "rewards/accuracy_reward": 0.43033072352409363, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2105712890625, "step": 949 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 348.5833435058594, "epoch": 0.009197316319911705, "grad_norm": 4.283568952082578, "kl": 0.04736328125, "learning_rate": 9.997917346295918e-07, "loss": 0.0019, "reward": 2.105259656906128, "reward_std": 0.08708351105451584, "rewards/accuracy_reward": 0.6396833658218384, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2989095151424408, "step": 950 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 374.8333435058594, "epoch": 0.009206997705511612, "grad_norm": 5.971305637460061, "kl": 0.05126953125, "learning_rate": 9.997912955134659e-07, "loss": 0.0021, "reward": 2.079435348510742, "reward_std": 0.28008192777633667, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2127685546875, "step": 951 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 371.16668701171875, "epoch": 0.00921667909111152, "grad_norm": 1.6726936758574609, "kl": 0.04833984375, "learning_rate": 9.997908559349979e-07, "loss": 0.0019, "reward": 1.6068925857543945, "reward_std": 0.19697873294353485, "rewards/accuracy_reward": 0.37938594818115234, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1525065153837204, "step": 952 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 470.2083435058594, "epoch": 0.009226360476711427, "grad_norm": 3.3348956989672245, "kl": 0.03369140625, "learning_rate": 9.99790415894188e-07, "loss": 0.0013, "reward": 1.9828613996505737, "reward_std": 0.36793312430381775, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2245279997587204, "step": 953 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 351.625, "epoch": 0.009236041862311334, "grad_norm": 2.0487628105539426, "kl": 0.0458984375, "learning_rate": 9.997899753910369e-07, "loss": 0.0018, "reward": 1.720963478088379, "reward_std": 0.2842136025428772, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.154296875, "step": 954 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 380.7083435058594, "epoch": 0.009245723247911241, "grad_norm": 5.989510953431908, "kl": 0.046630859375, "learning_rate": 9.997895344255448e-07, "loss": 0.0019, "reward": 1.5997509956359863, "reward_std": 0.05671258270740509, "rewards/accuracy_reward": 0.2748892307281494, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1915283203125, "step": 955 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 496.54168701171875, "epoch": 0.009255404633511148, "grad_norm": 5.212969118576818, "kl": 0.03173828125, "learning_rate": 9.997890929977122e-07, "loss": 0.0013, "reward": 1.5320909023284912, "reward_std": 0.38178691267967224, "rewards/accuracy_reward": 0.36981067061424255, "rewards/format_reward": 0.875, "rewards/semantic_reward": 0.1706136167049408, "step": 956 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 373.0, "epoch": 0.009265086019111055, "grad_norm": 2.5176662546040265, "kl": 0.038330078125, "learning_rate": 9.997886511075395e-07, "loss": 0.0015, "reward": 1.8160731792449951, "reward_std": 0.3836252689361572, "rewards/accuracy_reward": 0.5076989531517029, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2083740234375, "step": 957 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 404.2083435058594, "epoch": 0.009274767404710962, "grad_norm": 2.283085723876674, "kl": 0.034912109375, "learning_rate": 9.997882087550271e-07, "loss": 0.0014, "reward": 1.560039758682251, "reward_std": 0.2871273458003998, "rewards/accuracy_reward": 0.2810845971107483, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.178955078125, "step": 958 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 402.375, "epoch": 0.00928444879031087, "grad_norm": 3.651206531863228, "kl": 0.0478515625, "learning_rate": 9.997877659401756e-07, "loss": 0.0019, "reward": 1.7366937398910522, "reward_std": 0.07025589793920517, "rewards/accuracy_reward": 0.4463617205619812, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1736653745174408, "step": 959 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 440.875, "epoch": 0.009294130175910777, "grad_norm": 1.857722844816389, "kl": 0.032470703125, "learning_rate": 9.99787322662985e-07, "loss": 0.0013, "reward": 1.731510043144226, "reward_std": 0.33382365107536316, "rewards/accuracy_reward": 0.49029093980789185, "rewards/format_reward": 0.9166666865348816, "rewards/semantic_reward": 0.2078857421875, "step": 960 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 387.5833435058594, "epoch": 0.009303811561510684, "grad_norm": 2.6790316677152064, "kl": 0.04833984375, "learning_rate": 9.99786878923456e-07, "loss": 0.0019, "reward": 2.315002202987671, "reward_std": 0.23259298503398895, "rewards/accuracy_reward": 0.7698279619216919, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3868408203125, "step": 961 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 334.875, "epoch": 0.009313492947110591, "grad_norm": 4.267475776554868, "kl": 0.046875, "learning_rate": 9.997864347215891e-07, "loss": 0.0019, "reward": 1.643322229385376, "reward_std": 0.06285049021244049, "rewards/accuracy_reward": 0.4128492474555969, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1721394956111908, "step": 962 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.3333333333333333, "completion_length": 399.79168701171875, "epoch": 0.009323174332710498, "grad_norm": 2.1469266539711405, "kl": 0.05078125, "learning_rate": 9.997859900573845e-07, "loss": 0.002, "reward": 1.8772515058517456, "reward_std": 0.04298488423228264, "rewards/accuracy_reward": 0.5555555820465088, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1966959685087204, "step": 963 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 445.16668701171875, "epoch": 0.009332855718310405, "grad_norm": 3.8129973820536067, "kl": 0.040283203125, "learning_rate": 9.997855449308429e-07, "loss": 0.0016, "reward": 1.8356618881225586, "reward_std": 0.07862619310617447, "rewards/accuracy_reward": 0.326099693775177, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3428955078125, "step": 964 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 425.54168701171875, "epoch": 0.009342537103910312, "grad_norm": 2.0034460770071196, "kl": 0.0380859375, "learning_rate": 9.997850993419641e-07, "loss": 0.0015, "reward": 2.060753583908081, "reward_std": 0.3160852789878845, "rewards/accuracy_reward": 0.632774829864502, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2613118588924408, "step": 965 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 440.8333435058594, "epoch": 0.00935221848951022, "grad_norm": 2.613011118271196, "kl": 0.038818359375, "learning_rate": 9.99784653290749e-07, "loss": 0.0016, "reward": 1.9894421100616455, "reward_std": 0.31065165996551514, "rewards/accuracy_reward": 0.613815426826477, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2256266325712204, "step": 966 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 448.16668701171875, "epoch": 0.009361899875110125, "grad_norm": 1.987792815447964, "kl": 0.033447265625, "learning_rate": 9.99784206777198e-07, "loss": 0.0013, "reward": 1.5562989711761475, "reward_std": 0.19066809117794037, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1229654997587204, "step": 967 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 422.75, "epoch": 0.009371581260710032, "grad_norm": 1.93599021242813, "kl": 0.04443359375, "learning_rate": 9.997837598013114e-07, "loss": 0.0018, "reward": 1.8530428409576416, "reward_std": 0.2572658061981201, "rewards/accuracy_reward": 0.4628327786922455, "rewards/format_reward": 0.9583333730697632, "rewards/semantic_reward": 0.2568766474723816, "step": 968 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 436.5833435058594, "epoch": 0.00938126264630994, "grad_norm": 1.9795564868603657, "kl": 0.03759765625, "learning_rate": 9.997833123630897e-07, "loss": 0.0015, "reward": 1.9794210195541382, "reward_std": 0.3300279378890991, "rewards/accuracy_reward": 0.556121826171875, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2566325068473816, "step": 969 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.3333333333333333, "completion_length": 417.75, "epoch": 0.009390944031909846, "grad_norm": 3.9060855689689573, "kl": 0.038818359375, "learning_rate": 9.99782864462533e-07, "loss": 0.0016, "reward": 2.0304527282714844, "reward_std": 0.06243254989385605, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2554524838924408, "step": 970 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 396.79168701171875, "epoch": 0.009400625417509753, "grad_norm": 3.0392490836233836, "kl": 0.04345703125, "learning_rate": 9.99782416099642e-07, "loss": 0.0017, "reward": 2.0075268745422363, "reward_std": 0.07055248320102692, "rewards/accuracy_reward": 0.5356597900390625, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2885335385799408, "step": 971 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 412.8333435058594, "epoch": 0.00941030680310966, "grad_norm": 2.1859190848196777, "kl": 0.03955078125, "learning_rate": 9.997819672744174e-07, "loss": 0.0016, "reward": 2.09552264213562, "reward_std": 0.07349695265293121, "rewards/accuracy_reward": 0.5490705966949463, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3631184995174408, "step": 972 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 441.4583435058594, "epoch": 0.009419988188709568, "grad_norm": 5.0393581063522745, "kl": 0.041259765625, "learning_rate": 9.997815179868592e-07, "loss": 0.0017, "reward": 1.81057870388031, "reward_std": 0.24851304292678833, "rewards/accuracy_reward": 0.4706779420375824, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2315673828125, "step": 973 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 387.5833435058594, "epoch": 0.009429669574309475, "grad_norm": 2.4909535655542836, "kl": 0.049072265625, "learning_rate": 9.99781068236968e-07, "loss": 0.002, "reward": 1.3432598114013672, "reward_std": 0.2750215530395508, "rewards/accuracy_reward": 0.18805158138275146, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1302083432674408, "step": 974 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 411.25, "epoch": 0.009439350959909382, "grad_norm": 6.235611377940446, "kl": 0.046630859375, "learning_rate": 9.997806180247439e-07, "loss": 0.0019, "reward": 1.6530817747116089, "reward_std": 0.03620843589305878, "rewards/accuracy_reward": 0.34608301520347595, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1736653745174408, "step": 975 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 425.2083435058594, "epoch": 0.00944903234550929, "grad_norm": 3.4733937996916406, "kl": 0.041748046875, "learning_rate": 9.997801673501874e-07, "loss": 0.0017, "reward": 1.9594587087631226, "reward_std": 0.058403126895427704, "rewards/accuracy_reward": 0.5169904232025146, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2591349482536316, "step": 976 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 411.8333435058594, "epoch": 0.009458713731109196, "grad_norm": 1.7159346264114927, "kl": 0.0361328125, "learning_rate": 9.997797162132995e-07, "loss": 0.0014, "reward": 1.814770221710205, "reward_std": 0.1954699456691742, "rewards/accuracy_reward": 0.4654945731163025, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2159423828125, "step": 977 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 392.875, "epoch": 0.009468395116709103, "grad_norm": 2.8578154019429576, "kl": 0.0537109375, "learning_rate": 9.9977926461408e-07, "loss": 0.0021, "reward": 2.0216450691223145, "reward_std": 0.09773099422454834, "rewards/accuracy_reward": 0.4951638877391815, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3514811396598816, "step": 978 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 379.79168701171875, "epoch": 0.00947807650230901, "grad_norm": 3.149216388619217, "kl": 0.050537109375, "learning_rate": 9.997788125525295e-07, "loss": 0.002, "reward": 2.0894532203674316, "reward_std": 0.08597581088542938, "rewards/accuracy_reward": 0.6436606049537659, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2707926630973816, "step": 979 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.6666666666666666, "completion_length": 346.75, "epoch": 0.009487757887908918, "grad_norm": 2.0696970951422413, "kl": 0.052978515625, "learning_rate": 9.997783600286484e-07, "loss": 0.0021, "reward": 1.488313913345337, "reward_std": 0.04637616500258446, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.10498046875, "step": 980 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 397.79168701171875, "epoch": 0.009497439273508825, "grad_norm": 1.8918282530565296, "kl": 0.03662109375, "learning_rate": 9.99777907042437e-07, "loss": 0.0015, "reward": 1.6658529043197632, "reward_std": 0.452862024307251, "rewards/accuracy_reward": 0.4166666865348816, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1658528745174408, "step": 981 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 380.125, "epoch": 0.009507120659108732, "grad_norm": 3.4091895419683955, "kl": 0.0322265625, "learning_rate": 9.99777453593896e-07, "loss": 0.0013, "reward": 2.1194405555725098, "reward_std": 0.2650601863861084, "rewards/accuracy_reward": 0.6707016825675964, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2904052734375, "step": 982 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 450.625, "epoch": 0.009516802044708639, "grad_norm": 2.364210088564988, "kl": 0.035888671875, "learning_rate": 9.997769996830258e-07, "loss": 0.0014, "reward": 1.68663489818573, "reward_std": 0.21388956904411316, "rewards/accuracy_reward": 0.46035706996917725, "rewards/format_reward": 0.9166666865348816, "rewards/semantic_reward": 0.1846110075712204, "step": 983 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.3333333333333333, "completion_length": 390.5, "epoch": 0.009526483430308546, "grad_norm": 2.860724194931765, "kl": 0.03515625, "learning_rate": 9.997765453098265e-07, "loss": 0.0014, "reward": 2.0063884258270264, "reward_std": 0.038253944367170334, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2147216796875, "step": 984 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 384.0, "epoch": 0.009536164815908453, "grad_norm": 1.7507804209507176, "kl": 0.03955078125, "learning_rate": 9.997760904742989e-07, "loss": 0.0016, "reward": 1.9671783447265625, "reward_std": 0.2841596305370331, "rewards/accuracy_reward": 0.5764719247817993, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2657063901424408, "step": 985 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 414.66668701171875, "epoch": 0.00954584620150836, "grad_norm": 8.497618952696895, "kl": 0.0361328125, "learning_rate": 9.99775635176443e-07, "loss": 0.0014, "reward": 1.6790037155151367, "reward_std": 0.07991504669189453, "rewards/accuracy_reward": 0.41048163175582886, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1935221403837204, "step": 986 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 449.79168701171875, "epoch": 0.009555527587108268, "grad_norm": 1.584405338577175, "kl": 0.039794921875, "learning_rate": 9.997751794162595e-07, "loss": 0.0016, "reward": 1.4148097038269043, "reward_std": 0.05662819743156433, "rewards/accuracy_reward": 0.13231047987937927, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.16583251953125, "step": 987 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 379.25, "epoch": 0.009565208972708175, "grad_norm": 2.8164933642005954, "kl": 0.038330078125, "learning_rate": 9.997747231937489e-07, "loss": 0.0015, "reward": 1.6957666873931885, "reward_std": 0.051186077296733856, "rewards/accuracy_reward": 0.4124739170074463, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1666259765625, "step": 988 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 392.2083435058594, "epoch": 0.00957489035830808, "grad_norm": 2.4624283355253938, "kl": 0.0419921875, "learning_rate": 9.997742665089116e-07, "loss": 0.0017, "reward": 2.4670655727386475, "reward_std": 0.07193697988986969, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2837321162223816, "step": 989 }, { "all_correct": 0.6666666666666666, "all_wrong": 0.0, "completion_length": 376.5, "epoch": 0.009584571743907987, "grad_norm": 4.547566296574692, "kl": 0.050048828125, "learning_rate": 9.997738093617476e-07, "loss": 0.002, "reward": 1.982313871383667, "reward_std": 0.09433700889348984, "rewards/accuracy_reward": 0.5781227946281433, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2958577573299408, "step": 990 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 418.2083435058594, "epoch": 0.009594253129507894, "grad_norm": 1.196858958080209, "kl": 0.027099609375, "learning_rate": 9.99773351752258e-07, "loss": 0.0011, "reward": 1.783186912536621, "reward_std": 0.2946337163448334, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1831868588924408, "step": 991 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 393.625, "epoch": 0.009603934515107802, "grad_norm": 2.709155457061636, "kl": 0.051025390625, "learning_rate": 9.997728936804425e-07, "loss": 0.002, "reward": 1.8427371978759766, "reward_std": 0.13703112304210663, "rewards/accuracy_reward": 0.4496626555919647, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2347412109375, "step": 992 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 372.8333435058594, "epoch": 0.009613615900707709, "grad_norm": 2.7022999326423096, "kl": 0.048095703125, "learning_rate": 9.99772435146302e-07, "loss": 0.0019, "reward": 2.0914177894592285, "reward_std": 0.04357539862394333, "rewards/accuracy_reward": 0.6081331968307495, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2832845151424408, "step": 993 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.3333333333333333, "completion_length": 414.16668701171875, "epoch": 0.009623297286307616, "grad_norm": 13.96995074966927, "kl": 0.043701171875, "learning_rate": 9.99771976149837e-07, "loss": 0.0017, "reward": 1.7097347974777222, "reward_std": 0.06658726185560226, "rewards/accuracy_reward": 0.32658863067626953, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2664794921875, "step": 994 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 381.3333435058594, "epoch": 0.009632978671907523, "grad_norm": 7.30006038705351, "kl": 0.05029296875, "learning_rate": 9.997715166910475e-07, "loss": 0.002, "reward": 1.9419893026351929, "reward_std": 0.12115972489118576, "rewards/accuracy_reward": 0.5396698117256165, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2939860224723816, "step": 995 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 389.5, "epoch": 0.00964266005750743, "grad_norm": 1.783621224289053, "kl": 0.038330078125, "learning_rate": 9.997710567699345e-07, "loss": 0.0015, "reward": 1.5663142204284668, "reward_std": 0.2808592915534973, "rewards/accuracy_reward": 0.3188532590866089, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1474609375, "step": 996 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 400.16668701171875, "epoch": 0.009652341443107337, "grad_norm": 1.852509140634956, "kl": 0.038818359375, "learning_rate": 9.99770596386498e-07, "loss": 0.0015, "reward": 1.43021559715271, "reward_std": 0.21448907256126404, "rewards/accuracy_reward": 0.22449442744255066, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1140543669462204, "step": 997 }, { "all_correct": 0.0, "all_wrong": 0.3333333333333333, "completion_length": 342.0, "epoch": 0.009662022828707244, "grad_norm": 2.0254409928425687, "kl": 0.0478515625, "learning_rate": 9.997701355407381e-07, "loss": 0.0019, "reward": 1.3957475423812866, "reward_std": 0.28440195322036743, "rewards/accuracy_reward": 0.2203974574804306, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.1086832731962204, "step": 998 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 408.54168701171875, "epoch": 0.009671704214307152, "grad_norm": 1.8168681350749403, "kl": 0.03857421875, "learning_rate": 9.99769674232656e-07, "loss": 0.0015, "reward": 2.1519343852996826, "reward_std": 0.31413784623146057, "rewards/accuracy_reward": 0.7076147794723511, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.3026530146598816, "step": 999 }, { "all_correct": 0.3333333333333333, "all_wrong": 0.0, "completion_length": 410.9583435058594, "epoch": 0.009681385599907059, "grad_norm": 2.909246938686589, "kl": 0.04443359375, "learning_rate": 9.997692124622518e-07, "loss": 0.0018, "reward": 1.895688772201538, "reward_std": 0.08162929862737656, "rewards/accuracy_reward": 0.46718114614486694, "rewards/format_reward": 1.0, "rewards/semantic_reward": 0.2618408203125, "step": 1000 } ], "logging_steps": 1.0, "max_steps": 103291, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }