{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.48, "eval_steps": 500, "global_step": 900, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 257.03907012939453, "epoch": 0.0010666666666666667, "grad_norm": 0.16667956716604723, "kl": 0.0, "learning_rate": 3.7037037037037036e-07, "loss": -0.0, "reward": 0.02083333395421505, "reward_std": 0.053838133811950684, "rewards/equation_reward_func": 0.0, "rewards/format_reward_func": 0.02083333395421505, "step": 2 }, { "completion_length": 260.90625762939453, "epoch": 0.0021333333333333334, "grad_norm": 0.2666116145187317, "kl": 0.0005308389663696289, "learning_rate": 7.407407407407407e-07, "loss": 0.0, "reward": 0.03385416744276881, "reward_std": 0.09575404319912195, "rewards/equation_reward_func": 0.0, "rewards/format_reward_func": 0.03385416744276881, "step": 4 }, { "completion_length": 274.26042556762695, "epoch": 0.0032, "grad_norm": 0.16478154469097706, "kl": 0.0006362199783325195, "learning_rate": 1.111111111111111e-06, "loss": 0.0, "reward": 0.02864583395421505, "reward_std": 0.07084779068827629, "rewards/equation_reward_func": 0.0, "rewards/format_reward_func": 0.02864583395421505, "step": 6 }, { "completion_length": 273.91407108306885, "epoch": 0.004266666666666667, "grad_norm": 0.29685789652942896, "kl": 0.0008223056793212891, "learning_rate": 1.4814814814814815e-06, "loss": 0.0, "reward": 0.0572916679084301, "reward_std": 0.16204530373215675, "rewards/equation_reward_func": 0.0026041667442768812, "rewards/format_reward_func": 0.05468750116415322, "step": 8 }, { "completion_length": 210.3020887374878, "epoch": 0.005333333333333333, "grad_norm": 1.4208031353553083, "kl": 0.006844520568847656, "learning_rate": 1.8518518518518519e-06, "loss": 0.0, "reward": 0.19270833698101342, "reward_std": 0.3526948341168463, "rewards/equation_reward_func": 0.0052083334885537624, "rewards/format_reward_func": 0.18750000302679837, "step": 10 }, { "completion_length": 143.4218807220459, "epoch": 0.0064, "grad_norm": 0.382802360237772, "kl": 0.0333251953125, "learning_rate": 2.222222222222222e-06, "loss": 0.0, "reward": 0.5468750167638063, "reward_std": 0.46607458777725697, "rewards/equation_reward_func": 0.007812500232830644, "rewards/format_reward_func": 0.5390625167638063, "step": 12 }, { "completion_length": 93.60156536102295, "epoch": 0.007466666666666667, "grad_norm": 0.3929464752272363, "kl": 0.081390380859375, "learning_rate": 2.5925925925925925e-06, "loss": 0.0001, "reward": 0.7812500186264515, "reward_std": 0.4070173464715481, "rewards/equation_reward_func": 0.007812500232830644, "rewards/format_reward_func": 0.7734375186264515, "step": 14 }, { "completion_length": 58.67448091506958, "epoch": 0.008533333333333334, "grad_norm": 0.19679118997240852, "kl": 0.112762451171875, "learning_rate": 2.962962962962963e-06, "loss": 0.0001, "reward": 0.9713541939854622, "reward_std": 0.1053980034776032, "rewards/equation_reward_func": 0.0052083334885537624, "rewards/format_reward_func": 0.9661458507180214, "step": 16 }, { "completion_length": 45.17708492279053, "epoch": 0.0096, "grad_norm": 0.1009675402276491, "kl": 0.18695068359375, "learning_rate": 3.3333333333333333e-06, "loss": 0.0002, "reward": 1.0078125149011612, "reward_std": 0.03174104681238532, "rewards/equation_reward_func": 0.010416666977107525, "rewards/format_reward_func": 0.9973958358168602, "step": 18 }, { "completion_length": 41.58073043823242, "epoch": 0.010666666666666666, "grad_norm": 0.18166331794796514, "kl": 0.243408203125, "learning_rate": 3.7037037037037037e-06, "loss": 0.0002, "reward": 1.0338541939854622, "reward_std": 0.08049174910411239, "rewards/equation_reward_func": 0.03645833418704569, "rewards/format_reward_func": 0.9973958358168602, "step": 20 }, { "completion_length": 38.39323043823242, "epoch": 0.011733333333333333, "grad_norm": 0.23006140707579034, "kl": 0.321533203125, "learning_rate": 4.074074074074074e-06, "loss": 0.0003, "reward": 1.0442708656191826, "reward_std": 0.10828368039801717, "rewards/equation_reward_func": 0.04687500139698386, "rewards/format_reward_func": 0.9973958358168602, "step": 22 }, { "completion_length": 37.91927218437195, "epoch": 0.0128, "grad_norm": 0.21888284569395813, "kl": 0.4520263671875, "learning_rate": 4.444444444444444e-06, "loss": 0.0005, "reward": 1.0390625298023224, "reward_std": 0.13415095582604408, "rewards/equation_reward_func": 0.05208333465270698, "rewards/format_reward_func": 0.986979179084301, "step": 24 }, { "completion_length": 40.960939168930054, "epoch": 0.013866666666666666, "grad_norm": 0.16551293898285133, "kl": 0.470703125, "learning_rate": 4.814814814814815e-06, "loss": 0.0005, "reward": 1.1171875223517418, "reward_std": 0.09757464285939932, "rewards/equation_reward_func": 0.11718750465661287, "rewards/format_reward_func": 1.0, "step": 26 }, { "completion_length": 45.05208468437195, "epoch": 0.014933333333333333, "grad_norm": 0.2136799478318177, "kl": 0.30328369140625, "learning_rate": 4.9999838124619495e-06, "loss": 0.0003, "reward": 1.1197916939854622, "reward_std": 0.14553900947794318, "rewards/equation_reward_func": 0.11979166977107525, "rewards/format_reward_func": 1.0, "step": 28 }, { "completion_length": 42.15364742279053, "epoch": 0.016, "grad_norm": 0.1512857212247234, "kl": 0.330078125, "learning_rate": 4.999854313415309e-06, "loss": 0.0003, "reward": 1.1067708507180214, "reward_std": 0.11984641291201115, "rewards/equation_reward_func": 0.10937500256113708, "rewards/format_reward_func": 0.9973958358168602, "step": 30 }, { "completion_length": 40.08593940734863, "epoch": 0.017066666666666667, "grad_norm": 0.11784741952605589, "kl": 0.392333984375, "learning_rate": 4.999595322030074e-06, "loss": 0.0004, "reward": 1.0651041865348816, "reward_std": 0.06803698698058724, "rewards/equation_reward_func": 0.07031250256113708, "rewards/format_reward_func": 0.9947916716337204, "step": 32 }, { "completion_length": 39.947918176651, "epoch": 0.018133333333333335, "grad_norm": 0.13733806131383555, "kl": 0.388916015625, "learning_rate": 4.999206851721985e-06, "loss": 0.0004, "reward": 1.1119792014360428, "reward_std": 0.058393027167767286, "rewards/equation_reward_func": 0.11197917070239782, "rewards/format_reward_func": 1.0, "step": 34 }, { "completion_length": 39.63541793823242, "epoch": 0.0192, "grad_norm": 0.10794547268697512, "kl": 0.3826904296875, "learning_rate": 4.998688922613788e-06, "loss": 0.0004, "reward": 1.2083333656191826, "reward_std": 0.055330058094114065, "rewards/equation_reward_func": 0.21093750628642738, "rewards/format_reward_func": 0.9973958358168602, "step": 36 }, { "completion_length": 39.62760543823242, "epoch": 0.020266666666666665, "grad_norm": 0.07717579011062131, "kl": 0.3720703125, "learning_rate": 4.9980415615341844e-06, "loss": 0.0004, "reward": 1.1223958507180214, "reward_std": 0.007365695666521788, "rewards/equation_reward_func": 0.12239583767950535, "rewards/format_reward_func": 1.0, "step": 38 }, { "completion_length": 39.43229293823242, "epoch": 0.021333333333333333, "grad_norm": 0.004475441139681152, "kl": 0.426025390625, "learning_rate": 4.997264802016451e-06, "loss": 0.0004, "reward": 1.0833333432674408, "reward_std": 0.0, "rewards/equation_reward_func": 0.0833333358168602, "rewards/format_reward_func": 1.0, "step": 40 }, { "completion_length": 40.41927194595337, "epoch": 0.0224, "grad_norm": 0.07600011997887009, "kl": 0.382080078125, "learning_rate": 4.9963586842966925e-06, "loss": 0.0004, "reward": 1.1666666865348816, "reward_std": 0.014731391333043575, "rewards/equation_reward_func": 0.16666667046956718, "rewards/format_reward_func": 1.0, "step": 42 }, { "completion_length": 39.54427194595337, "epoch": 0.023466666666666667, "grad_norm": 0.13650399282040712, "kl": 0.5513916015625, "learning_rate": 4.995323255311768e-06, "loss": 0.0006, "reward": 1.1197916865348816, "reward_std": 0.014731391333043575, "rewards/equation_reward_func": 0.1197916716337204, "rewards/format_reward_func": 1.0, "step": 44 }, { "completion_length": 39.74479293823242, "epoch": 0.024533333333333334, "grad_norm": 0.030454015411769154, "kl": 0.460693359375, "learning_rate": 4.994158568696849e-06, "loss": 0.0005, "reward": 1.1223958507180214, "reward_std": 0.007365695666521788, "rewards/equation_reward_func": 0.1250000037252903, "rewards/format_reward_func": 0.9973958358168602, "step": 46 }, { "completion_length": 39.94010591506958, "epoch": 0.0256, "grad_norm": 0.003356232253432759, "kl": 0.37353515625, "learning_rate": 4.9928646847826494e-06, "loss": 0.0004, "reward": 1.0833333432674408, "reward_std": 0.0, "rewards/equation_reward_func": 0.0833333358168602, "rewards/format_reward_func": 1.0, "step": 48 }, { "completion_length": 40.773438453674316, "epoch": 0.02666666666666667, "grad_norm": 0.06621469115063311, "kl": 0.3673095703125, "learning_rate": 4.991441670592297e-06, "loss": 0.0004, "reward": 1.1354166939854622, "reward_std": 0.02946278266608715, "rewards/equation_reward_func": 0.14062500558793545, "rewards/format_reward_func": 0.9947916716337204, "step": 50 }, { "completion_length": 39.757813692092896, "epoch": 0.027733333333333332, "grad_norm": 0.10224335905392065, "kl": 0.40576171875, "learning_rate": 4.989889599837861e-06, "loss": 0.0004, "reward": 1.1562500149011612, "reward_std": 0.019287919625639915, "rewards/equation_reward_func": 0.1562500037252903, "rewards/format_reward_func": 1.0, "step": 52 }, { "completion_length": 40.093751668930054, "epoch": 0.0288, "grad_norm": 0.001198027095161411, "kl": 0.417724609375, "learning_rate": 4.988208552916535e-06, "loss": 0.0004, "reward": 1.104166679084301, "reward_std": 0.0, "rewards/equation_reward_func": 0.10416666977107525, "rewards/format_reward_func": 1.0, "step": 54 }, { "completion_length": 39.703125953674316, "epoch": 0.029866666666666666, "grad_norm": 0.002934931774677214, "kl": 0.4046630859375, "learning_rate": 4.986398616906474e-06, "loss": 0.0004, "reward": 1.0625000074505806, "reward_std": 0.0, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 1.0, "step": 56 }, { "completion_length": 39.54427170753479, "epoch": 0.030933333333333334, "grad_norm": 0.0008224222821494128, "kl": 0.4183349609375, "learning_rate": 4.984459885562277e-06, "loss": 0.0004, "reward": 1.0625000074505806, "reward_std": 0.0, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 1.0, "step": 58 }, { "completion_length": 39.23177194595337, "epoch": 0.032, "grad_norm": 0.0008905449388522471, "kl": 0.59033203125, "learning_rate": 4.982392459310142e-06, "loss": 0.0006, "reward": 1.2500000298023224, "reward_std": 0.0, "rewards/equation_reward_func": 0.2500000074505806, "rewards/format_reward_func": 1.0, "step": 60 }, { "completion_length": 39.171876430511475, "epoch": 0.03306666666666667, "grad_norm": 0.03745122502578824, "kl": 0.6116943359375, "learning_rate": 4.980196445242651e-06, "loss": 0.0006, "reward": 1.2317708656191826, "reward_std": 0.007365695666521788, "rewards/equation_reward_func": 0.23177084024064243, "rewards/format_reward_func": 1.0, "step": 62 }, { "completion_length": 39.68229341506958, "epoch": 0.034133333333333335, "grad_norm": 0.0010746207327056937, "kl": 0.546630859375, "learning_rate": 4.977871957113233e-06, "loss": 0.0005, "reward": 1.1432291865348816, "reward_std": 0.007365695666521788, "rewards/equation_reward_func": 0.1432291716337204, "rewards/format_reward_func": 1.0, "step": 64 }, { "completion_length": 39.37760591506958, "epoch": 0.0352, "grad_norm": 0.0049245133708245075, "kl": 0.58203125, "learning_rate": 4.975419115330267e-06, "loss": 0.0006, "reward": 1.0833333432674408, "reward_std": 0.014731391333043575, "rewards/equation_reward_func": 0.08593750186264515, "rewards/format_reward_func": 0.9973958358168602, "step": 66 }, { "completion_length": 40.17708492279053, "epoch": 0.03626666666666667, "grad_norm": 0.0013418804445475946, "kl": 0.556640625, "learning_rate": 4.972838046950844e-06, "loss": 0.0006, "reward": 1.1250000149011612, "reward_std": 0.0, "rewards/equation_reward_func": 0.1250000037252903, "rewards/format_reward_func": 1.0, "step": 68 }, { "completion_length": 38.984376430511475, "epoch": 0.037333333333333336, "grad_norm": 0.09348519326051212, "kl": 0.605224609375, "learning_rate": 4.970128885674187e-06, "loss": 0.0006, "reward": 1.1223958507180214, "reward_std": 0.007365695666521788, "rewards/equation_reward_func": 0.1250000037252903, "rewards/format_reward_func": 0.9973958358168602, "step": 70 }, { "completion_length": 39.15364718437195, "epoch": 0.0384, "grad_norm": 0.07787056887937681, "kl": 0.627685546875, "learning_rate": 4.967291771834727e-06, "loss": 0.0006, "reward": 1.0833333507180214, "reward_std": 0.014731391333043575, "rewards/equation_reward_func": 0.08333333651535213, "rewards/format_reward_func": 1.0, "step": 72 }, { "completion_length": 39.25520920753479, "epoch": 0.039466666666666664, "grad_norm": 0.004271359696057742, "kl": 0.603515625, "learning_rate": 4.96432685239483e-06, "loss": 0.0006, "reward": 1.0625000074505806, "reward_std": 0.0, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 1.0, "step": 74 }, { "completion_length": 39.72916793823242, "epoch": 0.04053333333333333, "grad_norm": 0.0918627518953965, "kl": 0.578857421875, "learning_rate": 4.961234280937186e-06, "loss": 0.0006, "reward": 1.0390625074505806, "reward_std": 0.007365695666521788, "rewards/equation_reward_func": 0.03906250186264515, "rewards/format_reward_func": 1.0, "step": 76 }, { "completion_length": 39.77864718437195, "epoch": 0.0416, "grad_norm": 0.002745593495479229, "kl": 0.54833984375, "learning_rate": 4.958014217656855e-06, "loss": 0.0005, "reward": 1.0833333432674408, "reward_std": 0.0, "rewards/equation_reward_func": 0.0833333358168602, "rewards/format_reward_func": 1.0, "step": 78 }, { "completion_length": 40.30729293823242, "epoch": 0.042666666666666665, "grad_norm": 0.002715027986479562, "kl": 0.5364990234375, "learning_rate": 4.954666829352966e-06, "loss": 0.0005, "reward": 1.2291666939854622, "reward_std": 0.0, "rewards/equation_reward_func": 0.22916667349636555, "rewards/format_reward_func": 1.0, "step": 80 }, { "completion_length": 39.507813930511475, "epoch": 0.04373333333333333, "grad_norm": 0.010725358478554874, "kl": 0.626953125, "learning_rate": 4.951192289420082e-06, "loss": 0.0006, "reward": 1.1666666865348816, "reward_std": 0.0, "rewards/equation_reward_func": 0.1666666716337204, "rewards/format_reward_func": 1.0, "step": 82 }, { "completion_length": 39.09375047683716, "epoch": 0.0448, "grad_norm": 0.007005180569909533, "kl": 0.628173828125, "learning_rate": 4.9475907778392095e-06, "loss": 0.0006, "reward": 1.0208333358168602, "reward_std": 0.0, "rewards/equation_reward_func": 0.02083333395421505, "rewards/format_reward_func": 1.0, "step": 84 }, { "completion_length": 39.81510519981384, "epoch": 0.04586666666666667, "grad_norm": 0.0041960123506723955, "kl": 0.5367431640625, "learning_rate": 4.943862481168484e-06, "loss": 0.0005, "reward": 1.0208333358168602, "reward_std": 0.0, "rewards/equation_reward_func": 0.02083333395421505, "rewards/format_reward_func": 1.0, "step": 86 }, { "completion_length": 39.29948091506958, "epoch": 0.046933333333333334, "grad_norm": 0.15448586546910006, "kl": 0.5098876953125, "learning_rate": 4.940007592533501e-06, "loss": 0.0005, "reward": 1.0963541828095913, "reward_std": 0.022097086999565363, "rewards/equation_reward_func": 0.1015625037252903, "rewards/format_reward_func": 0.9947916679084301, "step": 88 }, { "completion_length": 39.73958492279053, "epoch": 0.048, "grad_norm": 0.10049556303386913, "kl": 0.520751953125, "learning_rate": 4.936026311617316e-06, "loss": 0.0005, "reward": 1.1432291865348816, "reward_std": 0.007365695666521788, "rewards/equation_reward_func": 0.14583333767950535, "rewards/format_reward_func": 0.9973958358168602, "step": 90 }, { "completion_length": 38.924480676651, "epoch": 0.04906666666666667, "grad_norm": 0.03603422286832018, "kl": 0.515380859375, "learning_rate": 4.931918844650096e-06, "loss": 0.0005, "reward": 1.104166679084301, "reward_std": 0.0, "rewards/equation_reward_func": 0.10416666977107525, "rewards/format_reward_func": 1.0, "step": 92 }, { "completion_length": 39.60416793823242, "epoch": 0.050133333333333335, "grad_norm": 0.0017376563849351675, "kl": 0.4222412109375, "learning_rate": 4.927685404398442e-06, "loss": 0.0004, "reward": 1.1666666865348816, "reward_std": 0.0, "rewards/equation_reward_func": 0.1666666716337204, "rewards/format_reward_func": 1.0, "step": 94 }, { "completion_length": 39.27083492279053, "epoch": 0.0512, "grad_norm": 0.0018084406399118448, "kl": 0.3944091796875, "learning_rate": 4.923326210154364e-06, "loss": 0.0004, "reward": 1.0416666716337204, "reward_std": 0.0, "rewards/equation_reward_func": 0.0416666679084301, "rewards/format_reward_func": 1.0, "step": 96 }, { "completion_length": 39.335938930511475, "epoch": 0.05226666666666667, "grad_norm": 0.0020596671344712916, "kl": 0.413330078125, "learning_rate": 4.918841487723926e-06, "loss": 0.0004, "reward": 1.0625000074505806, "reward_std": 0.0, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 1.0, "step": 98 }, { "completion_length": 39.77083468437195, "epoch": 0.05333333333333334, "grad_norm": 0.0023153702711157433, "kl": 0.3978271484375, "learning_rate": 4.91423146941554e-06, "loss": 0.0004, "reward": 1.1250000149011612, "reward_std": 0.0, "rewards/equation_reward_func": 0.1250000037252903, "rewards/format_reward_func": 1.0, "step": 100 }, { "completion_length": 40.04427194595337, "epoch": 0.0544, "grad_norm": 0.02991985269562785, "kl": 0.4010009765625, "learning_rate": 4.909496394027945e-06, "loss": 0.0004, "reward": 1.0182291716337204, "reward_std": 0.007365695666521788, "rewards/equation_reward_func": 0.02083333395421505, "rewards/format_reward_func": 0.9973958358168602, "step": 102 }, { "completion_length": 39.95573043823242, "epoch": 0.055466666666666664, "grad_norm": 0.00700827127434529, "kl": 0.450927734375, "learning_rate": 4.904636506837829e-06, "loss": 0.0005, "reward": 1.104166679084301, "reward_std": 0.0, "rewards/equation_reward_func": 0.10416666977107525, "rewards/format_reward_func": 1.0, "step": 104 }, { "completion_length": 41.34635543823242, "epoch": 0.05653333333333333, "grad_norm": 0.0021462281731712985, "kl": 0.39599609375, "learning_rate": 4.899652059587123e-06, "loss": 0.0004, "reward": 1.1250000149011612, "reward_std": 0.0, "rewards/equation_reward_func": 0.1250000037252903, "rewards/format_reward_func": 1.0, "step": 106 }, { "completion_length": 42.47135543823242, "epoch": 0.0576, "grad_norm": 0.1306198917515026, "kl": 0.6517333984375, "learning_rate": 4.894543310469968e-06, "loss": 0.0007, "reward": 1.080729179084301, "reward_std": 0.007365695666521788, "rewards/equation_reward_func": 0.08072916977107525, "rewards/format_reward_func": 1.0, "step": 108 }, { "completion_length": 45.671876430511475, "epoch": 0.058666666666666666, "grad_norm": 0.05674666774672507, "kl": 0.419189453125, "learning_rate": 4.889310524119332e-06, "loss": 0.0004, "reward": 1.080729179084301, "reward_std": 0.007365695666521788, "rewards/equation_reward_func": 0.08072916977107525, "rewards/format_reward_func": 1.0, "step": 110 }, { "completion_length": 48.210938692092896, "epoch": 0.05973333333333333, "grad_norm": 0.0024093189967069406, "kl": 0.3663330078125, "learning_rate": 4.883953971593308e-06, "loss": 0.0004, "reward": 1.104166679084301, "reward_std": 0.0, "rewards/equation_reward_func": 0.10416666977107525, "rewards/format_reward_func": 1.0, "step": 112 }, { "completion_length": 49.726563692092896, "epoch": 0.0608, "grad_norm": 0.10306486059128735, "kl": 0.4376220703125, "learning_rate": 4.8784739303610716e-06, "loss": 0.0004, "reward": 1.2187500298023224, "reward_std": 0.024375351145863533, "rewards/equation_reward_func": 0.2187500074505806, "rewards/format_reward_func": 1.0, "step": 114 }, { "completion_length": 54.01302218437195, "epoch": 0.06186666666666667, "grad_norm": 0.08566641495228024, "kl": 0.48388671875, "learning_rate": 4.872870684288506e-06, "loss": 0.0005, "reward": 1.1432291716337204, "reward_std": 0.022097086999565363, "rewards/equation_reward_func": 0.1432291679084301, "rewards/format_reward_func": 1.0, "step": 116 }, { "completion_length": 53.916668176651, "epoch": 0.06293333333333333, "grad_norm": 0.0033970370223610534, "kl": 0.3331298828125, "learning_rate": 4.8671445236234995e-06, "loss": 0.0003, "reward": 1.1380208507180214, "reward_std": 0.017009655479341745, "rewards/equation_reward_func": 0.1406250037252903, "rewards/format_reward_func": 0.9973958358168602, "step": 118 }, { "completion_length": 47.361980676651, "epoch": 0.064, "grad_norm": 0.002422725370002285, "kl": 0.328369140625, "learning_rate": 4.861295744980914e-06, "loss": 0.0003, "reward": 1.0859375149011612, "reward_std": 0.007365695666521788, "rewards/equation_reward_func": 0.08593750256113708, "rewards/format_reward_func": 1.0, "step": 120 }, { "completion_length": 45.085938453674316, "epoch": 0.06506666666666666, "grad_norm": 0.09144539681757236, "kl": 0.3819580078125, "learning_rate": 4.855324651327212e-06, "loss": 0.0004, "reward": 1.0833333507180214, "reward_std": 0.014731391333043575, "rewards/equation_reward_func": 0.08333333651535213, "rewards/format_reward_func": 1.0, "step": 122 }, { "completion_length": 44.476564168930054, "epoch": 0.06613333333333334, "grad_norm": 0.06930119025478021, "kl": 0.34716796875, "learning_rate": 4.849231551964771e-06, "loss": 0.0003, "reward": 1.0546875074505806, "reward_std": 0.010782274417579174, "rewards/equation_reward_func": 0.05468750186264515, "rewards/format_reward_func": 1.0, "step": 124 }, { "completion_length": 44.11198019981384, "epoch": 0.0672, "grad_norm": 0.07653320037698823, "kl": 0.330322265625, "learning_rate": 4.84301676251586e-06, "loss": 0.0003, "reward": 1.0364583507180214, "reward_std": 0.02946278266608715, "rewards/equation_reward_func": 0.03906250256113708, "rewards/format_reward_func": 0.9973958358168602, "step": 126 }, { "completion_length": 42.25260519981384, "epoch": 0.06826666666666667, "grad_norm": 0.08224101454691443, "kl": 0.3515625, "learning_rate": 4.836680604906284e-06, "loss": 0.0004, "reward": 1.0833333507180214, "reward_std": 0.02946278266608715, "rewards/equation_reward_func": 0.08333333535119891, "rewards/format_reward_func": 1.0, "step": 128 }, { "completion_length": 43.04166793823242, "epoch": 0.06933333333333333, "grad_norm": 0.11911380254992376, "kl": 0.3729248046875, "learning_rate": 4.830223407348719e-06, "loss": 0.0004, "reward": 1.1093750298023224, "reward_std": 0.044194173999130726, "rewards/equation_reward_func": 0.1093750074505806, "rewards/format_reward_func": 1.0, "step": 130 }, { "completion_length": 43.031251192092896, "epoch": 0.0704, "grad_norm": 0.06503473037393004, "kl": 0.326171875, "learning_rate": 4.823645504325699e-06, "loss": 0.0003, "reward": 1.0625000074505806, "reward_std": 0.024375351145863533, "rewards/equation_reward_func": 0.06250000139698386, "rewards/format_reward_func": 1.0, "step": 132 }, { "completion_length": 42.945313453674316, "epoch": 0.07146666666666666, "grad_norm": 0.05776150309528316, "kl": 0.3314208984375, "learning_rate": 4.816947236572301e-06, "loss": 0.0003, "reward": 1.104166679084301, "reward_std": 0.014731391333043575, "rewards/equation_reward_func": 0.10416666977107525, "rewards/format_reward_func": 1.0, "step": 134 }, { "completion_length": 44.83854269981384, "epoch": 0.07253333333333334, "grad_norm": 0.0025761779857380747, "kl": 0.33929443359375, "learning_rate": 4.810128951058485e-06, "loss": 0.0003, "reward": 1.1432291865348816, "reward_std": 0.007365695666521788, "rewards/equation_reward_func": 0.1432291716337204, "rewards/format_reward_func": 1.0, "step": 136 }, { "completion_length": 45.93229293823242, "epoch": 0.0736, "grad_norm": 0.0029902222204831875, "kl": 0.31256103515625, "learning_rate": 4.803191000971128e-06, "loss": 0.0003, "reward": 1.0208333358168602, "reward_std": 0.0, "rewards/equation_reward_func": 0.02083333395421505, "rewards/format_reward_func": 1.0, "step": 138 }, { "completion_length": 47.15364694595337, "epoch": 0.07466666666666667, "grad_norm": 0.13051464295325377, "kl": 0.341552734375, "learning_rate": 4.796133745695725e-06, "loss": 0.0003, "reward": 1.0911458507180214, "reward_std": 0.01850158115848899, "rewards/equation_reward_func": 0.09114583674818277, "rewards/format_reward_func": 1.0, "step": 140 }, { "completion_length": 49.24739742279053, "epoch": 0.07573333333333333, "grad_norm": 0.04410494742405578, "kl": 0.40576171875, "learning_rate": 4.788957550797778e-06, "loss": 0.0004, "reward": 1.1770833507180214, "reward_std": 0.024375351145863533, "rewards/equation_reward_func": 0.17708333767950535, "rewards/format_reward_func": 1.0, "step": 142 }, { "completion_length": 48.343750953674316, "epoch": 0.0768, "grad_norm": 0.07349435518183887, "kl": 0.3116455078125, "learning_rate": 4.781662788003851e-06, "loss": 0.0003, "reward": 1.1744791939854622, "reward_std": 0.040245057083666325, "rewards/equation_reward_func": 0.17447917233221233, "rewards/format_reward_func": 1.0, "step": 144 }, { "completion_length": 50.382813692092896, "epoch": 0.07786666666666667, "grad_norm": 0.056971661658506964, "kl": 0.3365478515625, "learning_rate": 4.774249835182321e-06, "loss": 0.0003, "reward": 1.0859375074505806, "reward_std": 0.007365695666521788, "rewards/equation_reward_func": 0.08593750186264515, "rewards/format_reward_func": 1.0, "step": 146 }, { "completion_length": 52.835938692092896, "epoch": 0.07893333333333333, "grad_norm": 0.048126955815888425, "kl": 0.3348388671875, "learning_rate": 4.766719076323804e-06, "loss": 0.0003, "reward": 1.1588541865348816, "reward_std": 0.022097086999565363, "rewards/equation_reward_func": 0.1588541716337204, "rewards/format_reward_func": 1.0, "step": 148 }, { "completion_length": 57.02083468437195, "epoch": 0.08, "grad_norm": 0.002053907688626172, "kl": 0.32958984375, "learning_rate": 4.759070901521264e-06, "loss": 0.0003, "reward": 1.1015625149011612, "reward_std": 0.007365695666521788, "rewards/equation_reward_func": 0.1015625037252903, "rewards/format_reward_func": 1.0, "step": 150 }, { "completion_length": 59.41406321525574, "epoch": 0.08106666666666666, "grad_norm": 0.0018461288900427132, "kl": 0.36865234375, "learning_rate": 4.751305706949803e-06, "loss": 0.0004, "reward": 1.1067708507180214, "reward_std": 0.007365695666521788, "rewards/equation_reward_func": 0.10677083651535213, "rewards/format_reward_func": 1.0, "step": 152 }, { "completion_length": 63.35156512260437, "epoch": 0.08213333333333334, "grad_norm": 0.0034329725309404507, "kl": 0.3702392578125, "learning_rate": 4.743423894846144e-06, "loss": 0.0004, "reward": 1.0833333432674408, "reward_std": 0.0, "rewards/equation_reward_func": 0.0833333358168602, "rewards/format_reward_func": 1.0, "step": 154 }, { "completion_length": 65.22656440734863, "epoch": 0.0832, "grad_norm": 0.10347017950061639, "kl": 0.374267578125, "learning_rate": 4.735425873487791e-06, "loss": 0.0004, "reward": 1.1171875223517418, "reward_std": 0.022097086999565363, "rewards/equation_reward_func": 0.11718750558793545, "rewards/format_reward_func": 1.0, "step": 156 }, { "completion_length": 70.645836353302, "epoch": 0.08426666666666667, "grad_norm": 0.0019201008169428577, "kl": 0.3751220703125, "learning_rate": 4.727312057171885e-06, "loss": 0.0004, "reward": 1.1380208507180214, "reward_std": 0.052520891185849905, "rewards/equation_reward_func": 0.1432291716337204, "rewards/format_reward_func": 0.9947916716337204, "step": 158 }, { "completion_length": 65.62500190734863, "epoch": 0.08533333333333333, "grad_norm": 0.07815871418569881, "kl": 0.423095703125, "learning_rate": 4.719082866193736e-06, "loss": 0.0004, "reward": 1.0546875223517418, "reward_std": 0.033232972491532564, "rewards/equation_reward_func": 0.05468750232830644, "rewards/format_reward_func": 1.0, "step": 160 }, { "completion_length": 62.937501668930054, "epoch": 0.0864, "grad_norm": 0.07996688240084189, "kl": 0.3814697265625, "learning_rate": 4.710738726825059e-06, "loss": 0.0004, "reward": 1.2187500149011612, "reward_std": 0.02946278266608715, "rewards/equation_reward_func": 0.2187500037252903, "rewards/format_reward_func": 1.0, "step": 162 }, { "completion_length": 59.27864742279053, "epoch": 0.08746666666666666, "grad_norm": 0.05361679335053332, "kl": 0.3466796875, "learning_rate": 4.702280071291892e-06, "loss": 0.0003, "reward": 1.1171875149011612, "reward_std": 0.017009655479341745, "rewards/equation_reward_func": 0.1171875037252903, "rewards/format_reward_func": 1.0, "step": 164 }, { "completion_length": 54.197918176651, "epoch": 0.08853333333333334, "grad_norm": 0.07111079844029923, "kl": 0.3455810546875, "learning_rate": 4.693707337752201e-06, "loss": 0.0003, "reward": 1.1145833507180214, "reward_std": 0.01814797008410096, "rewards/equation_reward_func": 0.11458333767950535, "rewards/format_reward_func": 1.0, "step": 166 }, { "completion_length": 51.851563930511475, "epoch": 0.0896, "grad_norm": 0.04708958771989249, "kl": 0.3311767578125, "learning_rate": 4.68502097027319e-06, "loss": 0.0003, "reward": 1.1484375223517418, "reward_std": 0.007365695666521788, "rewards/equation_reward_func": 0.14843750442378223, "rewards/format_reward_func": 1.0, "step": 168 }, { "completion_length": 49.72395992279053, "epoch": 0.09066666666666667, "grad_norm": 0.0018024580260479306, "kl": 0.3154296875, "learning_rate": 4.676221418808295e-06, "loss": 0.0003, "reward": 1.0989583507180214, "reward_std": 0.014731391333043575, "rewards/equation_reward_func": 0.09895833767950535, "rewards/format_reward_func": 1.0, "step": 170 }, { "completion_length": 48.937501192092896, "epoch": 0.09173333333333333, "grad_norm": 0.002984263788533099, "kl": 0.3480224609375, "learning_rate": 4.667309139173879e-06, "loss": 0.0003, "reward": 1.0625000074505806, "reward_std": 0.0, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 1.0, "step": 172 }, { "completion_length": 48.494793176651, "epoch": 0.0928, "grad_norm": 0.0014747458915803722, "kl": 0.34033203125, "learning_rate": 4.658284593025617e-06, "loss": 0.0003, "reward": 1.0833333432674408, "reward_std": 0.0, "rewards/equation_reward_func": 0.0833333358168602, "rewards/format_reward_func": 1.0, "step": 174 }, { "completion_length": 48.398438930511475, "epoch": 0.09386666666666667, "grad_norm": 0.05681792101029396, "kl": 0.31201171875, "learning_rate": 4.6491482478345836e-06, "loss": 0.0003, "reward": 1.1223958507180214, "reward_std": 0.007365695666521788, "rewards/equation_reward_func": 0.12239583767950535, "rewards/format_reward_func": 1.0, "step": 176 }, { "completion_length": 49.062501192092896, "epoch": 0.09493333333333333, "grad_norm": 0.0016466184211370732, "kl": 0.3016357421875, "learning_rate": 4.6399005768630425e-06, "loss": 0.0003, "reward": 1.0833333432674408, "reward_std": 0.0, "rewards/equation_reward_func": 0.0833333358168602, "rewards/format_reward_func": 1.0, "step": 178 }, { "completion_length": 47.05989742279053, "epoch": 0.096, "grad_norm": 0.0028282549670083116, "kl": 0.327880859375, "learning_rate": 4.630542059139923e-06, "loss": 0.0003, "reward": 1.0833333432674408, "reward_std": 0.0, "rewards/equation_reward_func": 0.0833333358168602, "rewards/format_reward_func": 1.0, "step": 180 }, { "completion_length": 46.96614718437195, "epoch": 0.09706666666666666, "grad_norm": 20330.95377551581, "kl": 38784.28649902344, "learning_rate": 4.621073179436015e-06, "loss": 38.8858, "reward": 1.1015625074505806, "reward_std": 0.007365695666521788, "rewards/equation_reward_func": 0.10156250186264515, "rewards/format_reward_func": 1.0, "step": 182 }, { "completion_length": 46.01302242279053, "epoch": 0.09813333333333334, "grad_norm": 0.0026288677823797482, "kl": 0.3526611328125, "learning_rate": 4.611494428238851e-06, "loss": 0.0004, "reward": 1.0833333432674408, "reward_std": 0.0, "rewards/equation_reward_func": 0.0833333358168602, "rewards/format_reward_func": 1.0, "step": 184 }, { "completion_length": 46.55208420753479, "epoch": 0.0992, "grad_norm": 0.0018890793009976863, "kl": 0.3076171875, "learning_rate": 4.601806301727303e-06, "loss": 0.0003, "reward": 1.1250000149011612, "reward_std": 0.0, "rewards/equation_reward_func": 0.1250000037252903, "rewards/format_reward_func": 1.0, "step": 186 }, { "completion_length": 45.718751192092896, "epoch": 0.10026666666666667, "grad_norm": 0.001705003845255798, "kl": 0.3077392578125, "learning_rate": 4.592009301745879e-06, "loss": 0.0003, "reward": 1.0625000074505806, "reward_std": 0.0, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 1.0, "step": 188 }, { "completion_length": 46.500001430511475, "epoch": 0.10133333333333333, "grad_norm": 0.0018901714393887129, "kl": 0.302001953125, "learning_rate": 4.582103935778728e-06, "loss": 0.0003, "reward": 1.1250000149011612, "reward_std": 0.0, "rewards/equation_reward_func": 0.1250000037252903, "rewards/format_reward_func": 1.0, "step": 190 }, { "completion_length": 46.44270944595337, "epoch": 0.1024, "grad_norm": 0.0027744469044849508, "kl": 0.3707275390625, "learning_rate": 4.572090716923354e-06, "loss": 0.0004, "reward": 1.1250000149011612, "reward_std": 0.0, "rewards/equation_reward_func": 0.1250000037252903, "rewards/format_reward_func": 1.0, "step": 192 }, { "completion_length": 47.45052218437195, "epoch": 0.10346666666666667, "grad_norm": 0.0017930832653982255, "kl": 0.3076171875, "learning_rate": 4.561970163864031e-06, "loss": 0.0003, "reward": 1.104166679084301, "reward_std": 0.0, "rewards/equation_reward_func": 0.10416666977107525, "rewards/format_reward_func": 1.0, "step": 194 }, { "completion_length": 47.179689168930054, "epoch": 0.10453333333333334, "grad_norm": 0.043886529178630944, "kl": 0.3211669921875, "learning_rate": 4.5517428008449435e-06, "loss": 0.0003, "reward": 1.080729179084301, "reward_std": 0.007365695666521788, "rewards/equation_reward_func": 0.0833333358168602, "rewards/format_reward_func": 0.9973958358168602, "step": 196 }, { "completion_length": 45.908855676651, "epoch": 0.1056, "grad_norm": 0.0016311471325742954, "kl": 0.4903564453125, "learning_rate": 4.541409157643027e-06, "loss": 0.0005, "reward": 1.0625000074505806, "reward_std": 0.0, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 1.0, "step": 198 }, { "completion_length": 48.289063453674316, "epoch": 0.10666666666666667, "grad_norm": 0.1406312782184638, "kl": 0.9571533203125, "learning_rate": 4.530969769540525e-06, "loss": 0.001, "reward": 1.0833333432674408, "reward_std": 0.0, "rewards/equation_reward_func": 0.0833333358168602, "rewards/format_reward_func": 1.0, "step": 200 }, { "completion_length": 48.70573019981384, "epoch": 0.10773333333333333, "grad_norm": 0.002094273750756197, "kl": 0.2982177734375, "learning_rate": 4.5204251772972596e-06, "loss": 0.0003, "reward": 1.0625000074505806, "reward_std": 0.0, "rewards/equation_reward_func": 0.06250000186264515, "rewards/format_reward_func": 1.0, "step": 202 }, { "completion_length": 49.73177242279053, "epoch": 0.1088, "grad_norm": 0.002034042170668089, "kl": 0.29437255859375, "learning_rate": 4.509775927122626e-06, "loss": 0.0003, "reward": 1.161458358168602, "reward_std": 0.01814797008410096, "rewards/equation_reward_func": 0.16145833837799728, "rewards/format_reward_func": 1.0, "step": 204 }, { "completion_length": 52.69270944595337, "epoch": 0.10986666666666667, "grad_norm": 0.07554276875742744, "kl": 0.32891845703125, "learning_rate": 4.499022570647292e-06, "loss": 0.0003, "reward": 1.0338541716337204, "reward_std": 0.026653615292161703, "rewards/equation_reward_func": 0.033854166977107525, "rewards/format_reward_func": 1.0, "step": 206 }, { "completion_length": 57.23177218437195, "epoch": 0.11093333333333333, "grad_norm": 0.07366971404963872, "kl": 0.27972412109375, "learning_rate": 4.488165664894632e-06, "loss": 0.0003, "reward": 1.0937500074505806, "reward_std": 0.03401931095868349, "rewards/equation_reward_func": 0.09375000186264515, "rewards/format_reward_func": 1.0, "step": 208 }, { "completion_length": 62.41927218437195, "epoch": 0.112, "grad_norm": 0.08312861851514726, "kl": 0.2943115234375, "learning_rate": 4.477205772251865e-06, "loss": 0.0003, "reward": 1.0494791865348816, "reward_std": 0.04138500662520528, "rewards/equation_reward_func": 0.049479167675599456, "rewards/format_reward_func": 1.0, "step": 210 }, { "completion_length": 66.20312738418579, "epoch": 0.11306666666666666, "grad_norm": 0.07208386359811411, "kl": 0.2977294921875, "learning_rate": 4.466143460440924e-06, "loss": 0.0003, "reward": 1.0937500298023224, "reward_std": 0.0814511370845139, "rewards/equation_reward_func": 0.09375000395812094, "rewards/format_reward_func": 1.0, "step": 212 }, { "completion_length": 65.04687714576721, "epoch": 0.11413333333333334, "grad_norm": 0.1018091484539782, "kl": 0.30206298828125, "learning_rate": 4.454979302489053e-06, "loss": 0.0003, "reward": 1.0807291939854622, "reward_std": 0.10187737224623561, "rewards/equation_reward_func": 0.08072916860692203, "rewards/format_reward_func": 1.0, "step": 214 }, { "completion_length": 61.60677242279053, "epoch": 0.1152, "grad_norm": 0.1827519017211342, "kl": 0.27130126953125, "learning_rate": 4.443713876699124e-06, "loss": 0.0003, "reward": 1.0807292014360428, "reward_std": 0.08540025493130088, "rewards/equation_reward_func": 0.08072917046956718, "rewards/format_reward_func": 1.0, "step": 216 }, { "completion_length": 61.187501430511475, "epoch": 0.11626666666666667, "grad_norm": 0.31137525938223165, "kl": 0.29693603515625, "learning_rate": 4.432347766619672e-06, "loss": 0.0003, "reward": 1.156250037252903, "reward_std": 0.17447089031338692, "rewards/equation_reward_func": 0.15625000279396772, "rewards/format_reward_func": 1.0, "step": 218 }, { "completion_length": 58.31770968437195, "epoch": 0.11733333333333333, "grad_norm": 0.08813120770560309, "kl": 0.3250732421875, "learning_rate": 4.42088156101468e-06, "loss": 0.0003, "reward": 1.0703125149011612, "reward_std": 0.08793068304657936, "rewards/equation_reward_func": 0.07031250093132257, "rewards/format_reward_func": 1.0, "step": 220 }, { "completion_length": 52.695313930511475, "epoch": 0.1184, "grad_norm": 0.11887187061910455, "kl": 0.2825927734375, "learning_rate": 4.409315853833068e-06, "loss": 0.0003, "reward": 1.1328125223517418, "reward_std": 0.14002047991380095, "rewards/equation_reward_func": 0.13281250302679837, "rewards/format_reward_func": 1.0, "step": 222 }, { "completion_length": 49.16145944595337, "epoch": 0.11946666666666667, "grad_norm": 0.09162669463895588, "kl": 0.2979736328125, "learning_rate": 4.397651244177939e-06, "loss": 0.0003, "reward": 1.177083358168602, "reward_std": 0.1649224916473031, "rewards/equation_reward_func": 0.17968750419095159, "rewards/format_reward_func": 0.9973958358168602, "step": 224 }, { "completion_length": 47.359376430511475, "epoch": 0.12053333333333334, "grad_norm": 0.10881964423253246, "kl": 0.26776123046875, "learning_rate": 4.385888336275538e-06, "loss": 0.0003, "reward": 1.083333358168602, "reward_std": 0.12230360461398959, "rewards/equation_reward_func": 0.08593750209547579, "rewards/format_reward_func": 0.9973958358168602, "step": 226 }, { "completion_length": 46.234376430511475, "epoch": 0.1216, "grad_norm": 0.09601658954970464, "kl": 0.3140869140625, "learning_rate": 4.374027739443953e-06, "loss": 0.0003, "reward": 1.1276042088866234, "reward_std": 0.13440475752577186, "rewards/equation_reward_func": 0.12760417140088975, "rewards/format_reward_func": 1.0, "step": 228 }, { "completion_length": 44.61458468437195, "epoch": 0.12266666666666666, "grad_norm": 0.0810341552877944, "kl": 0.3056640625, "learning_rate": 4.362070068061553e-06, "loss": 0.0003, "reward": 1.1119791865348816, "reward_std": 0.10380202671512961, "rewards/equation_reward_func": 0.11197916907258332, "rewards/format_reward_func": 1.0, "step": 230 }, { "completion_length": 44.27864694595337, "epoch": 0.12373333333333333, "grad_norm": 0.0632426004395647, "kl": 0.3031005859375, "learning_rate": 4.35001594153517e-06, "loss": 0.0003, "reward": 1.1666666939854622, "reward_std": 0.12650488875806332, "rewards/equation_reward_func": 0.1666666711680591, "rewards/format_reward_func": 1.0, "step": 232 }, { "completion_length": 45.15364718437195, "epoch": 0.1248, "grad_norm": 0.09667195525260472, "kl": 0.323486328125, "learning_rate": 4.337865984268002e-06, "loss": 0.0003, "reward": 1.138020858168602, "reward_std": 0.1345778051763773, "rewards/equation_reward_func": 0.13802083861082792, "rewards/format_reward_func": 1.0, "step": 234 }, { "completion_length": 45.55208468437195, "epoch": 0.12586666666666665, "grad_norm": 0.0551554728284556, "kl": 0.2943115234375, "learning_rate": 4.325620825627277e-06, "loss": 0.0003, "reward": 1.1510416939854622, "reward_std": 0.1128385765478015, "rewards/equation_reward_func": 0.15104167256504297, "rewards/format_reward_func": 1.0, "step": 236 }, { "completion_length": 46.72656440734863, "epoch": 0.12693333333333334, "grad_norm": 0.056037581108344996, "kl": 0.3001708984375, "learning_rate": 4.313281099911651e-06, "loss": 0.0003, "reward": 1.0937500223517418, "reward_std": 0.06181124225258827, "rewards/equation_reward_func": 0.09375000349245965, "rewards/format_reward_func": 1.0, "step": 238 }, { "completion_length": 44.070313692092896, "epoch": 0.128, "grad_norm": 0.04897805918105704, "kl": 0.3748779296875, "learning_rate": 4.3008474463183505e-06, "loss": 0.0004, "reward": 1.164062537252903, "reward_std": 0.046472438145428896, "rewards/equation_reward_func": 0.1666666737291962, "rewards/format_reward_func": 0.9973958358168602, "step": 240 }, { "completion_length": 46.11979269981384, "epoch": 0.12906666666666666, "grad_norm": 0.05861551115208267, "kl": 0.317626953125, "learning_rate": 4.288320508910058e-06, "loss": 0.0003, "reward": 1.1197917014360428, "reward_std": 0.09372697165235877, "rewards/equation_reward_func": 0.11979167186655104, "rewards/format_reward_func": 1.0, "step": 242 }, { "completion_length": 45.69270968437195, "epoch": 0.13013333333333332, "grad_norm": 0.10621871838409014, "kl": 0.329345703125, "learning_rate": 4.275700936581557e-06, "loss": 0.0003, "reward": 1.1197916939854622, "reward_std": 0.10757221281528473, "rewards/equation_reward_func": 0.1197916716337204, "rewards/format_reward_func": 1.0, "step": 244 }, { "completion_length": 45.23698043823242, "epoch": 0.1312, "grad_norm": 0.07197049946857388, "kl": 0.3629150390625, "learning_rate": 4.262989383026115e-06, "loss": 0.0004, "reward": 1.1015625149011612, "reward_std": 0.1054728776216507, "rewards/equation_reward_func": 0.10156250232830644, "rewards/format_reward_func": 1.0, "step": 246 }, { "completion_length": 50.210938692092896, "epoch": 0.13226666666666667, "grad_norm": 0.08629207099770442, "kl": 0.4293212890625, "learning_rate": 4.25018650670162e-06, "loss": 0.0004, "reward": 1.153645858168602, "reward_std": 0.0888168322853744, "rewards/equation_reward_func": 0.1562500037252903, "rewards/format_reward_func": 0.9973958358168602, "step": 248 }, { "completion_length": 47.13541793823242, "epoch": 0.13333333333333333, "grad_norm": 0.07872159558644704, "kl": 0.5299072265625, "learning_rate": 4.23729297079648e-06, "loss": 0.0005, "reward": 1.1640625223517418, "reward_std": 0.10678587714210153, "rewards/equation_reward_func": 0.16406250349245965, "rewards/format_reward_func": 1.0, "step": 250 }, { "completion_length": 45.76041793823242, "epoch": 0.1344, "grad_norm": 0.0861462860147158, "kl": 0.591552734375, "learning_rate": 4.224309443195261e-06, "loss": 0.0006, "reward": 1.1666667014360428, "reward_std": 0.11336686601862311, "rewards/equation_reward_func": 0.16666667186655104, "rewards/format_reward_func": 1.0, "step": 252 }, { "completion_length": 48.835939168930054, "epoch": 0.13546666666666668, "grad_norm": 0.07905779829816866, "kl": 0.644775390625, "learning_rate": 4.211236596444097e-06, "loss": 0.0006, "reward": 1.1197917014360428, "reward_std": 0.07540268264710903, "rewards/equation_reward_func": 0.1223958374466747, "rewards/format_reward_func": 0.9973958358168602, "step": 254 }, { "completion_length": 50.36198019981384, "epoch": 0.13653333333333334, "grad_norm": 0.05805525649082006, "kl": 0.71630859375, "learning_rate": 4.198075107715849e-06, "loss": 0.0007, "reward": 1.1901041865348816, "reward_std": 0.0907414872199297, "rewards/equation_reward_func": 0.1927083390764892, "rewards/format_reward_func": 0.9973958358168602, "step": 256 }, { "completion_length": 63.192710161209106, "epoch": 0.1376, "grad_norm": 0.11266596946901891, "kl": 0.860595703125, "learning_rate": 4.184825658775027e-06, "loss": 0.0009, "reward": 1.1432292200624943, "reward_std": 0.1915695248171687, "rewards/equation_reward_func": 0.15885417046956718, "rewards/format_reward_func": 0.9843750111758709, "step": 258 }, { "completion_length": 50.47916841506958, "epoch": 0.13866666666666666, "grad_norm": 0.08511878573204953, "kl": 0.755126953125, "learning_rate": 4.17148893594248e-06, "loss": 0.0008, "reward": 1.1562500223517418, "reward_std": 0.12092530494555831, "rewards/equation_reward_func": 0.15885417256504297, "rewards/format_reward_func": 0.9973958358168602, "step": 260 }, { "completion_length": 47.22916793823242, "epoch": 0.13973333333333332, "grad_norm": 0.09937826534016155, "kl": 0.80322265625, "learning_rate": 4.158065630059838e-06, "loss": 0.0008, "reward": 1.1354166865348816, "reward_std": 0.08767851442098618, "rewards/equation_reward_func": 0.13541667256504297, "rewards/format_reward_func": 1.0, "step": 262 }, { "completion_length": 46.843751430511475, "epoch": 0.1408, "grad_norm": 0.08803845957934865, "kl": 0.826416015625, "learning_rate": 4.144556436453727e-06, "loss": 0.0008, "reward": 1.161458358168602, "reward_std": 0.07013632403686643, "rewards/equation_reward_func": 0.16145833861082792, "rewards/format_reward_func": 1.0, "step": 264 }, { "completion_length": 47.11458468437195, "epoch": 0.14186666666666667, "grad_norm": 0.11747054375698676, "kl": 0.8515625, "learning_rate": 4.130962054899756e-06, "loss": 0.0009, "reward": 1.2578125149011612, "reward_std": 0.09618416707962751, "rewards/equation_reward_func": 0.257812506519258, "rewards/format_reward_func": 1.0, "step": 266 }, { "completion_length": 49.20052194595337, "epoch": 0.14293333333333333, "grad_norm": 0.08739632969014637, "kl": 1.005859375, "learning_rate": 4.117283189586266e-06, "loss": 0.001, "reward": 1.1640625149011612, "reward_std": 0.06803698698058724, "rewards/equation_reward_func": 0.1640625037252903, "rewards/format_reward_func": 1.0, "step": 268 }, { "completion_length": 63.40364718437195, "epoch": 0.144, "grad_norm": 0.12835446561536284, "kl": 1.095703125, "learning_rate": 4.1035205490778505e-06, "loss": 0.0011, "reward": 1.1328125335276127, "reward_std": 0.13660651305690408, "rewards/equation_reward_func": 0.14843750651925802, "rewards/format_reward_func": 0.9843750111758709, "step": 270 }, { "completion_length": 65.80208563804626, "epoch": 0.14506666666666668, "grad_norm": 0.0826270243396572, "kl": 1.17822265625, "learning_rate": 4.0896748462786565e-06, "loss": 0.0012, "reward": 1.0807292014360428, "reward_std": 0.13301100628450513, "rewards/equation_reward_func": 0.09375000395812094, "rewards/format_reward_func": 0.986979179084301, "step": 272 }, { "completion_length": 46.85677242279053, "epoch": 0.14613333333333334, "grad_norm": 0.003531358069143052, "kl": 1.119140625, "learning_rate": 4.075746798395452e-06, "loss": 0.0011, "reward": 1.1015625149011612, "reward_std": 0.021918159909546375, "rewards/equation_reward_func": 0.10156250279396772, "rewards/format_reward_func": 1.0, "step": 274 }, { "completion_length": 48.16145944595337, "epoch": 0.1472, "grad_norm": 0.0532767760750055, "kl": 1.0732421875, "learning_rate": 4.061737126900479e-06, "loss": 0.0011, "reward": 1.1692708656191826, "reward_std": 0.0888168322853744, "rewards/equation_reward_func": 0.16927083884365857, "rewards/format_reward_func": 1.0, "step": 276 }, { "completion_length": 47.273438930511475, "epoch": 0.14826666666666666, "grad_norm": 0.08587954791502528, "kl": 1.099853515625, "learning_rate": 4.047646557494076e-06, "loss": 0.0011, "reward": 1.1458333656191826, "reward_std": 0.06557979574427009, "rewards/equation_reward_func": 0.14583333930931985, "rewards/format_reward_func": 1.0, "step": 278 }, { "completion_length": 48.96354293823242, "epoch": 0.14933333333333335, "grad_norm": 0.04847297045942927, "kl": 1.12841796875, "learning_rate": 4.033475820067091e-06, "loss": 0.0011, "reward": 1.1953125298023224, "reward_std": 0.06216485192999244, "rewards/equation_reward_func": 0.19531250558793545, "rewards/format_reward_func": 1.0, "step": 280 }, { "completion_length": 48.062500953674316, "epoch": 0.1504, "grad_norm": 0.1008526819813902, "kl": 1.03271484375, "learning_rate": 4.019225648663073e-06, "loss": 0.001, "reward": 1.286458358168602, "reward_std": 0.0621632169932127, "rewards/equation_reward_func": 0.28645834024064243, "rewards/format_reward_func": 1.0, "step": 282 }, { "completion_length": 50.210938692092896, "epoch": 0.15146666666666667, "grad_norm": 0.08137421195984214, "kl": 1.025146484375, "learning_rate": 4.004896781440244e-06, "loss": 0.001, "reward": 1.1588541865348816, "reward_std": 0.028930244501680136, "rewards/equation_reward_func": 0.1588541716337204, "rewards/format_reward_func": 1.0, "step": 284 }, { "completion_length": 51.11979269981384, "epoch": 0.15253333333333333, "grad_norm": 0.0329162794136622, "kl": 1.053955078125, "learning_rate": 3.990489960633271e-06, "loss": 0.0011, "reward": 1.1953125223517418, "reward_std": 0.044801585376262665, "rewards/equation_reward_func": 0.1953125058207661, "rewards/format_reward_func": 1.0, "step": 286 }, { "completion_length": 50.98177218437195, "epoch": 0.1536, "grad_norm": 0.11109481476714739, "kl": 1.033203125, "learning_rate": 3.976005932514807e-06, "loss": 0.001, "reward": 1.2526041865348816, "reward_std": 0.06970523158088326, "rewards/equation_reward_func": 0.252604172565043, "rewards/format_reward_func": 1.0, "step": 288 }, { "completion_length": 47.09635543823242, "epoch": 0.15466666666666667, "grad_norm": 0.10467692461476164, "kl": 1.087158203125, "learning_rate": 3.961445447356844e-06, "loss": 0.0011, "reward": 1.2552083730697632, "reward_std": 0.05216728104278445, "rewards/equation_reward_func": 0.2552083416376263, "rewards/format_reward_func": 1.0, "step": 290 }, { "completion_length": 46.20052218437195, "epoch": 0.15573333333333333, "grad_norm": 0.08554298248685681, "kl": 1.19091796875, "learning_rate": 3.946809259391846e-06, "loss": 0.0012, "reward": 1.208333358168602, "reward_std": 0.03910674247890711, "rewards/equation_reward_func": 0.2083333390764892, "rewards/format_reward_func": 1.0, "step": 292 }, { "completion_length": 45.83333492279053, "epoch": 0.1568, "grad_norm": 0.04983199249728075, "kl": 1.10546875, "learning_rate": 3.932098126773675e-06, "loss": 0.0011, "reward": 1.0937500074505806, "reward_std": 0.04971172474324703, "rewards/equation_reward_func": 0.09375000279396772, "rewards/format_reward_func": 1.0, "step": 294 }, { "completion_length": 47.31510543823242, "epoch": 0.15786666666666666, "grad_norm": 0.07476817116518496, "kl": 1.206787109375, "learning_rate": 3.917312811538326e-06, "loss": 0.0012, "reward": 1.169270858168602, "reward_std": 0.037789500784128904, "rewards/equation_reward_func": 0.1692708374466747, "rewards/format_reward_func": 1.0, "step": 296 }, { "completion_length": 46.94791793823242, "epoch": 0.15893333333333334, "grad_norm": 0.04603483813614701, "kl": 1.3662109375, "learning_rate": 3.902454079564447e-06, "loss": 0.0014, "reward": 1.1119791939854622, "reward_std": 0.040245057083666325, "rewards/equation_reward_func": 0.11197917233221233, "rewards/format_reward_func": 1.0, "step": 298 }, { "completion_length": 48.28645920753479, "epoch": 0.16, "grad_norm": 0.11474964967614432, "kl": 1.095703125, "learning_rate": 3.887522700533675e-06, "loss": 0.0011, "reward": 1.1276041939854622, "reward_std": 0.04988901689648628, "rewards/equation_reward_func": 0.12760417233221233, "rewards/format_reward_func": 1.0, "step": 300 }, { "completion_length": 48.58073019981384, "epoch": 0.16106666666666666, "grad_norm": 0.08748036183499752, "kl": 1.076171875, "learning_rate": 3.872519447890756e-06, "loss": 0.0011, "reward": 1.1510416939854622, "reward_std": 0.058567711152136326, "rewards/equation_reward_func": 0.15104167023673654, "rewards/format_reward_func": 1.0, "step": 302 }, { "completion_length": 48.265625953674316, "epoch": 0.16213333333333332, "grad_norm": 0.05330672359247684, "kl": 1.1533203125, "learning_rate": 3.8574450988034874e-06, "loss": 0.0012, "reward": 1.1223958507180214, "reward_std": 0.055937470868229866, "rewards/equation_reward_func": 0.12239583511836827, "rewards/format_reward_func": 1.0, "step": 304 }, { "completion_length": 48.43489694595337, "epoch": 0.1632, "grad_norm": 0.05752159507291927, "kl": 1.028076171875, "learning_rate": 3.84230043412246e-06, "loss": 0.001, "reward": 1.2291666939854622, "reward_std": 0.019287919625639915, "rewards/equation_reward_func": 0.22916667303070426, "rewards/format_reward_func": 1.0, "step": 306 }, { "completion_length": 46.84635591506958, "epoch": 0.16426666666666667, "grad_norm": 0.06802192895095636, "kl": 0.97119140625, "learning_rate": 3.8270862383406085e-06, "loss": 0.001, "reward": 1.145833358168602, "reward_std": 0.024375351145863533, "rewards/equation_reward_func": 0.14583333721384406, "rewards/format_reward_func": 1.0, "step": 308 }, { "completion_length": 45.570313453674316, "epoch": 0.16533333333333333, "grad_norm": 0.056616891684726535, "kl": 0.9365234375, "learning_rate": 3.811803299552575e-06, "loss": 0.0009, "reward": 1.1067708507180214, "reward_std": 0.021918159909546375, "rewards/equation_reward_func": 0.10677083674818277, "rewards/format_reward_func": 1.0, "step": 310 }, { "completion_length": 44.890625953674316, "epoch": 0.1664, "grad_norm": 0.1201361909505056, "kl": 0.87939453125, "learning_rate": 3.796452409413887e-06, "loss": 0.0009, "reward": 1.1875000298023224, "reward_std": 0.09732247563079, "rewards/equation_reward_func": 0.18750000628642738, "rewards/format_reward_func": 1.0, "step": 312 }, { "completion_length": 46.81510543823242, "epoch": 0.16746666666666668, "grad_norm": 0.15475824812584402, "kl": 0.875732421875, "learning_rate": 3.781034363099949e-06, "loss": 0.0009, "reward": 1.0963541865348816, "reward_std": 0.06785805989056826, "rewards/equation_reward_func": 0.09895833558402956, "rewards/format_reward_func": 0.9973958358168602, "step": 314 }, { "completion_length": 45.52083444595337, "epoch": 0.16853333333333334, "grad_norm": 0.11133793430533863, "kl": 0.84521484375, "learning_rate": 3.7655499592648514e-06, "loss": 0.0008, "reward": 1.1276042014360428, "reward_std": 0.06611233251169324, "rewards/equation_reward_func": 0.1302083390764892, "rewards/format_reward_func": 0.9973958358168602, "step": 316 }, { "completion_length": 46.101563692092896, "epoch": 0.1696, "grad_norm": 0.14231952655893862, "kl": 0.76171875, "learning_rate": 3.7500000000000005e-06, "loss": 0.0008, "reward": 1.1848958507180214, "reward_std": 0.09504421381279826, "rewards/equation_reward_func": 0.19010417046956718, "rewards/format_reward_func": 0.9947916716337204, "step": 318 }, { "completion_length": 48.25260543823242, "epoch": 0.17066666666666666, "grad_norm": 0.09375566429352347, "kl": 0.767578125, "learning_rate": 3.7343852907925734e-06, "loss": 0.0008, "reward": 1.1953125298023224, "reward_std": 0.08540025353431702, "rewards/equation_reward_func": 0.20052083814516664, "rewards/format_reward_func": 0.9947916716337204, "step": 320 }, { "completion_length": 45.351563692092896, "epoch": 0.17173333333333332, "grad_norm": 0.04419342373186932, "kl": 0.7607421875, "learning_rate": 3.7187066404837892e-06, "loss": 0.0008, "reward": 1.1380208507180214, "reward_std": 0.03682847833260894, "rewards/equation_reward_func": 0.14062500558793545, "rewards/format_reward_func": 0.9973958358168602, "step": 322 }, { "completion_length": 45.37239742279053, "epoch": 0.1728, "grad_norm": 0.06455285487511006, "kl": 0.917724609375, "learning_rate": 3.702964861227013e-06, "loss": 0.0009, "reward": 1.1197916865348816, "reward_std": 0.03629594016820192, "rewards/equation_reward_func": 0.1197916716337204, "rewards/format_reward_func": 1.0, "step": 324 }, { "completion_length": 48.289063930511475, "epoch": 0.17386666666666667, "grad_norm": 0.07212276737740847, "kl": 0.838134765625, "learning_rate": 3.6871607684456884e-06, "loss": 0.0008, "reward": 1.0833333507180214, "reward_std": 0.024375351145863533, "rewards/equation_reward_func": 0.08333333535119891, "rewards/format_reward_func": 1.0, "step": 326 }, { "completion_length": 46.41666841506958, "epoch": 0.17493333333333333, "grad_norm": 0.07344186263607759, "kl": 0.86865234375, "learning_rate": 3.6712951807910942e-06, "loss": 0.0009, "reward": 1.1484375223517418, "reward_std": 0.02551366575062275, "rewards/equation_reward_func": 0.14843750558793545, "rewards/format_reward_func": 1.0, "step": 328 }, { "completion_length": 46.757813453674316, "epoch": 0.176, "grad_norm": 0.11402292693483036, "kl": 0.821044921875, "learning_rate": 3.655368920099943e-06, "loss": 0.0008, "reward": 1.153645858168602, "reward_std": 0.06251682806760073, "rewards/equation_reward_func": 0.1536458374466747, "rewards/format_reward_func": 1.0, "step": 330 }, { "completion_length": 45.375001430511475, "epoch": 0.17706666666666668, "grad_norm": 0.050851734182512064, "kl": 0.917724609375, "learning_rate": 3.6393828113518065e-06, "loss": 0.0009, "reward": 1.0859375149011612, "reward_std": 0.007365695666521788, "rewards/equation_reward_func": 0.08593750256113708, "rewards/format_reward_func": 1.0, "step": 332 }, { "completion_length": 45.65885519981384, "epoch": 0.17813333333333334, "grad_norm": 0.15494400741248782, "kl": 0.819091796875, "learning_rate": 3.623337682626383e-06, "loss": 0.0008, "reward": 1.1119791865348816, "reward_std": 0.06013875314965844, "rewards/equation_reward_func": 0.11197917000390589, "rewards/format_reward_func": 1.0, "step": 334 }, { "completion_length": 46.70833396911621, "epoch": 0.1792, "grad_norm": 0.10083785138579399, "kl": 0.848876953125, "learning_rate": 3.6072343650606044e-06, "loss": 0.0008, "reward": 1.1770833730697632, "reward_std": 0.07821348635479808, "rewards/equation_reward_func": 0.17708333977498114, "rewards/format_reward_func": 1.0, "step": 336 }, { "completion_length": 46.72395920753479, "epoch": 0.18026666666666666, "grad_norm": 0.06577677701365688, "kl": 0.734375, "learning_rate": 3.5910736928055814e-06, "loss": 0.0007, "reward": 1.145833358168602, "reward_std": 0.03401931095868349, "rewards/equation_reward_func": 0.145833337912336, "rewards/format_reward_func": 1.0, "step": 338 }, { "completion_length": 47.065104961395264, "epoch": 0.18133333333333335, "grad_norm": 0.11945834183328498, "kl": 0.7041015625, "learning_rate": 3.574856502983392e-06, "loss": 0.0007, "reward": 1.1197917014360428, "reward_std": 0.09127402352169156, "rewards/equation_reward_func": 0.11979167209938169, "rewards/format_reward_func": 1.0, "step": 340 }, { "completion_length": 45.328125953674316, "epoch": 0.1824, "grad_norm": 0.14189639396994833, "kl": 0.665283203125, "learning_rate": 3.5585836356437266e-06, "loss": 0.0007, "reward": 1.1223958507180214, "reward_std": 0.07408544234931469, "rewards/equation_reward_func": 0.12239583535119891, "rewards/format_reward_func": 1.0, "step": 342 }, { "completion_length": 47.52083492279053, "epoch": 0.18346666666666667, "grad_norm": 0.09861554940896551, "kl": 0.684814453125, "learning_rate": 3.542255933720363e-06, "loss": 0.0007, "reward": 1.1875000298023224, "reward_std": 0.10977136204019189, "rewards/equation_reward_func": 0.1875000037252903, "rewards/format_reward_func": 1.0, "step": 344 }, { "completion_length": 50.190105676651, "epoch": 0.18453333333333333, "grad_norm": 0.10466423598229567, "kl": 0.747314453125, "learning_rate": 3.5258742429875137e-06, "loss": 0.0007, "reward": 1.106770858168602, "reward_std": 0.046472438145428896, "rewards/equation_reward_func": 0.10677083861082792, "rewards/format_reward_func": 1.0, "step": 346 }, { "completion_length": 51.804688453674316, "epoch": 0.1856, "grad_norm": 0.07539317603710921, "kl": 0.875244140625, "learning_rate": 3.5094394120160047e-06, "loss": 0.0009, "reward": 1.0703125149011612, "reward_std": 0.02551366575062275, "rewards/equation_reward_func": 0.07291666977107525, "rewards/format_reward_func": 0.9973958358168602, "step": 348 }, { "completion_length": 48.572918176651, "epoch": 0.18666666666666668, "grad_norm": 0.06386551709212408, "kl": 0.939697265625, "learning_rate": 3.4929522921293246e-06, "loss": 0.0009, "reward": 1.2161458730697632, "reward_std": 0.05479752132669091, "rewards/equation_reward_func": 0.21614584233611822, "rewards/format_reward_func": 1.0, "step": 350 }, { "completion_length": 48.75520968437195, "epoch": 0.18773333333333334, "grad_norm": 0.11246311442332814, "kl": 1.00244140625, "learning_rate": 3.4764137373595274e-06, "loss": 0.001, "reward": 1.161458358168602, "reward_std": 0.04515519551932812, "rewards/equation_reward_func": 0.16145834140479565, "rewards/format_reward_func": 1.0, "step": 352 }, { "completion_length": 48.398438930511475, "epoch": 0.1888, "grad_norm": 0.07890096191201901, "kl": 1.093017578125, "learning_rate": 3.459824604402991e-06, "loss": 0.0011, "reward": 1.1197916939854622, "reward_std": 0.035511236637830734, "rewards/equation_reward_func": 0.11979167093522847, "rewards/format_reward_func": 1.0, "step": 354 }, { "completion_length": 47.835938692092896, "epoch": 0.18986666666666666, "grad_norm": 0.06423226939830316, "kl": 1.077880859375, "learning_rate": 3.4431857525760386e-06, "loss": 0.0011, "reward": 1.1640625223517418, "reward_std": 0.03664955263957381, "rewards/equation_reward_func": 0.16406250465661287, "rewards/format_reward_func": 1.0, "step": 356 }, { "completion_length": 54.43489742279053, "epoch": 0.19093333333333334, "grad_norm": 0.06797874242778387, "kl": 1.06640625, "learning_rate": 3.426498043770432e-06, "loss": 0.0011, "reward": 1.1276041716337204, "reward_std": 0.044801585376262665, "rewards/equation_reward_func": 0.13281250186264515, "rewards/format_reward_func": 0.9947916679084301, "step": 358 }, { "completion_length": 48.24479293823242, "epoch": 0.192, "grad_norm": 0.13835664845190732, "kl": 1.509521484375, "learning_rate": 3.4097623424087196e-06, "loss": 0.0015, "reward": 1.1276041865348816, "reward_std": 0.03174104681238532, "rewards/equation_reward_func": 0.1276041711680591, "rewards/format_reward_func": 1.0, "step": 360 }, { "completion_length": 53.822918176651, "epoch": 0.19306666666666666, "grad_norm": 0.11967318300321697, "kl": 1.095703125, "learning_rate": 3.3929795153994627e-06, "loss": 0.0011, "reward": 1.1171875223517418, "reward_std": 0.06652782764285803, "rewards/equation_reward_func": 0.12239583651535213, "rewards/format_reward_func": 0.9947916716337204, "step": 362 }, { "completion_length": 49.307293176651, "epoch": 0.19413333333333332, "grad_norm": 0.11912470005927893, "kl": 1.06884765625, "learning_rate": 3.376150432092332e-06, "loss": 0.0011, "reward": 1.2213542014360428, "reward_std": 0.03515762556344271, "rewards/equation_reward_func": 0.22135417419485748, "rewards/format_reward_func": 1.0, "step": 364 }, { "completion_length": 54.33854341506958, "epoch": 0.1952, "grad_norm": 0.06482729759607722, "kl": 1.0517578125, "learning_rate": 3.3592759642330664e-06, "loss": 0.0011, "reward": 1.1666666865348816, "reward_std": 0.06318034511059523, "rewards/equation_reward_func": 0.17187500442378223, "rewards/format_reward_func": 0.9947916716337204, "step": 366 }, { "completion_length": 58.28385519981384, "epoch": 0.19626666666666667, "grad_norm": 0.06890758730782987, "kl": 1.041015625, "learning_rate": 3.3423569859183282e-06, "loss": 0.001, "reward": 1.2135416939854622, "reward_std": 0.058746638242155313, "rewards/equation_reward_func": 0.21875000838190317, "rewards/format_reward_func": 0.9947916716337204, "step": 368 }, { "completion_length": 57.500001192092896, "epoch": 0.19733333333333333, "grad_norm": 0.06854056424765637, "kl": 0.99658203125, "learning_rate": 3.325394373550416e-06, "loss": 0.001, "reward": 1.1015625223517418, "reward_std": 0.06652782764285803, "rewards/equation_reward_func": 0.10677083698101342, "rewards/format_reward_func": 0.9947916716337204, "step": 370 }, { "completion_length": 55.35416841506958, "epoch": 0.1984, "grad_norm": 0.11377627086258121, "kl": 1.08837890625, "learning_rate": 3.308389005791872e-06, "loss": 0.0011, "reward": 1.1406250223517418, "reward_std": 0.07654263218864799, "rewards/equation_reward_func": 0.14322917303070426, "rewards/format_reward_func": 0.9973958358168602, "step": 372 }, { "completion_length": 51.531251430511475, "epoch": 0.19946666666666665, "grad_norm": 0.087534558237461, "kl": 1.03076171875, "learning_rate": 3.2913417635199627e-06, "loss": 0.001, "reward": 1.1744791939854622, "reward_std": 0.06558143068104982, "rewards/equation_reward_func": 0.17447917209938169, "rewards/format_reward_func": 1.0, "step": 374 }, { "completion_length": 53.91145968437195, "epoch": 0.20053333333333334, "grad_norm": 0.09656096364285013, "kl": 1.001708984375, "learning_rate": 3.2742535297810576e-06, "loss": 0.001, "reward": 1.1328125149011612, "reward_std": 0.03857420431450009, "rewards/equation_reward_func": 0.13541666977107525, "rewards/format_reward_func": 0.9973958358168602, "step": 376 }, { "completion_length": 50.97395968437195, "epoch": 0.2016, "grad_norm": 0.0028318880559913286, "kl": 0.95751953125, "learning_rate": 3.257125189744877e-06, "loss": 0.001, "reward": 1.1067708432674408, "reward_std": 0.007365695666521788, "rewards/equation_reward_func": 0.1067708358168602, "rewards/format_reward_func": 1.0, "step": 378 }, { "completion_length": 49.671876192092896, "epoch": 0.20266666666666666, "grad_norm": 0.03328256715848928, "kl": 0.921875, "learning_rate": 3.2399576306586496e-06, "loss": 0.0009, "reward": 1.1484375223517418, "reward_std": 0.046293511521071196, "rewards/equation_reward_func": 0.14843750465661287, "rewards/format_reward_func": 1.0, "step": 380 }, { "completion_length": 47.86718940734863, "epoch": 0.20373333333333332, "grad_norm": 0.004439679112135068, "kl": 0.908935546875, "learning_rate": 3.2227517418011463e-06, "loss": 0.0009, "reward": 1.1250000149011612, "reward_std": 0.0, "rewards/equation_reward_func": 0.1250000037252903, "rewards/format_reward_func": 1.0, "step": 382 }, { "completion_length": 48.19010591506958, "epoch": 0.2048, "grad_norm": 0.04976536544477015, "kl": 0.8896484375, "learning_rate": 3.2055084144366195e-06, "loss": 0.0009, "reward": 1.1718750223517418, "reward_std": 0.042523321229964495, "rewards/equation_reward_func": 0.17447917046956718, "rewards/format_reward_func": 0.9973958358168602, "step": 384 }, { "completion_length": 47.54166841506958, "epoch": 0.20586666666666667, "grad_norm": 0.0980752458345384, "kl": 0.85888671875, "learning_rate": 3.1882285417686353e-06, "loss": 0.0009, "reward": 1.1145833507180214, "reward_std": 0.07373183127492666, "rewards/equation_reward_func": 0.11718750256113708, "rewards/format_reward_func": 0.9973958358168602, "step": 386 }, { "completion_length": 47.34895968437195, "epoch": 0.20693333333333333, "grad_norm": 0.09867925876177443, "kl": 0.778076171875, "learning_rate": 3.1709130188938044e-06, "loss": 0.0008, "reward": 1.1979166865348816, "reward_std": 0.07206097710877657, "rewards/equation_reward_func": 0.1979166711680591, "rewards/format_reward_func": 1.0, "step": 388 }, { "completion_length": 47.47135519981384, "epoch": 0.208, "grad_norm": 0.08800570789980847, "kl": 0.789794921875, "learning_rate": 3.1535627427554144e-06, "loss": 0.0008, "reward": 1.2317708656191826, "reward_std": 0.055937470868229866, "rewards/equation_reward_func": 0.23177084000781178, "rewards/format_reward_func": 1.0, "step": 390 }, { "completion_length": 48.65104269981384, "epoch": 0.20906666666666668, "grad_norm": 0.027123570773499573, "kl": 0.8271484375, "learning_rate": 3.1361786120969735e-06, "loss": 0.0008, "reward": 1.0677083432674408, "reward_std": 0.031208508647978306, "rewards/equation_reward_func": 0.06770833535119891, "rewards/format_reward_func": 1.0, "step": 392 }, { "completion_length": 47.16145920753479, "epoch": 0.21013333333333334, "grad_norm": 0.10793612747614911, "kl": 0.826904296875, "learning_rate": 3.1187615274156513e-06, "loss": 0.0008, "reward": 1.2239583656191826, "reward_std": 0.0976744550280273, "rewards/equation_reward_func": 0.22656250838190317, "rewards/format_reward_func": 0.9973958358168602, "step": 394 }, { "completion_length": 49.22135543823242, "epoch": 0.2112, "grad_norm": 0.06373438709255157, "kl": 0.77783203125, "learning_rate": 3.1013123909156347e-06, "loss": 0.0008, "reward": 1.1979166865348816, "reward_std": 0.03287936141714454, "rewards/equation_reward_func": 0.19791667233221233, "rewards/format_reward_func": 1.0, "step": 396 }, { "completion_length": 47.226563930511475, "epoch": 0.21226666666666666, "grad_norm": 0.10287713344495177, "kl": 0.80908203125, "learning_rate": 3.0838321064613914e-06, "loss": 0.0008, "reward": 1.2239583656191826, "reward_std": 0.07654263218864799, "rewards/equation_reward_func": 0.22395833977498114, "rewards/format_reward_func": 1.0, "step": 398 }, { "completion_length": 49.10677242279053, "epoch": 0.21333333333333335, "grad_norm": 0.09564921006082781, "kl": 0.762451171875, "learning_rate": 3.0663215795308536e-06, "loss": 0.0008, "reward": 1.1822916865348816, "reward_std": 0.09020894719287753, "rewards/equation_reward_func": 0.1822916716337204, "rewards/format_reward_func": 1.0, "step": 400 }, { "completion_length": 49.59895992279053, "epoch": 0.2144, "grad_norm": 0.040922653114638094, "kl": 0.720947265625, "learning_rate": 3.048781717168513e-06, "loss": 0.0007, "reward": 1.1093750223517418, "reward_std": 0.05839466489851475, "rewards/equation_reward_func": 0.10937500442378223, "rewards/format_reward_func": 1.0, "step": 402 }, { "completion_length": 49.609376192092896, "epoch": 0.21546666666666667, "grad_norm": 0.10279002828949411, "kl": 0.6087646484375, "learning_rate": 3.031213427938432e-06, "loss": 0.0006, "reward": 1.1822916939854622, "reward_std": 0.09810718381777406, "rewards/equation_reward_func": 0.18229167233221233, "rewards/format_reward_func": 1.0, "step": 404 }, { "completion_length": 50.213542461395264, "epoch": 0.21653333333333333, "grad_norm": 0.06354596859795375, "kl": 0.5797119140625, "learning_rate": 3.013617621877188e-06, "loss": 0.0006, "reward": 1.127604179084301, "reward_std": 0.05085003934800625, "rewards/equation_reward_func": 0.13020833604969084, "rewards/format_reward_func": 0.9973958358168602, "step": 406 }, { "completion_length": 51.205729722976685, "epoch": 0.2176, "grad_norm": 0.10572767956540034, "kl": 0.5369873046875, "learning_rate": 2.9959952104467247e-06, "loss": 0.0005, "reward": 1.138020858168602, "reward_std": 0.08294469770044088, "rewards/equation_reward_func": 0.1380208362825215, "rewards/format_reward_func": 1.0, "step": 408 }, { "completion_length": 52.343751430511475, "epoch": 0.21866666666666668, "grad_norm": 0.08016091763460871, "kl": 0.5113525390625, "learning_rate": 2.978347106487146e-06, "loss": 0.0005, "reward": 1.1119791865348816, "reward_std": 0.044801585376262665, "rewards/equation_reward_func": 0.11197917000390589, "rewards/format_reward_func": 1.0, "step": 410 }, { "completion_length": 52.66927218437195, "epoch": 0.21973333333333334, "grad_norm": 0.08944092001462081, "kl": 0.43505859375, "learning_rate": 2.960674224169427e-06, "loss": 0.0004, "reward": 1.145833358168602, "reward_std": 0.06497401930391788, "rewards/equation_reward_func": 0.1458333353511989, "rewards/format_reward_func": 1.0, "step": 412 }, { "completion_length": 53.367188930511475, "epoch": 0.2208, "grad_norm": 0.16114935883839165, "kl": 0.49755859375, "learning_rate": 2.9429774789480576e-06, "loss": 0.0005, "reward": 1.2213542014360428, "reward_std": 0.14211818668991327, "rewards/equation_reward_func": 0.22135417303070426, "rewards/format_reward_func": 1.0, "step": 414 }, { "completion_length": 53.73958420753479, "epoch": 0.22186666666666666, "grad_norm": 0.10073464939760426, "kl": 0.4027099609375, "learning_rate": 2.9252577875136277e-06, "loss": 0.0004, "reward": 1.1458333656191826, "reward_std": 0.11283857375383377, "rewards/equation_reward_func": 0.1458333374466747, "rewards/format_reward_func": 1.0, "step": 416 }, { "completion_length": 55.32552242279053, "epoch": 0.22293333333333334, "grad_norm": 0.09818722889684572, "kl": 0.407958984375, "learning_rate": 2.9075160677453416e-06, "loss": 0.0004, "reward": 1.1822916865348816, "reward_std": 0.1137995976023376, "rewards/equation_reward_func": 0.18229167023673654, "rewards/format_reward_func": 1.0, "step": 418 }, { "completion_length": 54.58333492279053, "epoch": 0.224, "grad_norm": 0.08950810256956081, "kl": 0.451416015625, "learning_rate": 2.8897532386634663e-06, "loss": 0.0005, "reward": 1.1744792014360428, "reward_std": 0.1268584975041449, "rewards/equation_reward_func": 0.17447917466051877, "rewards/format_reward_func": 1.0, "step": 420 }, { "completion_length": 54.22395920753479, "epoch": 0.22506666666666666, "grad_norm": 0.08929457708172545, "kl": 0.515625, "learning_rate": 2.8719702203817334e-06, "loss": 0.0005, "reward": 1.1822917014360428, "reward_std": 0.0754026840440929, "rewards/equation_reward_func": 0.18229167466051877, "rewards/format_reward_func": 1.0, "step": 422 }, { "completion_length": 65.64843916893005, "epoch": 0.22613333333333333, "grad_norm": 0.060303658707177785, "kl": 0.5272216796875, "learning_rate": 2.8541679340596723e-06, "loss": 0.0005, "reward": 1.1666666939854622, "reward_std": 0.08259108755737543, "rewards/equation_reward_func": 0.16666667209938169, "rewards/format_reward_func": 1.0, "step": 424 }, { "completion_length": 126.20052576065063, "epoch": 0.2272, "grad_norm": 0.07480725251270064, "kl": 0.444580078125, "learning_rate": 2.836347301854897e-06, "loss": 0.0004, "reward": 1.1953125298023224, "reward_std": 0.09013571171090007, "rewards/equation_reward_func": 0.20052083861082792, "rewards/format_reward_func": 0.9947916716337204, "step": 426 }, { "completion_length": 104.59896183013916, "epoch": 0.22826666666666667, "grad_norm": 0.0813220595012641, "kl": 0.4412841796875, "learning_rate": 2.818509246875337e-06, "loss": 0.0004, "reward": 1.2213542014360428, "reward_std": 0.1228361432440579, "rewards/equation_reward_func": 0.22656250791624188, "rewards/format_reward_func": 0.9947916716337204, "step": 428 }, { "completion_length": 76.18750190734863, "epoch": 0.22933333333333333, "grad_norm": 0.047955628123028324, "kl": 0.507568359375, "learning_rate": 2.8006546931314197e-06, "loss": 0.0005, "reward": 1.1953125298023224, "reward_std": 0.0950442161411047, "rewards/equation_reward_func": 0.19531250675208867, "rewards/format_reward_func": 1.0, "step": 430 }, { "completion_length": 69.1197943687439, "epoch": 0.2304, "grad_norm": 0.13722505166284227, "kl": 0.51806640625, "learning_rate": 2.7827845654882112e-06, "loss": 0.0005, "reward": 1.1223958507180214, "reward_std": 0.07882089773193002, "rewards/equation_reward_func": 0.12239583814516664, "rewards/format_reward_func": 1.0, "step": 432 }, { "completion_length": 54.53385519981384, "epoch": 0.23146666666666665, "grad_norm": 0.10393252857101508, "kl": 0.4356689453125, "learning_rate": 2.7648997896175005e-06, "loss": 0.0004, "reward": 1.1901042014360428, "reward_std": 0.09846079349517822, "rewards/equation_reward_func": 0.1901041774544865, "rewards/format_reward_func": 1.0, "step": 434 }, { "completion_length": 54.312501430511475, "epoch": 0.23253333333333334, "grad_norm": 0.07726928035755769, "kl": 0.4515380859375, "learning_rate": 2.7470012919498567e-06, "loss": 0.0005, "reward": 1.1744792014360428, "reward_std": 0.05953297670930624, "rewards/equation_reward_func": 0.1744791737291962, "rewards/format_reward_func": 1.0, "step": 436 }, { "completion_length": 55.58854293823242, "epoch": 0.2336, "grad_norm": 0.04592833614282811, "kl": 0.4368896484375, "learning_rate": 2.729089999626637e-06, "loss": 0.0004, "reward": 1.2031250223517418, "reward_std": 0.05173455365002155, "rewards/equation_reward_func": 0.20312500349245965, "rewards/format_reward_func": 1.0, "step": 438 }, { "completion_length": 54.799480676651, "epoch": 0.23466666666666666, "grad_norm": 0.07084176314499496, "kl": 0.449951171875, "learning_rate": 2.7111668404519602e-06, "loss": 0.0005, "reward": 1.1302083432674408, "reward_std": 0.035511236637830734, "rewards/equation_reward_func": 0.13020833465270698, "rewards/format_reward_func": 1.0, "step": 440 }, { "completion_length": 56.242189168930054, "epoch": 0.23573333333333332, "grad_norm": 0.11692973907664804, "kl": 0.5537109375, "learning_rate": 2.6932327428446493e-06, "loss": 0.0006, "reward": 1.1718750298023224, "reward_std": 0.06330316653475165, "rewards/equation_reward_func": 0.17447917233221233, "rewards/format_reward_func": 0.9973958358168602, "step": 442 }, { "completion_length": 55.63541793823242, "epoch": 0.2368, "grad_norm": 0.09182446461115165, "kl": 0.4761962890625, "learning_rate": 2.675288635790135e-06, "loss": 0.0005, "reward": 1.2578125298023224, "reward_std": 0.07689624466001987, "rewards/equation_reward_func": 0.2604166741948575, "rewards/format_reward_func": 0.9973958358168602, "step": 444 }, { "completion_length": 58.875001192092896, "epoch": 0.23786666666666667, "grad_norm": 0.09045825573176301, "kl": 0.4979248046875, "learning_rate": 2.6573354487923402e-06, "loss": 0.0005, "reward": 1.174479179084301, "reward_std": 0.04366163583472371, "rewards/equation_reward_func": 0.1744791716337204, "rewards/format_reward_func": 1.0, "step": 446 }, { "completion_length": 57.67708420753479, "epoch": 0.23893333333333333, "grad_norm": 0.09647404437373373, "kl": 0.52587890625, "learning_rate": 2.639374111825526e-06, "loss": 0.0005, "reward": 1.1328125298023224, "reward_std": 0.061024902388453484, "rewards/equation_reward_func": 0.1328125053551048, "rewards/format_reward_func": 1.0, "step": 448 }, { "completion_length": 58.60677194595337, "epoch": 0.24, "grad_norm": 0.12020087676837644, "kl": 0.498779296875, "learning_rate": 2.6214055552861213e-06, "loss": 0.0005, "reward": 1.1901041939854622, "reward_std": 0.11319218622520566, "rewards/equation_reward_func": 0.1901041711680591, "rewards/format_reward_func": 1.0, "step": 450 }, { "completion_length": 56.00520944595337, "epoch": 0.24106666666666668, "grad_norm": 0.10771665572982514, "kl": 0.463623046875, "learning_rate": 2.6034307099445292e-06, "loss": 0.0005, "reward": 1.2473958805203438, "reward_std": 0.09390426380559802, "rewards/equation_reward_func": 0.24739584140479565, "rewards/format_reward_func": 1.0, "step": 452 }, { "completion_length": 55.16145992279053, "epoch": 0.24213333333333334, "grad_norm": 0.10777922280809354, "kl": 0.42578125, "learning_rate": 2.585450506896915e-06, "loss": 0.0004, "reward": 1.343750037252903, "reward_std": 0.09162600105628371, "rewards/equation_reward_func": 0.3437500102445483, "rewards/format_reward_func": 1.0, "step": 454 }, { "completion_length": 54.61979293823242, "epoch": 0.2432, "grad_norm": 0.12411467410005365, "kl": 0.4093017578125, "learning_rate": 2.567465877516968e-06, "loss": 0.0004, "reward": 1.1718750223517418, "reward_std": 0.03743588970974088, "rewards/equation_reward_func": 0.17187500442378223, "rewards/format_reward_func": 1.0, "step": 456 }, { "completion_length": 56.671876430511475, "epoch": 0.24426666666666666, "grad_norm": 0.07334127959616951, "kl": 0.426513671875, "learning_rate": 2.5494777534076647e-06, "loss": 0.0004, "reward": 1.2239583507180214, "reward_std": 0.08259108662605286, "rewards/equation_reward_func": 0.22395833651535213, "rewards/format_reward_func": 1.0, "step": 458 }, { "completion_length": 55.960938692092896, "epoch": 0.24533333333333332, "grad_norm": 0.12366324669628964, "kl": 0.395263671875, "learning_rate": 2.5314870663530083e-06, "loss": 0.0004, "reward": 1.3333333656191826, "reward_std": 0.07654263358563185, "rewards/equation_reward_func": 0.3333333432674408, "rewards/format_reward_func": 1.0, "step": 460 }, { "completion_length": 54.09114670753479, "epoch": 0.2464, "grad_norm": 0.035603617122373005, "kl": 0.4075927734375, "learning_rate": 2.5134947482697615e-06, "loss": 0.0004, "reward": 1.239583358168602, "reward_std": 0.024375351145863533, "rewards/equation_reward_func": 0.23958334140479565, "rewards/format_reward_func": 1.0, "step": 462 }, { "completion_length": 54.072918176651, "epoch": 0.24746666666666667, "grad_norm": 0.07643038420514706, "kl": 0.4041748046875, "learning_rate": 2.495501731159174e-06, "loss": 0.0004, "reward": 1.2916667014360428, "reward_std": 0.031208508647978306, "rewards/equation_reward_func": 0.2916666748933494, "rewards/format_reward_func": 1.0, "step": 464 }, { "completion_length": 53.90885591506958, "epoch": 0.24853333333333333, "grad_norm": 0.14200853375202732, "kl": 0.3963623046875, "learning_rate": 2.4775089470587057e-06, "loss": 0.0004, "reward": 1.210937537252903, "reward_std": 0.03664955124258995, "rewards/equation_reward_func": 0.21093750721774995, "rewards/format_reward_func": 1.0, "step": 466 }, { "completion_length": 54.367188453674316, "epoch": 0.2496, "grad_norm": 0.19083812752236776, "kl": 0.375244140625, "learning_rate": 2.4595173279937464e-06, "loss": 0.0004, "reward": 1.2526042088866234, "reward_std": 0.022097086999565363, "rewards/equation_reward_func": 0.2526041748933494, "rewards/format_reward_func": 1.0, "step": 468 }, { "completion_length": 52.429688692092896, "epoch": 0.25066666666666665, "grad_norm": 0.10654234829100602, "kl": 0.443115234375, "learning_rate": 2.441527805929338e-06, "loss": 0.0004, "reward": 1.2708333656191826, "reward_std": 0.03401931095868349, "rewards/equation_reward_func": 0.27083334140479565, "rewards/format_reward_func": 1.0, "step": 470 }, { "completion_length": 53.601563930511475, "epoch": 0.2517333333333333, "grad_norm": 0.15307419576429313, "kl": 0.418212890625, "learning_rate": 2.423541312721896e-06, "loss": 0.0004, "reward": 1.2630208656191826, "reward_std": 0.03515762556344271, "rewards/equation_reward_func": 0.26302084140479565, "rewards/format_reward_func": 1.0, "step": 472 }, { "completion_length": 53.125001192092896, "epoch": 0.2528, "grad_norm": 0.32551456625063296, "kl": 1.44970703125, "learning_rate": 2.4055587800709423e-06, "loss": 0.0014, "reward": 1.1640625223517418, "reward_std": 0.007365695666521788, "rewards/equation_reward_func": 0.16406250558793545, "rewards/format_reward_func": 1.0, "step": 474 }, { "completion_length": 53.078126192092896, "epoch": 0.2538666666666667, "grad_norm": 0.07709090937740397, "kl": 0.407470703125, "learning_rate": 2.3875811394708433e-06, "loss": 0.0004, "reward": 1.1354166865348816, "reward_std": 0.01814797008410096, "rewards/equation_reward_func": 0.1354166716337204, "rewards/format_reward_func": 1.0, "step": 476 }, { "completion_length": 54.476563453674316, "epoch": 0.25493333333333335, "grad_norm": 0.2387216691645432, "kl": 0.3582763671875, "learning_rate": 2.369609322162553e-06, "loss": 0.0004, "reward": 1.216145858168602, "reward_std": 0.04629351105540991, "rewards/equation_reward_func": 0.21614584024064243, "rewards/format_reward_func": 1.0, "step": 478 }, { "completion_length": 53.890626430511475, "epoch": 0.256, "grad_norm": 0.1565674333814937, "kl": 0.399658203125, "learning_rate": 2.351644259085387e-06, "loss": 0.0004, "reward": 1.164062537252903, "reward_std": 0.05910024931654334, "rewards/equation_reward_func": 0.16406250512227416, "rewards/format_reward_func": 1.0, "step": 480 }, { "completion_length": 52.515626668930054, "epoch": 0.25706666666666667, "grad_norm": 0.08281822153807049, "kl": 0.39306640625, "learning_rate": 2.3336868808287845e-06, "loss": 0.0004, "reward": 1.2109375298023224, "reward_std": 0.03515762556344271, "rewards/equation_reward_func": 0.21093750814907253, "rewards/format_reward_func": 1.0, "step": 482 }, { "completion_length": 52.28385591506958, "epoch": 0.2581333333333333, "grad_norm": 0.17569671531962108, "kl": 0.4095458984375, "learning_rate": 2.3157381175841146e-06, "loss": 0.0004, "reward": 1.2447917014360428, "reward_std": 0.06864439835771918, "rewards/equation_reward_func": 0.24479167675599456, "rewards/format_reward_func": 1.0, "step": 484 }, { "completion_length": 53.41666793823242, "epoch": 0.2592, "grad_norm": 0.13780548705369053, "kl": 0.8272705078125, "learning_rate": 2.29779889909649e-06, "loss": 0.0008, "reward": 1.2760417014360428, "reward_std": 0.0640878714621067, "rewards/equation_reward_func": 0.2760416748933494, "rewards/format_reward_func": 1.0, "step": 486 }, { "completion_length": 53.34895968437195, "epoch": 0.26026666666666665, "grad_norm": 0.07148811780445115, "kl": 0.4310302734375, "learning_rate": 2.2798701546166026e-06, "loss": 0.0004, "reward": 1.1875000223517418, "reward_std": 0.03401931095868349, "rewards/equation_reward_func": 0.18750000512227416, "rewards/format_reward_func": 1.0, "step": 488 }, { "completion_length": 52.867188930511475, "epoch": 0.2613333333333333, "grad_norm": 0.06423982216937062, "kl": 0.3638916015625, "learning_rate": 2.261952812852594e-06, "loss": 0.0004, "reward": 1.281250037252903, "reward_std": 0.06181124225258827, "rewards/equation_reward_func": 0.2812500079162419, "rewards/format_reward_func": 1.0, "step": 490 }, { "completion_length": 52.908855676651, "epoch": 0.2624, "grad_norm": 0.0846045137631974, "kl": 0.404052734375, "learning_rate": 2.244047801921944e-06, "loss": 0.0004, "reward": 1.2109375223517418, "reward_std": 0.05444554518908262, "rewards/equation_reward_func": 0.21093750651925802, "rewards/format_reward_func": 1.0, "step": 492 }, { "completion_length": 53.40885519981384, "epoch": 0.2634666666666667, "grad_norm": 0.1067400740620474, "kl": 0.3988037109375, "learning_rate": 2.2261560493033963e-06, "loss": 0.0004, "reward": 1.2526042088866234, "reward_std": 0.08347559906542301, "rewards/equation_reward_func": 0.2526041753590107, "rewards/format_reward_func": 1.0, "step": 494 }, { "completion_length": 54.218751192092896, "epoch": 0.26453333333333334, "grad_norm": 0.07256791160443991, "kl": 0.3349609375, "learning_rate": 2.208278481788915e-06, "loss": 0.0003, "reward": 1.1796875223517418, "reward_std": 0.03857420431450009, "rewards/equation_reward_func": 0.17968750558793545, "rewards/format_reward_func": 1.0, "step": 496 }, { "completion_length": 54.88802218437195, "epoch": 0.2656, "grad_norm": 0.07112085855461625, "kl": 0.3714599609375, "learning_rate": 2.190416025435675e-06, "loss": 0.0004, "reward": 1.216145858168602, "reward_std": 0.036649550311267376, "rewards/equation_reward_func": 0.21614584024064243, "rewards/format_reward_func": 1.0, "step": 498 }, { "completion_length": 53.61979293823242, "epoch": 0.26666666666666666, "grad_norm": 0.10498184046893184, "kl": 0.534912109375, "learning_rate": 2.172569605518096e-06, "loss": 0.0005, "reward": 1.2656250298023224, "reward_std": 0.058746636379510164, "rewards/equation_reward_func": 0.265625006519258, "rewards/format_reward_func": 1.0, "step": 500 }, { "completion_length": 54.88541793823242, "epoch": 0.2677333333333333, "grad_norm": 0.04571174588948423, "kl": 0.380859375, "learning_rate": 2.15474014647991e-06, "loss": 0.0004, "reward": 1.2291666865348816, "reward_std": 0.03910674247890711, "rewards/equation_reward_func": 0.2291666711680591, "rewards/format_reward_func": 1.0, "step": 502 }, { "completion_length": 54.31510543823242, "epoch": 0.2688, "grad_norm": 0.05576172706581537, "kl": 0.3726806640625, "learning_rate": 2.136928571886275e-06, "loss": 0.0004, "reward": 1.1562500149011612, "reward_std": 0.02779193129390478, "rewards/equation_reward_func": 0.1562500037252903, "rewards/format_reward_func": 1.0, "step": 504 }, { "completion_length": 55.773439168930054, "epoch": 0.26986666666666664, "grad_norm": 0.059589196325192564, "kl": 0.3621826171875, "learning_rate": 2.119135804375937e-06, "loss": 0.0004, "reward": 1.1536458507180214, "reward_std": 0.02551366575062275, "rewards/equation_reward_func": 0.15364583767950535, "rewards/format_reward_func": 1.0, "step": 506 }, { "completion_length": 53.98958420753479, "epoch": 0.27093333333333336, "grad_norm": 0.05913338645068504, "kl": 0.3856201171875, "learning_rate": 2.101362765613436e-06, "loss": 0.0004, "reward": 1.200520858168602, "reward_std": 0.017009655479341745, "rewards/equation_reward_func": 0.2005208395421505, "rewards/format_reward_func": 1.0, "step": 508 }, { "completion_length": 55.265625953674316, "epoch": 0.272, "grad_norm": 0.08212586361582647, "kl": 0.387451171875, "learning_rate": 2.083610376241364e-06, "loss": 0.0004, "reward": 1.3072916939854622, "reward_std": 0.05725471256300807, "rewards/equation_reward_func": 0.3072916741948575, "rewards/format_reward_func": 1.0, "step": 510 }, { "completion_length": 55.82291769981384, "epoch": 0.2730666666666667, "grad_norm": 0.0014411808806308035, "kl": 0.39892578125, "learning_rate": 2.0658795558326745e-06, "loss": 0.0004, "reward": 1.2682291939854622, "reward_std": 0.037789500784128904, "rewards/equation_reward_func": 0.26822917349636555, "rewards/format_reward_func": 1.0, "step": 512 }, { "completion_length": 56.078126430511475, "epoch": 0.27413333333333334, "grad_norm": 0.12638894673333173, "kl": 0.3673095703125, "learning_rate": 2.0481712228430495e-06, "loss": 0.0004, "reward": 1.3541666865348816, "reward_std": 0.07408707588911057, "rewards/equation_reward_func": 0.3541666748933494, "rewards/format_reward_func": 1.0, "step": 514 }, { "completion_length": 55.395835399627686, "epoch": 0.2752, "grad_norm": 0.003652563697235084, "kl": 0.40380859375, "learning_rate": 2.030486294563325e-06, "loss": 0.0004, "reward": 1.200520858168602, "reward_std": 0.020426234230399132, "rewards/equation_reward_func": 0.2005208390764892, "rewards/format_reward_func": 1.0, "step": 516 }, { "completion_length": 55.265626668930054, "epoch": 0.27626666666666666, "grad_norm": 0.07875827788354772, "kl": 0.39208984375, "learning_rate": 2.012825687071974e-06, "loss": 0.0004, "reward": 1.325520858168602, "reward_std": 0.046472438145428896, "rewards/equation_reward_func": 0.32552084024064243, "rewards/format_reward_func": 1.0, "step": 518 }, { "completion_length": 55.127604961395264, "epoch": 0.2773333333333333, "grad_norm": 0.06637272951113936, "kl": 0.3955078125, "learning_rate": 1.9951903151876518e-06, "loss": 0.0004, "reward": 1.2916667014360428, "reward_std": 0.035511236637830734, "rewards/equation_reward_func": 0.2916666744276881, "rewards/format_reward_func": 1.0, "step": 520 }, { "completion_length": 56.11198019981384, "epoch": 0.2784, "grad_norm": 0.11358097124817731, "kl": 0.4456787109375, "learning_rate": 1.9775810924218126e-06, "loss": 0.0004, "reward": 1.3750000447034836, "reward_std": 0.06689867237582803, "rewards/equation_reward_func": 0.3750000100117177, "rewards/format_reward_func": 1.0, "step": 522 }, { "completion_length": 55.33333468437195, "epoch": 0.27946666666666664, "grad_norm": 0.06819570200631479, "kl": 0.3876953125, "learning_rate": 1.9599989309313873e-06, "loss": 0.0004, "reward": 1.2187500223517418, "reward_std": 0.019287919625639915, "rewards/equation_reward_func": 0.2187500074505806, "rewards/format_reward_func": 1.0, "step": 524 }, { "completion_length": 55.50520944595337, "epoch": 0.28053333333333336, "grad_norm": 0.056558280597430614, "kl": 0.3763427734375, "learning_rate": 1.9424447414715325e-06, "loss": 0.0004, "reward": 1.1848958507180214, "reward_std": 0.03664955124258995, "rewards/equation_reward_func": 0.18489583861082792, "rewards/format_reward_func": 1.0, "step": 526 }, { "completion_length": 55.86458468437195, "epoch": 0.2816, "grad_norm": 0.001365563594823176, "kl": 0.3681640625, "learning_rate": 1.9249194333484567e-06, "loss": 0.0004, "reward": 1.2161458656191826, "reward_std": 0.03629757510498166, "rewards/equation_reward_func": 0.21614584024064243, "rewards/format_reward_func": 1.0, "step": 528 }, { "completion_length": 54.986980676651, "epoch": 0.2826666666666667, "grad_norm": 0.057505800002901244, "kl": 0.3658447265625, "learning_rate": 1.9074239143723145e-06, "loss": 0.0004, "reward": 1.3619792088866234, "reward_std": 0.041206078603863716, "rewards/equation_reward_func": 0.361979179084301, "rewards/format_reward_func": 1.0, "step": 530 }, { "completion_length": 57.031251430511475, "epoch": 0.28373333333333334, "grad_norm": 0.07122803954502509, "kl": 0.4356689453125, "learning_rate": 1.8899590908101853e-06, "loss": 0.0004, "reward": 1.3020833730697632, "reward_std": 0.03401931095868349, "rewards/equation_reward_func": 0.3020833421032876, "rewards/format_reward_func": 1.0, "step": 532 }, { "completion_length": 56.40364742279053, "epoch": 0.2848, "grad_norm": 0.06846058767995586, "kl": 0.4139404296875, "learning_rate": 1.8725258673391283e-06, "loss": 0.0004, "reward": 1.2916666939854622, "reward_std": 0.05216728104278445, "rewards/equation_reward_func": 0.2916666760575026, "rewards/format_reward_func": 1.0, "step": 534 }, { "completion_length": 55.19270944595337, "epoch": 0.28586666666666666, "grad_norm": 0.059791080229339565, "kl": 0.3824462890625, "learning_rate": 1.8551251469993176e-06, "loss": 0.0004, "reward": 1.2994792014360428, "reward_std": 0.0470782145857811, "rewards/equation_reward_func": 0.2994791753590107, "rewards/format_reward_func": 1.0, "step": 536 }, { "completion_length": 57.77083420753479, "epoch": 0.2869333333333333, "grad_norm": 0.0015334438320037363, "kl": 0.37548828125, "learning_rate": 1.8377578311472686e-06, "loss": 0.0004, "reward": 1.2500000298023224, "reward_std": 0.0, "rewards/equation_reward_func": 0.2500000074505806, "rewards/format_reward_func": 1.0, "step": 538 }, { "completion_length": 55.90885519981384, "epoch": 0.288, "grad_norm": 0.12956499332010468, "kl": 0.3951416015625, "learning_rate": 1.8204248194091429e-06, "loss": 0.0004, "reward": 1.2708333656191826, "reward_std": 0.08451573923230171, "rewards/equation_reward_func": 0.2708333400078118, "rewards/format_reward_func": 1.0, "step": 540 }, { "completion_length": 57.234376668930054, "epoch": 0.2890666666666667, "grad_norm": 0.12892523943061418, "kl": 0.439453125, "learning_rate": 1.8031270096341536e-06, "loss": 0.0004, "reward": 1.239583358168602, "reward_std": 0.051988353952765465, "rewards/equation_reward_func": 0.23958334000781178, "rewards/format_reward_func": 1.0, "step": 542 }, { "completion_length": 59.83854341506958, "epoch": 0.29013333333333335, "grad_norm": 0.07701166996588489, "kl": 0.4437255859375, "learning_rate": 1.7858652978480516e-06, "loss": 0.0004, "reward": 1.2447916939854622, "reward_std": 0.08294306229799986, "rewards/equation_reward_func": 0.24739584140479565, "rewards/format_reward_func": 0.9973958358168602, "step": 544 }, { "completion_length": 57.585938692092896, "epoch": 0.2912, "grad_norm": 0.00354578778226613, "kl": 0.4229736328125, "learning_rate": 1.7686405782067151e-06, "loss": 0.0004, "reward": 1.278645858168602, "reward_std": 0.010782274417579174, "rewards/equation_reward_func": 0.2786458395421505, "rewards/format_reward_func": 1.0, "step": 546 }, { "completion_length": 57.986980676651, "epoch": 0.2922666666666667, "grad_norm": 0.08702632973020263, "kl": 0.4168701171875, "learning_rate": 1.7514537429498297e-06, "loss": 0.0004, "reward": 1.208333358168602, "reward_std": 0.060671291314065456, "rewards/equation_reward_func": 0.2083333395421505, "rewards/format_reward_func": 1.0, "step": 548 }, { "completion_length": 57.367189168930054, "epoch": 0.29333333333333333, "grad_norm": 0.06994705023202123, "kl": 0.4080810546875, "learning_rate": 1.7343056823546725e-06, "loss": 0.0004, "reward": 1.2734375298023224, "reward_std": 0.06874420773237944, "rewards/equation_reward_func": 0.273437506519258, "rewards/format_reward_func": 1.0, "step": 550 }, { "completion_length": 57.992189168930054, "epoch": 0.2944, "grad_norm": 0.03713201969436462, "kl": 0.3992919921875, "learning_rate": 1.7171972846899942e-06, "loss": 0.0004, "reward": 1.2239583656191826, "reward_std": 0.014731391333043575, "rewards/equation_reward_func": 0.22395834140479565, "rewards/format_reward_func": 1.0, "step": 552 }, { "completion_length": 56.710938453674316, "epoch": 0.29546666666666666, "grad_norm": 0.0031854510686083436, "kl": 0.49462890625, "learning_rate": 1.700129436170008e-06, "loss": 0.0005, "reward": 1.3567708730697632, "reward_std": 0.020426234230399132, "rewards/equation_reward_func": 0.3567708432674408, "rewards/format_reward_func": 1.0, "step": 554 }, { "completion_length": 58.27864742279053, "epoch": 0.2965333333333333, "grad_norm": 0.03516598691840604, "kl": 0.5328369140625, "learning_rate": 1.6831030209084839e-06, "loss": 0.0005, "reward": 1.190104179084301, "reward_std": 0.022097086999565363, "rewards/equation_reward_func": 0.19010416977107525, "rewards/format_reward_func": 1.0, "step": 556 }, { "completion_length": 54.632813692092896, "epoch": 0.2976, "grad_norm": 0.07353107157383423, "kl": 1.0828857421875, "learning_rate": 1.6661189208729492e-06, "loss": 0.0011, "reward": 1.3463542014360428, "reward_std": 0.04138500662520528, "rewards/equation_reward_func": 0.3463541753590107, "rewards/format_reward_func": 1.0, "step": 558 }, { "completion_length": 55.16666793823242, "epoch": 0.2986666666666667, "grad_norm": 0.08404894932698356, "kl": 0.3812255859375, "learning_rate": 1.6491780158390052e-06, "loss": 0.0004, "reward": 1.2916667014360428, "reward_std": 0.024375351145863533, "rewards/equation_reward_func": 0.2916666753590107, "rewards/format_reward_func": 1.0, "step": 560 }, { "completion_length": 53.74739742279053, "epoch": 0.29973333333333335, "grad_norm": 0.0936278855990997, "kl": 0.3426513671875, "learning_rate": 1.632281183344756e-06, "loss": 0.0003, "reward": 1.3958333879709244, "reward_std": 0.029283855576068163, "rewards/equation_reward_func": 0.3958333448972553, "rewards/format_reward_func": 1.0, "step": 562 }, { "completion_length": 55.13541793823242, "epoch": 0.3008, "grad_norm": 0.15018320923263204, "kl": 0.3760986328125, "learning_rate": 1.6154292986453485e-06, "loss": 0.0004, "reward": 1.1640625298023224, "reward_std": 0.05935404961928725, "rewards/equation_reward_func": 0.16406250605359674, "rewards/format_reward_func": 1.0, "step": 564 }, { "completion_length": 54.19791769981384, "epoch": 0.30186666666666667, "grad_norm": 0.047320353220118155, "kl": 0.3875732421875, "learning_rate": 1.5986232346676344e-06, "loss": 0.0004, "reward": 1.177083358168602, "reward_std": 0.019287919625639915, "rewards/equation_reward_func": 0.17708333861082792, "rewards/format_reward_func": 1.0, "step": 566 }, { "completion_length": 53.80729293823242, "epoch": 0.30293333333333333, "grad_norm": 0.11117222635741157, "kl": 0.3692626953125, "learning_rate": 1.5818638619649568e-06, "loss": 0.0004, "reward": 1.3906250521540642, "reward_std": 0.08294306369498372, "rewards/equation_reward_func": 0.3906250128056854, "rewards/format_reward_func": 1.0, "step": 568 }, { "completion_length": 53.843751192092896, "epoch": 0.304, "grad_norm": 0.0013187286444649735, "kl": 0.345703125, "learning_rate": 1.5651520486720518e-06, "loss": 0.0003, "reward": 1.2291666939854622, "reward_std": 0.0, "rewards/equation_reward_func": 0.22916667349636555, "rewards/format_reward_func": 1.0, "step": 570 }, { "completion_length": 53.55989694595337, "epoch": 0.30506666666666665, "grad_norm": 0.039428428756151296, "kl": 0.3472900390625, "learning_rate": 1.5484886604600795e-06, "loss": 0.0003, "reward": 1.320312537252903, "reward_std": 0.026653615292161703, "rewards/equation_reward_func": 0.32031250931322575, "rewards/format_reward_func": 1.0, "step": 572 }, { "completion_length": 54.549480676651, "epoch": 0.3061333333333333, "grad_norm": 0.032964619421432695, "kl": 0.3670654296875, "learning_rate": 1.5318745604917847e-06, "loss": 0.0004, "reward": 1.2604166939854622, "reward_std": 0.02779192989692092, "rewards/equation_reward_func": 0.26041667303070426, "rewards/format_reward_func": 1.0, "step": 574 }, { "completion_length": 54.45833468437195, "epoch": 0.3072, "grad_norm": 0.04544278903567721, "kl": 0.404052734375, "learning_rate": 1.5153106093767827e-06, "loss": 0.0004, "reward": 1.2213541939854622, "reward_std": 0.03156211972236633, "rewards/equation_reward_func": 0.22135417256504297, "rewards/format_reward_func": 1.0, "step": 576 }, { "completion_length": 54.76302218437195, "epoch": 0.3082666666666667, "grad_norm": 0.0220656665317917, "kl": 0.38427734375, "learning_rate": 1.498797665126979e-06, "loss": 0.0004, "reward": 1.3437500298023224, "reward_std": 0.06067129271104932, "rewards/equation_reward_func": 0.34375000931322575, "rewards/format_reward_func": 1.0, "step": 578 }, { "completion_length": 53.58333420753479, "epoch": 0.30933333333333335, "grad_norm": 0.05914889226186578, "kl": 0.3614501953125, "learning_rate": 1.4823365831121277e-06, "loss": 0.0004, "reward": 1.4114583730697632, "reward_std": 0.04401524690911174, "rewards/equation_reward_func": 0.4114583432674408, "rewards/format_reward_func": 1.0, "step": 580 }, { "completion_length": 53.757814168930054, "epoch": 0.3104, "grad_norm": 0.05572033170657447, "kl": 0.402587890625, "learning_rate": 1.4659282160155222e-06, "loss": 0.0004, "reward": 1.2942708730697632, "reward_std": 0.022097086999565363, "rewards/equation_reward_func": 0.2942708428017795, "rewards/format_reward_func": 1.0, "step": 582 }, { "completion_length": 54.291668176651, "epoch": 0.31146666666666667, "grad_norm": 0.08848201378435357, "kl": 0.358642578125, "learning_rate": 1.4495734137898227e-06, "loss": 0.0004, "reward": 1.3177083730697632, "reward_std": 0.014731391333043575, "rewards/equation_reward_func": 0.31770834093913436, "rewards/format_reward_func": 1.0, "step": 584 }, { "completion_length": 54.15885543823242, "epoch": 0.31253333333333333, "grad_norm": 0.06972992203049268, "kl": 0.4097900390625, "learning_rate": 1.4332730236130337e-06, "loss": 0.0004, "reward": 1.184895858168602, "reward_std": 0.007365695666521788, "rewards/equation_reward_func": 0.1848958395421505, "rewards/format_reward_func": 1.0, "step": 586 }, { "completion_length": 53.57291865348816, "epoch": 0.3136, "grad_norm": 0.05135166946654117, "kl": 0.4085693359375, "learning_rate": 1.4170278898446176e-06, "loss": 0.0004, "reward": 1.3932292088866234, "reward_std": 0.040245057083666325, "rewards/equation_reward_func": 0.393229179084301, "rewards/format_reward_func": 1.0, "step": 588 }, { "completion_length": 54.145835161209106, "epoch": 0.31466666666666665, "grad_norm": 0.09141011042180326, "kl": 0.368408203125, "learning_rate": 1.4008388539817574e-06, "loss": 0.0004, "reward": 1.3593750447034836, "reward_std": 0.024375351145863533, "rewards/equation_reward_func": 0.3593750095460564, "rewards/format_reward_func": 1.0, "step": 590 }, { "completion_length": 53.601563453674316, "epoch": 0.3157333333333333, "grad_norm": 0.0036628846156534028, "kl": 0.402587890625, "learning_rate": 1.3847067546157671e-06, "loss": 0.0004, "reward": 1.2109375298023224, "reward_std": 0.007365695666521788, "rewards/equation_reward_func": 0.21093750628642738, "rewards/format_reward_func": 1.0, "step": 592 }, { "completion_length": 54.093751430511475, "epoch": 0.3168, "grad_norm": 0.0010709922443727733, "kl": 0.396484375, "learning_rate": 1.3686324273886531e-06, "loss": 0.0004, "reward": 1.3411458730697632, "reward_std": 0.03515762696042657, "rewards/equation_reward_func": 0.3411458432674408, "rewards/format_reward_func": 1.0, "step": 594 }, { "completion_length": 53.30729293823242, "epoch": 0.3178666666666667, "grad_norm": 0.002584212749515802, "kl": 0.3919677734375, "learning_rate": 1.3526167049498265e-06, "loss": 0.0004, "reward": 1.2786458656191826, "reward_std": 0.044801585376262665, "rewards/equation_reward_func": 0.2786458416376263, "rewards/format_reward_func": 1.0, "step": 596 }, { "completion_length": 54.17708444595337, "epoch": 0.31893333333333335, "grad_norm": 0.02373170046005579, "kl": 0.3848876953125, "learning_rate": 1.3366604169129743e-06, "loss": 0.0004, "reward": 1.2838542014360428, "reward_std": 0.022097086999565363, "rewards/equation_reward_func": 0.2838541753590107, "rewards/format_reward_func": 1.0, "step": 598 }, { "completion_length": 53.385418176651, "epoch": 0.32, "grad_norm": 0.03660001583432564, "kl": 0.3646240234375, "learning_rate": 1.3207643898130854e-06, "loss": 0.0004, "reward": 1.2682291939854622, "reward_std": 0.03629757510498166, "rewards/equation_reward_func": 0.2682291748933494, "rewards/format_reward_func": 1.0, "step": 600 }, { "completion_length": 53.95052242279053, "epoch": 0.32106666666666667, "grad_norm": 0.09856332142644789, "kl": 0.3656005859375, "learning_rate": 1.3049294470636304e-06, "loss": 0.0004, "reward": 1.223958358168602, "reward_std": 0.009643959812819958, "rewards/equation_reward_func": 0.22395834140479565, "rewards/format_reward_func": 1.0, "step": 602 }, { "completion_length": 54.257813453674316, "epoch": 0.3221333333333333, "grad_norm": 0.0008388012696272095, "kl": 0.3529052734375, "learning_rate": 1.289156408913918e-06, "loss": 0.0004, "reward": 1.2265625298023224, "reward_std": 0.007365695666521788, "rewards/equation_reward_func": 0.2265625074505806, "rewards/format_reward_func": 1.0, "step": 604 }, { "completion_length": 54.61979269981384, "epoch": 0.3232, "grad_norm": 0.06753784077533541, "kl": 0.36376953125, "learning_rate": 1.2734460924065992e-06, "loss": 0.0004, "reward": 1.3619792014360428, "reward_std": 0.04287693230435252, "rewards/equation_reward_func": 0.36197917396202683, "rewards/format_reward_func": 1.0, "step": 606 }, { "completion_length": 54.101563692092896, "epoch": 0.32426666666666665, "grad_norm": 0.05953968510321721, "kl": 0.3729248046875, "learning_rate": 1.2577993113353475e-06, "loss": 0.0004, "reward": 1.2421875298023224, "reward_std": 0.017009655479341745, "rewards/equation_reward_func": 0.2421875074505806, "rewards/format_reward_func": 1.0, "step": 608 }, { "completion_length": 54.440104722976685, "epoch": 0.3253333333333333, "grad_norm": 0.10862990113903555, "kl": 0.357177734375, "learning_rate": 1.2422168762027051e-06, "loss": 0.0004, "reward": 1.1640625223517418, "reward_std": 0.03174104681238532, "rewards/equation_reward_func": 0.16406250512227416, "rewards/format_reward_func": 1.0, "step": 610 }, { "completion_length": 52.75520968437195, "epoch": 0.3264, "grad_norm": 0.054423443021296904, "kl": 0.3450927734375, "learning_rate": 1.2266995941780934e-06, "loss": 0.0003, "reward": 1.393229216337204, "reward_std": 0.05163474287837744, "rewards/equation_reward_func": 0.3932291786186397, "rewards/format_reward_func": 1.0, "step": 612 }, { "completion_length": 52.890626192092896, "epoch": 0.3274666666666667, "grad_norm": 0.003421821348396495, "kl": 0.3782958984375, "learning_rate": 1.211248269056009e-06, "loss": 0.0004, "reward": 1.304687537252903, "reward_std": 0.017009655479341745, "rewards/equation_reward_func": 0.3046875111758709, "rewards/format_reward_func": 1.0, "step": 614 }, { "completion_length": 54.359376192092896, "epoch": 0.32853333333333334, "grad_norm": 0.055508596920617985, "kl": 0.3524169921875, "learning_rate": 1.1958637012143849e-06, "loss": 0.0004, "reward": 1.2031250223517418, "reward_std": 0.029283855576068163, "rewards/equation_reward_func": 0.20312500465661287, "rewards/format_reward_func": 1.0, "step": 616 }, { "completion_length": 53.510418176651, "epoch": 0.3296, "grad_norm": 0.002110590086909732, "kl": 0.36083984375, "learning_rate": 1.1805466875731277e-06, "loss": 0.0004, "reward": 1.2578125298023224, "reward_std": 0.03234682325273752, "rewards/equation_reward_func": 0.2578125074505806, "rewards/format_reward_func": 1.0, "step": 618 }, { "completion_length": 53.31770944595337, "epoch": 0.33066666666666666, "grad_norm": 0.0041158154213266854, "kl": 0.36474609375, "learning_rate": 1.1652980215528415e-06, "loss": 0.0004, "reward": 1.2916667014360428, "reward_std": 0.0, "rewards/equation_reward_func": 0.2916666753590107, "rewards/format_reward_func": 1.0, "step": 620 }, { "completion_length": 52.94791769981384, "epoch": 0.3317333333333333, "grad_norm": 0.10272303719354359, "kl": 0.35107421875, "learning_rate": 1.1501184930337236e-06, "loss": 0.0004, "reward": 1.3593750298023224, "reward_std": 0.02779192989692092, "rewards/equation_reward_func": 0.3593750074505806, "rewards/format_reward_func": 1.0, "step": 622 }, { "completion_length": 53.76302194595337, "epoch": 0.3328, "grad_norm": 0.08455154932697702, "kl": 0.8724365234375, "learning_rate": 1.135008888314655e-06, "loss": 0.0009, "reward": 1.2291666939854622, "reward_std": 0.029283855576068163, "rewards/equation_reward_func": 0.22916667256504297, "rewards/format_reward_func": 1.0, "step": 624 }, { "completion_length": 53.23958468437195, "epoch": 0.33386666666666664, "grad_norm": 0.0017269292003182076, "kl": 0.360595703125, "learning_rate": 1.1199699900724658e-06, "loss": 0.0004, "reward": 1.2447916865348816, "reward_std": 0.009643959812819958, "rewards/equation_reward_func": 0.2447916716337204, "rewards/format_reward_func": 1.0, "step": 626 }, { "completion_length": 53.95052218437195, "epoch": 0.33493333333333336, "grad_norm": 0.11118016611442859, "kl": 0.403564453125, "learning_rate": 1.1050025773213943e-06, "loss": 0.0004, "reward": 1.3619792088866234, "reward_std": 0.01850158115848899, "rewards/equation_reward_func": 0.36197917722165585, "rewards/format_reward_func": 1.0, "step": 628 }, { "completion_length": 52.52083420753479, "epoch": 0.336, "grad_norm": 0.04680259166400608, "kl": 0.361572265625, "learning_rate": 1.0901074253727338e-06, "loss": 0.0004, "reward": 1.270833358168602, "reward_std": 0.014731391333043575, "rewards/equation_reward_func": 0.2708333395421505, "rewards/format_reward_func": 1.0, "step": 630 }, { "completion_length": 52.955730676651, "epoch": 0.3370666666666667, "grad_norm": 0.0021280251984700832, "kl": 0.468994140625, "learning_rate": 1.0752853057946711e-06, "loss": 0.0005, "reward": 1.3333333656191826, "reward_std": 0.0, "rewards/equation_reward_func": 0.33333334140479565, "rewards/format_reward_func": 1.0, "step": 632 }, { "completion_length": 52.96614718437195, "epoch": 0.33813333333333334, "grad_norm": 0.05323290232652677, "kl": 0.3575439453125, "learning_rate": 1.060536986372318e-06, "loss": 0.0004, "reward": 1.3125000298023224, "reward_std": 0.05024262657389045, "rewards/equation_reward_func": 0.3151041753590107, "rewards/format_reward_func": 0.9973958358168602, "step": 634 }, { "completion_length": 54.22135543823242, "epoch": 0.3392, "grad_norm": 0.07387085921683315, "kl": 0.4681396484375, "learning_rate": 1.045863231067944e-06, "loss": 0.0005, "reward": 1.2447916939854622, "reward_std": 0.042523321229964495, "rewards/equation_reward_func": 0.24479167349636555, "rewards/format_reward_func": 1.0, "step": 636 }, { "completion_length": 53.648438692092896, "epoch": 0.34026666666666666, "grad_norm": 0.06546663631003981, "kl": 0.3916015625, "learning_rate": 1.0312647999814e-06, "loss": 0.0004, "reward": 1.3046875298023224, "reward_std": 0.010782274417579174, "rewards/equation_reward_func": 0.3046875074505806, "rewards/format_reward_func": 1.0, "step": 638 }, { "completion_length": 56.94270968437195, "epoch": 0.3413333333333333, "grad_norm": 0.05879491178572667, "kl": 0.48828125, "learning_rate": 1.016742449310745e-06, "loss": 0.0005, "reward": 1.200520858168602, "reward_std": 0.022097086999565363, "rewards/equation_reward_func": 0.20312500558793545, "rewards/format_reward_func": 0.9973958358168602, "step": 640 }, { "completion_length": 54.58854269981384, "epoch": 0.3424, "grad_norm": 0.023127125587042094, "kl": 0.4007568359375, "learning_rate": 1.0022969313130773e-06, "loss": 0.0004, "reward": 1.2473958656191826, "reward_std": 0.007365695666521788, "rewards/equation_reward_func": 0.24739584140479565, "rewards/format_reward_func": 1.0, "step": 642 }, { "completion_length": 54.328126668930054, "epoch": 0.34346666666666664, "grad_norm": 0.03269961349644101, "kl": 0.400390625, "learning_rate": 9.879289942655652e-07, "loss": 0.0004, "reward": 1.2916667088866234, "reward_std": 0.04059866815805435, "rewards/equation_reward_func": 0.29166667722165585, "rewards/format_reward_func": 1.0, "step": 644 }, { "completion_length": 54.02604293823242, "epoch": 0.34453333333333336, "grad_norm": 0.0015668610352273652, "kl": 0.3875732421875, "learning_rate": 9.736393824266876e-07, "loss": 0.0004, "reward": 1.2500000298023224, "reward_std": 0.0, "rewards/equation_reward_func": 0.2500000074505806, "rewards/format_reward_func": 1.0, "step": 646 }, { "completion_length": 53.908855676651, "epoch": 0.3456, "grad_norm": 0.0014270227328498826, "kl": 0.364501953125, "learning_rate": 9.594288359976817e-07, "loss": 0.0004, "reward": 1.3333333730697632, "reward_std": 0.0, "rewards/equation_reward_func": 0.3333333432674408, "rewards/format_reward_func": 1.0, "step": 648 }, { "completion_length": 53.78645944595337, "epoch": 0.3466666666666667, "grad_norm": 0.0019091556203229391, "kl": 0.3760986328125, "learning_rate": 9.452980910841993e-07, "loss": 0.0004, "reward": 1.2916667014360428, "reward_std": 0.0, "rewards/equation_reward_func": 0.2916666753590107, "rewards/format_reward_func": 1.0, "step": 650 }, { "completion_length": 53.81770920753479, "epoch": 0.34773333333333334, "grad_norm": 0.09627440550422062, "kl": 0.48486328125, "learning_rate": 9.312478796581792e-07, "loss": 0.0005, "reward": 1.3619792088866234, "reward_std": 0.04988901689648628, "rewards/equation_reward_func": 0.3619791786186397, "rewards/format_reward_func": 1.0, "step": 652 }, { "completion_length": 53.82552242279053, "epoch": 0.3488, "grad_norm": 0.0015109282683432436, "kl": 0.35546875, "learning_rate": 9.172789295199256e-07, "loss": 0.0004, "reward": 1.3177083656191826, "reward_std": 0.01814797008410096, "rewards/equation_reward_func": 0.31770834140479565, "rewards/format_reward_func": 1.0, "step": 654 }, { "completion_length": 54.203126192092896, "epoch": 0.34986666666666666, "grad_norm": 0.0012398342616790586, "kl": 0.400146484375, "learning_rate": 9.03391964260415e-07, "loss": 0.0004, "reward": 1.3515625298023224, "reward_std": 0.007365695666521788, "rewards/equation_reward_func": 0.3515625074505806, "rewards/format_reward_func": 1.0, "step": 656 }, { "completion_length": 56.86979341506958, "epoch": 0.3509333333333333, "grad_norm": 0.05713890888124243, "kl": 0.3778076171875, "learning_rate": 8.895877032238096e-07, "loss": 0.0004, "reward": 1.2838541939854622, "reward_std": 0.022097086999565363, "rewards/equation_reward_func": 0.2864583432674408, "rewards/format_reward_func": 0.9973958358168602, "step": 658 }, { "completion_length": 54.039063453674316, "epoch": 0.352, "grad_norm": 0.0009618941388559105, "kl": 0.356201171875, "learning_rate": 8.758668614701973e-07, "loss": 0.0004, "reward": 1.3515625298023224, "reward_std": 0.007365695666521788, "rewards/equation_reward_func": 0.3515625074505806, "rewards/format_reward_func": 1.0, "step": 660 }, { "completion_length": 54.13802194595337, "epoch": 0.35306666666666664, "grad_norm": 0.028511363400704693, "kl": 0.41943359375, "learning_rate": 8.622301497385508e-07, "loss": 0.0004, "reward": 1.2734375298023224, "reward_std": 0.007365695666521788, "rewards/equation_reward_func": 0.2734375062864274, "rewards/format_reward_func": 1.0, "step": 662 }, { "completion_length": 53.84895944595337, "epoch": 0.35413333333333336, "grad_norm": 0.006808197054012968, "kl": 0.8538818359375, "learning_rate": 8.486782744099117e-07, "loss": 0.0009, "reward": 1.1875000223517418, "reward_std": 0.0, "rewards/equation_reward_func": 0.18750000558793545, "rewards/format_reward_func": 1.0, "step": 664 }, { "completion_length": 53.471355676651, "epoch": 0.3552, "grad_norm": 0.06380295899414516, "kl": 0.3599853515625, "learning_rate": 8.352119374707979e-07, "loss": 0.0004, "reward": 1.3385417088866234, "reward_std": 0.025867276825010777, "rewards/equation_reward_func": 0.33854167722165585, "rewards/format_reward_func": 1.0, "step": 666 }, { "completion_length": 54.43229269981384, "epoch": 0.3562666666666667, "grad_norm": 0.000957074945124631, "kl": 0.3529052734375, "learning_rate": 8.218318364768451e-07, "loss": 0.0004, "reward": 1.2291666939854622, "reward_std": 0.0, "rewards/equation_reward_func": 0.22916667349636555, "rewards/format_reward_func": 1.0, "step": 668 }, { "completion_length": 53.80989694595337, "epoch": 0.35733333333333334, "grad_norm": 0.0493844055858425, "kl": 0.4134521484375, "learning_rate": 8.085386645166698e-07, "loss": 0.0004, "reward": 1.2447917014360428, "reward_std": 0.014731391333043575, "rewards/equation_reward_func": 0.2447916753590107, "rewards/format_reward_func": 1.0, "step": 670 }, { "completion_length": 53.73177218437195, "epoch": 0.3584, "grad_norm": 0.02489738449659351, "kl": 0.36865234375, "learning_rate": 7.953331101759706e-07, "loss": 0.0004, "reward": 1.1875000298023224, "reward_std": 0.014731391333043575, "rewards/equation_reward_func": 0.18750000628642738, "rewards/format_reward_func": 1.0, "step": 672 }, { "completion_length": 54.30989718437195, "epoch": 0.35946666666666666, "grad_norm": 0.05510373618888561, "kl": 0.5057373046875, "learning_rate": 7.822158575018535e-07, "loss": 0.0005, "reward": 1.3229167014360428, "reward_std": 0.03287936141714454, "rewards/equation_reward_func": 0.3229166741948575, "rewards/format_reward_func": 1.0, "step": 674 }, { "completion_length": 54.64583468437195, "epoch": 0.3605333333333333, "grad_norm": 0.005731883171549106, "kl": 0.3985595703125, "learning_rate": 7.691875859674053e-07, "loss": 0.0004, "reward": 1.2604167014360428, "reward_std": 0.011135885491967201, "rewards/equation_reward_func": 0.2604166744276881, "rewards/format_reward_func": 1.0, "step": 676 }, { "completion_length": 54.359376430511475, "epoch": 0.3616, "grad_norm": 0.034996714446269166, "kl": 0.361328125, "learning_rate": 7.56248970436493e-07, "loss": 0.0004, "reward": 1.200520858168602, "reward_std": 0.010782274417579174, "rewards/equation_reward_func": 0.2005208395421505, "rewards/format_reward_func": 1.0, "step": 678 }, { "completion_length": 54.190104961395264, "epoch": 0.3626666666666667, "grad_norm": 0.001366911134600043, "kl": 0.3656005859375, "learning_rate": 7.434006811288069e-07, "loss": 0.0004, "reward": 1.3723958730697632, "reward_std": 0.007365695666521788, "rewards/equation_reward_func": 0.3723958432674408, "rewards/format_reward_func": 1.0, "step": 680 }, { "completion_length": 54.27864718437195, "epoch": 0.36373333333333335, "grad_norm": 0.075504024487262, "kl": 1.032958984375, "learning_rate": 7.306433835851423e-07, "loss": 0.001, "reward": 1.242187537252903, "reward_std": 0.02551366575062275, "rewards/equation_reward_func": 0.24218750814907253, "rewards/format_reward_func": 1.0, "step": 682 }, { "completion_length": 53.61458444595337, "epoch": 0.3648, "grad_norm": 0.014655869705823969, "kl": 0.5537109375, "learning_rate": 7.179777386329276e-07, "loss": 0.0006, "reward": 1.239583358168602, "reward_std": 0.024375351145863533, "rewards/equation_reward_func": 0.23958334140479565, "rewards/format_reward_func": 1.0, "step": 684 }, { "completion_length": 53.66145968437195, "epoch": 0.3658666666666667, "grad_norm": 0.08462688279948909, "kl": 0.3863525390625, "learning_rate": 7.054044023519871e-07, "loss": 0.0004, "reward": 1.2708333656191826, "reward_std": 0.014731391333043575, "rewards/equation_reward_func": 0.27083334024064243, "rewards/format_reward_func": 1.0, "step": 686 }, { "completion_length": 55.16927194595337, "epoch": 0.36693333333333333, "grad_norm": 0.058925241887043914, "kl": 0.3699951171875, "learning_rate": 6.929240260405634e-07, "loss": 0.0004, "reward": 1.2369792014360428, "reward_std": 0.033232972491532564, "rewards/equation_reward_func": 0.23958334047347307, "rewards/format_reward_func": 0.9973958358168602, "step": 688 }, { "completion_length": 53.18489718437195, "epoch": 0.368, "grad_norm": 0.0025747448808322694, "kl": 0.41064453125, "learning_rate": 6.805372561815768e-07, "loss": 0.0004, "reward": 1.289062537252903, "reward_std": 0.007365695666521788, "rewards/equation_reward_func": 0.28906250931322575, "rewards/format_reward_func": 1.0, "step": 690 }, { "completion_length": 54.59114718437195, "epoch": 0.36906666666666665, "grad_norm": 0.038409956280049735, "kl": 0.417724609375, "learning_rate": 6.682447344091364e-07, "loss": 0.0004, "reward": 1.2708333656191826, "reward_std": 0.014731391333043575, "rewards/equation_reward_func": 0.27083334140479565, "rewards/format_reward_func": 1.0, "step": 692 }, { "completion_length": 54.52864742279053, "epoch": 0.3701333333333333, "grad_norm": 0.0008978397061271276, "kl": 0.37353515625, "learning_rate": 6.560470974753052e-07, "loss": 0.0004, "reward": 1.2057291865348816, "reward_std": 0.007365695666521788, "rewards/equation_reward_func": 0.2057291716337204, "rewards/format_reward_func": 1.0, "step": 694 }, { "completion_length": 54.39062571525574, "epoch": 0.3712, "grad_norm": 0.03559191125731647, "kl": 0.443359375, "learning_rate": 6.439449772171163e-07, "loss": 0.0004, "reward": 1.2109375298023224, "reward_std": 0.007365695666521788, "rewards/equation_reward_func": 0.21093750628642738, "rewards/format_reward_func": 1.0, "step": 696 }, { "completion_length": 54.625001192092896, "epoch": 0.3722666666666667, "grad_norm": 0.015158725996873044, "kl": 0.4281005859375, "learning_rate": 6.319390005238432e-07, "loss": 0.0004, "reward": 1.2890625223517418, "reward_std": 0.007365695666521788, "rewards/equation_reward_func": 0.28906250558793545, "rewards/format_reward_func": 1.0, "step": 698 }, { "completion_length": 54.64062571525574, "epoch": 0.37333333333333335, "grad_norm": 0.05458037929690762, "kl": 0.4036865234375, "learning_rate": 6.20029789304527e-07, "loss": 0.0004, "reward": 1.2838542088866234, "reward_std": 0.028145540971308947, "rewards/equation_reward_func": 0.28385417629033327, "rewards/format_reward_func": 1.0, "step": 700 }, { "completion_length": 54.705730676651, "epoch": 0.3744, "grad_norm": 0.0016946645479741956, "kl": 0.380859375, "learning_rate": 6.082179604557617e-07, "loss": 0.0004, "reward": 1.289062537252903, "reward_std": 0.007365695666521788, "rewards/equation_reward_func": 0.28906250931322575, "rewards/format_reward_func": 1.0, "step": 702 }, { "completion_length": 54.55989742279053, "epoch": 0.37546666666666667, "grad_norm": 0.035514587659000764, "kl": 0.3909912109375, "learning_rate": 5.965041258297397e-07, "loss": 0.0004, "reward": 1.1901041939854622, "reward_std": 0.007365695666521788, "rewards/equation_reward_func": 0.19010417233221233, "rewards/format_reward_func": 1.0, "step": 704 }, { "completion_length": 53.578126430511475, "epoch": 0.37653333333333333, "grad_norm": 0.002222090665027611, "kl": 0.459716796875, "learning_rate": 5.848888922025553e-07, "loss": 0.0005, "reward": 1.2447916939854622, "reward_std": 0.014731391333043575, "rewards/equation_reward_func": 0.2447916753590107, "rewards/format_reward_func": 1.0, "step": 706 }, { "completion_length": 54.369793176651, "epoch": 0.3776, "grad_norm": 0.0432686884123198, "kl": 0.3992919921875, "learning_rate": 5.733728612427772e-07, "loss": 0.0004, "reward": 1.2031250223517418, "reward_std": 0.014731391333043575, "rewards/equation_reward_func": 0.20312500558793545, "rewards/format_reward_func": 1.0, "step": 708 }, { "completion_length": 53.625001430511475, "epoch": 0.37866666666666665, "grad_norm": 0.059698551905734296, "kl": 0.378173828125, "learning_rate": 5.619566294802803e-07, "loss": 0.0004, "reward": 1.3828125298023224, "reward_std": 0.026653615292161703, "rewards/equation_reward_func": 0.3828125111758709, "rewards/format_reward_func": 1.0, "step": 710 }, { "completion_length": 54.968751192092896, "epoch": 0.3797333333333333, "grad_norm": 0.033445444276081406, "kl": 0.5015869140625, "learning_rate": 5.506407882753456e-07, "loss": 0.0005, "reward": 1.1666666939854622, "reward_std": 0.014731391333043575, "rewards/equation_reward_func": 0.16927083837799728, "rewards/format_reward_func": 0.9973958358168602, "step": 712 }, { "completion_length": 54.132813692092896, "epoch": 0.3808, "grad_norm": 0.061151246596668575, "kl": 0.691650390625, "learning_rate": 5.394259237880272e-07, "loss": 0.0007, "reward": 1.2291666939854622, "reward_std": 0.0, "rewards/equation_reward_func": 0.22916667349636555, "rewards/format_reward_func": 1.0, "step": 714 }, { "completion_length": 54.578126192092896, "epoch": 0.3818666666666667, "grad_norm": 0.0021255933334136824, "kl": 0.3978271484375, "learning_rate": 5.283126169477914e-07, "loss": 0.0004, "reward": 1.1875000223517418, "reward_std": 0.0, "rewards/equation_reward_func": 0.18750000558793545, "rewards/format_reward_func": 1.0, "step": 716 }, { "completion_length": 53.06510543823242, "epoch": 0.38293333333333335, "grad_norm": 0.0021466206010982676, "kl": 0.3798828125, "learning_rate": 5.173014434234208e-07, "loss": 0.0004, "reward": 1.289062537252903, "reward_std": 0.007365695666521788, "rewards/equation_reward_func": 0.28906250931322575, "rewards/format_reward_func": 1.0, "step": 718 }, { "completion_length": 54.57291793823242, "epoch": 0.384, "grad_norm": 0.030046046975439514, "kl": 0.4244384765625, "learning_rate": 5.063929735931985e-07, "loss": 0.0004, "reward": 1.2213541865348816, "reward_std": 0.028930244501680136, "rewards/equation_reward_func": 0.22135417349636555, "rewards/format_reward_func": 1.0, "step": 720 }, { "completion_length": 53.31510519981384, "epoch": 0.38506666666666667, "grad_norm": 0.07080806705799082, "kl": 0.373046875, "learning_rate": 4.955877725153604e-07, "loss": 0.0004, "reward": 1.1666666939854622, "reward_std": 0.014731391333043575, "rewards/equation_reward_func": 0.16666667233221233, "rewards/format_reward_func": 1.0, "step": 722 }, { "completion_length": 53.549480676651, "epoch": 0.38613333333333333, "grad_norm": 0.05090828466741818, "kl": 0.4261474609375, "learning_rate": 4.84886399898826e-07, "loss": 0.0004, "reward": 1.3125000223517418, "reward_std": 0.03287936141714454, "rewards/equation_reward_func": 0.31250000558793545, "rewards/format_reward_func": 1.0, "step": 724 }, { "completion_length": 53.16666793823242, "epoch": 0.3872, "grad_norm": 0.0014271472903984441, "kl": 0.360595703125, "learning_rate": 4.7428941007420625e-07, "loss": 0.0004, "reward": 1.3072917014360428, "reward_std": 0.009643959812819958, "rewards/equation_reward_func": 0.3072916753590107, "rewards/format_reward_func": 1.0, "step": 726 }, { "completion_length": 53.87760591506958, "epoch": 0.38826666666666665, "grad_norm": 0.0793209829040656, "kl": 0.3935546875, "learning_rate": 4.63797351965086e-07, "loss": 0.0004, "reward": 1.3411458656191826, "reward_std": 0.017009655479341745, "rewards/equation_reward_func": 0.34114584093913436, "rewards/format_reward_func": 1.0, "step": 728 }, { "completion_length": 53.73177218437195, "epoch": 0.3893333333333333, "grad_norm": 0.33169984024282617, "kl": 0.41162109375, "learning_rate": 4.5341076905959376e-07, "loss": 0.0004, "reward": 1.2890625298023224, "reward_std": 0.017009655479341745, "rewards/equation_reward_func": 0.28906250814907253, "rewards/format_reward_func": 1.0, "step": 730 }, { "completion_length": 52.63802170753479, "epoch": 0.3904, "grad_norm": 0.03947437936930629, "kl": 0.67529296875, "learning_rate": 4.431301993822471e-07, "loss": 0.0007, "reward": 1.3072916939854622, "reward_std": 0.014731391333043575, "rewards/equation_reward_func": 0.30729167349636555, "rewards/format_reward_func": 1.0, "step": 732 }, { "completion_length": 54.08073043823242, "epoch": 0.3914666666666667, "grad_norm": 0.00459983316214303, "kl": 0.394775390625, "learning_rate": 4.329561754660827e-07, "loss": 0.0004, "reward": 1.2500000298023224, "reward_std": 0.0, "rewards/equation_reward_func": 0.2500000074505806, "rewards/format_reward_func": 1.0, "step": 734 }, { "completion_length": 54.65364694595337, "epoch": 0.39253333333333335, "grad_norm": 0.013590863885382408, "kl": 0.4984130859375, "learning_rate": 4.228892243250726e-07, "loss": 0.0005, "reward": 1.1015625149011612, "reward_std": 0.007365695666521788, "rewards/equation_reward_func": 0.1015625037252903, "rewards/format_reward_func": 1.0, "step": 736 }, { "completion_length": 55.213543176651, "epoch": 0.3936, "grad_norm": 0.0014429639739969799, "kl": 0.400390625, "learning_rate": 4.129298674268226e-07, "loss": 0.0004, "reward": 1.2812500298023224, "reward_std": 0.024375352542847395, "rewards/equation_reward_func": 0.2812500074505806, "rewards/format_reward_func": 1.0, "step": 738 }, { "completion_length": 53.64583444595337, "epoch": 0.39466666666666667, "grad_norm": 0.02656565564960493, "kl": 0.4044189453125, "learning_rate": 4.0307862066556265e-07, "loss": 0.0004, "reward": 1.2786458507180214, "reward_std": 0.017009655479341745, "rewards/equation_reward_func": 0.27864583767950535, "rewards/format_reward_func": 1.0, "step": 740 }, { "completion_length": 53.22656440734863, "epoch": 0.3957333333333333, "grad_norm": 0.0014083987051696572, "kl": 0.3685302734375, "learning_rate": 3.9333599433542285e-07, "loss": 0.0004, "reward": 1.479166716337204, "reward_std": 0.0, "rewards/equation_reward_func": 0.479166679084301, "rewards/format_reward_func": 1.0, "step": 742 }, { "completion_length": 55.065105676651, "epoch": 0.3968, "grad_norm": 0.057470153285508, "kl": 0.377685546875, "learning_rate": 3.837024931039995e-07, "loss": 0.0004, "reward": 1.1822916939854622, "reward_std": 0.03892781538888812, "rewards/equation_reward_func": 0.18229167140088975, "rewards/format_reward_func": 1.0, "step": 744 }, { "completion_length": 54.55989742279053, "epoch": 0.39786666666666665, "grad_norm": 0.07756679919223744, "kl": 0.4306640625, "learning_rate": 3.7417861598621347e-07, "loss": 0.0004, "reward": 1.273437537252903, "reward_std": 0.017009655479341745, "rewards/equation_reward_func": 0.27343750931322575, "rewards/format_reward_func": 1.0, "step": 746 }, { "completion_length": 53.484376192092896, "epoch": 0.3989333333333333, "grad_norm": 0.03902544772435559, "kl": 0.453857421875, "learning_rate": 3.6476485631846303e-07, "loss": 0.0005, "reward": 1.3307292014360428, "reward_std": 0.007365695666521788, "rewards/equation_reward_func": 0.33072917722165585, "rewards/format_reward_func": 1.0, "step": 748 }, { "completion_length": 53.31770944595337, "epoch": 0.4, "grad_norm": 0.0561616291149979, "kl": 0.44140625, "learning_rate": 3.554617017330644e-07, "loss": 0.0004, "reward": 1.2656250298023224, "reward_std": 0.014731391333043575, "rewards/equation_reward_func": 0.2656250074505806, "rewards/format_reward_func": 1.0, "step": 750 }, { "completion_length": 54.94270944595337, "epoch": 0.4010666666666667, "grad_norm": 0.08562704937951225, "kl": 0.4373779296875, "learning_rate": 3.462696341329996e-07, "loss": 0.0004, "reward": 1.2526042014360428, "reward_std": 0.022097086999565363, "rewards/equation_reward_func": 0.2526041741948575, "rewards/format_reward_func": 1.0, "step": 752 }, { "completion_length": 55.20052218437195, "epoch": 0.40213333333333334, "grad_norm": 0.03308118020943631, "kl": 0.484130859375, "learning_rate": 3.371891296669474e-07, "loss": 0.0005, "reward": 1.2265625223517418, "reward_std": 0.007365695666521788, "rewards/equation_reward_func": 0.22656250558793545, "rewards/format_reward_func": 1.0, "step": 754 }, { "completion_length": 54.54166769981384, "epoch": 0.4032, "grad_norm": 0.07362735068786681, "kl": 0.4862060546875, "learning_rate": 3.2822065870462216e-07, "loss": 0.0005, "reward": 1.2526041939854622, "reward_std": 0.028145540971308947, "rewards/equation_reward_func": 0.25260417349636555, "rewards/format_reward_func": 1.0, "step": 756 }, { "completion_length": 54.16666793823242, "epoch": 0.40426666666666666, "grad_norm": 0.0032306170595747725, "kl": 0.3824462890625, "learning_rate": 3.1936468581240806e-07, "loss": 0.0004, "reward": 1.289062537252903, "reward_std": 0.007365695666521788, "rewards/equation_reward_func": 0.28906250931322575, "rewards/format_reward_func": 1.0, "step": 758 }, { "completion_length": 55.338542461395264, "epoch": 0.4053333333333333, "grad_norm": 0.04533318347337018, "kl": 0.441650390625, "learning_rate": 3.1062166972929323e-07, "loss": 0.0004, "reward": 1.2526042014360428, "reward_std": 0.017009655479341745, "rewards/equation_reward_func": 0.2526041753590107, "rewards/format_reward_func": 1.0, "step": 760 }, { "completion_length": 53.39323043823242, "epoch": 0.4064, "grad_norm": 0.0025661528016194778, "kl": 0.42578125, "learning_rate": 3.019920633431095e-07, "loss": 0.0004, "reward": 1.2760417014360428, "reward_std": 0.024375351145863533, "rewards/equation_reward_func": 0.2760416737291962, "rewards/format_reward_func": 1.0, "step": 762 }, { "completion_length": 53.99479269981384, "epoch": 0.40746666666666664, "grad_norm": 0.030729658887712456, "kl": 0.4056396484375, "learning_rate": 2.9347631366707124e-07, "loss": 0.0004, "reward": 1.3333333656191826, "reward_std": 0.019287919625639915, "rewards/equation_reward_func": 0.33333334140479565, "rewards/format_reward_func": 1.0, "step": 764 }, { "completion_length": 54.143230676651, "epoch": 0.40853333333333336, "grad_norm": 0.0023596558635757014, "kl": 0.378662109375, "learning_rate": 2.8507486181662076e-07, "loss": 0.0004, "reward": 1.2942708656191826, "reward_std": 0.007365695666521788, "rewards/equation_reward_func": 0.29427084024064243, "rewards/format_reward_func": 1.0, "step": 766 }, { "completion_length": 54.56770968437195, "epoch": 0.4096, "grad_norm": 0.0034876726855577684, "kl": 0.408935546875, "learning_rate": 2.7678814298657735e-07, "loss": 0.0004, "reward": 1.3333333730697632, "reward_std": 0.0, "rewards/equation_reward_func": 0.3333333432674408, "rewards/format_reward_func": 1.0, "step": 768 }, { "completion_length": 54.51302218437195, "epoch": 0.4106666666666667, "grad_norm": 0.11490092569552311, "kl": 0.4324951171875, "learning_rate": 2.6861658642859696e-07, "loss": 0.0004, "reward": 1.2760416939854622, "reward_std": 0.04059866815805435, "rewards/equation_reward_func": 0.276041672565043, "rewards/format_reward_func": 1.0, "step": 770 }, { "completion_length": 54.20833468437195, "epoch": 0.41173333333333334, "grad_norm": 0.04343077161191203, "kl": 0.459228515625, "learning_rate": 2.6056061542893225e-07, "loss": 0.0005, "reward": 1.3255208656191826, "reward_std": 0.022097086999565363, "rewards/equation_reward_func": 0.3255208432674408, "rewards/format_reward_func": 1.0, "step": 772 }, { "completion_length": 54.17708468437195, "epoch": 0.4128, "grad_norm": 0.0840423546029653, "kl": 0.3916015625, "learning_rate": 2.52620647286512e-07, "loss": 0.0004, "reward": 1.2838542014360428, "reward_std": 0.03234682325273752, "rewards/equation_reward_func": 0.2838541753590107, "rewards/format_reward_func": 1.0, "step": 774 }, { "completion_length": 55.15364742279053, "epoch": 0.41386666666666666, "grad_norm": 0.21282953599684604, "kl": 0.4306640625, "learning_rate": 2.4479709329132074e-07, "loss": 0.0004, "reward": 1.3489583805203438, "reward_std": 0.02077984530478716, "rewards/equation_reward_func": 0.34895834419876337, "rewards/format_reward_func": 1.0, "step": 776 }, { "completion_length": 54.54427194595337, "epoch": 0.4149333333333333, "grad_norm": 0.040299199892509435, "kl": 0.4066162109375, "learning_rate": 2.370903587030965e-07, "loss": 0.0004, "reward": 1.3463542088866234, "reward_std": 0.010782274417579174, "rewards/equation_reward_func": 0.34635417722165585, "rewards/format_reward_func": 1.0, "step": 778 }, { "completion_length": 53.593750953674316, "epoch": 0.416, "grad_norm": 0.0160160866319604, "kl": 0.4176025390625, "learning_rate": 2.2950084273033634e-07, "loss": 0.0004, "reward": 1.2526042014360428, "reward_std": 0.017009655479341745, "rewards/equation_reward_func": 0.25520834140479565, "rewards/format_reward_func": 0.9973958358168602, "step": 780 }, { "completion_length": 55.21614718437195, "epoch": 0.41706666666666664, "grad_norm": 0.005180214768365815, "kl": 0.4544677734375, "learning_rate": 2.2202893850961943e-07, "loss": 0.0005, "reward": 1.1666666865348816, "reward_std": 0.0, "rewards/equation_reward_func": 0.1666666716337204, "rewards/format_reward_func": 1.0, "step": 782 }, { "completion_length": 54.75260543823242, "epoch": 0.41813333333333336, "grad_norm": 0.060106878690107966, "kl": 0.404296875, "learning_rate": 2.1467503308524097e-07, "loss": 0.0004, "reward": 1.3463541939854622, "reward_std": 0.022097086999565363, "rewards/equation_reward_func": 0.34635417722165585, "rewards/format_reward_func": 1.0, "step": 784 }, { "completion_length": 54.578126430511475, "epoch": 0.4192, "grad_norm": 0.06789689486915826, "kl": 0.4388427734375, "learning_rate": 2.074395073891644e-07, "loss": 0.0004, "reward": 1.2734375223517418, "reward_std": 0.022097086999565363, "rewards/equation_reward_func": 0.27343750558793545, "rewards/format_reward_func": 1.0, "step": 786 }, { "completion_length": 54.33073043823242, "epoch": 0.4202666666666667, "grad_norm": 0.037115328774145785, "kl": 0.3868408203125, "learning_rate": 2.0032273622128784e-07, "loss": 0.0004, "reward": 1.4114583805203438, "reward_std": 0.009643959812819958, "rewards/equation_reward_func": 0.4114583469927311, "rewards/format_reward_func": 1.0, "step": 788 }, { "completion_length": 53.94791841506958, "epoch": 0.42133333333333334, "grad_norm": 0.0543586190371099, "kl": 0.405517578125, "learning_rate": 1.9332508823003193e-07, "loss": 0.0004, "reward": 1.304687537252903, "reward_std": 0.022097086999565363, "rewards/equation_reward_func": 0.30468750931322575, "rewards/format_reward_func": 1.0, "step": 790 }, { "completion_length": 55.18489718437195, "epoch": 0.4224, "grad_norm": 0.07317121726857033, "kl": 0.3917236328125, "learning_rate": 1.864469258932397e-07, "loss": 0.0004, "reward": 1.1979166865348816, "reward_std": 0.011135885491967201, "rewards/equation_reward_func": 0.1979166716337204, "rewards/format_reward_func": 1.0, "step": 792 }, { "completion_length": 53.94010519981384, "epoch": 0.42346666666666666, "grad_norm": 0.04547407129462169, "kl": 0.4234619140625, "learning_rate": 1.7968860549940513e-07, "loss": 0.0004, "reward": 1.296875037252903, "reward_std": 0.014731391333043575, "rewards/equation_reward_func": 0.29687500814907253, "rewards/format_reward_func": 1.0, "step": 794 }, { "completion_length": 55.281251192092896, "epoch": 0.4245333333333333, "grad_norm": 0.0012507847797499818, "kl": 0.3968505859375, "learning_rate": 1.730504771292138e-07, "loss": 0.0004, "reward": 1.3098958656191826, "reward_std": 0.007365695666521788, "rewards/equation_reward_func": 0.30989584140479565, "rewards/format_reward_func": 1.0, "step": 796 }, { "completion_length": 56.119793176651, "epoch": 0.4256, "grad_norm": 0.04071217976203216, "kl": 0.5706787109375, "learning_rate": 1.6653288463741064e-07, "loss": 0.0006, "reward": 1.2708333656191826, "reward_std": 0.02779192989692092, "rewards/equation_reward_func": 0.27083334140479565, "rewards/format_reward_func": 1.0, "step": 798 }, { "completion_length": 53.859376430511475, "epoch": 0.4266666666666667, "grad_norm": 0.06763443269753186, "kl": 0.391357421875, "learning_rate": 1.6013616563498703e-07, "loss": 0.0004, "reward": 1.2526041939854622, "reward_std": 0.033232972491532564, "rewards/equation_reward_func": 0.252604172565043, "rewards/format_reward_func": 1.0, "step": 800 }, { "completion_length": 54.54427146911621, "epoch": 0.42773333333333335, "grad_norm": 0.0017954572841376304, "kl": 0.390380859375, "learning_rate": 1.5386065147169394e-07, "loss": 0.0004, "reward": 1.1875000223517418, "reward_std": 0.0, "rewards/equation_reward_func": 0.18750000558793545, "rewards/format_reward_func": 1.0, "step": 802 }, { "completion_length": 54.35677242279053, "epoch": 0.4288, "grad_norm": 0.0011393545202177705, "kl": 0.3603515625, "learning_rate": 1.4770666721887622e-07, "loss": 0.0004, "reward": 1.263020858168602, "reward_std": 0.010782274417579174, "rewards/equation_reward_func": 0.2630208395421505, "rewards/format_reward_func": 1.0, "step": 804 }, { "completion_length": 54.557293176651, "epoch": 0.4298666666666667, "grad_norm": 0.003814435991371691, "kl": 0.413818359375, "learning_rate": 1.4167453165263494e-07, "loss": 0.0004, "reward": 1.2708333656191826, "reward_std": 0.0, "rewards/equation_reward_func": 0.27083334140479565, "rewards/format_reward_func": 1.0, "step": 806 }, { "completion_length": 54.85677194595337, "epoch": 0.43093333333333333, "grad_norm": 0.04307861790650641, "kl": 0.3704833984375, "learning_rate": 1.3576455723731646e-07, "loss": 0.0004, "reward": 1.1953125223517418, "reward_std": 0.017009655479341745, "rewards/equation_reward_func": 0.19531250512227416, "rewards/format_reward_func": 1.0, "step": 808 }, { "completion_length": 54.367188692092896, "epoch": 0.432, "grad_norm": 0.0030659931806077878, "kl": 0.4273681640625, "learning_rate": 1.2997705010932394e-07, "loss": 0.0004, "reward": 1.351562537252903, "reward_std": 0.007365695666521788, "rewards/equation_reward_func": 0.3541666753590107, "rewards/format_reward_func": 0.9973958358168602, "step": 810 }, { "completion_length": 54.74479293823242, "epoch": 0.43306666666666666, "grad_norm": 0.008077285315027324, "kl": 0.4517822265625, "learning_rate": 1.2431231006126004e-07, "loss": 0.0005, "reward": 1.208333358168602, "reward_std": 0.014731391333043575, "rewards/equation_reward_func": 0.20833334024064243, "rewards/format_reward_func": 1.0, "step": 812 }, { "completion_length": 55.04948091506958, "epoch": 0.4341333333333333, "grad_norm": 0.006939965710335588, "kl": 0.41943359375, "learning_rate": 1.1877063052639914e-07, "loss": 0.0004, "reward": 1.333333358168602, "reward_std": 0.014731391333043575, "rewards/equation_reward_func": 0.3333333395421505, "rewards/format_reward_func": 1.0, "step": 814 }, { "completion_length": 54.250001192092896, "epoch": 0.4352, "grad_norm": 0.0408897494432558, "kl": 0.4183349609375, "learning_rate": 1.133522985634869e-07, "loss": 0.0004, "reward": 1.1718750223517418, "reward_std": 0.009643959812819958, "rewards/equation_reward_func": 0.17187500558793545, "rewards/format_reward_func": 1.0, "step": 816 }, { "completion_length": 55.30208444595337, "epoch": 0.4362666666666667, "grad_norm": 0.04841400759680108, "kl": 0.426513671875, "learning_rate": 1.0805759484186995e-07, "loss": 0.0004, "reward": 1.2968750223517418, "reward_std": 0.03287936141714454, "rewards/equation_reward_func": 0.29687500558793545, "rewards/format_reward_func": 1.0, "step": 818 }, { "completion_length": 53.28385519981384, "epoch": 0.43733333333333335, "grad_norm": 0.0036641619469083408, "kl": 0.5653076171875, "learning_rate": 1.0288679362695786e-07, "loss": 0.0006, "reward": 1.3333333730697632, "reward_std": 0.0, "rewards/equation_reward_func": 0.3333333432674408, "rewards/format_reward_func": 1.0, "step": 820 }, { "completion_length": 53.54427194595337, "epoch": 0.4384, "grad_norm": 0.0031091171525429277, "kl": 0.4061279296875, "learning_rate": 9.78401627660161e-08, "loss": 0.0004, "reward": 1.390625037252903, "reward_std": 0.009643959812819958, "rewards/equation_reward_func": 0.39062500931322575, "rewards/format_reward_func": 1.0, "step": 822 }, { "completion_length": 54.44791865348816, "epoch": 0.43946666666666667, "grad_norm": 0.05627304257780898, "kl": 0.38427734375, "learning_rate": 9.291796367429107e-08, "loss": 0.0004, "reward": 1.3229167014360428, "reward_std": 0.01814797008410096, "rewards/equation_reward_func": 0.3229166753590107, "rewards/format_reward_func": 1.0, "step": 824 }, { "completion_length": 56.171875953674316, "epoch": 0.44053333333333333, "grad_norm": 0.00314713591457346, "kl": 0.4090576171875, "learning_rate": 8.812045132147007e-08, "loss": 0.0004, "reward": 1.3125000298023224, "reward_std": 0.014731391333043575, "rewards/equation_reward_func": 0.3151041753590107, "rewards/format_reward_func": 0.9973958358168602, "step": 826 }, { "completion_length": 54.82291769981384, "epoch": 0.4416, "grad_norm": 0.044880125038787855, "kl": 0.403564453125, "learning_rate": 8.344787421847216e-08, "loss": 0.0004, "reward": 1.3593750298023224, "reward_std": 0.025867276825010777, "rewards/equation_reward_func": 0.359375006519258, "rewards/format_reward_func": 1.0, "step": 828 }, { "completion_length": 53.61458420753479, "epoch": 0.44266666666666665, "grad_norm": 0.06716771072814452, "kl": 0.385009765625, "learning_rate": 7.890047440457683e-08, "loss": 0.0004, "reward": 1.328125037252903, "reward_std": 0.03401931095868349, "rewards/equation_reward_func": 0.3281250111758709, "rewards/format_reward_func": 1.0, "step": 830 }, { "completion_length": 53.98177218437195, "epoch": 0.4437333333333333, "grad_norm": 0.004116626418940948, "kl": 0.38818359375, "learning_rate": 7.447848743488555e-08, "loss": 0.0004, "reward": 1.3958333805203438, "reward_std": 0.0, "rewards/equation_reward_func": 0.39583334513008595, "rewards/format_reward_func": 1.0, "step": 832 }, { "completion_length": 53.76302242279053, "epoch": 0.4448, "grad_norm": 0.10874889880406396, "kl": 0.402587890625, "learning_rate": 7.01821423681201e-08, "loss": 0.0004, "reward": 1.2604166939854622, "reward_std": 0.024375352542847395, "rewards/equation_reward_func": 0.26041667349636555, "rewards/format_reward_func": 1.0, "step": 834 }, { "completion_length": 53.890626430511475, "epoch": 0.4458666666666667, "grad_norm": 0.0017043652569705776, "kl": 0.3685302734375, "learning_rate": 6.601166175475793e-08, "loss": 0.0004, "reward": 1.2916667014360428, "reward_std": 0.0, "rewards/equation_reward_func": 0.2916666753590107, "rewards/format_reward_func": 1.0, "step": 836 }, { "completion_length": 54.171876192092896, "epoch": 0.44693333333333335, "grad_norm": 0.08661319478540903, "kl": 0.405029296875, "learning_rate": 6.196726162550292e-08, "loss": 0.0004, "reward": 1.3203125223517418, "reward_std": 0.026653615292161703, "rewards/equation_reward_func": 0.32031250558793545, "rewards/format_reward_func": 1.0, "step": 838 }, { "completion_length": 55.768230676651, "epoch": 0.448, "grad_norm": 0.06635253420171656, "kl": 0.375244140625, "learning_rate": 5.804915148009571e-08, "loss": 0.0004, "reward": 1.2578125298023224, "reward_std": 0.018501579761505127, "rewards/equation_reward_func": 0.2578125074505806, "rewards/format_reward_func": 1.0, "step": 840 }, { "completion_length": 55.406251192092896, "epoch": 0.44906666666666667, "grad_norm": 0.04859645028098945, "kl": 0.4815673828125, "learning_rate": 5.425753427646257e-08, "loss": 0.0005, "reward": 1.3281250298023224, "reward_std": 0.02946278266608715, "rewards/equation_reward_func": 0.3307291753590107, "rewards/format_reward_func": 0.9973958358168602, "step": 842 }, { "completion_length": 54.93489694595337, "epoch": 0.45013333333333333, "grad_norm": 0.039381161804116646, "kl": 0.3951416015625, "learning_rate": 5.059260642020003e-08, "loss": 0.0004, "reward": 1.200520858168602, "reward_std": 0.010782274417579174, "rewards/equation_reward_func": 0.2005208395421505, "rewards/format_reward_func": 1.0, "step": 844 }, { "completion_length": 53.166668176651, "epoch": 0.4512, "grad_norm": 0.0025859849261834966, "kl": 0.4044189453125, "learning_rate": 4.705455775440237e-08, "loss": 0.0004, "reward": 1.312500037252903, "reward_std": 0.0, "rewards/equation_reward_func": 0.31250000931322575, "rewards/format_reward_func": 1.0, "step": 846 }, { "completion_length": 55.10677170753479, "epoch": 0.45226666666666665, "grad_norm": 0.07462454881512422, "kl": 0.416748046875, "learning_rate": 4.364357154982846e-08, "loss": 0.0004, "reward": 1.3072917014360428, "reward_std": 0.014731391333043575, "rewards/equation_reward_func": 0.30729167722165585, "rewards/format_reward_func": 1.0, "step": 848 }, { "completion_length": 54.445313692092896, "epoch": 0.4533333333333333, "grad_norm": 0.0038679759871174425, "kl": 0.3973388671875, "learning_rate": 4.035982449540676e-08, "loss": 0.0004, "reward": 1.2343750298023224, "reward_std": 0.009643959812819958, "rewards/equation_reward_func": 0.2343750069849193, "rewards/format_reward_func": 1.0, "step": 850 }, { "completion_length": 54.367188453674316, "epoch": 0.4544, "grad_norm": 0.050692638092982675, "kl": 0.378173828125, "learning_rate": 3.7203486689083857e-08, "loss": 0.0004, "reward": 1.3177083656191826, "reward_std": 0.009643959812819958, "rewards/equation_reward_func": 0.31770834093913436, "rewards/format_reward_func": 1.0, "step": 852 }, { "completion_length": 54.268229961395264, "epoch": 0.4554666666666667, "grad_norm": 0.05211757526577857, "kl": 0.4293212890625, "learning_rate": 3.4174721629013364e-08, "loss": 0.0004, "reward": 1.3151042014360428, "reward_std": 0.020426234230399132, "rewards/equation_reward_func": 0.3151041753590107, "rewards/format_reward_func": 1.0, "step": 854 }, { "completion_length": 52.71354341506958, "epoch": 0.45653333333333335, "grad_norm": 0.04287081204880476, "kl": 0.40771484375, "learning_rate": 3.1273686205086084e-08, "loss": 0.0004, "reward": 1.453125037252903, "reward_std": 0.014731391333043575, "rewards/equation_reward_func": 0.4531250111758709, "rewards/format_reward_func": 1.0, "step": 856 }, { "completion_length": 53.80208468437195, "epoch": 0.4576, "grad_norm": 0.0019349493196587919, "kl": 0.412353515625, "learning_rate": 2.850053069080344e-08, "loss": 0.0004, "reward": 1.3541667088866234, "reward_std": 0.0, "rewards/equation_reward_func": 0.35416667722165585, "rewards/format_reward_func": 1.0, "step": 858 }, { "completion_length": 55.55989718437195, "epoch": 0.45866666666666667, "grad_norm": 0.04114322595002585, "kl": 0.3990478515625, "learning_rate": 2.5855398735493697e-08, "loss": 0.0004, "reward": 1.169270858168602, "reward_std": 0.007365695666521788, "rewards/equation_reward_func": 0.16927083837799728, "rewards/format_reward_func": 1.0, "step": 860 }, { "completion_length": 54.25000190734863, "epoch": 0.4597333333333333, "grad_norm": 0.0043589387503332605, "kl": 0.3883056640625, "learning_rate": 2.3338427356870972e-08, "loss": 0.0004, "reward": 1.2916667014360428, "reward_std": 0.0, "rewards/equation_reward_func": 0.2916666753590107, "rewards/format_reward_func": 1.0, "step": 862 }, { "completion_length": 54.588543176651, "epoch": 0.4608, "grad_norm": 0.015302715345719392, "kl": 0.4150390625, "learning_rate": 2.094974693393731e-08, "loss": 0.0004, "reward": 1.3828125223517418, "reward_std": 0.02551366575062275, "rewards/equation_reward_func": 0.38281250558793545, "rewards/format_reward_func": 1.0, "step": 864 }, { "completion_length": 54.203126430511475, "epoch": 0.46186666666666665, "grad_norm": 0.04209690961750703, "kl": 0.394775390625, "learning_rate": 1.8689481200228064e-08, "loss": 0.0004, "reward": 1.3932292014360428, "reward_std": 0.007365695666521788, "rewards/equation_reward_func": 0.39322917722165585, "rewards/format_reward_func": 1.0, "step": 866 }, { "completion_length": 54.729167461395264, "epoch": 0.4629333333333333, "grad_norm": 0.04001888502432887, "kl": 0.446044921875, "learning_rate": 1.6557747237405108e-08, "loss": 0.0004, "reward": 1.3255208656191826, "reward_std": 0.017009655479341745, "rewards/equation_reward_func": 0.32552084140479565, "rewards/format_reward_func": 1.0, "step": 868 }, { "completion_length": 54.47135543823242, "epoch": 0.464, "grad_norm": 0.03102184080615482, "kl": 0.3768310546875, "learning_rate": 1.4554655469189438e-08, "loss": 0.0004, "reward": 1.2994792014360428, "reward_std": 0.020426234230399132, "rewards/equation_reward_func": 0.29947917722165585, "rewards/format_reward_func": 1.0, "step": 870 }, { "completion_length": 53.976563930511475, "epoch": 0.4650666666666667, "grad_norm": 0.04436906557747848, "kl": 0.4132080078125, "learning_rate": 1.2680309655642431e-08, "loss": 0.0004, "reward": 1.4062500298023224, "reward_std": 0.02946278266608715, "rewards/equation_reward_func": 0.40625000931322575, "rewards/format_reward_func": 1.0, "step": 872 }, { "completion_length": 54.822918176651, "epoch": 0.46613333333333334, "grad_norm": 0.10872944527393583, "kl": 0.821533203125, "learning_rate": 1.0934806887791805e-08, "loss": 0.0008, "reward": 1.2994791939854622, "reward_std": 0.056116399355232716, "rewards/equation_reward_func": 0.29947917349636555, "rewards/format_reward_func": 1.0, "step": 874 }, { "completion_length": 54.351563692092896, "epoch": 0.4672, "grad_norm": 0.09510209295062413, "kl": 0.4007568359375, "learning_rate": 9.318237582600088e-09, "loss": 0.0004, "reward": 1.2734375298023224, "reward_std": 0.04287693230435252, "rewards/equation_reward_func": 0.27343750838190317, "rewards/format_reward_func": 1.0, "step": 876 }, { "completion_length": 53.77083492279053, "epoch": 0.46826666666666666, "grad_norm": 0.029324852760745777, "kl": 0.441650390625, "learning_rate": 7.830685478283362e-09, "loss": 0.0004, "reward": 1.4140625298023224, "reward_std": 0.007365695666521788, "rewards/equation_reward_func": 0.4140625074505806, "rewards/format_reward_func": 1.0, "step": 878 }, { "completion_length": 55.05729293823242, "epoch": 0.4693333333333333, "grad_norm": 0.0017378658055886994, "kl": 0.40625, "learning_rate": 6.472227629972238e-09, "loss": 0.0004, "reward": 1.3151042014360428, "reward_std": 0.020426234230399132, "rewards/equation_reward_func": 0.3151041753590107, "rewards/format_reward_func": 1.0, "step": 880 }, { "completion_length": 55.25000071525574, "epoch": 0.4704, "grad_norm": 0.06502816647873647, "kl": 0.3956298828125, "learning_rate": 5.242934405720879e-09, "loss": 0.0004, "reward": 1.281250037252903, "reward_std": 0.01814797008410096, "rewards/equation_reward_func": 0.28125000931322575, "rewards/format_reward_func": 1.0, "step": 882 }, { "completion_length": 54.898438453674316, "epoch": 0.47146666666666665, "grad_norm": 0.0035887430503341697, "kl": 0.39990234375, "learning_rate": 4.142869482861578e-09, "loss": 0.0004, "reward": 1.208333358168602, "reward_std": 0.0, "rewards/equation_reward_func": 0.2083333395421505, "rewards/format_reward_func": 1.0, "step": 884 }, { "completion_length": 55.06770920753479, "epoch": 0.47253333333333336, "grad_norm": 0.0058684324019355055, "kl": 0.4083251953125, "learning_rate": 3.1720898447074043e-09, "loss": 0.0004, "reward": 1.1875000223517418, "reward_std": 0.0, "rewards/equation_reward_func": 0.18750000558793545, "rewards/format_reward_func": 1.0, "step": 886 }, { "completion_length": 56.09895920753479, "epoch": 0.4736, "grad_norm": 0.002873055504777843, "kl": 0.460693359375, "learning_rate": 2.330645777598173e-09, "loss": 0.0005, "reward": 1.1875000223517418, "reward_std": 0.0, "rewards/equation_reward_func": 0.18750000558793545, "rewards/format_reward_func": 1.0, "step": 888 }, { "completion_length": 55.10416769981384, "epoch": 0.4746666666666667, "grad_norm": 0.0024766498773939718, "kl": 0.3994140625, "learning_rate": 1.6185808682986358e-09, "loss": 0.0004, "reward": 1.2265625298023224, "reward_std": 0.017009655479341745, "rewards/equation_reward_func": 0.22656250628642738, "rewards/format_reward_func": 1.0, "step": 890 }, { "completion_length": 54.24479269981384, "epoch": 0.47573333333333334, "grad_norm": 0.006620981245308639, "kl": 0.4544677734375, "learning_rate": 1.035932001737794e-09, "loss": 0.0005, "reward": 1.3750000447034836, "reward_std": 0.0, "rewards/equation_reward_func": 0.3750000111758709, "rewards/format_reward_func": 1.0, "step": 892 }, { "completion_length": 55.549480676651, "epoch": 0.4768, "grad_norm": 0.004275301036612241, "kl": 0.423828125, "learning_rate": 5.827293591006978e-10, "loss": 0.0004, "reward": 1.2578125298023224, "reward_std": 0.026653615292161703, "rewards/equation_reward_func": 0.2604166753590107, "rewards/format_reward_func": 0.9973958358168602, "step": 894 }, { "completion_length": 54.68229293823242, "epoch": 0.47786666666666666, "grad_norm": 0.001788071618968572, "kl": 0.419677734375, "learning_rate": 2.58996416263313e-10, "loss": 0.0004, "reward": 1.2708333656191826, "reward_std": 0.0, "rewards/equation_reward_func": 0.27083334140479565, "rewards/format_reward_func": 1.0, "step": 896 }, { "completion_length": 53.992188692092896, "epoch": 0.4789333333333333, "grad_norm": 0.05393770700308795, "kl": 0.3878173828125, "learning_rate": 6.474994257682499e-11, "loss": 0.0004, "reward": 1.2526042014360428, "reward_std": 0.02551366575062275, "rewards/equation_reward_func": 0.2526041753590107, "rewards/format_reward_func": 1.0, "step": 898 }, { "completion_length": 55.02864718437195, "epoch": 0.48, "grad_norm": 0.0021273783690601037, "kl": 0.4002685546875, "learning_rate": 0.0, "loss": 0.0004, "reward": 1.208333358168602, "reward_std": 0.0, "rewards/equation_reward_func": 0.2083333395421505, "rewards/format_reward_func": 1.0, "step": 900 }, { "epoch": 0.48, "step": 900, "total_flos": 0.0, "train_loss": 0.08691239206201022, "train_runtime": 9457.957, "train_samples_per_second": 2.284, "train_steps_per_second": 0.095 } ], "logging_steps": 2, "max_steps": 900, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }