File size: 21,978 Bytes
2e3a075
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.1568627450980392,
  "eval_steps": 500,
  "global_step": 30,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "accuracy_delta": -0.03125,
      "baseline_accuracy": 0.5625,
      "completion_length": 1660.1953125,
      "degradation_rate": 0.1875,
      "epoch": 0.00522875816993464,
      "grad_norm": 1.8285036167620026,
      "improvement_rate": 0.15625,
      "kl": 0.0,
      "learning_rate": 2e-06,
      "loss": 0.0,
      "reward": 0.24004681408405304,
      "reward_std": 0.25635848194360733,
      "rewards/AdaptiveTeachingReward": 0.24004681408405304,
      "step": 1,
      "student_accuracy": 0.53125,
      "student_approach_length": 500.0,
      "teaching_length_mean": 1349.5,
      "teaching_length_std": 1533.2601122433693,
      "token_efficiency": 0.01778511088825826
    },
    {
      "accuracy_delta": -0.21875,
      "baseline_accuracy": 1.0,
      "completion_length": 2174.171875,
      "degradation_rate": 0.21875,
      "epoch": 0.01045751633986928,
      "grad_norm": 1.2225533588422857,
      "improvement_rate": 0.0,
      "kl": 0.002572178840637207,
      "learning_rate": 2e-06,
      "loss": 0.0001,
      "reward": 0.2656950503587723,
      "reward_std": 0.17467603832483292,
      "rewards/AdaptiveTeachingReward": 0.2656950503587723,
      "step": 2,
      "student_accuracy": 0.78125,
      "student_approach_length": 489.875,
      "teaching_length_mean": 1645.90625,
      "teaching_length_std": 1777.2254866782437,
      "token_efficiency": 0.016061369340544678
    },
    {
      "accuracy_delta": -0.0625,
      "baseline_accuracy": 0.0625,
      "completion_length": 2889.65625,
      "degradation_rate": 0.0625,
      "epoch": 0.01568627450980392,
      "grad_norm": 0.4912196165940296,
      "improvement_rate": 0.0,
      "kl": 0.0021309852600097656,
      "learning_rate": 2e-06,
      "loss": 0.0001,
      "reward": 0.1566198617219925,
      "reward_std": 0.17452973127365112,
      "rewards/AdaptiveTeachingReward": 0.1566198617219925,
      "step": 3,
      "student_accuracy": 0.0,
      "student_approach_length": 500.0,
      "teaching_length_mean": 3435.71875,
      "teaching_length_std": 874.6175996443687,
      "token_efficiency": 0.005029539554335019
    },
    {
      "accuracy_delta": 0.0,
      "baseline_accuracy": 0.0,
      "completion_length": 1765.4765625,
      "degradation_rate": 0.0,
      "epoch": 0.02091503267973856,
      "grad_norm": 0.014919439029800962,
      "improvement_rate": 0.0,
      "kl": 0.0024797916412353516,
      "learning_rate": 2e-06,
      "loss": 0.0001,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/AdaptiveTeachingReward": 0.0,
      "step": 4,
      "student_accuracy": 0.0,
      "student_approach_length": 500.0,
      "teaching_length_mean": 1644.53125,
      "teaching_length_std": 1544.9233390329066,
      "token_efficiency": 0.0
    },
    {
      "accuracy_delta": 0.1875,
      "baseline_accuracy": 0.375,
      "completion_length": 2330.296875,
      "degradation_rate": 0.09375,
      "epoch": 0.026143790849673203,
      "grad_norm": 1.6480657403271404,
      "improvement_rate": 0.28125,
      "kl": 0.002542257308959961,
      "learning_rate": 2e-06,
      "loss": 0.0001,
      "reward": 0.293629452586174,
      "reward_std": 0.31043318659067154,
      "rewards/AdaptiveTeachingReward": 0.293629452586174,
      "step": 5,
      "student_accuracy": 0.5625,
      "student_approach_length": 500.0,
      "teaching_length_mean": 2543.6875,
      "teaching_length_std": 1157.76463240177,
      "token_efficiency": 0.015775285003662067
    },
    {
      "accuracy_delta": -0.125,
      "baseline_accuracy": 0.375,
      "completion_length": 2799.2265625,
      "degradation_rate": 0.21875,
      "epoch": 0.03137254901960784,
      "grad_norm": 1.3626772466174568,
      "improvement_rate": 0.09375,
      "kl": 0.0024237632751464844,
      "learning_rate": 2e-06,
      "loss": 0.0001,
      "reward": 0.297846183180809,
      "reward_std": 0.1792445182800293,
      "rewards/AdaptiveTeachingReward": 0.297846183180809,
      "step": 6,
      "student_accuracy": 0.25,
      "student_approach_length": 500.0,
      "teaching_length_mean": 3300.46875,
      "teaching_length_std": 1322.463354762313,
      "token_efficiency": 0.008984860526379333
    },
    {
      "accuracy_delta": 0.15625,
      "baseline_accuracy": 0.09375,
      "completion_length": 2839.59375,
      "degradation_rate": 0.0,
      "epoch": 0.036601307189542485,
      "grad_norm": 1.0148015671601693,
      "improvement_rate": 0.15625,
      "kl": 0.0022563934326171875,
      "learning_rate": 2e-06,
      "loss": 0.0001,
      "reward": 0.22244123369455338,
      "reward_std": 0.32708095014095306,
      "rewards/AdaptiveTeachingReward": 0.22244123369455338,
      "step": 7,
      "student_accuracy": 0.25,
      "student_approach_length": 500.0,
      "teaching_length_mean": 2663.28125,
      "teaching_length_std": 1263.471554191286,
      "token_efficiency": 0.009322577760442718
    },
    {
      "accuracy_delta": 0.03125,
      "baseline_accuracy": 0.34375,
      "completion_length": 2997.0234375,
      "degradation_rate": 0.0625,
      "epoch": 0.04183006535947712,
      "grad_norm": 1.0933058629769363,
      "improvement_rate": 0.09375,
      "kl": 0.00222015380859375,
      "learning_rate": 2e-06,
      "loss": 0.0001,
      "reward": 0.4145798534154892,
      "reward_std": 0.348370686173439,
      "rewards/AdaptiveTeachingReward": 0.4145798534154892,
      "step": 8,
      "student_accuracy": 0.375,
      "student_approach_length": 500.0,
      "teaching_length_mean": 2859.9375,
      "teaching_length_std": 1548.3925018635746,
      "token_efficiency": 0.014895284304358753
    },
    {
      "accuracy_delta": 0.0,
      "baseline_accuracy": 0.0,
      "completion_length": 2069.3515625,
      "degradation_rate": 0.0,
      "epoch": 0.047058823529411764,
      "grad_norm": 0.6907373861546955,
      "improvement_rate": 0.0,
      "kl": 0.0022356510162353516,
      "learning_rate": 2e-06,
      "loss": 0.0001,
      "reward": 0.13787749409675598,
      "reward_std": 0.08163860440254211,
      "rewards/AdaptiveTeachingReward": 0.13787749409675598,
      "step": 9,
      "student_accuracy": 0.0,
      "student_approach_length": 500.0,
      "teaching_length_mean": 1586.78125,
      "teaching_length_std": 1393.5538468081056,
      "token_efficiency": 0.009196431155361413
    },
    {
      "accuracy_delta": 0.0,
      "baseline_accuracy": 0.0,
      "completion_length": 2533.3203125,
      "degradation_rate": 0.0,
      "epoch": 0.05228758169934641,
      "grad_norm": 0.9569337200656036,
      "improvement_rate": 0.0,
      "kl": 0.002455472946166992,
      "learning_rate": 2e-06,
      "loss": 0.0001,
      "reward": 0.2650887817144394,
      "reward_std": 0.1834174394607544,
      "rewards/AdaptiveTeachingReward": 0.2650887817144394,
      "step": 10,
      "student_accuracy": 0.0,
      "student_approach_length": 500.0,
      "teaching_length_mean": 2718.53125,
      "teaching_length_std": 1545.6357044636015,
      "token_efficiency": 0.00983657689669804
    },
    {
      "accuracy_delta": 0.0,
      "baseline_accuracy": 0.0,
      "completion_length": 2366.4609375,
      "degradation_rate": 0.0,
      "epoch": 0.05751633986928104,
      "grad_norm": 0.1992844149700769,
      "improvement_rate": 0.0,
      "kl": 0.0023772716522216797,
      "learning_rate": 2e-06,
      "loss": 0.0001,
      "reward": 0.00799931213259697,
      "reward_std": 0.045250944793224335,
      "rewards/AdaptiveTeachingReward": 0.00799931213259697,
      "step": 11,
      "student_accuracy": 0.0,
      "student_approach_length": 500.0,
      "teaching_length_mean": 2560.28125,
      "teaching_length_std": 1264.0513369895032,
      "token_efficiency": 0.000244351732796639
    },
    {
      "accuracy_delta": 0.0,
      "baseline_accuracy": 0.0,
      "completion_length": 2429.25,
      "degradation_rate": 0.0,
      "epoch": 0.06274509803921569,
      "grad_norm": 0.02438642631727396,
      "improvement_rate": 0.0,
      "kl": 0.0028073787689208984,
      "learning_rate": 2e-06,
      "loss": 0.0001,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/AdaptiveTeachingReward": 0.0,
      "step": 12,
      "student_accuracy": 0.0,
      "student_approach_length": 500.0,
      "teaching_length_mean": 2684.0,
      "teaching_length_std": 1537.9383137400418,
      "token_efficiency": 0.0
    },
    {
      "accuracy_delta": 0.03125,
      "baseline_accuracy": 0.03125,
      "completion_length": 2985.0234375,
      "degradation_rate": 0.03125,
      "epoch": 0.06797385620915032,
      "grad_norm": 1.9377263264415499,
      "improvement_rate": 0.0625,
      "kl": 0.002295255661010742,
      "learning_rate": 2e-06,
      "loss": 0.0001,
      "reward": 0.3229658156633377,
      "reward_std": 0.2836003005504608,
      "rewards/AdaptiveTeachingReward": 0.3229658156633377,
      "step": 13,
      "student_accuracy": 0.0625,
      "student_approach_length": 500.0,
      "teaching_length_mean": 2764.78125,
      "teaching_length_std": 1359.6938149421062,
      "token_efficiency": 0.011864912871618307
    },
    {
      "accuracy_delta": 0.0,
      "baseline_accuracy": 0.40625,
      "completion_length": 2860.7890625,
      "degradation_rate": 0.03125,
      "epoch": 0.07320261437908497,
      "grad_norm": 0.7397348398480618,
      "improvement_rate": 0.03125,
      "kl": 0.0022339820861816406,
      "learning_rate": 2e-06,
      "loss": 0.0001,
      "reward": 0.3183840811252594,
      "reward_std": 0.2730633243918419,
      "rewards/AdaptiveTeachingReward": 0.3183840811252594,
      "step": 14,
      "student_accuracy": 0.40625,
      "student_approach_length": 500.0,
      "teaching_length_mean": 3300.75,
      "teaching_length_std": 1258.5522551939885,
      "token_efficiency": 0.009677591351820222
    },
    {
      "accuracy_delta": -0.25,
      "baseline_accuracy": 0.59375,
      "completion_length": 2667.375,
      "degradation_rate": 0.375,
      "epoch": 0.0784313725490196,
      "grad_norm": 0.591091766464445,
      "improvement_rate": 0.125,
      "kl": 0.0020873546600341797,
      "learning_rate": 2e-06,
      "loss": 0.0001,
      "reward": 0.14508739858865738,
      "reward_std": 0.1915995106101036,
      "rewards/AdaptiveTeachingReward": 0.14508739858865738,
      "step": 15,
      "student_accuracy": 0.34375,
      "student_approach_length": 500.0,
      "teaching_length_mean": 2580.1875,
      "teaching_length_std": 1411.9177183341415,
      "token_efficiency": 0.006345218750950054
    },
    {
      "accuracy_delta": -0.125,
      "baseline_accuracy": 0.78125,
      "completion_length": 2397.8046875,
      "degradation_rate": 0.28125,
      "epoch": 0.08366013071895424,
      "grad_norm": 1.0467841728187868,
      "improvement_rate": 0.15625,
      "kl": 0.0020101070404052734,
      "learning_rate": 2e-06,
      "loss": 0.0001,
      "reward": 0.29722827672958374,
      "reward_std": 0.29581306129693985,
      "rewards/AdaptiveTeachingReward": 0.29722827672958374,
      "step": 16,
      "student_accuracy": 0.65625,
      "student_approach_length": 500.0,
      "teaching_length_mean": 1867.65625,
      "teaching_length_std": 1645.2406135166107,
      "token_efficiency": 0.016315762169946447
    },
    {
      "accuracy_delta": 0.0,
      "baseline_accuracy": 0.4375,
      "completion_length": 2681.4296875,
      "degradation_rate": 0.0625,
      "epoch": 0.08888888888888889,
      "grad_norm": 1.0178368263353954,
      "improvement_rate": 0.0625,
      "kl": 0.0022170543670654297,
      "learning_rate": 2e-06,
      "loss": 0.0001,
      "reward": 0.27442795038223267,
      "reward_std": 0.24267160892486572,
      "rewards/AdaptiveTeachingReward": 0.27442795038223267,
      "step": 17,
      "student_accuracy": 0.4375,
      "student_approach_length": 500.0,
      "teaching_length_mean": 1946.4375,
      "teaching_length_std": 1595.4173133278073,
      "token_efficiency": 0.01725690617086827
    },
    {
      "accuracy_delta": 0.0,
      "baseline_accuracy": 0.0,
      "completion_length": 2260.1484375,
      "degradation_rate": 0.0,
      "epoch": 0.09411764705882353,
      "grad_norm": 0.39094525161531923,
      "improvement_rate": 0.0,
      "kl": 0.0026335716247558594,
      "learning_rate": 2e-06,
      "loss": 0.0001,
      "reward": 0.15987491607666016,
      "reward_std": 0.12795361876487732,
      "rewards/AdaptiveTeachingReward": 0.15987491607666016,
      "step": 18,
      "student_accuracy": 0.0,
      "student_approach_length": 500.0,
      "teaching_length_mean": 2452.8125,
      "teaching_length_std": 1687.6677585883976,
      "token_efficiency": 0.00733118954839666
    },
    {
      "accuracy_delta": 0.09375,
      "baseline_accuracy": 0.34375,
      "completion_length": 2509.359375,
      "degradation_rate": 0.0,
      "epoch": 0.09934640522875816,
      "grad_norm": 1.112027848483532,
      "improvement_rate": 0.09375,
      "kl": 0.0021845102310180664,
      "learning_rate": 2e-06,
      "loss": 0.0001,
      "reward": 0.32069824635982513,
      "reward_std": 0.1779022440314293,
      "rewards/AdaptiveTeachingReward": 0.32069824635982513,
      "step": 19,
      "student_accuracy": 0.4375,
      "student_approach_length": 500.0,
      "teaching_length_mean": 1688.25,
      "teaching_length_std": 1591.1127810270257,
      "token_efficiency": 0.02000900419695598
    },
    {
      "accuracy_delta": 0.15625,
      "baseline_accuracy": 0.25,
      "completion_length": 2566.109375,
      "degradation_rate": 0.125,
      "epoch": 0.10457516339869281,
      "grad_norm": 0.635668741879516,
      "improvement_rate": 0.28125,
      "kl": 0.0020592212677001953,
      "learning_rate": 2e-06,
      "loss": 0.0001,
      "reward": 0.3237999305129051,
      "reward_std": 0.29250405728816986,
      "rewards/AdaptiveTeachingReward": 0.3237999305129051,
      "step": 20,
      "student_accuracy": 0.40625,
      "student_approach_length": 500.0,
      "teaching_length_mean": 3000.8125,
      "teaching_length_std": 1146.9664495361749,
      "token_efficiency": 0.009248973352977438
    },
    {
      "accuracy_delta": -0.15625,
      "baseline_accuracy": 0.5,
      "completion_length": 2821.34375,
      "degradation_rate": 0.15625,
      "epoch": 0.10980392156862745,
      "grad_norm": 1.038632983891241,
      "improvement_rate": 0.0,
      "kl": 0.0022677183151245117,
      "learning_rate": 2e-06,
      "loss": 0.0001,
      "reward": 0.26167380064725876,
      "reward_std": 0.20054005086421967,
      "rewards/AdaptiveTeachingReward": 0.26167380064725876,
      "step": 21,
      "student_accuracy": 0.34375,
      "student_approach_length": 500.0,
      "teaching_length_mean": 2324.625,
      "teaching_length_std": 1783.1465396131703,
      "token_efficiency": 0.011391752427342916
    },
    {
      "accuracy_delta": -0.0625,
      "baseline_accuracy": 0.6875,
      "completion_length": 1981.4453125,
      "degradation_rate": 0.21875,
      "epoch": 0.11503267973856209,
      "grad_norm": 1.0591629807323937,
      "improvement_rate": 0.15625,
      "kl": 0.0025501251220703125,
      "learning_rate": 2e-06,
      "loss": 0.0001,
      "reward": 0.3476671576499939,
      "reward_std": 0.32001765072345734,
      "rewards/AdaptiveTeachingReward": 0.3476671576499939,
      "step": 22,
      "student_accuracy": 0.625,
      "student_approach_length": 500.0,
      "teaching_length_mean": 1747.6875,
      "teaching_length_std": 1544.1982674617702,
      "token_efficiency": 0.019930595836054183
    },
    {
      "accuracy_delta": 0.0,
      "baseline_accuracy": 0.6875,
      "completion_length": 2491.2109375,
      "degradation_rate": 0.15625,
      "epoch": 0.12026143790849673,
      "grad_norm": 0.8736384977021919,
      "improvement_rate": 0.15625,
      "kl": 0.002077817916870117,
      "learning_rate": 2e-06,
      "loss": 0.0001,
      "reward": 0.3966591954231262,
      "reward_std": 0.35394637286663055,
      "rewards/AdaptiveTeachingReward": 0.3966591954231262,
      "step": 23,
      "student_accuracy": 0.6875,
      "student_approach_length": 500.0,
      "teaching_length_mean": 2244.40625,
      "teaching_length_std": 1464.1512714006012,
      "token_efficiency": 0.020170202633726
    },
    {
      "accuracy_delta": -0.0625,
      "baseline_accuracy": 0.40625,
      "completion_length": 3183.6796875,
      "degradation_rate": 0.09375,
      "epoch": 0.12549019607843137,
      "grad_norm": 0.731937384244321,
      "improvement_rate": 0.03125,
      "kl": 0.002083301544189453,
      "learning_rate": 2e-06,
      "loss": 0.0001,
      "reward": 0.24638524651527405,
      "reward_std": 0.25337880849838257,
      "rewards/AdaptiveTeachingReward": 0.24638524651527405,
      "step": 24,
      "student_accuracy": 0.34375,
      "student_approach_length": 500.0,
      "teaching_length_mean": 2833.75,
      "teaching_length_std": 883.8529811162587,
      "token_efficiency": 0.0154971457828618
    },
    {
      "accuracy_delta": 0.09375,
      "baseline_accuracy": 0.09375,
      "completion_length": 2996.8203125,
      "degradation_rate": 0.0625,
      "epoch": 0.13071895424836602,
      "grad_norm": 1.0366727211421645,
      "improvement_rate": 0.15625,
      "kl": 0.002071857452392578,
      "learning_rate": 2e-06,
      "loss": 0.0001,
      "reward": 0.10002126544713974,
      "reward_std": 0.12362907081842422,
      "rewards/AdaptiveTeachingReward": 0.10002126544713974,
      "step": 25,
      "student_accuracy": 0.1875,
      "student_approach_length": 500.0,
      "teaching_length_mean": 2241.40625,
      "teaching_length_std": 1680.2583046873353,
      "token_efficiency": 0.005314979233325259
    },
    {
      "accuracy_delta": 0.03125,
      "baseline_accuracy": 0.125,
      "completion_length": 2699.5078125,
      "degradation_rate": 0.125,
      "epoch": 0.13594771241830064,
      "grad_norm": 0.8849958564728577,
      "improvement_rate": 0.15625,
      "kl": 0.00244140625,
      "learning_rate": 2e-06,
      "loss": 0.0001,
      "reward": 0.21003766357898712,
      "reward_std": 0.27024491131305695,
      "rewards/AdaptiveTeachingReward": 0.21003766357898712,
      "step": 26,
      "student_accuracy": 0.15625,
      "student_approach_length": 500.0,
      "teaching_length_mean": 2865.75,
      "teaching_length_std": 1575.6950824020187,
      "token_efficiency": 0.00747702067329278
    },
    {
      "accuracy_delta": 0.09375,
      "baseline_accuracy": 0.0,
      "completion_length": 2216.78125,
      "degradation_rate": 0.0,
      "epoch": 0.1411764705882353,
      "grad_norm": 0.6451950468247998,
      "improvement_rate": 0.09375,
      "kl": 0.0021767616271972656,
      "learning_rate": 2e-06,
      "loss": 0.0001,
      "reward": 0.2089657336473465,
      "reward_std": 0.21514078974723816,
      "rewards/AdaptiveTeachingReward": 0.2089657336473465,
      "step": 27,
      "student_accuracy": 0.09375,
      "student_approach_length": 499.96875,
      "teaching_length_mean": 2619.40625,
      "teaching_length_std": 1265.4369118665845,
      "token_efficiency": 0.0060904088353781515
    },
    {
      "accuracy_delta": 0.0,
      "baseline_accuracy": 0.0,
      "completion_length": 2668.4140625,
      "degradation_rate": 0.0,
      "epoch": 0.14640522875816994,
      "grad_norm": 0.7896767066978999,
      "improvement_rate": 0.0,
      "kl": 0.002083301544189453,
      "learning_rate": 2e-06,
      "loss": 0.0001,
      "reward": 0.14528799057006836,
      "reward_std": 0.09566954523324966,
      "rewards/AdaptiveTeachingReward": 0.14528799057006836,
      "step": 28,
      "student_accuracy": 0.0,
      "student_approach_length": 500.0,
      "teaching_length_mean": 3113.90625,
      "teaching_length_std": 884.9974614855895,
      "token_efficiency": 0.0035470700822770596
    },
    {
      "accuracy_delta": 0.03125,
      "baseline_accuracy": 0.0,
      "completion_length": 2467.4140625,
      "degradation_rate": 0.0,
      "epoch": 0.15163398692810456,
      "grad_norm": 0.22872066233966626,
      "improvement_rate": 0.03125,
      "kl": 0.0025146007537841797,
      "learning_rate": 2e-06,
      "loss": 0.0001,
      "reward": 0.007998689077794552,
      "reward_std": 0.04524742066860199,
      "rewards/AdaptiveTeachingReward": 0.007998689077794552,
      "step": 29,
      "student_accuracy": 0.03125,
      "student_approach_length": 500.0,
      "teaching_length_mean": 2764.9375,
      "teaching_length_std": 1560.6896673437209,
      "token_efficiency": 0.0002591402932910396
    },
    {
      "accuracy_delta": 0.0,
      "baseline_accuracy": 0.0,
      "completion_length": 2854.59375,
      "degradation_rate": 0.0,
      "epoch": 0.1568627450980392,
      "grad_norm": 0.5744778046050318,
      "improvement_rate": 0.0,
      "kl": 0.0022513866424560547,
      "learning_rate": 2e-06,
      "loss": 0.0001,
      "reward": 0.16005077958106995,
      "reward_std": 0.10021104663610458,
      "rewards/AdaptiveTeachingReward": 0.16005077958106995,
      "step": 30,
      "student_accuracy": 0.0,
      "student_approach_length": 500.0,
      "teaching_length_mean": 2139.71875,
      "teaching_length_std": 1711.9645016733543,
      "token_efficiency": 0.007949871083127776
    }
  ],
  "logging_steps": 1,
  "max_steps": 250,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 2,
  "save_steps": 10,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 16,
  "trial_name": null,
  "trial_params": null
}