nt-van-khanh commited on
Commit
ec41953
·
verified ·
1 Parent(s): 783a096

Upload folder using huggingface_hub

Browse files
adapter_config.json CHANGED
@@ -29,13 +29,13 @@
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
- "gate_proj",
33
- "k_proj",
34
  "v_proj",
 
35
  "down_proj",
36
- "up_proj",
37
- "o_proj",
38
- "q_proj"
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
 
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
+ "up_proj",
33
+ "q_proj",
34
  "v_proj",
35
+ "gate_proj",
36
  "down_proj",
37
+ "k_proj",
38
+ "o_proj"
 
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8f135b1aa2e390d1c988122717a5ac9ab4020af2cd577111a234cd318d9c653e
3
  size 167832240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3db570fd27876879cc33103ca8933604745e61f4019c1d836c15c7fe2de9457
3
  size 167832240
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:811119aceb53e5716fa4f3a64d3d5aa323a9f0ec2a7c50f98d41f16f80104bba
3
  size 85728342
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:beeda5e00e3f2a251a26931469526d6fefa8f6cc35fc3926826851fecb38c416
3
  size 85728342
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e653f327340c2a2bf9d85813888ec80ce279c9079550355bbe8116334e542c0f
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa347da6099cd574f9473f1c0ead501ac849153b19f5aa7b33de856b2d1f19dc
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:63ce17dd2c32e1042039dfe648c482c9ff0032ac68df46007019bf1f153ddc3e
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8bea4c55977be70b1134031e6a8b57e36f8f593b2249c6d9d6b94a16db34cae2
3
  size 1064
trainer_state.json CHANGED
@@ -4,1154 +4,1154 @@
4
  "best_model_checkpoint": null,
5
  "epoch": 1.0,
6
  "eval_steps": 115,
7
- "global_step": 572,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "entropy": 1.3924612522125244,
14
- "epoch": 0.008741258741258742,
15
- "grad_norm": 0.9623214602470398,
16
  "learning_rate": 8e-05,
17
- "loss": 1.3775,
18
- "mean_token_accuracy": 0.6655579686164856,
19
- "num_tokens": 4268.0,
20
  "step": 5
21
  },
22
  {
23
- "entropy": 1.3508465528488158,
24
- "epoch": 0.017482517482517484,
25
- "grad_norm": 0.9386249780654907,
26
  "learning_rate": 0.00018,
27
- "loss": 1.1938,
28
- "mean_token_accuracy": 0.7008997738361359,
29
- "num_tokens": 9206.0,
30
  "step": 10
31
  },
32
  {
33
- "entropy": 1.4163436055183412,
34
- "epoch": 0.026223776223776224,
35
- "grad_norm": 1.047428011894226,
36
- "learning_rate": 0.00019857651245551604,
37
- "loss": 1.2855,
38
- "mean_token_accuracy": 0.6798348546028137,
39
- "num_tokens": 13655.0,
40
  "step": 15
41
  },
42
  {
43
- "entropy": 1.3434713006019592,
44
- "epoch": 0.03496503496503497,
45
- "grad_norm": 0.7252029180526733,
46
- "learning_rate": 0.00019679715302491104,
47
- "loss": 1.2563,
48
- "mean_token_accuracy": 0.6956509709358215,
49
- "num_tokens": 19321.0,
50
  "step": 20
51
  },
52
  {
53
- "entropy": 1.1038120150566102,
54
- "epoch": 0.043706293706293704,
55
- "grad_norm": 1.1018619537353516,
56
- "learning_rate": 0.00019501779359430604,
57
- "loss": 1.0152,
58
- "mean_token_accuracy": 0.7334934413433075,
59
- "num_tokens": 24098.0,
60
  "step": 25
61
  },
62
  {
63
- "entropy": 1.1559201896190643,
64
- "epoch": 0.05244755244755245,
65
- "grad_norm": 0.7375137209892273,
66
- "learning_rate": 0.0001932384341637011,
67
- "loss": 1.0187,
68
- "mean_token_accuracy": 0.736066859960556,
69
- "num_tokens": 28311.0,
70
  "step": 30
71
  },
72
  {
73
- "entropy": 1.251962125301361,
74
- "epoch": 0.06118881118881119,
75
- "grad_norm": 0.9549528956413269,
76
- "learning_rate": 0.0001914590747330961,
77
- "loss": 1.1877,
78
- "mean_token_accuracy": 0.7039336919784546,
79
- "num_tokens": 33549.0,
80
  "step": 35
81
  },
82
  {
83
- "entropy": 1.2189816296100617,
84
- "epoch": 0.06993006993006994,
85
- "grad_norm": 0.7660185694694519,
86
- "learning_rate": 0.00018967971530249112,
87
- "loss": 1.1063,
88
- "mean_token_accuracy": 0.714855033159256,
89
- "num_tokens": 38668.0,
90
  "step": 40
91
  },
92
  {
93
- "entropy": 1.2883310735225677,
94
- "epoch": 0.07867132867132867,
95
- "grad_norm": 0.9150952696800232,
96
- "learning_rate": 0.00018790035587188612,
97
- "loss": 1.2873,
98
- "mean_token_accuracy": 0.6970557630062103,
99
- "num_tokens": 44197.0,
100
  "step": 45
101
  },
102
  {
103
- "entropy": 1.1909499704837798,
104
- "epoch": 0.08741258741258741,
105
- "grad_norm": 0.6662327647209167,
106
- "learning_rate": 0.00018612099644128114,
107
- "loss": 1.1576,
108
- "mean_token_accuracy": 0.7179031908512116,
109
- "num_tokens": 49908.0,
110
  "step": 50
111
  },
112
  {
113
- "entropy": 1.3348850965499879,
114
- "epoch": 0.09615384615384616,
115
- "grad_norm": 0.8849563002586365,
116
- "learning_rate": 0.00018434163701067617,
117
- "loss": 1.1603,
118
- "mean_token_accuracy": 0.7022291779518127,
119
- "num_tokens": 54191.0,
120
  "step": 55
121
  },
122
  {
123
- "entropy": 1.2066528439521789,
124
- "epoch": 0.1048951048951049,
125
- "grad_norm": 0.8784617781639099,
126
- "learning_rate": 0.0001825622775800712,
127
- "loss": 1.1539,
128
- "mean_token_accuracy": 0.7269383847713471,
129
- "num_tokens": 59562.0,
130
  "step": 60
131
  },
132
  {
133
- "entropy": 1.129963719844818,
134
- "epoch": 0.11363636363636363,
135
- "grad_norm": 0.774686336517334,
136
- "learning_rate": 0.0001807829181494662,
137
- "loss": 1.0373,
138
- "mean_token_accuracy": 0.7341378688812256,
139
- "num_tokens": 65018.0,
140
  "step": 65
141
  },
142
  {
143
- "entropy": 1.276548171043396,
144
- "epoch": 0.12237762237762238,
145
- "grad_norm": 0.7157500386238098,
146
- "learning_rate": 0.0001790035587188612,
147
- "loss": 1.1742,
148
- "mean_token_accuracy": 0.7058017492294312,
149
- "num_tokens": 70812.0,
150
  "step": 70
151
  },
152
  {
153
- "entropy": 1.0836508870124817,
154
- "epoch": 0.13111888111888112,
155
- "grad_norm": 0.7531887292861938,
156
- "learning_rate": 0.00017722419928825625,
157
- "loss": 0.9649,
158
- "mean_token_accuracy": 0.748905599117279,
159
- "num_tokens": 76106.0,
160
  "step": 75
161
  },
162
  {
163
- "entropy": 1.0430119216442109,
164
- "epoch": 0.13986013986013987,
165
- "grad_norm": 0.6007382869720459,
166
- "learning_rate": 0.00017544483985765125,
167
- "loss": 0.9445,
168
- "mean_token_accuracy": 0.7515589416027069,
169
- "num_tokens": 81557.0,
170
  "step": 80
171
  },
172
  {
173
- "entropy": 1.1489889979362489,
174
- "epoch": 0.1486013986013986,
175
- "grad_norm": 0.7831665277481079,
176
- "learning_rate": 0.00017366548042704627,
177
- "loss": 1.0943,
178
- "mean_token_accuracy": 0.7280911147594452,
179
- "num_tokens": 86442.0,
180
  "step": 85
181
  },
182
  {
183
- "entropy": 1.0606273770332337,
184
- "epoch": 0.15734265734265734,
185
- "grad_norm": 0.7512551546096802,
186
- "learning_rate": 0.00017188612099644127,
187
- "loss": 0.9434,
188
- "mean_token_accuracy": 0.7480818212032319,
189
- "num_tokens": 90404.0,
190
  "step": 90
191
  },
192
  {
193
- "entropy": 1.1492775142192841,
194
- "epoch": 0.1660839160839161,
195
- "grad_norm": 0.5179319381713867,
196
- "learning_rate": 0.00017010676156583633,
197
- "loss": 1.0696,
198
- "mean_token_accuracy": 0.7424242496490479,
199
- "num_tokens": 97002.0,
200
  "step": 95
201
  },
202
  {
203
- "entropy": 1.194596391916275,
204
- "epoch": 0.17482517482517482,
205
- "grad_norm": 0.7215603590011597,
206
- "learning_rate": 0.00016832740213523133,
207
- "loss": 1.1231,
208
- "mean_token_accuracy": 0.7197276711463928,
209
- "num_tokens": 101935.0,
210
  "step": 100
211
  },
212
  {
213
- "entropy": 1.048801952600479,
214
- "epoch": 0.18356643356643357,
215
- "grad_norm": 0.9170930981636047,
216
- "learning_rate": 0.00016654804270462633,
217
- "loss": 0.9718,
218
- "mean_token_accuracy": 0.7438604295253753,
219
- "num_tokens": 107692.0,
220
  "step": 105
221
  },
222
  {
223
- "entropy": 1.2630416095256805,
224
- "epoch": 0.19230769230769232,
225
- "grad_norm": 0.6975880861282349,
226
- "learning_rate": 0.00016476868327402135,
227
- "loss": 1.1672,
228
- "mean_token_accuracy": 0.7042996108531951,
229
- "num_tokens": 113041.0,
230
  "step": 110
231
  },
232
  {
233
- "entropy": 1.170883482694626,
234
- "epoch": 0.20104895104895104,
235
- "grad_norm": 1.2549158334732056,
236
- "learning_rate": 0.00016298932384341638,
237
- "loss": 1.1129,
238
- "mean_token_accuracy": 0.7294944524765015,
239
- "num_tokens": 118065.0,
240
  "step": 115
241
  },
242
  {
243
- "entropy": 1.2757395565509797,
244
- "epoch": 0.2097902097902098,
245
- "grad_norm": 0.7007513046264648,
246
- "learning_rate": 0.0001612099644128114,
247
- "loss": 1.2333,
248
- "mean_token_accuracy": 0.7045675635337829,
249
- "num_tokens": 123502.0,
250
  "step": 120
251
  },
252
  {
253
- "entropy": 1.102766215801239,
254
- "epoch": 0.21853146853146854,
255
- "grad_norm": 0.6966100931167603,
256
- "learning_rate": 0.0001594306049822064,
257
- "loss": 0.9903,
258
- "mean_token_accuracy": 0.7430883646011353,
259
- "num_tokens": 128417.0,
260
  "step": 125
261
  },
262
  {
263
- "entropy": 1.0878133654594422,
264
- "epoch": 0.22727272727272727,
265
- "grad_norm": 0.5765619277954102,
266
- "learning_rate": 0.00015765124555160143,
267
- "loss": 1.0547,
268
- "mean_token_accuracy": 0.7235966801643372,
269
- "num_tokens": 134955.0,
270
  "step": 130
271
  },
272
  {
273
- "entropy": 1.059841650724411,
274
- "epoch": 0.23601398601398602,
275
- "grad_norm": 0.6278873085975647,
276
- "learning_rate": 0.00015587188612099646,
277
- "loss": 1.0116,
278
- "mean_token_accuracy": 0.7435504794120789,
279
- "num_tokens": 140785.0,
280
  "step": 135
281
  },
282
  {
283
- "entropy": 1.0601991772651673,
284
- "epoch": 0.24475524475524477,
285
- "grad_norm": 0.7335526943206787,
286
- "learning_rate": 0.00015409252669039148,
287
- "loss": 0.9323,
288
- "mean_token_accuracy": 0.7478764116764068,
289
- "num_tokens": 145727.0,
290
  "step": 140
291
  },
292
  {
293
- "entropy": 1.1874103128910065,
294
- "epoch": 0.2534965034965035,
295
- "grad_norm": 0.5723336338996887,
296
- "learning_rate": 0.00015231316725978648,
297
- "loss": 1.0757,
298
- "mean_token_accuracy": 0.7186325311660766,
299
- "num_tokens": 151867.0,
300
  "step": 145
301
  },
302
  {
303
- "entropy": 0.9424997448921204,
304
- "epoch": 0.26223776223776224,
305
- "grad_norm": 0.8389629125595093,
306
- "learning_rate": 0.00015053380782918148,
307
- "loss": 0.8844,
308
- "mean_token_accuracy": 0.7715938806533813,
309
- "num_tokens": 155296.0,
310
  "step": 150
311
  },
312
  {
313
- "entropy": 1.0623292565345763,
314
- "epoch": 0.270979020979021,
315
- "grad_norm": 0.7301695942878723,
316
- "learning_rate": 0.00014875444839857654,
317
- "loss": 0.9899,
318
- "mean_token_accuracy": 0.7323084354400635,
319
- "num_tokens": 160654.0,
320
  "step": 155
321
  },
322
  {
323
- "entropy": 1.065491944551468,
324
- "epoch": 0.27972027972027974,
325
- "grad_norm": 0.7877907156944275,
326
- "learning_rate": 0.00014697508896797153,
327
- "loss": 0.9466,
328
- "mean_token_accuracy": 0.7487669110298156,
329
- "num_tokens": 165603.0,
330
  "step": 160
331
  },
332
  {
333
- "entropy": 1.1675564229488373,
334
- "epoch": 0.28846153846153844,
335
- "grad_norm": 0.8203403949737549,
336
- "learning_rate": 0.00014519572953736656,
337
- "loss": 1.0758,
338
- "mean_token_accuracy": 0.7346278429031372,
339
- "num_tokens": 171119.0,
340
  "step": 165
341
  },
342
  {
343
- "entropy": 1.0618612051010132,
344
- "epoch": 0.2972027972027972,
345
- "grad_norm": 0.8218940496444702,
346
- "learning_rate": 0.00014341637010676156,
347
- "loss": 1.0296,
348
- "mean_token_accuracy": 0.7328690826892853,
349
- "num_tokens": 176227.0,
350
  "step": 170
351
  },
352
  {
353
- "entropy": 1.193172001838684,
354
- "epoch": 0.30594405594405594,
355
- "grad_norm": 0.9550092816352844,
356
- "learning_rate": 0.0001416370106761566,
357
- "loss": 1.1211,
358
- "mean_token_accuracy": 0.7121898174285889,
359
- "num_tokens": 181621.0,
360
  "step": 175
361
  },
362
  {
363
- "entropy": 1.136334627866745,
364
- "epoch": 0.3146853146853147,
365
- "grad_norm": 0.6639471650123596,
366
- "learning_rate": 0.0001398576512455516,
367
- "loss": 1.0134,
368
- "mean_token_accuracy": 0.7396367609500885,
369
- "num_tokens": 188262.0,
370
  "step": 180
371
  },
372
  {
373
- "entropy": 1.1639393329620362,
374
- "epoch": 0.32342657342657344,
375
- "grad_norm": 0.6816486120223999,
376
- "learning_rate": 0.0001380782918149466,
377
- "loss": 1.0736,
378
- "mean_token_accuracy": 0.7224510788917542,
379
- "num_tokens": 192956.0,
380
  "step": 185
381
  },
382
  {
383
- "entropy": 1.146146583557129,
384
- "epoch": 0.3321678321678322,
385
- "grad_norm": 0.786189079284668,
386
- "learning_rate": 0.00013629893238434164,
387
- "loss": 1.0364,
388
- "mean_token_accuracy": 0.7313450872898102,
389
- "num_tokens": 197213.0,
390
  "step": 190
391
  },
392
  {
393
- "entropy": 1.10458744764328,
394
- "epoch": 0.3409090909090909,
395
- "grad_norm": 1.0277358293533325,
396
- "learning_rate": 0.00013451957295373666,
397
- "loss": 1.0431,
398
- "mean_token_accuracy": 0.7269207119941712,
399
- "num_tokens": 201735.0,
400
  "step": 195
401
  },
402
  {
403
- "entropy": 1.0827986776828766,
404
- "epoch": 0.34965034965034963,
405
- "grad_norm": 0.7654422521591187,
406
- "learning_rate": 0.0001327402135231317,
407
- "loss": 1.0355,
408
- "mean_token_accuracy": 0.738901925086975,
409
- "num_tokens": 206600.0,
410
  "step": 200
411
  },
412
  {
413
- "entropy": 1.0413719892501831,
414
- "epoch": 0.3583916083916084,
415
- "grad_norm": 0.8267967700958252,
416
- "learning_rate": 0.0001309608540925267,
417
- "loss": 0.9717,
418
- "mean_token_accuracy": 0.7459078669548035,
419
- "num_tokens": 211319.0,
420
  "step": 205
421
  },
422
  {
423
- "entropy": 0.9956618547439575,
424
- "epoch": 0.36713286713286714,
425
- "grad_norm": 0.7114885449409485,
426
- "learning_rate": 0.00012918149466192172,
427
- "loss": 0.8975,
428
- "mean_token_accuracy": 0.7587344646453857,
429
- "num_tokens": 216407.0,
430
  "step": 210
431
  },
432
  {
433
- "entropy": 1.201312917470932,
434
- "epoch": 0.3758741258741259,
435
- "grad_norm": 0.5830783843994141,
436
- "learning_rate": 0.00012740213523131672,
437
- "loss": 1.1477,
438
- "mean_token_accuracy": 0.7168383121490478,
439
- "num_tokens": 222016.0,
440
  "step": 215
441
  },
442
  {
443
- "entropy": 1.1125480353832244,
444
- "epoch": 0.38461538461538464,
445
- "grad_norm": 0.6842811107635498,
446
- "learning_rate": 0.00012562277580071177,
447
- "loss": 0.982,
448
- "mean_token_accuracy": 0.7435801923274994,
449
- "num_tokens": 226748.0,
450
  "step": 220
451
  },
452
  {
453
- "entropy": 1.1250860214233398,
454
- "epoch": 0.39335664335664333,
455
- "grad_norm": 1.392675757408142,
456
- "learning_rate": 0.00012384341637010677,
457
- "loss": 1.0523,
458
- "mean_token_accuracy": 0.7364842057228088,
459
- "num_tokens": 231513.0,
460
  "step": 225
461
  },
462
  {
463
- "entropy": 0.9735329568386077,
464
- "epoch": 0.4020979020979021,
465
- "grad_norm": 0.8255024552345276,
466
- "learning_rate": 0.00012206405693950178,
467
- "loss": 0.9083,
468
- "mean_token_accuracy": 0.7571049571037293,
469
- "num_tokens": 235433.0,
470
  "step": 230
471
  },
472
  {
473
- "entropy": 1.0409073889255525,
474
- "epoch": 0.41083916083916083,
475
- "grad_norm": 0.6322015523910522,
476
- "learning_rate": 0.0001202846975088968,
477
- "loss": 0.9712,
478
- "mean_token_accuracy": 0.7544535756111145,
479
- "num_tokens": 240991.0,
480
  "step": 235
481
  },
482
  {
483
- "entropy": 0.9808995604515076,
484
- "epoch": 0.4195804195804196,
485
- "grad_norm": 0.693168044090271,
486
- "learning_rate": 0.00011850533807829183,
487
- "loss": 0.9637,
488
- "mean_token_accuracy": 0.7572705090045929,
489
- "num_tokens": 245361.0,
490
  "step": 240
491
  },
492
  {
493
- "entropy": 1.1916967630386353,
494
- "epoch": 0.42832167832167833,
495
- "grad_norm": 0.7691939473152161,
496
- "learning_rate": 0.00011672597864768685,
497
- "loss": 1.1378,
498
- "mean_token_accuracy": 0.7057560324668884,
499
- "num_tokens": 249896.0,
500
  "step": 245
501
  },
502
  {
503
- "entropy": 0.9713864088058471,
504
- "epoch": 0.4370629370629371,
505
- "grad_norm": 0.6049178838729858,
506
- "learning_rate": 0.00011494661921708185,
507
- "loss": 0.9592,
508
- "mean_token_accuracy": 0.7560720384120941,
509
- "num_tokens": 255682.0,
510
  "step": 250
511
  },
512
  {
513
- "entropy": 1.260662978887558,
514
- "epoch": 0.4458041958041958,
515
- "grad_norm": 0.7776870131492615,
516
- "learning_rate": 0.00011316725978647686,
517
- "loss": 1.1992,
518
- "mean_token_accuracy": 0.6990963518619537,
519
- "num_tokens": 261698.0,
520
  "step": 255
521
  },
522
  {
523
- "entropy": 1.0263409852981566,
524
- "epoch": 0.45454545454545453,
525
- "grad_norm": 0.5895385146141052,
526
- "learning_rate": 0.0001113879003558719,
527
- "loss": 1.0182,
528
- "mean_token_accuracy": 0.7378697097301483,
529
- "num_tokens": 266624.0,
530
  "step": 260
531
  },
532
  {
533
- "entropy": 1.0448009312152862,
534
- "epoch": 0.4632867132867133,
535
- "grad_norm": 0.7714991569519043,
536
- "learning_rate": 0.00010960854092526691,
537
- "loss": 0.9675,
538
- "mean_token_accuracy": 0.7545935451984406,
539
- "num_tokens": 272155.0,
540
  "step": 265
541
  },
542
  {
543
- "entropy": 1.009095060825348,
544
- "epoch": 0.47202797202797203,
545
- "grad_norm": 0.7107412219047546,
546
- "learning_rate": 0.00010782918149466192,
547
- "loss": 0.9022,
548
- "mean_token_accuracy": 0.7640557646751404,
549
- "num_tokens": 277590.0,
550
  "step": 270
551
  },
552
  {
553
- "entropy": 1.085400366783142,
554
- "epoch": 0.4807692307692308,
555
- "grad_norm": 0.6840293407440186,
556
- "learning_rate": 0.00010604982206405694,
557
- "loss": 1.101,
558
- "mean_token_accuracy": 0.7363012135028839,
559
- "num_tokens": 282989.0,
560
  "step": 275
561
  },
562
  {
563
- "entropy": 1.209915179014206,
564
- "epoch": 0.48951048951048953,
565
- "grad_norm": 0.7322263121604919,
566
- "learning_rate": 0.00010427046263345198,
567
- "loss": 1.0632,
568
- "mean_token_accuracy": 0.7248473286628723,
569
- "num_tokens": 288148.0,
570
  "step": 280
571
  },
572
  {
573
- "entropy": 1.1313316702842713,
574
- "epoch": 0.4982517482517482,
575
- "grad_norm": 0.8790935277938843,
576
- "learning_rate": 0.00010249110320284699,
577
- "loss": 1.0362,
578
- "mean_token_accuracy": 0.7234691977500916,
579
- "num_tokens": 293421.0,
580
  "step": 285
581
  },
582
  {
583
- "entropy": 1.0769161105155944,
584
- "epoch": 0.506993006993007,
585
- "grad_norm": 0.742671012878418,
586
- "learning_rate": 0.00010071174377224199,
587
- "loss": 1.0596,
588
- "mean_token_accuracy": 0.7369856536388397,
589
- "num_tokens": 299197.0,
590
  "step": 290
591
  },
592
  {
593
- "entropy": 1.1410824477672576,
594
- "epoch": 0.5157342657342657,
595
- "grad_norm": 0.6181492209434509,
596
- "learning_rate": 9.893238434163702e-05,
597
- "loss": 1.165,
598
- "mean_token_accuracy": 0.7148903965950012,
599
- "num_tokens": 305681.0,
600
  "step": 295
601
  },
602
  {
603
- "entropy": 1.1295619785785675,
604
- "epoch": 0.5244755244755245,
605
- "grad_norm": 0.6285997033119202,
606
- "learning_rate": 9.715302491103203e-05,
607
- "loss": 1.0482,
608
- "mean_token_accuracy": 0.723493081331253,
609
- "num_tokens": 312074.0,
610
  "step": 300
611
  },
612
  {
613
- "entropy": 1.0108375370502471,
614
- "epoch": 0.5332167832167832,
615
- "grad_norm": 0.9831832647323608,
616
- "learning_rate": 9.537366548042705e-05,
617
- "loss": 0.8795,
618
- "mean_token_accuracy": 0.7591509163379669,
619
- "num_tokens": 316386.0,
620
  "step": 305
621
  },
622
  {
623
- "entropy": 1.0078293979167938,
624
- "epoch": 0.541958041958042,
625
- "grad_norm": 0.7532368302345276,
626
- "learning_rate": 9.359430604982207e-05,
627
- "loss": 0.9584,
628
- "mean_token_accuracy": 0.7491445183753968,
629
- "num_tokens": 322246.0,
630
  "step": 310
631
  },
632
  {
633
- "entropy": 0.940712821483612,
634
- "epoch": 0.5506993006993007,
635
- "grad_norm": 0.8640061020851135,
636
- "learning_rate": 9.18149466192171e-05,
637
- "loss": 0.9253,
638
- "mean_token_accuracy": 0.7581913948059082,
639
- "num_tokens": 328041.0,
640
  "step": 315
641
  },
642
  {
643
- "entropy": 0.9539014101028442,
644
- "epoch": 0.5594405594405595,
645
- "grad_norm": 0.5698885321617126,
646
- "learning_rate": 9.00355871886121e-05,
647
- "loss": 0.8867,
648
- "mean_token_accuracy": 0.7597615242004394,
649
- "num_tokens": 332751.0,
650
  "step": 320
651
  },
652
  {
653
- "entropy": 1.08140572309494,
654
- "epoch": 0.5681818181818182,
655
- "grad_norm": 0.5825881361961365,
656
- "learning_rate": 8.825622775800713e-05,
657
- "loss": 1.0597,
658
- "mean_token_accuracy": 0.7322126507759095,
659
- "num_tokens": 338448.0,
660
  "step": 325
661
  },
662
  {
663
- "entropy": 1.0642346262931823,
664
- "epoch": 0.5769230769230769,
665
- "grad_norm": 0.8457391858100891,
666
- "learning_rate": 8.647686832740213e-05,
667
- "loss": 1.0298,
668
- "mean_token_accuracy": 0.7364085793495179,
669
- "num_tokens": 343508.0,
670
  "step": 330
671
  },
672
  {
673
- "entropy": 1.0377025127410888,
674
- "epoch": 0.5856643356643356,
675
- "grad_norm": 0.7959486842155457,
676
- "learning_rate": 8.469750889679716e-05,
677
- "loss": 0.9248,
678
- "mean_token_accuracy": 0.757226413488388,
679
- "num_tokens": 347840.0,
680
  "step": 335
681
  },
682
  {
683
- "entropy": 1.0676892161369325,
684
- "epoch": 0.5944055944055944,
685
- "grad_norm": 0.9492782950401306,
686
- "learning_rate": 8.291814946619217e-05,
687
- "loss": 0.9644,
688
- "mean_token_accuracy": 0.7350347638130188,
689
- "num_tokens": 353004.0,
690
  "step": 340
691
  },
692
  {
693
- "entropy": 1.2051751494407654,
694
- "epoch": 0.6031468531468531,
695
- "grad_norm": 0.6062285304069519,
696
- "learning_rate": 8.11387900355872e-05,
697
- "loss": 1.1306,
698
- "mean_token_accuracy": 0.71878741979599,
699
- "num_tokens": 358355.0,
700
  "step": 345
701
  },
702
  {
703
- "entropy": 0.9939802944660187,
704
- "epoch": 0.6118881118881119,
705
- "grad_norm": 0.6014482378959656,
706
- "learning_rate": 7.935943060498221e-05,
707
- "loss": 0.9206,
708
- "mean_token_accuracy": 0.7534485578536987,
709
- "num_tokens": 363815.0,
710
  "step": 350
711
  },
712
  {
713
- "entropy": 0.9838183641433715,
714
- "epoch": 0.6206293706293706,
715
- "grad_norm": 0.6233981251716614,
716
- "learning_rate": 7.758007117437722e-05,
717
- "loss": 0.9557,
718
- "mean_token_accuracy": 0.7579984903335572,
719
- "num_tokens": 370209.0,
720
  "step": 355
721
  },
722
  {
723
- "entropy": 1.1523795008659363,
724
- "epoch": 0.6293706293706294,
725
- "grad_norm": 0.9388852119445801,
726
- "learning_rate": 7.580071174377225e-05,
727
- "loss": 1.1244,
728
- "mean_token_accuracy": 0.7127670645713806,
729
- "num_tokens": 375178.0,
730
  "step": 360
731
  },
732
  {
733
- "entropy": 1.1256710410118103,
734
- "epoch": 0.6381118881118881,
735
- "grad_norm": 0.7773574590682983,
736
- "learning_rate": 7.402135231316726e-05,
737
- "loss": 1.199,
738
- "mean_token_accuracy": 0.7347433745861054,
739
- "num_tokens": 380359.0,
740
  "step": 365
741
  },
742
  {
743
- "entropy": 1.0246877193450927,
744
- "epoch": 0.6468531468531469,
745
- "grad_norm": 0.7057833671569824,
746
- "learning_rate": 7.224199288256229e-05,
747
- "loss": 0.9349,
748
- "mean_token_accuracy": 0.7434077799320221,
749
- "num_tokens": 386251.0,
750
  "step": 370
751
  },
752
  {
753
- "entropy": 0.9082993268966675,
754
- "epoch": 0.6555944055944056,
755
- "grad_norm": 0.7693665027618408,
756
- "learning_rate": 7.046263345195729e-05,
757
- "loss": 0.8317,
758
- "mean_token_accuracy": 0.7674221277236939,
759
- "num_tokens": 391273.0,
760
  "step": 375
761
  },
762
  {
763
- "entropy": 1.0551639199256897,
764
- "epoch": 0.6643356643356644,
765
- "grad_norm": 0.6118054986000061,
766
- "learning_rate": 6.868327402135231e-05,
767
- "loss": 0.9564,
768
- "mean_token_accuracy": 0.7505346298217773,
769
- "num_tokens": 396405.0,
770
  "step": 380
771
  },
772
  {
773
- "entropy": 0.856031060218811,
774
- "epoch": 0.6730769230769231,
775
- "grad_norm": 0.7436105608940125,
776
- "learning_rate": 6.690391459074733e-05,
777
- "loss": 0.7753,
778
- "mean_token_accuracy": 0.7836384952068329,
779
- "num_tokens": 401417.0,
780
  "step": 385
781
  },
782
  {
783
- "entropy": 1.1769568383693696,
784
- "epoch": 0.6818181818181818,
785
- "grad_norm": 0.5364604592323303,
786
- "learning_rate": 6.512455516014235e-05,
787
- "loss": 1.1369,
788
- "mean_token_accuracy": 0.7138187170028687,
789
- "num_tokens": 408045.0,
790
  "step": 390
791
  },
792
  {
793
- "entropy": 0.9055932879447937,
794
- "epoch": 0.6905594405594405,
795
- "grad_norm": 0.7993744015693665,
796
- "learning_rate": 6.334519572953737e-05,
797
- "loss": 0.8238,
798
- "mean_token_accuracy": 0.7695916533470154,
799
- "num_tokens": 412408.0,
800
  "step": 395
801
  },
802
  {
803
- "entropy": 1.067290061712265,
804
- "epoch": 0.6993006993006993,
805
- "grad_norm": 0.5611645579338074,
806
- "learning_rate": 6.156583629893239e-05,
807
- "loss": 1.0754,
808
- "mean_token_accuracy": 0.7374713003635407,
809
- "num_tokens": 417539.0,
810
  "step": 400
811
  },
812
  {
813
- "entropy": 0.9325143158435821,
814
- "epoch": 0.708041958041958,
815
- "grad_norm": 0.8282243609428406,
816
- "learning_rate": 5.97864768683274e-05,
817
- "loss": 0.8287,
818
- "mean_token_accuracy": 0.7693089723587037,
819
- "num_tokens": 421587.0,
820
  "step": 405
821
  },
822
  {
823
- "entropy": 0.9437564730644226,
824
- "epoch": 0.7167832167832168,
825
- "grad_norm": 0.8528610467910767,
826
- "learning_rate": 5.8007117437722425e-05,
827
- "loss": 0.8851,
828
- "mean_token_accuracy": 0.7588753461837768,
829
- "num_tokens": 425118.0,
830
  "step": 410
831
  },
832
  {
833
- "entropy": 0.9383285760879516,
834
- "epoch": 0.7255244755244755,
835
- "grad_norm": 0.9912576079368591,
836
- "learning_rate": 5.622775800711744e-05,
837
- "loss": 0.8777,
838
- "mean_token_accuracy": 0.7649032652378083,
839
- "num_tokens": 429766.0,
840
  "step": 415
841
  },
842
  {
843
- "entropy": 0.9844208836555481,
844
- "epoch": 0.7342657342657343,
845
- "grad_norm": 0.8838147521018982,
846
- "learning_rate": 5.4448398576512464e-05,
847
- "loss": 0.9286,
848
- "mean_token_accuracy": 0.7666606605052948,
849
- "num_tokens": 434826.0,
850
  "step": 420
851
  },
852
  {
853
- "entropy": 1.0472073316574098,
854
- "epoch": 0.743006993006993,
855
- "grad_norm": 0.9893532991409302,
856
- "learning_rate": 5.266903914590747e-05,
857
- "loss": 0.9453,
858
- "mean_token_accuracy": 0.7458884060382843,
859
- "num_tokens": 439219.0,
860
  "step": 425
861
  },
862
  {
863
- "entropy": 1.059507966041565,
864
- "epoch": 0.7517482517482518,
865
- "grad_norm": 0.7243296504020691,
866
- "learning_rate": 5.0889679715302496e-05,
867
- "loss": 0.9485,
868
- "mean_token_accuracy": 0.7473999261856079,
869
- "num_tokens": 444496.0,
870
  "step": 430
871
  },
872
  {
873
- "entropy": 0.96737100481987,
874
- "epoch": 0.7604895104895105,
875
- "grad_norm": 0.7511352300643921,
876
- "learning_rate": 4.911032028469751e-05,
877
- "loss": 0.9112,
878
- "mean_token_accuracy": 0.7562202334403991,
879
- "num_tokens": 449115.0,
880
  "step": 435
881
  },
882
  {
883
- "entropy": 1.0681302666664123,
884
- "epoch": 0.7692307692307693,
885
- "grad_norm": 0.6476220488548279,
886
- "learning_rate": 4.733096085409253e-05,
887
- "loss": 1.1169,
888
- "mean_token_accuracy": 0.7343231618404389,
889
- "num_tokens": 454151.0,
890
  "step": 440
891
  },
892
  {
893
- "entropy": 0.9483801007270813,
894
- "epoch": 0.777972027972028,
895
- "grad_norm": 0.7808278799057007,
896
- "learning_rate": 4.555160142348754e-05,
897
- "loss": 0.9041,
898
- "mean_token_accuracy": 0.7763189613819123,
899
- "num_tokens": 458892.0,
900
  "step": 445
901
  },
902
  {
903
- "entropy": 0.9629013359546661,
904
- "epoch": 0.7867132867132867,
905
- "grad_norm": 0.7341641187667847,
906
- "learning_rate": 4.377224199288256e-05,
907
- "loss": 0.8238,
908
- "mean_token_accuracy": 0.765246057510376,
909
- "num_tokens": 463856.0,
910
  "step": 450
911
  },
912
  {
913
- "entropy": 1.180522269010544,
914
- "epoch": 0.7954545454545454,
915
- "grad_norm": 0.8312517404556274,
916
- "learning_rate": 4.199288256227758e-05,
917
- "loss": 1.1042,
918
- "mean_token_accuracy": 0.7128246188163757,
919
- "num_tokens": 470112.0,
920
  "step": 455
921
  },
922
  {
923
- "entropy": 1.004443597793579,
924
- "epoch": 0.8041958041958042,
925
- "grad_norm": 0.9074130654335022,
926
- "learning_rate": 4.02135231316726e-05,
927
- "loss": 0.9222,
928
- "mean_token_accuracy": 0.7539559602737427,
929
- "num_tokens": 475012.0,
930
  "step": 460
931
  },
932
  {
933
- "entropy": 1.0228057682514191,
934
- "epoch": 0.8129370629370629,
935
- "grad_norm": 0.920925498008728,
936
- "learning_rate": 3.843416370106761e-05,
937
- "loss": 0.9035,
938
- "mean_token_accuracy": 0.7569567143917084,
939
- "num_tokens": 480558.0,
940
  "step": 465
941
  },
942
  {
943
- "entropy": 0.949072140455246,
944
- "epoch": 0.8216783216783217,
945
- "grad_norm": 0.6804259419441223,
946
- "learning_rate": 3.665480427046263e-05,
947
- "loss": 0.8606,
948
- "mean_token_accuracy": 0.7625180125236511,
949
- "num_tokens": 486294.0,
950
  "step": 470
951
  },
952
  {
953
- "entropy": 1.0250387787818909,
954
- "epoch": 0.8304195804195804,
955
- "grad_norm": 0.6318123936653137,
956
- "learning_rate": 3.487544483985765e-05,
957
- "loss": 0.9913,
958
- "mean_token_accuracy": 0.7425659537315369,
959
- "num_tokens": 492617.0,
960
  "step": 475
961
  },
962
  {
963
- "entropy": 0.8904710471630096,
964
- "epoch": 0.8391608391608392,
965
- "grad_norm": 0.6852394342422485,
966
- "learning_rate": 3.309608540925267e-05,
967
- "loss": 0.8392,
968
- "mean_token_accuracy": 0.7645678043365478,
969
- "num_tokens": 497070.0,
970
  "step": 480
971
  },
972
  {
973
- "entropy": 0.9813799023628235,
974
- "epoch": 0.8479020979020979,
975
- "grad_norm": 0.6071293950080872,
976
- "learning_rate": 3.1316725978647684e-05,
977
- "loss": 0.8984,
978
- "mean_token_accuracy": 0.7646778285503387,
979
- "num_tokens": 502298.0,
980
  "step": 485
981
  },
982
  {
983
- "entropy": 1.0262552201747894,
984
- "epoch": 0.8566433566433567,
985
- "grad_norm": 0.8407160043716431,
986
- "learning_rate": 2.9537366548042704e-05,
987
- "loss": 0.9343,
988
- "mean_token_accuracy": 0.7484920144081115,
989
- "num_tokens": 507261.0,
990
  "step": 490
991
  },
992
  {
993
- "entropy": 0.9773908019065857,
994
- "epoch": 0.8653846153846154,
995
- "grad_norm": 0.6108224987983704,
996
- "learning_rate": 2.7758007117437723e-05,
997
- "loss": 0.8876,
998
- "mean_token_accuracy": 0.7593122482299804,
999
- "num_tokens": 512933.0,
1000
  "step": 495
1001
  },
1002
  {
1003
- "entropy": 1.143789404630661,
1004
- "epoch": 0.8741258741258742,
1005
- "grad_norm": 0.6079063415527344,
1006
- "learning_rate": 2.597864768683274e-05,
1007
- "loss": 1.0861,
1008
- "mean_token_accuracy": 0.7239168882369995,
1009
- "num_tokens": 518867.0,
1010
  "step": 500
1011
  },
1012
  {
1013
- "entropy": 0.9865677416324615,
1014
- "epoch": 0.8828671328671329,
1015
- "grad_norm": 0.8393223285675049,
1016
- "learning_rate": 2.419928825622776e-05,
1017
- "loss": 0.9208,
1018
- "mean_token_accuracy": 0.7588137328624726,
1019
- "num_tokens": 523197.0,
1020
  "step": 505
1021
  },
1022
  {
1023
- "entropy": 1.0429059386253356,
1024
- "epoch": 0.8916083916083916,
1025
- "grad_norm": 0.7288678288459778,
1026
- "learning_rate": 2.2419928825622775e-05,
1027
- "loss": 1.0118,
1028
- "mean_token_accuracy": 0.7459483563899993,
1029
- "num_tokens": 528553.0,
1030
  "step": 510
1031
  },
1032
  {
1033
- "entropy": 0.936554628610611,
1034
- "epoch": 0.9003496503496503,
1035
- "grad_norm": 1.026867151260376,
1036
- "learning_rate": 2.0640569395017795e-05,
1037
- "loss": 0.8488,
1038
- "mean_token_accuracy": 0.7743270337581635,
1039
- "num_tokens": 533175.0,
1040
  "step": 515
1041
  },
1042
  {
1043
- "entropy": 1.0927321076393128,
1044
- "epoch": 0.9090909090909091,
1045
- "grad_norm": 0.8070006370544434,
1046
- "learning_rate": 1.8861209964412814e-05,
1047
- "loss": 1.0321,
1048
- "mean_token_accuracy": 0.7340759754180908,
1049
- "num_tokens": 537923.0,
1050
  "step": 520
1051
  },
1052
  {
1053
- "entropy": 0.923135507106781,
1054
- "epoch": 0.9178321678321678,
1055
- "grad_norm": 0.7885546684265137,
1056
- "learning_rate": 1.708185053380783e-05,
1057
- "loss": 0.8886,
1058
- "mean_token_accuracy": 0.7669959187507629,
1059
- "num_tokens": 543086.0,
1060
  "step": 525
1061
  },
1062
  {
1063
- "entropy": 0.803551995754242,
1064
- "epoch": 0.9265734265734266,
1065
- "grad_norm": 0.5133217573165894,
1066
- "learning_rate": 1.530249110320285e-05,
1067
- "loss": 0.7201,
1068
- "mean_token_accuracy": 0.7979816317558288,
1069
- "num_tokens": 547920.0,
1070
  "step": 530
1071
  },
1072
  {
1073
- "entropy": 1.0683785855770112,
1074
- "epoch": 0.9353146853146853,
1075
- "grad_norm": 1.0883749723434448,
1076
- "learning_rate": 1.3523131672597866e-05,
1077
- "loss": 0.979,
1078
- "mean_token_accuracy": 0.7476417005062104,
1079
- "num_tokens": 553743.0,
1080
  "step": 535
1081
  },
1082
  {
1083
- "entropy": 1.0017572939395905,
1084
- "epoch": 0.9440559440559441,
1085
- "grad_norm": 0.8225399851799011,
1086
- "learning_rate": 1.1743772241992882e-05,
1087
- "loss": 0.8984,
1088
- "mean_token_accuracy": 0.761442244052887,
1089
- "num_tokens": 558414.0,
1090
  "step": 540
1091
  },
1092
  {
1093
- "entropy": 1.0132792532444,
1094
- "epoch": 0.9527972027972028,
1095
- "grad_norm": 0.9049685001373291,
1096
- "learning_rate": 9.9644128113879e-06,
1097
- "loss": 0.9703,
1098
- "mean_token_accuracy": 0.7527327954769134,
1099
- "num_tokens": 563295.0,
1100
  "step": 545
1101
  },
1102
  {
1103
- "entropy": 0.972287380695343,
1104
- "epoch": 0.9615384615384616,
1105
- "grad_norm": 0.657630980014801,
1106
- "learning_rate": 8.185053380782918e-06,
1107
- "loss": 0.8971,
1108
- "mean_token_accuracy": 0.7535503268241882,
1109
- "num_tokens": 568925.0,
1110
  "step": 550
1111
  },
1112
  {
1113
- "entropy": 1.0074927151203155,
1114
- "epoch": 0.9702797202797203,
1115
- "grad_norm": 0.5989683866500854,
1116
- "learning_rate": 6.405693950177937e-06,
1117
- "loss": 0.9767,
1118
- "mean_token_accuracy": 0.7365618705749511,
1119
- "num_tokens": 574965.0,
1120
  "step": 555
1121
  },
1122
  {
1123
- "entropy": 1.2086752831935883,
1124
- "epoch": 0.9790209790209791,
1125
- "grad_norm": 0.6988089084625244,
1126
- "learning_rate": 4.626334519572954e-06,
1127
- "loss": 1.1787,
1128
- "mean_token_accuracy": 0.7035641133785248,
1129
- "num_tokens": 580877.0,
1130
  "step": 560
1131
  },
1132
  {
1133
- "entropy": 0.9592096865177154,
1134
- "epoch": 0.9877622377622378,
1135
- "grad_norm": 1.0358166694641113,
1136
- "learning_rate": 2.8469750889679713e-06,
1137
- "loss": 0.8782,
1138
- "mean_token_accuracy": 0.76033256649971,
1139
- "num_tokens": 585093.0,
1140
  "step": 565
1141
  },
1142
  {
1143
- "entropy": 0.8759162247180938,
1144
- "epoch": 0.9965034965034965,
1145
- "grad_norm": 0.6450009942054749,
1146
- "learning_rate": 1.0676156583629894e-06,
1147
- "loss": 0.7832,
1148
- "mean_token_accuracy": 0.7865857958793641,
1149
- "num_tokens": 590558.0,
1150
  "step": 570
1151
  }
1152
  ],
1153
  "logging_steps": 5,
1154
- "max_steps": 572,
1155
  "num_input_tokens_seen": 0,
1156
  "num_train_epochs": 1,
1157
  "save_steps": 500,
@@ -1167,7 +1167,7 @@
1167
  "attributes": {}
1168
  }
1169
  },
1170
- "total_flos": 2.684206514115379e+16,
1171
  "train_batch_size": 1,
1172
  "trial_name": null,
1173
  "trial_params": null
 
4
  "best_model_checkpoint": null,
5
  "epoch": 1.0,
6
  "eval_steps": 115,
7
+ "global_step": 573,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "entropy": 1.333236712217331,
14
+ "epoch": 0.008726003490401396,
15
+ "grad_norm": 1.05680251121521,
16
  "learning_rate": 8e-05,
17
+ "loss": 1.3485,
18
+ "mean_token_accuracy": 0.6778763651847839,
19
+ "num_tokens": 4689.0,
20
  "step": 5
21
  },
22
  {
23
+ "entropy": 1.3305357813835144,
24
+ "epoch": 0.017452006980802792,
25
+ "grad_norm": 0.872683584690094,
26
  "learning_rate": 0.00018,
27
+ "loss": 1.2321,
28
+ "mean_token_accuracy": 0.6924330353736877,
29
+ "num_tokens": 9076.0,
30
  "step": 10
31
  },
32
  {
33
+ "entropy": 1.1033322036266326,
34
+ "epoch": 0.02617801047120419,
35
+ "grad_norm": 0.7987897992134094,
36
+ "learning_rate": 0.0001985790408525755,
37
+ "loss": 1.0149,
38
+ "mean_token_accuracy": 0.7376092612743378,
39
+ "num_tokens": 13468.0,
40
  "step": 15
41
  },
42
  {
43
+ "entropy": 1.3769255757331849,
44
+ "epoch": 0.034904013961605584,
45
+ "grad_norm": 0.5818138718605042,
46
+ "learning_rate": 0.00019680284191829486,
47
+ "loss": 1.3507,
48
+ "mean_token_accuracy": 0.6773534297943116,
49
+ "num_tokens": 19687.0,
50
  "step": 20
51
  },
52
  {
53
+ "entropy": 1.3704544663429261,
54
+ "epoch": 0.04363001745200698,
55
+ "grad_norm": 0.7997947335243225,
56
+ "learning_rate": 0.00019502664298401423,
57
+ "loss": 1.2011,
58
+ "mean_token_accuracy": 0.7000263512134552,
59
+ "num_tokens": 25321.0,
60
  "step": 25
61
  },
62
  {
63
+ "entropy": 1.1999043464660644,
64
+ "epoch": 0.05235602094240838,
65
+ "grad_norm": 0.7836564779281616,
66
+ "learning_rate": 0.00019325044404973357,
67
+ "loss": 1.1702,
68
+ "mean_token_accuracy": 0.7157157480716705,
69
+ "num_tokens": 29969.0,
70
  "step": 30
71
  },
72
  {
73
+ "entropy": 1.1485023498535156,
74
+ "epoch": 0.06108202443280977,
75
+ "grad_norm": 0.8341999650001526,
76
+ "learning_rate": 0.00019147424511545294,
77
+ "loss": 1.0127,
78
+ "mean_token_accuracy": 0.736984384059906,
79
+ "num_tokens": 34779.0,
80
  "step": 35
81
  },
82
  {
83
+ "entropy": 1.175552135705948,
84
+ "epoch": 0.06980802792321117,
85
+ "grad_norm": 0.8367146849632263,
86
+ "learning_rate": 0.0001896980461811723,
87
+ "loss": 1.081,
88
+ "mean_token_accuracy": 0.7145774185657501,
89
+ "num_tokens": 40454.0,
90
  "step": 40
91
  },
92
  {
93
+ "entropy": 1.2580669283866883,
94
+ "epoch": 0.07853403141361257,
95
+ "grad_norm": 0.7274704575538635,
96
+ "learning_rate": 0.00018792184724689167,
97
+ "loss": 1.1622,
98
+ "mean_token_accuracy": 0.7037679016590118,
99
+ "num_tokens": 45435.0,
100
  "step": 45
101
  },
102
  {
103
+ "entropy": 1.3289404153823852,
104
+ "epoch": 0.08726003490401396,
105
+ "grad_norm": 0.9459134936332703,
106
+ "learning_rate": 0.00018614564831261103,
107
+ "loss": 1.2887,
108
+ "mean_token_accuracy": 0.7032808780670166,
109
+ "num_tokens": 50161.0,
110
  "step": 50
111
  },
112
  {
113
+ "entropy": 1.206754869222641,
114
+ "epoch": 0.09598603839441536,
115
+ "grad_norm": 0.8248263001441956,
116
+ "learning_rate": 0.00018436944937833037,
117
+ "loss": 1.0185,
118
+ "mean_token_accuracy": 0.7304032206535339,
119
+ "num_tokens": 55082.0,
120
  "step": 55
121
  },
122
  {
123
+ "entropy": 1.3467580556869507,
124
+ "epoch": 0.10471204188481675,
125
+ "grad_norm": 0.7025023698806763,
126
+ "learning_rate": 0.00018259325044404974,
127
+ "loss": 1.3245,
128
+ "mean_token_accuracy": 0.6774280846118927,
129
+ "num_tokens": 61109.0,
130
  "step": 60
131
  },
132
  {
133
+ "entropy": 1.1657752752304078,
134
+ "epoch": 0.11343804537521815,
135
+ "grad_norm": 0.7866821885108948,
136
+ "learning_rate": 0.0001808170515097691,
137
+ "loss": 1.0342,
138
+ "mean_token_accuracy": 0.7379155635833741,
139
+ "num_tokens": 65130.0,
140
  "step": 65
141
  },
142
  {
143
+ "entropy": 1.3768277764320374,
144
+ "epoch": 0.12216404886561955,
145
+ "grad_norm": 0.6452690958976746,
146
+ "learning_rate": 0.00017904085257548847,
147
+ "loss": 1.3499,
148
+ "mean_token_accuracy": 0.6878371357917785,
149
+ "num_tokens": 71720.0,
150
  "step": 70
151
  },
152
  {
153
+ "entropy": 1.2285258889198303,
154
+ "epoch": 0.13089005235602094,
155
+ "grad_norm": 0.8868134617805481,
156
+ "learning_rate": 0.00017726465364120784,
157
+ "loss": 1.1203,
158
+ "mean_token_accuracy": 0.7103672683238983,
159
+ "num_tokens": 76475.0,
160
  "step": 75
161
  },
162
  {
163
+ "entropy": 1.142468798160553,
164
+ "epoch": 0.13961605584642234,
165
+ "grad_norm": 0.7537686228752136,
166
+ "learning_rate": 0.00017548845470692718,
167
+ "loss": 1.0207,
168
+ "mean_token_accuracy": 0.7329977452754974,
169
+ "num_tokens": 82239.0,
170
  "step": 80
171
  },
172
  {
173
+ "entropy": 1.30864217877388,
174
+ "epoch": 0.14834205933682373,
175
+ "grad_norm": 0.9109086394309998,
176
+ "learning_rate": 0.00017371225577264654,
177
+ "loss": 1.2256,
178
+ "mean_token_accuracy": 0.6924388945102692,
179
+ "num_tokens": 86033.0,
180
  "step": 85
181
  },
182
  {
183
+ "entropy": 1.279932165145874,
184
+ "epoch": 0.15706806282722513,
185
+ "grad_norm": 0.7983659505844116,
186
+ "learning_rate": 0.0001719360568383659,
187
+ "loss": 1.1764,
188
+ "mean_token_accuracy": 0.7101370930671692,
189
+ "num_tokens": 90170.0,
190
  "step": 90
191
  },
192
  {
193
+ "entropy": 1.1692178070545196,
194
+ "epoch": 0.16579406631762653,
195
+ "grad_norm": 0.8946067690849304,
196
+ "learning_rate": 0.00017015985790408525,
197
+ "loss": 1.0826,
198
+ "mean_token_accuracy": 0.7317939043045044,
199
+ "num_tokens": 95473.0,
200
  "step": 95
201
  },
202
  {
203
+ "entropy": 1.025848913192749,
204
+ "epoch": 0.17452006980802792,
205
+ "grad_norm": 0.8327645063400269,
206
+ "learning_rate": 0.00016838365896980464,
207
+ "loss": 0.9294,
208
+ "mean_token_accuracy": 0.7514408528804779,
209
+ "num_tokens": 99423.0,
210
  "step": 100
211
  },
212
  {
213
+ "entropy": 1.0799501717090607,
214
+ "epoch": 0.18324607329842932,
215
+ "grad_norm": 0.7194784283638,
216
+ "learning_rate": 0.00016660746003552398,
217
+ "loss": 1.0222,
218
+ "mean_token_accuracy": 0.7337860226631164,
219
+ "num_tokens": 104249.0,
220
  "step": 105
221
  },
222
  {
223
+ "entropy": 1.1033223390579223,
224
+ "epoch": 0.19197207678883071,
225
+ "grad_norm": 0.7712328433990479,
226
+ "learning_rate": 0.00016483126110124335,
227
+ "loss": 0.9856,
228
+ "mean_token_accuracy": 0.7449049592018128,
229
+ "num_tokens": 109205.0,
230
  "step": 110
231
  },
232
  {
233
+ "entropy": 1.1388230919837952,
234
+ "epoch": 0.2006980802792321,
235
+ "grad_norm": 0.6309220194816589,
236
+ "learning_rate": 0.00016305506216696272,
237
+ "loss": 1.1354,
238
+ "mean_token_accuracy": 0.724005150794983,
239
+ "num_tokens": 115207.0,
240
  "step": 115
241
  },
242
  {
243
+ "entropy": 1.0293731987476349,
244
+ "epoch": 0.2094240837696335,
245
+ "grad_norm": 1.0027621984481812,
246
+ "learning_rate": 0.00016127886323268206,
247
+ "loss": 0.9218,
248
+ "mean_token_accuracy": 0.7559137165546417,
249
+ "num_tokens": 120323.0,
250
  "step": 120
251
  },
252
  {
253
+ "entropy": 1.1900119483470917,
254
+ "epoch": 0.2181500872600349,
255
+ "grad_norm": 0.8019612431526184,
256
+ "learning_rate": 0.00015950266429840145,
257
+ "loss": 1.106,
258
+ "mean_token_accuracy": 0.7178053438663483,
259
+ "num_tokens": 125253.0,
260
  "step": 125
261
  },
262
  {
263
+ "entropy": 1.0218496084213258,
264
+ "epoch": 0.2268760907504363,
265
+ "grad_norm": 0.699367105960846,
266
+ "learning_rate": 0.0001577264653641208,
267
+ "loss": 0.931,
268
+ "mean_token_accuracy": 0.7488141357898712,
269
+ "num_tokens": 130360.0,
270
  "step": 130
271
  },
272
  {
273
+ "entropy": 1.1080122888088226,
274
+ "epoch": 0.2356020942408377,
275
+ "grad_norm": 0.7124127745628357,
276
+ "learning_rate": 0.00015595026642984015,
277
+ "loss": 1.0557,
278
+ "mean_token_accuracy": 0.7226514399051667,
279
+ "num_tokens": 135538.0,
280
  "step": 135
281
  },
282
  {
283
+ "entropy": 1.173432421684265,
284
+ "epoch": 0.2443280977312391,
285
+ "grad_norm": 0.794236421585083,
286
+ "learning_rate": 0.00015417406749555952,
287
+ "loss": 1.056,
288
+ "mean_token_accuracy": 0.7334702372550964,
289
+ "num_tokens": 140532.0,
290
  "step": 140
291
  },
292
  {
293
+ "entropy": 1.0574114263057708,
294
+ "epoch": 0.2530541012216405,
295
+ "grad_norm": 0.6696324944496155,
296
+ "learning_rate": 0.00015239786856127886,
297
+ "loss": 0.9361,
298
+ "mean_token_accuracy": 0.7482443630695343,
299
+ "num_tokens": 145908.0,
300
  "step": 145
301
  },
302
  {
303
+ "entropy": 1.086327201128006,
304
+ "epoch": 0.2617801047120419,
305
+ "grad_norm": 0.5255310535430908,
306
+ "learning_rate": 0.00015062166962699825,
307
+ "loss": 1.0768,
308
+ "mean_token_accuracy": 0.7292326390743256,
309
+ "num_tokens": 151148.0,
310
  "step": 150
311
  },
312
  {
313
+ "entropy": 1.092069786787033,
314
+ "epoch": 0.2705061082024433,
315
+ "grad_norm": 0.6275709271430969,
316
+ "learning_rate": 0.0001488454706927176,
317
+ "loss": 1.0778,
318
+ "mean_token_accuracy": 0.7255069613456726,
319
+ "num_tokens": 157506.0,
320
  "step": 155
321
  },
322
  {
323
+ "entropy": 1.1596343219280243,
324
+ "epoch": 0.2792321116928447,
325
+ "grad_norm": 0.9472619295120239,
326
+ "learning_rate": 0.00014706927175843693,
327
+ "loss": 1.1003,
328
+ "mean_token_accuracy": 0.7315803647041321,
329
+ "num_tokens": 162992.0,
330
  "step": 160
331
  },
332
  {
333
+ "entropy": 1.0481273233890533,
334
+ "epoch": 0.2879581151832461,
335
+ "grad_norm": 0.6921494007110596,
336
+ "learning_rate": 0.00014529307282415633,
337
+ "loss": 0.8895,
338
+ "mean_token_accuracy": 0.7529896676540375,
339
+ "num_tokens": 167640.0,
340
  "step": 165
341
  },
342
  {
343
+ "entropy": 1.0518691539764404,
344
+ "epoch": 0.29668411867364747,
345
+ "grad_norm": 0.6654248237609863,
346
+ "learning_rate": 0.00014351687388987566,
347
+ "loss": 1.018,
348
+ "mean_token_accuracy": 0.7503870785236358,
349
+ "num_tokens": 173423.0,
350
  "step": 170
351
  },
352
  {
353
+ "entropy": 1.1176642417907714,
354
+ "epoch": 0.3054101221640489,
355
+ "grad_norm": 0.7743102312088013,
356
+ "learning_rate": 0.00014174067495559503,
357
+ "loss": 1.0807,
358
+ "mean_token_accuracy": 0.7225248873233795,
359
+ "num_tokens": 178986.0,
360
  "step": 175
361
  },
362
  {
363
+ "entropy": 0.9516431629657746,
364
+ "epoch": 0.31413612565445026,
365
+ "grad_norm": 1.0389933586120605,
366
+ "learning_rate": 0.0001399644760213144,
367
+ "loss": 0.8189,
368
+ "mean_token_accuracy": 0.7752299129962921,
369
+ "num_tokens": 183459.0,
370
  "step": 180
371
  },
372
  {
373
+ "entropy": 1.1684755861759186,
374
+ "epoch": 0.3228621291448517,
375
+ "grad_norm": 1.4807476997375488,
376
+ "learning_rate": 0.00013818827708703374,
377
+ "loss": 1.1822,
378
+ "mean_token_accuracy": 0.7197710394859314,
379
+ "num_tokens": 187614.0,
380
  "step": 185
381
  },
382
  {
383
+ "entropy": 1.099220609664917,
384
+ "epoch": 0.33158813263525305,
385
+ "grad_norm": 0.7266477346420288,
386
+ "learning_rate": 0.00013641207815275313,
387
+ "loss": 1.0095,
388
+ "mean_token_accuracy": 0.7297711133956909,
389
+ "num_tokens": 192316.0,
390
  "step": 190
391
  },
392
  {
393
+ "entropy": 1.0837588012218475,
394
+ "epoch": 0.3403141361256545,
395
+ "grad_norm": 0.696660041809082,
396
+ "learning_rate": 0.00013463587921847247,
397
+ "loss": 0.9739,
398
+ "mean_token_accuracy": 0.7354932248592376,
399
+ "num_tokens": 197728.0,
400
  "step": 195
401
  },
402
  {
403
+ "entropy": 1.1696858763694764,
404
+ "epoch": 0.34904013961605584,
405
+ "grad_norm": 0.5466914772987366,
406
+ "learning_rate": 0.00013285968028419184,
407
+ "loss": 1.1444,
408
+ "mean_token_accuracy": 0.7138558447360992,
409
+ "num_tokens": 204502.0,
410
  "step": 200
411
  },
412
  {
413
+ "entropy": 1.147382140159607,
414
+ "epoch": 0.35776614310645727,
415
+ "grad_norm": 0.8311446905136108,
416
+ "learning_rate": 0.0001310834813499112,
417
+ "loss": 1.1093,
418
+ "mean_token_accuracy": 0.7309025764465332,
419
+ "num_tokens": 209069.0,
420
  "step": 205
421
  },
422
  {
423
+ "entropy": 1.2201330184936523,
424
+ "epoch": 0.36649214659685864,
425
+ "grad_norm": 0.6816751956939697,
426
+ "learning_rate": 0.00012930728241563054,
427
+ "loss": 1.2094,
428
+ "mean_token_accuracy": 0.7130683898925781,
429
+ "num_tokens": 214185.0,
430
  "step": 210
431
  },
432
  {
433
+ "entropy": 1.152731454372406,
434
+ "epoch": 0.37521815008726006,
435
+ "grad_norm": 0.6387792825698853,
436
+ "learning_rate": 0.00012753108348134993,
437
+ "loss": 1.0565,
438
+ "mean_token_accuracy": 0.7268509924411773,
439
+ "num_tokens": 219312.0,
440
  "step": 215
441
  },
442
  {
443
+ "entropy": 1.1504864931106566,
444
+ "epoch": 0.38394415357766143,
445
+ "grad_norm": 0.7773131728172302,
446
+ "learning_rate": 0.00012575488454706927,
447
+ "loss": 1.0913,
448
+ "mean_token_accuracy": 0.7241075754165649,
449
+ "num_tokens": 225616.0,
450
  "step": 220
451
  },
452
  {
453
+ "entropy": 1.0282553434371948,
454
+ "epoch": 0.39267015706806285,
455
+ "grad_norm": 0.8763700723648071,
456
+ "learning_rate": 0.00012397868561278864,
457
+ "loss": 0.9342,
458
+ "mean_token_accuracy": 0.7502905786037445,
459
+ "num_tokens": 230696.0,
460
  "step": 225
461
  },
462
  {
463
+ "entropy": 1.0895283699035645,
464
+ "epoch": 0.4013961605584642,
465
+ "grad_norm": 0.8293470740318298,
466
+ "learning_rate": 0.000122202486678508,
467
+ "loss": 1.067,
468
+ "mean_token_accuracy": 0.7364717125892639,
469
+ "num_tokens": 236685.0,
470
  "step": 230
471
  },
472
  {
473
+ "entropy": 1.172694307565689,
474
+ "epoch": 0.41012216404886565,
475
+ "grad_norm": 0.8818181753158569,
476
+ "learning_rate": 0.00012042628774422735,
477
+ "loss": 1.0149,
478
+ "mean_token_accuracy": 0.7262615323066711,
479
+ "num_tokens": 241211.0,
480
  "step": 235
481
  },
482
  {
483
+ "entropy": 1.2173514723777772,
484
+ "epoch": 0.418848167539267,
485
+ "grad_norm": 0.5635867714881897,
486
+ "learning_rate": 0.00011865008880994673,
487
+ "loss": 1.1783,
488
+ "mean_token_accuracy": 0.7147055625915527,
489
+ "num_tokens": 246360.0,
490
  "step": 240
491
  },
492
  {
493
+ "entropy": 1.188833224773407,
494
+ "epoch": 0.42757417102966844,
495
+ "grad_norm": 0.6060160398483276,
496
+ "learning_rate": 0.00011687388987566608,
497
+ "loss": 1.1545,
498
+ "mean_token_accuracy": 0.717083477973938,
499
+ "num_tokens": 252717.0,
500
  "step": 245
501
  },
502
  {
503
+ "entropy": 1.0905582129955291,
504
+ "epoch": 0.4363001745200698,
505
+ "grad_norm": 0.6812947988510132,
506
+ "learning_rate": 0.00011509769094138544,
507
+ "loss": 0.9922,
508
+ "mean_token_accuracy": 0.7299255549907684,
509
+ "num_tokens": 257249.0,
510
  "step": 250
511
  },
512
  {
513
+ "entropy": 0.8695837318897247,
514
+ "epoch": 0.44502617801047123,
515
+ "grad_norm": 0.8577454090118408,
516
+ "learning_rate": 0.0001133214920071048,
517
+ "loss": 0.8209,
518
+ "mean_token_accuracy": 0.7762204229831695,
519
+ "num_tokens": 262381.0,
520
  "step": 255
521
  },
522
  {
523
+ "entropy": 0.9932888269424438,
524
+ "epoch": 0.4537521815008726,
525
+ "grad_norm": 0.697665810585022,
526
+ "learning_rate": 0.00011154529307282415,
527
+ "loss": 1.0232,
528
+ "mean_token_accuracy": 0.7427519500255585,
529
+ "num_tokens": 267410.0,
530
  "step": 260
531
  },
532
  {
533
+ "entropy": 0.8414939880371094,
534
+ "epoch": 0.462478184991274,
535
+ "grad_norm": 0.789999783039093,
536
+ "learning_rate": 0.00010976909413854353,
537
+ "loss": 0.7225,
538
+ "mean_token_accuracy": 0.7937956035137177,
539
+ "num_tokens": 272109.0,
540
  "step": 265
541
  },
542
  {
543
+ "entropy": 1.0776531934738158,
544
+ "epoch": 0.4712041884816754,
545
+ "grad_norm": 0.6461851000785828,
546
+ "learning_rate": 0.00010799289520426288,
547
+ "loss": 1.0389,
548
+ "mean_token_accuracy": 0.7343196094036102,
549
+ "num_tokens": 276623.0,
550
  "step": 270
551
  },
552
  {
553
+ "entropy": 1.1227709293365478,
554
+ "epoch": 0.4799301919720768,
555
+ "grad_norm": 0.6017542481422424,
556
+ "learning_rate": 0.00010621669626998225,
557
+ "loss": 1.0346,
558
+ "mean_token_accuracy": 0.7320161819458008,
559
+ "num_tokens": 283256.0,
560
  "step": 275
561
  },
562
  {
563
+ "entropy": 0.9767000675201416,
564
+ "epoch": 0.4886561954624782,
565
+ "grad_norm": 0.7064502835273743,
566
+ "learning_rate": 0.0001044404973357016,
567
+ "loss": 0.9051,
568
+ "mean_token_accuracy": 0.7693962216377258,
569
+ "num_tokens": 288780.0,
570
  "step": 280
571
  },
572
  {
573
+ "entropy": 0.9595549941062927,
574
+ "epoch": 0.4973821989528796,
575
+ "grad_norm": 0.7622601985931396,
576
+ "learning_rate": 0.00010266429840142096,
577
+ "loss": 0.8922,
578
+ "mean_token_accuracy": 0.767174756526947,
579
+ "num_tokens": 293775.0,
580
  "step": 285
581
  },
582
  {
583
+ "entropy": 0.9456490218639374,
584
+ "epoch": 0.506108202443281,
585
+ "grad_norm": 0.7910531163215637,
586
+ "learning_rate": 0.00010088809946714034,
587
+ "loss": 0.8845,
588
+ "mean_token_accuracy": 0.7625713229179383,
589
+ "num_tokens": 299667.0,
590
  "step": 290
591
  },
592
  {
593
+ "entropy": 0.9972454011440277,
594
+ "epoch": 0.5148342059336823,
595
+ "grad_norm": 0.8077422976493835,
596
+ "learning_rate": 9.911190053285967e-05,
597
+ "loss": 0.9629,
598
+ "mean_token_accuracy": 0.7550196409225464,
599
+ "num_tokens": 304401.0,
600
  "step": 295
601
  },
602
  {
603
+ "entropy": 1.0132270872592926,
604
+ "epoch": 0.5235602094240838,
605
+ "grad_norm": 0.5776278972625732,
606
+ "learning_rate": 9.733570159857904e-05,
607
+ "loss": 0.9083,
608
+ "mean_token_accuracy": 0.7645319044589997,
609
+ "num_tokens": 310983.0,
610
  "step": 300
611
  },
612
  {
613
+ "entropy": 1.1321196973323822,
614
+ "epoch": 0.5322862129144852,
615
+ "grad_norm": 0.765808641910553,
616
+ "learning_rate": 9.555950266429841e-05,
617
+ "loss": 1.0364,
618
+ "mean_token_accuracy": 0.7226320803165436,
619
+ "num_tokens": 315721.0,
620
  "step": 305
621
  },
622
  {
623
+ "entropy": 1.0132107377052306,
624
+ "epoch": 0.5410122164048866,
625
+ "grad_norm": 0.5765398144721985,
626
+ "learning_rate": 9.378330373001777e-05,
627
+ "loss": 0.9858,
628
+ "mean_token_accuracy": 0.7562039911746978,
629
+ "num_tokens": 321834.0,
630
  "step": 310
631
  },
632
  {
633
+ "entropy": 1.097977089881897,
634
+ "epoch": 0.5497382198952879,
635
+ "grad_norm": 0.7264753580093384,
636
+ "learning_rate": 9.200710479573713e-05,
637
+ "loss": 1.0686,
638
+ "mean_token_accuracy": 0.7291842579841614,
639
+ "num_tokens": 327063.0,
640
  "step": 315
641
  },
642
  {
643
+ "entropy": 1.2174109816551208,
644
+ "epoch": 0.5584642233856894,
645
+ "grad_norm": 0.7541456818580627,
646
+ "learning_rate": 9.023090586145648e-05,
647
+ "loss": 1.1817,
648
+ "mean_token_accuracy": 0.7097965478897095,
649
+ "num_tokens": 332900.0,
650
  "step": 320
651
  },
652
  {
653
+ "entropy": 1.0044541895389556,
654
+ "epoch": 0.5671902268760908,
655
+ "grad_norm": 0.5834890604019165,
656
+ "learning_rate": 8.845470692717585e-05,
657
+ "loss": 0.9467,
658
+ "mean_token_accuracy": 0.7500465452671051,
659
+ "num_tokens": 337508.0,
660
  "step": 325
661
  },
662
  {
663
+ "entropy": 1.0295350253582,
664
+ "epoch": 0.5759162303664922,
665
+ "grad_norm": 0.8909983038902283,
666
+ "learning_rate": 8.667850799289521e-05,
667
+ "loss": 0.9113,
668
+ "mean_token_accuracy": 0.7476867496967315,
669
+ "num_tokens": 342644.0,
670
  "step": 330
671
  },
672
  {
673
+ "entropy": 1.0791299104690553,
674
+ "epoch": 0.5846422338568935,
675
+ "grad_norm": 1.0385737419128418,
676
+ "learning_rate": 8.490230905861456e-05,
677
+ "loss": 1.1175,
678
+ "mean_token_accuracy": 0.7305109918117523,
679
+ "num_tokens": 347547.0,
680
  "step": 335
681
  },
682
  {
683
+ "entropy": 1.0213176369667054,
684
+ "epoch": 0.5933682373472949,
685
+ "grad_norm": 0.943204402923584,
686
+ "learning_rate": 8.312611012433393e-05,
687
+ "loss": 0.9055,
688
+ "mean_token_accuracy": 0.7596513092517853,
689
+ "num_tokens": 351932.0,
690
  "step": 340
691
  },
692
  {
693
+ "entropy": 1.0257258594036103,
694
+ "epoch": 0.6020942408376964,
695
+ "grad_norm": 0.7949322462081909,
696
+ "learning_rate": 8.134991119005328e-05,
697
+ "loss": 0.9098,
698
+ "mean_token_accuracy": 0.7553630173206329,
699
+ "num_tokens": 357045.0,
700
  "step": 345
701
  },
702
  {
703
+ "entropy": 1.0372248589992523,
704
+ "epoch": 0.6108202443280978,
705
+ "grad_norm": 0.8405324220657349,
706
+ "learning_rate": 7.957371225577265e-05,
707
+ "loss": 0.9929,
708
+ "mean_token_accuracy": 0.7452831089496612,
709
+ "num_tokens": 362284.0,
710
  "step": 350
711
  },
712
  {
713
+ "entropy": 0.9565088748931885,
714
+ "epoch": 0.6195462478184991,
715
+ "grad_norm": 0.6379778981208801,
716
+ "learning_rate": 7.779751332149202e-05,
717
+ "loss": 0.9219,
718
+ "mean_token_accuracy": 0.7565369844436646,
719
+ "num_tokens": 367217.0,
720
  "step": 355
721
  },
722
  {
723
+ "entropy": 1.0628814578056336,
724
+ "epoch": 0.6282722513089005,
725
+ "grad_norm": 0.6335421204566956,
726
+ "learning_rate": 7.602131438721137e-05,
727
+ "loss": 1.0041,
728
+ "mean_token_accuracy": 0.7395376443862915,
729
+ "num_tokens": 372678.0,
730
  "step": 360
731
  },
732
  {
733
+ "entropy": 0.9448712587356567,
734
+ "epoch": 0.6369982547993019,
735
+ "grad_norm": 0.737162172794342,
736
+ "learning_rate": 7.424511545293074e-05,
737
+ "loss": 0.8211,
738
+ "mean_token_accuracy": 0.771143788099289,
739
+ "num_tokens": 377750.0,
740
  "step": 365
741
  },
742
  {
743
+ "entropy": 0.9797238111495972,
744
+ "epoch": 0.6457242582897034,
745
+ "grad_norm": 0.5577957034111023,
746
+ "learning_rate": 7.246891651865009e-05,
747
+ "loss": 0.9415,
748
+ "mean_token_accuracy": 0.7499814212322236,
749
+ "num_tokens": 383406.0,
750
  "step": 370
751
  },
752
  {
753
+ "entropy": 1.1891680419445039,
754
+ "epoch": 0.6544502617801047,
755
+ "grad_norm": 0.48097750544548035,
756
+ "learning_rate": 7.069271758436945e-05,
757
+ "loss": 1.1327,
758
+ "mean_token_accuracy": 0.7193056166172027,
759
+ "num_tokens": 389696.0,
760
  "step": 375
761
  },
762
  {
763
+ "entropy": 1.0238433182239532,
764
+ "epoch": 0.6631762652705061,
765
+ "grad_norm": 0.5823986530303955,
766
+ "learning_rate": 6.891651865008881e-05,
767
+ "loss": 0.9708,
768
+ "mean_token_accuracy": 0.7535522282123566,
769
+ "num_tokens": 394688.0,
770
  "step": 380
771
  },
772
  {
773
+ "entropy": 1.162860244512558,
774
+ "epoch": 0.6719022687609075,
775
+ "grad_norm": 0.6299170255661011,
776
+ "learning_rate": 6.714031971580817e-05,
777
+ "loss": 1.1866,
778
+ "mean_token_accuracy": 0.710206264257431,
779
+ "num_tokens": 400319.0,
780
  "step": 385
781
  },
782
  {
783
+ "entropy": 1.0206872344017028,
784
+ "epoch": 0.680628272251309,
785
+ "grad_norm": 0.7722362875938416,
786
+ "learning_rate": 6.536412078152754e-05,
787
+ "loss": 0.9289,
788
+ "mean_token_accuracy": 0.7554251432418824,
789
+ "num_tokens": 404918.0,
790
  "step": 390
791
  },
792
  {
793
+ "entropy": 1.0980794131755829,
794
+ "epoch": 0.6893542757417103,
795
+ "grad_norm": 0.9234552979469299,
796
+ "learning_rate": 6.358792184724689e-05,
797
+ "loss": 0.9551,
798
+ "mean_token_accuracy": 0.7426558673381806,
799
+ "num_tokens": 410635.0,
800
  "step": 395
801
  },
802
  {
803
+ "entropy": 1.0166767477989196,
804
+ "epoch": 0.6980802792321117,
805
+ "grad_norm": 0.9343558549880981,
806
+ "learning_rate": 6.181172291296625e-05,
807
+ "loss": 0.9624,
808
+ "mean_token_accuracy": 0.7539155185222626,
809
+ "num_tokens": 415005.0,
810
  "step": 400
811
  },
812
  {
813
+ "entropy": 1.0832793176174165,
814
+ "epoch": 0.7068062827225131,
815
+ "grad_norm": 0.7815644145011902,
816
+ "learning_rate": 6.003552397868561e-05,
817
+ "loss": 1.0316,
818
+ "mean_token_accuracy": 0.7289174854755401,
819
+ "num_tokens": 419347.0,
820
  "step": 405
821
  },
822
  {
823
+ "entropy": 1.0699054658412934,
824
+ "epoch": 0.7155322862129145,
825
+ "grad_norm": 0.7760159373283386,
826
+ "learning_rate": 5.825932504440498e-05,
827
+ "loss": 1.0357,
828
+ "mean_token_accuracy": 0.7321902751922608,
829
+ "num_tokens": 424588.0,
830
  "step": 410
831
  },
832
  {
833
+ "entropy": 0.966323298215866,
834
+ "epoch": 0.7242582897033158,
835
+ "grad_norm": 0.805746853351593,
836
+ "learning_rate": 5.648312611012434e-05,
837
+ "loss": 0.9306,
838
+ "mean_token_accuracy": 0.7569182515144348,
839
+ "num_tokens": 428943.0,
840
  "step": 415
841
  },
842
  {
843
+ "entropy": 0.9721911072731018,
844
+ "epoch": 0.7329842931937173,
845
+ "grad_norm": 0.6620533466339111,
846
+ "learning_rate": 5.470692717584369e-05,
847
+ "loss": 0.9465,
848
+ "mean_token_accuracy": 0.7597197592258453,
849
+ "num_tokens": 435326.0,
850
  "step": 420
851
  },
852
  {
853
+ "entropy": 0.9292757451534271,
854
+ "epoch": 0.7417102966841187,
855
+ "grad_norm": 0.7177068591117859,
856
+ "learning_rate": 5.293072824156306e-05,
857
+ "loss": 0.858,
858
+ "mean_token_accuracy": 0.7738080501556397,
859
+ "num_tokens": 441702.0,
860
  "step": 425
861
  },
862
  {
863
+ "entropy": 1.0638712823390961,
864
+ "epoch": 0.7504363001745201,
865
+ "grad_norm": 0.5912255048751831,
866
+ "learning_rate": 5.115452930728242e-05,
867
+ "loss": 1.0654,
868
+ "mean_token_accuracy": 0.747636479139328,
869
+ "num_tokens": 446862.0,
870
  "step": 430
871
  },
872
  {
873
+ "entropy": 0.9203409194946289,
874
+ "epoch": 0.7591623036649214,
875
+ "grad_norm": 0.8877400159835815,
876
+ "learning_rate": 4.9378330373001777e-05,
877
+ "loss": 0.8225,
878
+ "mean_token_accuracy": 0.7788766026496887,
879
+ "num_tokens": 451024.0,
880
  "step": 435
881
  },
882
  {
883
+ "entropy": 1.0310194969177247,
884
+ "epoch": 0.7678883071553229,
885
+ "grad_norm": 0.593137800693512,
886
+ "learning_rate": 4.7602131438721136e-05,
887
+ "loss": 1.0058,
888
+ "mean_token_accuracy": 0.7474644720554352,
889
+ "num_tokens": 457528.0,
890
  "step": 440
891
  },
892
  {
893
+ "entropy": 0.9218507647514343,
894
+ "epoch": 0.7766143106457243,
895
+ "grad_norm": 0.8034109473228455,
896
+ "learning_rate": 4.58259325044405e-05,
897
+ "loss": 0.8161,
898
+ "mean_token_accuracy": 0.773482757806778,
899
+ "num_tokens": 462267.0,
900
  "step": 445
901
  },
902
  {
903
+ "entropy": 1.0368493318557739,
904
+ "epoch": 0.7853403141361257,
905
+ "grad_norm": 0.9129230380058289,
906
+ "learning_rate": 4.404973357015986e-05,
907
+ "loss": 1.0042,
908
+ "mean_token_accuracy": 0.7518712699413299,
909
+ "num_tokens": 467337.0,
910
  "step": 450
911
  },
912
  {
913
+ "entropy": 0.8776600241661072,
914
+ "epoch": 0.794066317626527,
915
+ "grad_norm": 0.5392698645591736,
916
+ "learning_rate": 4.227353463587922e-05,
917
+ "loss": 0.7964,
918
+ "mean_token_accuracy": 0.773613715171814,
919
+ "num_tokens": 472361.0,
920
  "step": 455
921
  },
922
  {
923
+ "entropy": 0.9013674080371856,
924
+ "epoch": 0.8027923211169284,
925
+ "grad_norm": 0.731060266494751,
926
+ "learning_rate": 4.049733570159858e-05,
927
+ "loss": 0.9098,
928
+ "mean_token_accuracy": 0.7663923025131225,
929
+ "num_tokens": 477324.0,
930
  "step": 460
931
  },
932
  {
933
+ "entropy": 1.0141965687274932,
934
+ "epoch": 0.8115183246073299,
935
+ "grad_norm": 0.6941847205162048,
936
+ "learning_rate": 3.872113676731794e-05,
937
+ "loss": 1.0052,
938
+ "mean_token_accuracy": 0.747931432723999,
939
+ "num_tokens": 483192.0,
940
  "step": 465
941
  },
942
  {
943
+ "entropy": 0.9370434999465942,
944
+ "epoch": 0.8202443280977313,
945
+ "grad_norm": 0.7024611830711365,
946
+ "learning_rate": 3.69449378330373e-05,
947
+ "loss": 0.9472,
948
+ "mean_token_accuracy": 0.7648843646049499,
949
+ "num_tokens": 488771.0,
950
  "step": 470
951
  },
952
  {
953
+ "entropy": 1.2138389825820923,
954
+ "epoch": 0.8289703315881326,
955
+ "grad_norm": 0.6181853413581848,
956
+ "learning_rate": 3.516873889875667e-05,
957
+ "loss": 1.1913,
958
+ "mean_token_accuracy": 0.7199933648109436,
959
+ "num_tokens": 495594.0,
960
  "step": 475
961
  },
962
  {
963
+ "entropy": 0.9946802318096161,
964
+ "epoch": 0.837696335078534,
965
+ "grad_norm": 0.8392300009727478,
966
+ "learning_rate": 3.339253996447602e-05,
967
+ "loss": 0.8846,
968
+ "mean_token_accuracy": 0.7601681053638458,
969
+ "num_tokens": 501431.0,
970
  "step": 480
971
  },
972
  {
973
+ "entropy": 1.0851561069488525,
974
+ "epoch": 0.8464223385689355,
975
+ "grad_norm": 0.7538084983825684,
976
+ "learning_rate": 3.1616341030195386e-05,
977
+ "loss": 1.0112,
978
+ "mean_token_accuracy": 0.7339279770851135,
979
+ "num_tokens": 506603.0,
980
  "step": 485
981
  },
982
  {
983
+ "entropy": 0.9791876435279846,
984
+ "epoch": 0.8551483420593369,
985
+ "grad_norm": 0.6512478590011597,
986
+ "learning_rate": 2.9840142095914742e-05,
987
+ "loss": 0.9047,
988
+ "mean_token_accuracy": 0.7637781441211701,
989
+ "num_tokens": 511657.0,
990
  "step": 490
991
  },
992
  {
993
+ "entropy": 0.8807009816169739,
994
+ "epoch": 0.8638743455497382,
995
+ "grad_norm": 1.0381275415420532,
996
+ "learning_rate": 2.8063943161634105e-05,
997
+ "loss": 0.7989,
998
+ "mean_token_accuracy": 0.778300940990448,
999
+ "num_tokens": 516346.0,
1000
  "step": 495
1001
  },
1002
  {
1003
+ "entropy": 0.9706099390983581,
1004
+ "epoch": 0.8726003490401396,
1005
+ "grad_norm": 0.7503977417945862,
1006
+ "learning_rate": 2.6287744227353468e-05,
1007
+ "loss": 0.8633,
1008
+ "mean_token_accuracy": 0.7602749288082122,
1009
+ "num_tokens": 521118.0,
1010
  "step": 500
1011
  },
1012
  {
1013
+ "entropy": 0.9830702662467956,
1014
+ "epoch": 0.881326352530541,
1015
+ "grad_norm": 0.7824010252952576,
1016
+ "learning_rate": 2.4511545293072824e-05,
1017
+ "loss": 0.8701,
1018
+ "mean_token_accuracy": 0.7697367370128632,
1019
+ "num_tokens": 525785.0,
1020
  "step": 505
1021
  },
1022
  {
1023
+ "entropy": 1.0895603597164154,
1024
+ "epoch": 0.8900523560209425,
1025
+ "grad_norm": 0.6201509237289429,
1026
+ "learning_rate": 2.2735346358792187e-05,
1027
+ "loss": 0.999,
1028
+ "mean_token_accuracy": 0.7415844857692718,
1029
+ "num_tokens": 531296.0,
1030
  "step": 510
1031
  },
1032
  {
1033
+ "entropy": 1.0094242215156555,
1034
+ "epoch": 0.8987783595113438,
1035
+ "grad_norm": 0.6755935549736023,
1036
+ "learning_rate": 2.0959147424511547e-05,
1037
+ "loss": 0.9283,
1038
+ "mean_token_accuracy": 0.7551429510116577,
1039
+ "num_tokens": 536703.0,
1040
  "step": 515
1041
  },
1042
  {
1043
+ "entropy": 0.9846092760562897,
1044
+ "epoch": 0.9075043630017452,
1045
+ "grad_norm": 1.0709046125411987,
1046
+ "learning_rate": 1.9182948490230906e-05,
1047
+ "loss": 0.9426,
1048
+ "mean_token_accuracy": 0.7431533575057984,
1049
+ "num_tokens": 541044.0,
1050
  "step": 520
1051
  },
1052
  {
1053
+ "entropy": 0.9527219116687775,
1054
+ "epoch": 0.9162303664921466,
1055
+ "grad_norm": 0.6978484392166138,
1056
+ "learning_rate": 1.7406749555950266e-05,
1057
+ "loss": 0.8911,
1058
+ "mean_token_accuracy": 0.7653971970081329,
1059
+ "num_tokens": 546836.0,
1060
  "step": 525
1061
  },
1062
  {
1063
+ "entropy": 0.8855733275413513,
1064
+ "epoch": 0.924956369982548,
1065
+ "grad_norm": 0.9127820134162903,
1066
+ "learning_rate": 1.563055062166963e-05,
1067
+ "loss": 0.8139,
1068
+ "mean_token_accuracy": 0.7775610208511352,
1069
+ "num_tokens": 551666.0,
1070
  "step": 530
1071
  },
1072
  {
1073
+ "entropy": 0.9590709805488586,
1074
+ "epoch": 0.9336823734729494,
1075
+ "grad_norm": 0.7010323405265808,
1076
+ "learning_rate": 1.3854351687388988e-05,
1077
+ "loss": 0.9334,
1078
+ "mean_token_accuracy": 0.759455144405365,
1079
+ "num_tokens": 556932.0,
1080
  "step": 535
1081
  },
1082
  {
1083
+ "entropy": 0.9646609544754028,
1084
+ "epoch": 0.9424083769633508,
1085
+ "grad_norm": 0.5711817145347595,
1086
+ "learning_rate": 1.2078152753108348e-05,
1087
+ "loss": 0.9678,
1088
+ "mean_token_accuracy": 0.7603480279445648,
1089
+ "num_tokens": 562608.0,
1090
  "step": 540
1091
  },
1092
  {
1093
+ "entropy": 1.0106851994991302,
1094
+ "epoch": 0.9511343804537522,
1095
+ "grad_norm": 0.7159616947174072,
1096
+ "learning_rate": 1.030195381882771e-05,
1097
+ "loss": 0.9285,
1098
+ "mean_token_accuracy": 0.7571455597877502,
1099
+ "num_tokens": 568591.0,
1100
  "step": 545
1101
  },
1102
  {
1103
+ "entropy": 1.0865988105535507,
1104
+ "epoch": 0.9598603839441536,
1105
+ "grad_norm": 0.7819423079490662,
1106
+ "learning_rate": 8.52575488454707e-06,
1107
+ "loss": 1.1628,
1108
+ "mean_token_accuracy": 0.7484253525733948,
1109
+ "num_tokens": 572932.0,
1110
  "step": 550
1111
  },
1112
  {
1113
+ "entropy": 0.8808701932430267,
1114
+ "epoch": 0.9685863874345549,
1115
+ "grad_norm": 0.6782775521278381,
1116
+ "learning_rate": 6.74955595026643e-06,
1117
+ "loss": 0.7661,
1118
+ "mean_token_accuracy": 0.7774575710296631,
1119
+ "num_tokens": 577818.0,
1120
  "step": 555
1121
  },
1122
  {
1123
+ "entropy": 0.9287233471870422,
1124
+ "epoch": 0.9773123909249564,
1125
+ "grad_norm": 0.8206584453582764,
1126
+ "learning_rate": 4.973357015985791e-06,
1127
+ "loss": 0.7572,
1128
+ "mean_token_accuracy": 0.7789205074310303,
1129
+ "num_tokens": 581899.0,
1130
  "step": 560
1131
  },
1132
  {
1133
+ "entropy": 0.8616821765899658,
1134
+ "epoch": 0.9860383944153578,
1135
+ "grad_norm": 0.6403858661651611,
1136
+ "learning_rate": 3.197158081705151e-06,
1137
+ "loss": 0.7855,
1138
+ "mean_token_accuracy": 0.7908896625041961,
1139
+ "num_tokens": 587864.0,
1140
  "step": 565
1141
  },
1142
  {
1143
+ "entropy": 1.0378393054008483,
1144
+ "epoch": 0.9947643979057592,
1145
+ "grad_norm": 0.7347800731658936,
1146
+ "learning_rate": 1.4209591474245117e-06,
1147
+ "loss": 1.0399,
1148
+ "mean_token_accuracy": 0.7526734173297882,
1149
+ "num_tokens": 592990.0,
1150
  "step": 570
1151
  }
1152
  ],
1153
  "logging_steps": 5,
1154
+ "max_steps": 573,
1155
  "num_input_tokens_seen": 0,
1156
  "num_train_epochs": 1,
1157
  "save_steps": 500,
 
1167
  "attributes": {}
1168
  }
1169
  },
1170
+ "total_flos": 2.6952641846870016e+16,
1171
  "train_batch_size": 1,
1172
  "trial_name": null,
1173
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:58293d1261da0e67c9bdcabfa9d91110498e1d28ff6f6e0d9d07cd560a155972
3
  size 5816
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d49ed87fd1007ddba65a781a7a824d4db6222aa26b1008b2e988302b8cec8fab
3
  size 5816