robertou2 commited on
Commit
82e06b4
·
verified ·
1 Parent(s): e9ad8de

Upload folder using huggingface_hub

Browse files
adapter_config.json CHANGED
@@ -12,14 +12,14 @@
12
  "layers_pattern": null,
13
  "layers_to_transform": null,
14
  "loftq_config": {},
15
- "lora_alpha": 256,
16
  "lora_bias": false,
17
  "lora_dropout": 0,
18
  "megatron_config": null,
19
  "megatron_core": "megatron.core",
20
  "modules_to_save": null,
21
  "peft_type": "LORA",
22
- "r": 128,
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
 
12
  "layers_pattern": null,
13
  "layers_to_transform": null,
14
  "loftq_config": {},
15
+ "lora_alpha": 112,
16
  "lora_bias": false,
17
  "lora_dropout": 0,
18
  "megatron_config": null,
19
  "megatron_core": "megatron.core",
20
  "modules_to_save": null,
21
  "peft_type": "LORA",
22
+ "r": 56,
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4b98ba6ac806c03c0409f8d783327298917bd9290b863f004e8c9f4949a49cab
3
- size 369134112
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1767407cde1f093e295836dca6f0a8fe3280307f21de2f9ac5a87c96ac476e21
3
+ size 161515608
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7750a024ffc36f0b2b3d75b6d23a4abc45828022cd9fc314ed0ca873e7afc478
3
- size 738417355
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b03158d3c32281c45bae11494452aff7910950a34011e853f3d6c1c18d8651b
3
+ size 323181259
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:de69a2834426ff9ef8199d077e00892579278af31d8969d77f98235b5cfc010a
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2534e434cd5abbb8f7668d3eab0549db0ef95d6a797a3efa86b712e8e32266a7
3
  size 14645
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c2b8b314158649523e5cd4cc114f7b492743419645cb17f66610bf7539ffeb77
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f35223b4162b3f25fe602e5e4c5a2349c08c0134f11cd20a82d190f37cb0842a
3
  size 1465
trainer_state.json CHANGED
@@ -2,666 +2,1016 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.9808429118773945,
6
  "eval_steps": 500,
7
- "global_step": 65,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "entropy": 2.2323372662067413,
14
- "epoch": 0.03065134099616858,
15
- "grad_norm": 53.25,
16
  "learning_rate": 0.0,
17
- "loss": 2.7706,
18
- "mean_token_accuracy": 0.41634324193000793,
19
- "num_tokens": 1244.0,
20
  "step": 1
21
  },
22
  {
23
- "entropy": 2.174584299325943,
24
- "epoch": 0.06130268199233716,
25
- "grad_norm": 36.0,
26
  "learning_rate": 2e-06,
27
- "loss": 2.4332,
28
- "mean_token_accuracy": 0.41893551871180534,
29
- "num_tokens": 3427.0,
30
  "step": 2
31
  },
32
  {
33
- "entropy": 2.0810845494270325,
34
- "epoch": 0.09195402298850575,
35
- "grad_norm": 27.75,
36
  "learning_rate": 4e-06,
37
- "loss": 2.2604,
38
- "mean_token_accuracy": 0.4491094872355461,
39
- "num_tokens": 5582.0,
40
  "step": 3
41
  },
42
  {
43
- "entropy": 2.389508530497551,
44
- "epoch": 0.12260536398467432,
45
- "grad_norm": 28.625,
46
  "learning_rate": 6e-06,
47
- "loss": 2.224,
48
- "mean_token_accuracy": 0.47163403779268265,
49
- "num_tokens": 7064.0,
50
  "step": 4
51
  },
52
  {
53
- "entropy": 2.3899217396974564,
54
- "epoch": 0.1532567049808429,
55
- "grad_norm": 17.0,
56
  "learning_rate": 8e-06,
57
- "loss": 1.9894,
58
- "mean_token_accuracy": 0.4873850643634796,
59
- "num_tokens": 9091.0,
60
  "step": 5
61
  },
62
  {
63
- "entropy": 2.3988372683525085,
64
- "epoch": 0.1839080459770115,
65
- "grad_norm": 22.375,
66
  "learning_rate": 9.999999999999999e-06,
67
- "loss": 2.0726,
68
- "mean_token_accuracy": 0.5061133019626141,
69
- "num_tokens": 10556.0,
70
  "step": 6
71
  },
72
  {
73
- "entropy": 2.395625740289688,
74
- "epoch": 0.21455938697318008,
75
- "grad_norm": 16.75,
76
  "learning_rate": 1.2e-05,
77
- "loss": 2.0064,
78
- "mean_token_accuracy": 0.5037284195423126,
79
- "num_tokens": 12215.0,
80
  "step": 7
81
  },
82
  {
83
- "entropy": 2.2998499274253845,
84
- "epoch": 0.24521072796934865,
85
- "grad_norm": 14.5625,
86
  "learning_rate": 1.4e-05,
87
- "loss": 1.7784,
88
- "mean_token_accuracy": 0.5325785167515278,
89
- "num_tokens": 13939.0,
90
  "step": 8
91
  },
92
  {
93
- "entropy": 2.233474910259247,
94
- "epoch": 0.27586206896551724,
95
- "grad_norm": 14.6875,
96
  "learning_rate": 1.6e-05,
97
- "loss": 1.7552,
98
- "mean_token_accuracy": 0.5224817767739296,
99
- "num_tokens": 15986.0,
100
  "step": 9
101
  },
102
  {
103
- "entropy": 2.1560849398374557,
104
- "epoch": 0.3065134099616858,
105
- "grad_norm": 12.125,
106
  "learning_rate": 1.8e-05,
107
- "loss": 1.7487,
108
- "mean_token_accuracy": 0.5436614826321602,
109
- "num_tokens": 18444.0,
110
  "step": 10
111
  },
112
  {
113
- "entropy": 1.8782547265291214,
114
- "epoch": 0.3371647509578544,
115
- "grad_norm": 11.1875,
116
  "learning_rate": 1.9999999999999998e-05,
117
- "loss": 1.5774,
118
- "mean_token_accuracy": 0.5730905011296272,
119
- "num_tokens": 21127.0,
120
  "step": 11
121
  },
122
  {
123
- "entropy": 2.0860691219568253,
124
- "epoch": 0.367816091954023,
125
- "grad_norm": 13.125,
126
  "learning_rate": 2.2e-05,
127
- "loss": 1.8279,
128
- "mean_token_accuracy": 0.5077806040644646,
129
- "num_tokens": 23308.0,
130
  "step": 12
131
  },
132
  {
133
- "entropy": 2.0839987099170685,
134
- "epoch": 0.39846743295019155,
135
- "grad_norm": 13.5,
136
  "learning_rate": 2.4e-05,
137
- "loss": 1.8629,
138
- "mean_token_accuracy": 0.5324465520679951,
139
- "num_tokens": 25072.0,
140
  "step": 13
141
  },
142
  {
143
- "entropy": 2.211606591939926,
144
- "epoch": 0.42911877394636017,
145
- "grad_norm": 15.3125,
146
  "learning_rate": 2.6000000000000002e-05,
147
- "loss": 1.934,
148
- "mean_token_accuracy": 0.513655960559845,
149
- "num_tokens": 26450.0,
150
  "step": 14
151
  },
152
  {
153
- "entropy": 2.2505457401275635,
154
- "epoch": 0.45977011494252873,
155
- "grad_norm": 14.8125,
156
  "learning_rate": 2.8e-05,
157
- "loss": 1.7603,
158
- "mean_token_accuracy": 0.5480454824864864,
159
- "num_tokens": 27912.0,
160
  "step": 15
161
  },
162
  {
163
- "entropy": 2.187108889222145,
164
- "epoch": 0.4904214559386973,
165
- "grad_norm": 13.125,
166
  "learning_rate": 3e-05,
167
- "loss": 1.6138,
168
- "mean_token_accuracy": 0.5843819156289101,
169
- "num_tokens": 29392.0,
170
  "step": 16
171
  },
172
  {
173
- "entropy": 2.0149056166410446,
174
- "epoch": 0.5210727969348659,
175
- "grad_norm": 9.9375,
176
- "learning_rate": 2.998951057182598e-05,
177
- "loss": 1.4549,
178
- "mean_token_accuracy": 0.597277820110321,
179
- "num_tokens": 31417.0,
180
  "step": 17
181
  },
182
  {
183
- "entropy": 1.9988498389720917,
184
- "epoch": 0.5517241379310345,
185
- "grad_norm": 11.0,
186
- "learning_rate": 2.99580569577177e-05,
187
- "loss": 1.7097,
188
- "mean_token_accuracy": 0.5442679524421692,
189
- "num_tokens": 33727.0,
190
  "step": 18
191
  },
192
  {
193
- "entropy": 1.8304037749767303,
194
- "epoch": 0.5823754789272031,
195
- "grad_norm": 10.125,
196
- "learning_rate": 2.9905683148398642e-05,
197
- "loss": 1.5381,
198
- "mean_token_accuracy": 0.5851795524358749,
199
- "num_tokens": 35836.0,
200
  "step": 19
201
  },
202
  {
203
- "entropy": 1.891087457537651,
204
- "epoch": 0.6130268199233716,
205
- "grad_norm": 12.625,
206
- "learning_rate": 2.9832462393376926e-05,
207
- "loss": 1.6876,
208
- "mean_token_accuracy": 0.5546146482229233,
209
- "num_tokens": 37639.0,
210
  "step": 20
211
  },
212
  {
213
- "entropy": 1.9664306491613388,
214
- "epoch": 0.6436781609195402,
215
- "grad_norm": 12.125,
216
- "learning_rate": 2.9738497098499325e-05,
217
- "loss": 1.7271,
218
- "mean_token_accuracy": 0.5344564504921436,
219
- "num_tokens": 39351.0,
220
  "step": 21
221
  },
222
  {
223
- "entropy": 1.7850568294525146,
224
- "epoch": 0.6743295019157088,
225
- "grad_norm": 13.375,
226
- "learning_rate": 2.9623918682727355e-05,
227
- "loss": 1.524,
228
- "mean_token_accuracy": 0.5623632185161114,
229
- "num_tokens": 41024.0,
230
  "step": 22
231
  },
232
  {
233
- "entropy": 1.898742452263832,
234
- "epoch": 0.7049808429118773,
235
- "grad_norm": 13.0,
236
- "learning_rate": 2.9488887394336025e-05,
237
- "loss": 1.732,
238
- "mean_token_accuracy": 0.5667595192790031,
239
- "num_tokens": 42624.0,
240
  "step": 23
241
  },
242
  {
243
- "entropy": 2.062256097793579,
244
- "epoch": 0.735632183908046,
245
- "grad_norm": 15.0625,
246
- "learning_rate": 2.9333592086792113e-05,
247
- "loss": 1.8659,
248
- "mean_token_accuracy": 0.5371430143713951,
249
- "num_tokens": 43836.0,
250
  "step": 24
251
  },
252
  {
253
- "entropy": 1.9839176535606384,
254
- "epoch": 0.7662835249042146,
255
- "grad_norm": 10.4375,
256
- "learning_rate": 2.9158249954625514e-05,
257
- "loss": 1.7355,
258
- "mean_token_accuracy": 0.548308789730072,
259
- "num_tokens": 45870.0,
260
  "step": 25
261
  },
262
  {
263
- "entropy": 2.005643382668495,
264
- "epoch": 0.7969348659003831,
265
- "grad_norm": 10.6875,
266
- "learning_rate": 2.8963106229663064e-05,
267
- "loss": 1.6277,
268
- "mean_token_accuracy": 0.577509343624115,
269
- "num_tokens": 47664.0,
270
  "step": 26
271
  },
272
  {
273
- "entropy": 2.015763074159622,
274
- "epoch": 0.8275862068965517,
275
- "grad_norm": 10.875,
276
- "learning_rate": 2.8748433838049642e-05,
277
- "loss": 1.6878,
278
- "mean_token_accuracy": 0.5588897317647934,
279
- "num_tokens": 49646.0,
280
  "step": 27
281
  },
282
  {
283
- "entropy": 2.0416687428951263,
284
- "epoch": 0.8582375478927203,
285
- "grad_norm": 13.0,
286
- "learning_rate": 2.8514533018536286e-05,
287
- "loss": 1.5327,
288
- "mean_token_accuracy": 0.5883619785308838,
289
- "num_tokens": 51235.0,
290
  "step": 28
291
  },
292
  {
293
- "entropy": 2.029404863715172,
294
- "epoch": 0.8888888888888888,
295
- "grad_norm": 10.8125,
296
- "learning_rate": 2.8261730902569146e-05,
297
- "loss": 1.6362,
298
- "mean_token_accuracy": 0.5863424465060234,
299
- "num_tokens": 53037.0,
300
  "step": 29
301
  },
302
  {
303
- "entropy": 2.0645615607500076,
304
- "epoch": 0.9195402298850575,
305
- "grad_norm": 10.0625,
306
- "learning_rate": 2.7990381056766583e-05,
307
- "loss": 1.6623,
308
- "mean_token_accuracy": 0.5610311627388,
309
- "num_tokens": 54826.0,
310
  "step": 30
311
  },
312
  {
313
- "entropy": 2.090387746691704,
314
- "epoch": 0.9501915708812261,
315
- "grad_norm": 12.0,
316
- "learning_rate": 2.770086298842426e-05,
317
- "loss": 1.6578,
318
- "mean_token_accuracy": 0.5568758621811867,
319
- "num_tokens": 56737.0,
320
  "step": 31
321
  },
322
  {
323
- "entropy": 2.0354464948177338,
324
- "epoch": 0.9808429118773946,
325
- "grad_norm": 12.5625,
326
- "learning_rate": 2.7393581614739924e-05,
327
- "loss": 1.6745,
328
- "mean_token_accuracy": 0.5604493953287601,
329
- "num_tokens": 58084.0,
330
  "step": 32
331
  },
332
  {
333
- "entropy": 1.7894673347473145,
334
- "epoch": 1.0,
335
- "grad_norm": 12.4375,
336
- "learning_rate": 2.7068966696500025e-05,
337
- "loss": 1.6188,
338
- "mean_token_accuracy": 0.5824247837066651,
339
- "num_tokens": 59142.0,
340
  "step": 33
341
  },
342
  {
343
- "entropy": 1.63651242852211,
344
- "epoch": 1.0306513409961686,
345
- "grad_norm": 8.0625,
346
- "learning_rate": 2.672747223702045e-05,
347
- "loss": 0.9761,
348
- "mean_token_accuracy": 0.7217265591025352,
349
- "num_tokens": 60897.0,
350
  "step": 34
351
  },
352
  {
353
- "entropy": 1.7347675114870071,
354
- "epoch": 1.0613026819923372,
355
- "grad_norm": 9.3125,
356
- "learning_rate": 2.6369575847181795e-05,
357
- "loss": 1.1561,
358
- "mean_token_accuracy": 0.7075180560350418,
359
- "num_tokens": 62325.0,
360
  "step": 35
361
  },
362
  {
363
- "entropy": 1.5030861496925354,
364
- "epoch": 1.0919540229885056,
365
- "grad_norm": 7.65625,
366
- "learning_rate": 2.5995778077447393e-05,
367
- "loss": 0.8402,
368
- "mean_token_accuracy": 0.7322944924235344,
369
- "num_tokens": 64163.0,
370
  "step": 36
371
  },
372
  {
373
- "entropy": 1.3862270265817642,
374
- "epoch": 1.1226053639846743,
375
- "grad_norm": 8.5625,
376
- "learning_rate": 2.5606601717798212e-05,
377
- "loss": 0.9429,
378
- "mean_token_accuracy": 0.7389034852385521,
379
- "num_tokens": 66168.0,
380
  "step": 37
381
  },
382
  {
383
- "entropy": 1.3857311755418777,
384
- "epoch": 1.1532567049808429,
385
- "grad_norm": 7.65625,
386
- "learning_rate": 2.520259106656379e-05,
387
- "loss": 0.8564,
388
- "mean_token_accuracy": 0.7321354225277901,
389
- "num_tokens": 68398.0,
390
  "step": 38
391
  },
392
  {
393
- "entropy": 1.2590633258223534,
394
- "epoch": 1.1839080459770115,
395
- "grad_norm": 9.75,
396
- "learning_rate": 2.4784311169171818e-05,
397
- "loss": 0.9376,
398
- "mean_token_accuracy": 0.7156714797019958,
399
- "num_tokens": 70548.0,
400
  "step": 39
401
  },
402
  {
403
- "entropy": 1.2306247800588608,
404
- "epoch": 1.21455938697318,
405
- "grad_norm": 10.9375,
406
- "learning_rate": 2.4352347027881003e-05,
407
- "loss": 0.8899,
408
- "mean_token_accuracy": 0.756280928850174,
409
- "num_tokens": 72463.0,
410
  "step": 40
411
  },
412
  {
413
- "entropy": 1.110754244029522,
414
- "epoch": 1.2452107279693487,
415
- "grad_norm": 12.125,
416
- "learning_rate": 2.3907302783602522e-05,
417
- "loss": 0.7503,
418
- "mean_token_accuracy": 0.7652318105101585,
419
- "num_tokens": 74061.0,
420
  "step": 41
421
  },
422
  {
423
- "entropy": 1.1396447345614433,
424
- "epoch": 1.2758620689655173,
425
- "grad_norm": 10.375,
426
- "learning_rate": 2.344980087095433e-05,
427
- "loss": 0.774,
428
- "mean_token_accuracy": 0.7681270688772202,
429
- "num_tokens": 76130.0,
430
  "step": 42
431
  },
432
  {
433
- "entropy": 1.0957090184092522,
434
- "epoch": 1.3065134099616857,
435
- "grad_norm": 12.4375,
436
- "learning_rate": 2.298048114773005e-05,
437
- "loss": 0.7757,
438
- "mean_token_accuracy": 0.767442375421524,
439
- "num_tokens": 77912.0,
440
  "step": 43
441
  },
442
  {
443
- "entropy": 1.0323160290718079,
444
- "epoch": 1.3371647509578544,
445
- "grad_norm": 10.625,
446
- "learning_rate": 2.25e-05,
447
- "loss": 0.7192,
448
- "mean_token_accuracy": 0.771703340113163,
449
- "num_tokens": 79873.0,
450
  "step": 44
451
  },
452
  {
453
- "entropy": 1.1174012199044228,
454
- "epoch": 1.367816091954023,
455
- "grad_norm": 13.1875,
456
- "learning_rate": 2.200902942409593e-05,
457
- "loss": 0.7571,
458
- "mean_token_accuracy": 0.7688822597265244,
459
- "num_tokens": 81708.0,
460
  "step": 45
461
  },
462
  {
463
- "entropy": 1.133009672164917,
464
- "epoch": 1.3984674329501916,
465
- "grad_norm": 11.4375,
466
- "learning_rate": 2.1508256086763372e-05,
467
- "loss": 0.8328,
468
- "mean_token_accuracy": 0.7457190081477165,
469
- "num_tokens": 83479.0,
470
  "step": 46
471
  },
472
  {
473
- "entropy": 1.0821977257728577,
474
- "epoch": 1.4291187739463602,
475
- "grad_norm": 12.25,
476
- "learning_rate": 2.0998380364796112e-05,
477
- "loss": 0.8791,
478
- "mean_token_accuracy": 0.7517153918743134,
479
- "num_tokens": 85091.0,
480
  "step": 47
481
  },
482
  {
483
- "entropy": 1.160033829510212,
484
- "epoch": 1.4597701149425286,
485
- "grad_norm": 10.25,
486
- "learning_rate": 2.0480115365495928e-05,
487
- "loss": 0.7528,
488
- "mean_token_accuracy": 0.7454545870423317,
489
- "num_tokens": 87067.0,
490
  "step": 48
491
  },
492
  {
493
- "entropy": 1.09547870606184,
494
- "epoch": 1.4904214559386972,
495
- "grad_norm": 8.1875,
496
- "learning_rate": 1.995418592932751e-05,
497
- "loss": 0.6824,
498
- "mean_token_accuracy": 0.8004695847630501,
499
- "num_tokens": 89257.0,
500
  "step": 49
501
  },
502
  {
503
- "entropy": 1.1644561365246773,
504
- "epoch": 1.5210727969348659,
505
- "grad_norm": 10.125,
506
- "learning_rate": 1.9421327616163564e-05,
507
- "loss": 0.8229,
508
- "mean_token_accuracy": 0.744444377720356,
509
- "num_tokens": 91129.0,
510
  "step": 50
511
  },
512
  {
513
- "entropy": 1.1956558972597122,
514
- "epoch": 1.5517241379310345,
515
- "grad_norm": 9.1875,
516
- "learning_rate": 1.888228567653781e-05,
517
- "loss": 0.807,
518
- "mean_token_accuracy": 0.7377020716667175,
519
- "num_tokens": 93217.0,
520
  "step": 51
521
  },
522
  {
523
- "entropy": 1.2180762365460396,
524
- "epoch": 1.582375478927203,
525
- "grad_norm": 9.125,
526
- "learning_rate": 1.8337814009344716e-05,
527
- "loss": 0.6652,
528
- "mean_token_accuracy": 0.7918966636061668,
529
- "num_tokens": 94882.0,
530
  "step": 52
531
  },
532
  {
533
- "entropy": 1.2762009352445602,
534
- "epoch": 1.6130268199233715,
535
- "grad_norm": 11.625,
536
- "learning_rate": 1.778867410744372e-05,
537
- "loss": 0.8152,
538
- "mean_token_accuracy": 0.7556928023695946,
539
- "num_tokens": 96226.0,
540
  "step": 53
541
  },
542
  {
543
- "entropy": 1.2115763127803802,
544
- "epoch": 1.6436781609195403,
545
- "grad_norm": 10.8125,
546
- "learning_rate": 1.7235633992642615e-05,
547
- "loss": 0.7119,
548
- "mean_token_accuracy": 0.7653274685144424,
549
- "num_tokens": 98064.0,
550
  "step": 54
551
  },
552
  {
553
- "entropy": 1.301737241446972,
554
- "epoch": 1.6743295019157087,
555
- "grad_norm": 8.75,
556
- "learning_rate": 1.667946714154962e-05,
557
- "loss": 0.7362,
558
- "mean_token_accuracy": 0.7743538916110992,
559
- "num_tokens": 99875.0,
560
  "step": 55
561
  },
562
  {
563
- "entropy": 1.1645233482122421,
564
- "epoch": 1.7049808429118773,
565
- "grad_norm": 8.125,
566
- "learning_rate": 1.6120951403796367e-05,
567
- "loss": 0.7929,
568
- "mean_token_accuracy": 0.7437388524413109,
569
- "num_tokens": 102303.0,
570
  "step": 56
571
  },
572
  {
573
- "entropy": 1.2387544885277748,
574
- "epoch": 1.735632183908046,
575
- "grad_norm": 10.125,
576
- "learning_rate": 1.5560867914144887e-05,
577
- "loss": 0.7757,
578
- "mean_token_accuracy": 0.760113924741745,
579
- "num_tokens": 103806.0,
580
  "step": 57
581
  },
582
  {
583
- "entropy": 1.2401599884033203,
584
- "epoch": 1.7662835249042146,
585
- "grad_norm": 12.25,
586
- "learning_rate": 1.5e-05,
587
- "loss": 0.757,
588
- "mean_token_accuracy": 0.7870561257004738,
589
- "num_tokens": 105012.0,
590
  "step": 58
591
  },
592
  {
593
- "entropy": 1.3122059255838394,
594
- "epoch": 1.7969348659003832,
595
- "grad_norm": 11.25,
596
- "learning_rate": 1.4439132085855117e-05,
597
- "loss": 0.8231,
598
- "mean_token_accuracy": 0.7717632800340652,
599
- "num_tokens": 106373.0,
600
  "step": 59
601
  },
602
  {
603
- "entropy": 1.224107950925827,
604
- "epoch": 1.8275862068965516,
605
- "grad_norm": 9.3125,
606
- "learning_rate": 1.3879048596203637e-05,
607
- "loss": 0.6616,
608
- "mean_token_accuracy": 0.8022700250148773,
609
- "num_tokens": 107938.0,
610
  "step": 60
611
  },
612
  {
613
- "entropy": 1.2059504985809326,
614
- "epoch": 1.8582375478927204,
615
- "grad_norm": 9.625,
616
- "learning_rate": 1.3320532858450382e-05,
617
- "loss": 0.7585,
618
- "mean_token_accuracy": 0.7686295211315155,
619
- "num_tokens": 109587.0,
620
  "step": 61
621
  },
622
  {
623
- "entropy": 1.2734860181808472,
624
- "epoch": 1.8888888888888888,
625
- "grad_norm": 12.4375,
626
- "learning_rate": 1.2764366007357382e-05,
627
- "loss": 1.055,
628
- "mean_token_accuracy": 0.707017719745636,
629
- "num_tokens": 111253.0,
630
  "step": 62
631
  },
632
  {
633
- "entropy": 1.1893908977508545,
634
- "epoch": 1.9195402298850575,
635
- "grad_norm": 11.1875,
636
- "learning_rate": 1.2211325892556282e-05,
637
- "loss": 0.7912,
638
- "mean_token_accuracy": 0.7822966873645782,
639
- "num_tokens": 112833.0,
640
  "step": 63
641
  },
642
  {
643
- "entropy": 1.1533539071679115,
644
- "epoch": 1.950191570881226,
645
- "grad_norm": 11.125,
646
- "learning_rate": 1.1662185990655285e-05,
647
- "loss": 0.8553,
648
- "mean_token_accuracy": 0.7498924359679222,
649
- "num_tokens": 114573.0,
650
  "step": 64
651
  },
652
  {
653
- "entropy": 1.1270944774150848,
654
- "epoch": 1.9808429118773945,
655
- "grad_norm": 8.25,
656
- "learning_rate": 1.1117714323462188e-05,
657
- "loss": 0.7116,
658
- "mean_token_accuracy": 0.7686784416437149,
659
- "num_tokens": 116981.0,
660
  "step": 65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
661
  }
662
  ],
663
  "logging_steps": 1,
664
- "max_steps": 99,
665
  "num_input_tokens_seen": 0,
666
  "num_train_epochs": 3,
667
  "save_steps": 5,
@@ -677,7 +1027,7 @@
677
  "attributes": {}
678
  }
679
  },
680
- "total_flos": 3202052021059584.0,
681
  "train_batch_size": 2,
682
  "trial_name": null,
683
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 2.875912408759124,
6
  "eval_steps": 500,
7
+ "global_step": 100,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "entropy": 2.134004071354866,
14
+ "epoch": 0.029197080291970802,
15
+ "grad_norm": 19.125,
16
  "learning_rate": 0.0,
17
+ "loss": 2.5766,
18
+ "mean_token_accuracy": 0.42293117567896843,
19
+ "num_tokens": 1699.0,
20
  "step": 1
21
  },
22
  {
23
+ "entropy": 2.1332614570856094,
24
+ "epoch": 0.058394160583941604,
25
+ "grad_norm": 15.375,
26
  "learning_rate": 2e-06,
27
+ "loss": 2.3501,
28
+ "mean_token_accuracy": 0.43237315863370895,
29
+ "num_tokens": 3890.0,
30
  "step": 2
31
  },
32
  {
33
+ "entropy": 2.3441822230815887,
34
+ "epoch": 0.08759124087591241,
35
+ "grad_norm": 22.875,
36
  "learning_rate": 4e-06,
37
+ "loss": 3.134,
38
+ "mean_token_accuracy": 0.3771548382937908,
39
+ "num_tokens": 5114.0,
40
  "step": 3
41
  },
42
  {
43
+ "entropy": 2.169568419456482,
44
+ "epoch": 0.11678832116788321,
45
+ "grad_norm": 15.0625,
46
  "learning_rate": 6e-06,
47
+ "loss": 2.2609,
48
+ "mean_token_accuracy": 0.4582384452223778,
49
+ "num_tokens": 6825.0,
50
  "step": 4
51
  },
52
  {
53
+ "entropy": 2.3993491530418396,
54
+ "epoch": 0.145985401459854,
55
+ "grad_norm": 11.5625,
56
  "learning_rate": 8e-06,
57
+ "loss": 2.2625,
58
+ "mean_token_accuracy": 0.44751258939504623,
59
+ "num_tokens": 8794.0,
60
  "step": 5
61
  },
62
  {
63
+ "entropy": 2.38617005944252,
64
+ "epoch": 0.17518248175182483,
65
+ "grad_norm": 11.0,
66
  "learning_rate": 9.999999999999999e-06,
67
+ "loss": 2.2774,
68
+ "mean_token_accuracy": 0.4528072811663151,
69
+ "num_tokens": 10473.0,
70
  "step": 6
71
  },
72
  {
73
+ "entropy": 2.17643141746521,
74
+ "epoch": 0.20437956204379562,
75
+ "grad_norm": 7.21875,
76
  "learning_rate": 1.2e-05,
77
+ "loss": 1.911,
78
+ "mean_token_accuracy": 0.5113650299608707,
79
+ "num_tokens": 12709.0,
80
  "step": 7
81
  },
82
  {
83
+ "entropy": 2.2615339010953903,
84
+ "epoch": 0.23357664233576642,
85
+ "grad_norm": 5.9375,
86
  "learning_rate": 1.4e-05,
87
+ "loss": 1.8747,
88
+ "mean_token_accuracy": 0.5006480813026428,
89
+ "num_tokens": 15657.0,
90
  "step": 8
91
  },
92
  {
93
+ "entropy": 2.416978284716606,
94
+ "epoch": 0.26277372262773724,
95
+ "grad_norm": 8.4375,
96
  "learning_rate": 1.6e-05,
97
+ "loss": 1.9924,
98
+ "mean_token_accuracy": 0.4901970997452736,
99
+ "num_tokens": 17681.0,
100
  "step": 9
101
  },
102
  {
103
+ "entropy": 2.3273025155067444,
104
+ "epoch": 0.291970802919708,
105
+ "grad_norm": 6.09375,
106
  "learning_rate": 1.8e-05,
107
+ "loss": 1.8238,
108
+ "mean_token_accuracy": 0.4891773872077465,
109
+ "num_tokens": 20159.0,
110
  "step": 10
111
  },
112
  {
113
+ "entropy": 2.295111373066902,
114
+ "epoch": 0.32116788321167883,
115
+ "grad_norm": 5.9375,
116
  "learning_rate": 1.9999999999999998e-05,
117
+ "loss": 1.8838,
118
+ "mean_token_accuracy": 0.528899259865284,
119
+ "num_tokens": 22380.0,
120
  "step": 11
121
  },
122
  {
123
+ "entropy": 2.4463636726140976,
124
+ "epoch": 0.35036496350364965,
125
+ "grad_norm": 7.28125,
126
  "learning_rate": 2.2e-05,
127
+ "loss": 2.0672,
128
+ "mean_token_accuracy": 0.4942050985991955,
129
+ "num_tokens": 23949.0,
130
  "step": 12
131
  },
132
  {
133
+ "entropy": 2.2411956042051315,
134
+ "epoch": 0.3795620437956204,
135
+ "grad_norm": 6.625,
136
  "learning_rate": 2.4e-05,
137
+ "loss": 1.7258,
138
+ "mean_token_accuracy": 0.5641119256615639,
139
+ "num_tokens": 25626.0,
140
  "step": 13
141
  },
142
  {
143
+ "entropy": 2.1571693122386932,
144
+ "epoch": 0.40875912408759124,
145
+ "grad_norm": 6.3125,
146
  "learning_rate": 2.6000000000000002e-05,
147
+ "loss": 1.7421,
148
+ "mean_token_accuracy": 0.5413074977695942,
149
+ "num_tokens": 27703.0,
150
  "step": 14
151
  },
152
  {
153
+ "entropy": 2.0649050027132034,
154
+ "epoch": 0.43795620437956206,
155
+ "grad_norm": 5.65625,
156
  "learning_rate": 2.8e-05,
157
+ "loss": 1.7653,
158
+ "mean_token_accuracy": 0.5364297069609165,
159
+ "num_tokens": 29910.0,
160
  "step": 15
161
  },
162
  {
163
+ "entropy": 2.0259645730257034,
164
+ "epoch": 0.46715328467153283,
165
+ "grad_norm": 5.5,
166
  "learning_rate": 3e-05,
167
+ "loss": 1.586,
168
+ "mean_token_accuracy": 0.5716114267706871,
169
+ "num_tokens": 32243.0,
170
  "step": 16
171
  },
172
  {
173
+ "entropy": 2.2259650826454163,
174
+ "epoch": 0.49635036496350365,
175
+ "grad_norm": 6.46875,
176
+ "learning_rate": 2.9990862405286438e-05,
177
+ "loss": 1.8815,
178
+ "mean_token_accuracy": 0.5339390859007835,
179
+ "num_tokens": 33704.0,
180
  "step": 17
181
  },
182
  {
183
+ "entropy": 2.2045857161283493,
184
+ "epoch": 0.5255474452554745,
185
+ "grad_norm": 7.0,
186
+ "learning_rate": 2.9963460753897364e-05,
187
+ "loss": 1.8033,
188
+ "mean_token_accuracy": 0.5426613725721836,
189
+ "num_tokens": 35222.0,
190
  "step": 18
191
  },
192
  {
193
+ "entropy": 2.0502880662679672,
194
+ "epoch": 0.5547445255474452,
195
+ "grad_norm": 5.625,
196
+ "learning_rate": 2.99178284305241e-05,
197
+ "loss": 1.6822,
198
+ "mean_token_accuracy": 0.5445077642798424,
199
+ "num_tokens": 37281.0,
200
  "step": 19
201
  },
202
  {
203
+ "entropy": 1.9275199472904205,
204
+ "epoch": 0.583941605839416,
205
+ "grad_norm": 5.625,
206
+ "learning_rate": 2.9854021031123555e-05,
207
+ "loss": 1.5569,
208
+ "mean_token_accuracy": 0.5689515694975853,
209
+ "num_tokens": 39208.0,
210
  "step": 20
211
  },
212
  {
213
+ "entropy": 2.1408673971891403,
214
+ "epoch": 0.6131386861313869,
215
+ "grad_norm": 6.5,
216
+ "learning_rate": 2.977211629518312e-05,
217
+ "loss": 1.8479,
218
+ "mean_token_accuracy": 0.5382610447704792,
219
+ "num_tokens": 40754.0,
220
  "step": 21
221
  },
222
  {
223
+ "entropy": 2.138097256422043,
224
+ "epoch": 0.6423357664233577,
225
+ "grad_norm": 6.03125,
226
+ "learning_rate": 2.9672214011007087e-05,
227
+ "loss": 1.7691,
228
+ "mean_token_accuracy": 0.5337589606642723,
229
+ "num_tokens": 42447.0,
230
  "step": 22
231
  },
232
  {
233
+ "entropy": 1.9584687054157257,
234
+ "epoch": 0.6715328467153284,
235
+ "grad_norm": 4.59375,
236
+ "learning_rate": 2.9554435894139945e-05,
237
+ "loss": 1.502,
238
+ "mean_token_accuracy": 0.5679651834070683,
239
+ "num_tokens": 44963.0,
240
  "step": 23
241
  },
242
  {
243
+ "entropy": 2.0382106602191925,
244
+ "epoch": 0.7007299270072993,
245
+ "grad_norm": 5.03125,
246
+ "learning_rate": 2.9418925439074784e-05,
247
+ "loss": 1.6539,
248
+ "mean_token_accuracy": 0.5411265380680561,
249
+ "num_tokens": 47138.0,
250
  "step": 24
251
  },
252
  {
253
+ "entropy": 2.0515516996383667,
254
+ "epoch": 0.7299270072992701,
255
+ "grad_norm": 5.4375,
256
+ "learning_rate": 2.9265847744427305e-05,
257
+ "loss": 1.7007,
258
+ "mean_token_accuracy": 0.5707135051488876,
259
+ "num_tokens": 49154.0,
260
  "step": 25
261
  },
262
  {
263
+ "entropy": 1.96835595369339,
264
+ "epoch": 0.7591240875912408,
265
+ "grad_norm": 4.875,
266
+ "learning_rate": 2.9095389311788626e-05,
267
+ "loss": 1.5182,
268
+ "mean_token_accuracy": 0.5940572991967201,
269
+ "num_tokens": 51009.0,
270
  "step": 26
271
  },
272
  {
273
+ "entropy": 1.9829230606555939,
274
+ "epoch": 0.7883211678832117,
275
+ "grad_norm": 4.78125,
276
+ "learning_rate": 2.890775781850181e-05,
277
+ "loss": 1.5441,
278
+ "mean_token_accuracy": 0.5696061700582504,
279
+ "num_tokens": 52866.0,
280
  "step": 27
281
  },
282
  {
283
+ "entropy": 1.9901328533887863,
284
+ "epoch": 0.8175182481751825,
285
+ "grad_norm": 4.53125,
286
+ "learning_rate": 2.8703181864639013e-05,
287
+ "loss": 1.5227,
288
+ "mean_token_accuracy": 0.5771616920828819,
289
+ "num_tokens": 55235.0,
290
  "step": 28
291
  },
292
  {
293
+ "entropy": 2.176472947001457,
294
+ "epoch": 0.8467153284671532,
295
+ "grad_norm": 7.34375,
296
+ "learning_rate": 2.8481910694487507e-05,
297
+ "loss": 1.784,
298
+ "mean_token_accuracy": 0.5394799076020718,
299
+ "num_tokens": 56468.0,
300
  "step": 29
301
  },
302
  {
303
+ "entropy": 2.0398730635643005,
304
+ "epoch": 0.8759124087591241,
305
+ "grad_norm": 5.5,
306
+ "learning_rate": 2.8244213892883907e-05,
307
+ "loss": 1.584,
308
+ "mean_token_accuracy": 0.564793273806572,
309
+ "num_tokens": 58219.0,
310
  "step": 30
311
  },
312
  {
313
+ "entropy": 1.7868350446224213,
314
+ "epoch": 0.9051094890510949,
315
+ "grad_norm": 3.71875,
316
+ "learning_rate": 2.7990381056766583e-05,
317
+ "loss": 1.4897,
318
+ "mean_token_accuracy": 0.5773478448390961,
319
+ "num_tokens": 61246.0,
320
  "step": 31
321
  },
322
  {
323
+ "entropy": 1.8927763998508453,
324
+ "epoch": 0.9343065693430657,
325
+ "grad_norm": 5.03125,
326
+ "learning_rate": 2.772072144234639e-05,
327
+ "loss": 1.4658,
328
+ "mean_token_accuracy": 0.5965544059872627,
329
+ "num_tokens": 63057.0,
330
  "step": 32
331
  },
332
  {
333
+ "entropy": 1.9243939369916916,
334
+ "epoch": 0.9635036496350365,
335
+ "grad_norm": 4.9375,
336
+ "learning_rate": 2.7435563588325627e-05,
337
+ "loss": 1.5646,
338
+ "mean_token_accuracy": 0.551388930529356,
339
+ "num_tokens": 64856.0,
340
  "step": 33
341
  },
342
  {
343
+ "entropy": 1.945557788014412,
344
+ "epoch": 0.9927007299270073,
345
+ "grad_norm": 5.34375,
346
+ "learning_rate": 2.7135254915624213e-05,
347
+ "loss": 1.6558,
348
+ "mean_token_accuracy": 0.5641069300472736,
349
+ "num_tokens": 66564.0,
350
  "step": 34
351
  },
352
  {
353
+ "entropy": 1.8289813995361328,
354
+ "epoch": 1.0,
355
+ "grad_norm": 12.5,
356
+ "learning_rate": 2.6820161304100828e-05,
357
+ "loss": 1.6743,
358
+ "mean_token_accuracy": 0.5590097606182098,
359
+ "num_tokens": 66897.0,
360
  "step": 35
361
  },
362
  {
363
+ "entropy": 1.8240835815668106,
364
+ "epoch": 1.0291970802919708,
365
+ "grad_norm": 4.0,
366
+ "learning_rate": 2.649066664678467e-05,
367
+ "loss": 1.2519,
368
+ "mean_token_accuracy": 0.6510025560855865,
369
+ "num_tokens": 69125.0,
370
  "step": 36
371
  },
372
  {
373
+ "entropy": 1.7388608753681183,
374
+ "epoch": 1.0583941605839415,
375
+ "grad_norm": 3.671875,
376
+ "learning_rate": 2.6147172382160913e-05,
377
+ "loss": 1.145,
378
+ "mean_token_accuracy": 0.6592915058135986,
379
+ "num_tokens": 71403.0,
380
  "step": 37
381
  },
382
  {
383
+ "entropy": 1.7314125299453735,
384
+ "epoch": 1.0875912408759123,
385
+ "grad_norm": 3.84375,
386
+ "learning_rate": 2.5790097005079766e-05,
387
+ "loss": 1.2177,
388
+ "mean_token_accuracy": 0.6403542906045914,
389
+ "num_tokens": 73853.0,
390
  "step": 38
391
  },
392
  {
393
+ "entropy": 1.9059295356273651,
394
+ "epoch": 1.1167883211678833,
395
+ "grad_norm": 5.09375,
396
+ "learning_rate": 2.541987555688496e-05,
397
+ "loss": 1.3537,
398
+ "mean_token_accuracy": 0.5938370451331139,
399
+ "num_tokens": 75484.0,
400
  "step": 39
401
  },
402
  {
403
+ "entropy": 1.8351815044879913,
404
+ "epoch": 1.145985401459854,
405
+ "grad_norm": 5.03125,
406
+ "learning_rate": 2.5036959095382875e-05,
407
+ "loss": 1.1891,
408
+ "mean_token_accuracy": 0.6363263987004757,
409
+ "num_tokens": 77263.0,
410
  "step": 40
411
  },
412
  {
413
+ "entropy": 1.856779396533966,
414
+ "epoch": 1.1751824817518248,
415
+ "grad_norm": 4.8125,
416
+ "learning_rate": 2.464181414529809e-05,
417
+ "loss": 1.3116,
418
+ "mean_token_accuracy": 0.625493511557579,
419
+ "num_tokens": 79113.0,
420
  "step": 41
421
  },
422
  {
423
+ "entropy": 1.7603202909231186,
424
+ "epoch": 1.2043795620437956,
425
+ "grad_norm": 4.90625,
426
+ "learning_rate": 2.4234922129884873e-05,
427
+ "loss": 1.2056,
428
+ "mean_token_accuracy": 0.6308283284306526,
429
+ "num_tokens": 80962.0,
430
  "step": 42
431
  },
432
  {
433
+ "entropy": 1.6366319358348846,
434
+ "epoch": 1.2335766423357664,
435
+ "grad_norm": 4.6875,
436
+ "learning_rate": 2.3816778784387097e-05,
437
+ "loss": 1.2438,
438
+ "mean_token_accuracy": 0.6533086150884628,
439
+ "num_tokens": 83095.0,
440
  "step": 43
441
  },
442
  {
443
+ "entropy": 1.6320330947637558,
444
+ "epoch": 1.2627737226277373,
445
+ "grad_norm": 4.1875,
446
+ "learning_rate": 2.3387893552061202e-05,
447
+ "loss": 1.1647,
448
+ "mean_token_accuracy": 0.6589736789464951,
449
+ "num_tokens": 85383.0,
450
  "step": 44
451
  },
452
  {
453
+ "entropy": 1.575496032834053,
454
+ "epoch": 1.2919708029197081,
455
+ "grad_norm": 4.65625,
456
+ "learning_rate": 2.2948788963498073e-05,
457
+ "loss": 1.1654,
458
+ "mean_token_accuracy": 0.6555850505828857,
459
+ "num_tokens": 87754.0,
460
  "step": 45
461
  },
462
  {
463
+ "entropy": 1.64286208152771,
464
+ "epoch": 1.3211678832116789,
465
+ "grad_norm": 5.8125,
466
+ "learning_rate": 2.25e-05,
467
+ "loss": 1.3359,
468
+ "mean_token_accuracy": 0.649970181286335,
469
+ "num_tokens": 89289.0,
470
  "step": 46
471
  },
472
  {
473
+ "entropy": 1.457002505660057,
474
+ "epoch": 1.3503649635036497,
475
+ "grad_norm": 4.75,
476
+ "learning_rate": 2.2042073441788363e-05,
477
+ "loss": 1.1513,
478
+ "mean_token_accuracy": 0.6784967109560966,
479
+ "num_tokens": 91666.0,
480
  "step": 47
481
  },
482
  {
483
+ "entropy": 1.567281499505043,
484
+ "epoch": 1.3795620437956204,
485
+ "grad_norm": 6.78125,
486
+ "learning_rate": 2.157556720183616e-05,
487
+ "loss": 1.212,
488
+ "mean_token_accuracy": 0.6601979807019234,
489
+ "num_tokens": 93407.0,
490
  "step": 48
491
  },
492
  {
493
+ "entropy": 1.4496354460716248,
494
+ "epoch": 1.4087591240875912,
495
+ "grad_norm": 4.90625,
496
+ "learning_rate": 2.1101049646137008e-05,
497
+ "loss": 1.074,
498
+ "mean_token_accuracy": 0.6734104976058006,
499
+ "num_tokens": 95819.0,
500
  "step": 49
501
  },
502
  {
503
+ "entropy": 1.5027115792036057,
504
+ "epoch": 1.437956204379562,
505
+ "grad_norm": 4.65625,
506
+ "learning_rate": 2.0619098901238684e-05,
507
+ "loss": 1.1059,
508
+ "mean_token_accuracy": 0.6857927665114403,
509
+ "num_tokens": 98052.0,
510
  "step": 50
511
  },
512
  {
513
+ "entropy": 1.5403490960597992,
514
+ "epoch": 1.4671532846715327,
515
+ "grad_norm": 5.75,
516
+ "learning_rate": 2.0130302149885033e-05,
517
+ "loss": 1.1573,
518
+ "mean_token_accuracy": 0.6808772906661034,
519
+ "num_tokens": 99865.0,
520
  "step": 51
521
  },
522
  {
523
+ "entropy": 1.3851112127304077,
524
+ "epoch": 1.4963503649635037,
525
+ "grad_norm": 4.3125,
526
+ "learning_rate": 1.963525491562421e-05,
527
+ "loss": 1.0986,
528
+ "mean_token_accuracy": 0.669769361615181,
529
+ "num_tokens": 102444.0,
530
  "step": 52
531
  },
532
  {
533
+ "entropy": 1.6086822748184204,
534
+ "epoch": 1.5255474452554745,
535
+ "grad_norm": 5.9375,
536
+ "learning_rate": 1.9134560337254986e-05,
537
+ "loss": 1.2058,
538
+ "mean_token_accuracy": 0.6342265903949738,
539
+ "num_tokens": 104135.0,
540
  "step": 53
541
  },
542
  {
543
+ "entropy": 1.6186174154281616,
544
+ "epoch": 1.5547445255474452,
545
+ "grad_norm": 5.75,
546
+ "learning_rate": 1.8628828433995013e-05,
547
+ "loss": 1.1878,
548
+ "mean_token_accuracy": 0.6471928432583809,
549
+ "num_tokens": 105888.0,
550
  "step": 54
551
  },
552
  {
553
+ "entropy": 1.636601522564888,
554
+ "epoch": 1.583941605839416,
555
+ "grad_norm": 6.40625,
556
+ "learning_rate": 1.8118675362266388e-05,
557
+ "loss": 1.2144,
558
+ "mean_token_accuracy": 0.669179767370224,
559
+ "num_tokens": 107324.0,
560
  "step": 55
561
  },
562
  {
563
+ "entropy": 1.6150267571210861,
564
+ "epoch": 1.613138686131387,
565
+ "grad_norm": 6.21875,
566
+ "learning_rate": 1.760472266500396e-05,
567
+ "loss": 1.2551,
568
+ "mean_token_accuracy": 0.6627604365348816,
569
+ "num_tokens": 108844.0,
570
  "step": 56
571
  },
572
  {
573
+ "entropy": 1.7444928288459778,
574
+ "epoch": 1.6423357664233578,
575
+ "grad_norm": 6.34375,
576
+ "learning_rate": 1.7087596514400982e-05,
577
+ "loss": 1.2656,
578
+ "mean_token_accuracy": 0.6279268711805344,
579
+ "num_tokens": 110263.0,
580
  "step": 57
581
  },
582
  {
583
+ "entropy": 1.5423792004585266,
584
+ "epoch": 1.6715328467153285,
585
+ "grad_norm": 5.53125,
586
+ "learning_rate": 1.6567926949014805e-05,
587
+ "loss": 1.2103,
588
+ "mean_token_accuracy": 0.6224785149097443,
589
+ "num_tokens": 112199.0,
590
  "step": 58
591
  },
592
  {
593
+ "entropy": 1.6031899452209473,
594
+ "epoch": 1.7007299270072993,
595
+ "grad_norm": 6.5,
596
+ "learning_rate": 1.604634710616188e-05,
597
+ "loss": 1.2274,
598
+ "mean_token_accuracy": 0.6428026333451271,
599
+ "num_tokens": 113911.0,
600
  "step": 59
601
  },
602
  {
603
+ "entropy": 1.7055649012327194,
604
+ "epoch": 1.72992700729927,
605
+ "grad_norm": 6.6875,
606
+ "learning_rate": 1.552349245053752e-05,
607
+ "loss": 1.2889,
608
+ "mean_token_accuracy": 0.6419094651937485,
609
+ "num_tokens": 115316.0,
610
  "step": 60
611
  },
612
  {
613
+ "entropy": 1.5212641060352325,
614
+ "epoch": 1.7591240875912408,
615
+ "grad_norm": 4.4375,
616
+ "learning_rate": 1.5e-05,
617
+ "loss": 1.0935,
618
+ "mean_token_accuracy": 0.6695626378059387,
619
+ "num_tokens": 118007.0,
620
  "step": 61
621
  },
622
  {
623
+ "entropy": 1.781775563955307,
624
+ "epoch": 1.7883211678832116,
625
+ "grad_norm": 7.0,
626
+ "learning_rate": 1.447650754946249e-05,
627
+ "loss": 1.2709,
628
+ "mean_token_accuracy": 0.6656767651438713,
629
+ "num_tokens": 119232.0,
630
  "step": 62
631
  },
632
  {
633
+ "entropy": 1.616694524884224,
634
+ "epoch": 1.8175182481751824,
635
+ "grad_norm": 6.3125,
636
+ "learning_rate": 1.3953652893838121e-05,
637
+ "loss": 1.2435,
638
+ "mean_token_accuracy": 0.6494908779859543,
639
+ "num_tokens": 120725.0,
640
  "step": 63
641
  },
642
  {
643
+ "entropy": 1.7247931063175201,
644
+ "epoch": 1.8467153284671531,
645
+ "grad_norm": 7.15625,
646
+ "learning_rate": 1.3432073050985201e-05,
647
+ "loss": 1.3701,
648
+ "mean_token_accuracy": 0.6305030956864357,
649
+ "num_tokens": 122093.0,
650
  "step": 64
651
  },
652
  {
653
+ "entropy": 1.590467780828476,
654
+ "epoch": 1.8759124087591241,
655
+ "grad_norm": 5.0,
656
+ "learning_rate": 1.2912403485599022e-05,
657
+ "loss": 1.263,
658
+ "mean_token_accuracy": 0.6583547666668892,
659
+ "num_tokens": 124333.0,
660
  "step": 65
661
+ },
662
+ {
663
+ "entropy": 1.6301420778036118,
664
+ "epoch": 1.905109489051095,
665
+ "grad_norm": 5.3125,
666
+ "learning_rate": 1.2395277334996045e-05,
667
+ "loss": 1.1125,
668
+ "mean_token_accuracy": 0.650074191391468,
669
+ "num_tokens": 126272.0,
670
+ "step": 66
671
+ },
672
+ {
673
+ "entropy": 1.5050681680440903,
674
+ "epoch": 1.9343065693430657,
675
+ "grad_norm": 4.28125,
676
+ "learning_rate": 1.1881324637733613e-05,
677
+ "loss": 1.037,
678
+ "mean_token_accuracy": 0.6733650118112564,
679
+ "num_tokens": 128615.0,
680
+ "step": 67
681
+ },
682
+ {
683
+ "entropy": 1.5582159608602524,
684
+ "epoch": 1.9635036496350367,
685
+ "grad_norm": 4.34375,
686
+ "learning_rate": 1.1371171566004986e-05,
687
+ "loss": 1.0951,
688
+ "mean_token_accuracy": 0.6506948918104172,
689
+ "num_tokens": 131279.0,
690
+ "step": 68
691
+ },
692
+ {
693
+ "entropy": 1.6561681628227234,
694
+ "epoch": 1.9927007299270074,
695
+ "grad_norm": 5.75,
696
+ "learning_rate": 1.0865439662745013e-05,
697
+ "loss": 1.1486,
698
+ "mean_token_accuracy": 0.6755311414599419,
699
+ "num_tokens": 132847.0,
700
+ "step": 69
701
+ },
702
+ {
703
+ "entropy": 1.4383031129837036,
704
+ "epoch": 2.0,
705
+ "grad_norm": 7.4375,
706
+ "learning_rate": 1.036474508437579e-05,
707
+ "loss": 1.1032,
708
+ "mean_token_accuracy": 0.6792386174201965,
709
+ "num_tokens": 133794.0,
710
+ "step": 70
711
+ },
712
+ {
713
+ "entropy": 1.5033023059368134,
714
+ "epoch": 2.0291970802919708,
715
+ "grad_norm": 4.09375,
716
+ "learning_rate": 9.86969785011497e-06,
717
+ "loss": 0.8414,
718
+ "mean_token_accuracy": 0.7257160544395447,
719
+ "num_tokens": 135994.0,
720
+ "step": 71
721
+ },
722
+ {
723
+ "entropy": 1.588482990860939,
724
+ "epoch": 2.0583941605839415,
725
+ "grad_norm": 5.4375,
726
+ "learning_rate": 9.380901098761319e-06,
727
+ "loss": 0.8667,
728
+ "mean_token_accuracy": 0.7469649091362953,
729
+ "num_tokens": 137554.0,
730
+ "step": 72
731
+ },
732
+ {
733
+ "entropy": 1.539756417274475,
734
+ "epoch": 2.0875912408759123,
735
+ "grad_norm": 4.3125,
736
+ "learning_rate": 8.898950353863e-06,
737
+ "loss": 0.8192,
738
+ "mean_token_accuracy": 0.7514503225684166,
739
+ "num_tokens": 139542.0,
740
+ "step": 73
741
+ },
742
+ {
743
+ "entropy": 1.5114945620298386,
744
+ "epoch": 2.116788321167883,
745
+ "grad_norm": 4.25,
746
+ "learning_rate": 8.424432798163838e-06,
747
+ "loss": 0.9041,
748
+ "mean_token_accuracy": 0.7257768511772156,
749
+ "num_tokens": 141721.0,
750
+ "step": 74
751
+ },
752
+ {
753
+ "entropy": 1.4715029448270798,
754
+ "epoch": 2.145985401459854,
755
+ "grad_norm": 4.375,
756
+ "learning_rate": 7.957926558211643e-06,
757
+ "loss": 0.8884,
758
+ "mean_token_accuracy": 0.7411475032567978,
759
+ "num_tokens": 143837.0,
760
+ "step": 75
761
+ },
762
+ {
763
+ "entropy": 1.375910922884941,
764
+ "epoch": 2.1751824817518246,
765
+ "grad_norm": 4.0625,
766
+ "learning_rate": 7.500000000000004e-06,
767
+ "loss": 0.8403,
768
+ "mean_token_accuracy": 0.7337475717067719,
769
+ "num_tokens": 146069.0,
770
+ "step": 76
771
+ },
772
+ {
773
+ "entropy": 1.530395969748497,
774
+ "epoch": 2.204379562043796,
775
+ "grad_norm": 4.8125,
776
+ "learning_rate": 7.051211036501928e-06,
777
+ "loss": 0.9023,
778
+ "mean_token_accuracy": 0.7458862364292145,
779
+ "num_tokens": 147948.0,
780
+ "step": 77
781
+ },
782
+ {
783
+ "entropy": 1.5619382560253143,
784
+ "epoch": 2.2335766423357666,
785
+ "grad_norm": 5.375,
786
+ "learning_rate": 6.6121064479388e-06,
787
+ "loss": 0.9471,
788
+ "mean_token_accuracy": 0.7247473746538162,
789
+ "num_tokens": 149664.0,
790
+ "step": 78
791
+ },
792
+ {
793
+ "entropy": 1.4002738296985626,
794
+ "epoch": 2.2627737226277373,
795
+ "grad_norm": 4.90625,
796
+ "learning_rate": 6.1832212156129045e-06,
797
+ "loss": 0.8002,
798
+ "mean_token_accuracy": 0.7359691336750984,
799
+ "num_tokens": 151422.0,
800
+ "step": 79
801
+ },
802
+ {
803
+ "entropy": 1.3783821165561676,
804
+ "epoch": 2.291970802919708,
805
+ "grad_norm": 4.875,
806
+ "learning_rate": 5.765077870115126e-06,
807
+ "loss": 0.9352,
808
+ "mean_token_accuracy": 0.7229901030659676,
809
+ "num_tokens": 153330.0,
810
+ "step": 80
811
+ },
812
+ {
813
+ "entropy": 1.3214146196842194,
814
+ "epoch": 2.321167883211679,
815
+ "grad_norm": 4.875,
816
+ "learning_rate": 5.3581858547019095e-06,
817
+ "loss": 0.7626,
818
+ "mean_token_accuracy": 0.7818252220749855,
819
+ "num_tokens": 155088.0,
820
+ "step": 81
821
+ },
822
+ {
823
+ "entropy": 1.2702767699956894,
824
+ "epoch": 2.3503649635036497,
825
+ "grad_norm": 4.375,
826
+ "learning_rate": 4.963040904617131e-06,
827
+ "loss": 0.7893,
828
+ "mean_token_accuracy": 0.7699355036020279,
829
+ "num_tokens": 157396.0,
830
+ "step": 82
831
+ },
832
+ {
833
+ "entropy": 1.397829994559288,
834
+ "epoch": 2.3795620437956204,
835
+ "grad_norm": 5.25,
836
+ "learning_rate": 4.58012444311504e-06,
837
+ "loss": 0.9191,
838
+ "mean_token_accuracy": 0.7331462875008583,
839
+ "num_tokens": 159218.0,
840
+ "step": 83
841
+ },
842
+ {
843
+ "entropy": 1.2017180174589157,
844
+ "epoch": 2.408759124087591,
845
+ "grad_norm": 3.6875,
846
+ "learning_rate": 4.209902994920236e-06,
847
+ "loss": 0.8082,
848
+ "mean_token_accuracy": 0.7587887346744537,
849
+ "num_tokens": 162386.0,
850
+ "step": 84
851
+ },
852
+ {
853
+ "entropy": 1.374891072511673,
854
+ "epoch": 2.437956204379562,
855
+ "grad_norm": 5.09375,
856
+ "learning_rate": 3.852827617839085e-06,
857
+ "loss": 0.8665,
858
+ "mean_token_accuracy": 0.7603413909673691,
859
+ "num_tokens": 164138.0,
860
+ "step": 85
861
+ },
862
+ {
863
+ "entropy": 1.3341291099786758,
864
+ "epoch": 2.4671532846715327,
865
+ "grad_norm": 4.6875,
866
+ "learning_rate": 3.5093333532153316e-06,
867
+ "loss": 0.8604,
868
+ "mean_token_accuracy": 0.7294721901416779,
869
+ "num_tokens": 166308.0,
870
+ "step": 86
871
+ },
872
+ {
873
+ "entropy": 1.3214628398418427,
874
+ "epoch": 2.4963503649635035,
875
+ "grad_norm": 5.4375,
876
+ "learning_rate": 3.1798386958991715e-06,
877
+ "loss": 0.8978,
878
+ "mean_token_accuracy": 0.7371588498353958,
879
+ "num_tokens": 168073.0,
880
+ "step": 87
881
+ },
882
+ {
883
+ "entropy": 1.358703538775444,
884
+ "epoch": 2.5255474452554747,
885
+ "grad_norm": 5.125,
886
+ "learning_rate": 2.86474508437579e-06,
887
+ "loss": 0.859,
888
+ "mean_token_accuracy": 0.7255095988512039,
889
+ "num_tokens": 169979.0,
890
+ "step": 88
891
+ },
892
+ {
893
+ "entropy": 1.258324310183525,
894
+ "epoch": 2.554744525547445,
895
+ "grad_norm": 4.15625,
896
+ "learning_rate": 2.564436411674376e-06,
897
+ "loss": 0.825,
898
+ "mean_token_accuracy": 0.7614458128809929,
899
+ "num_tokens": 172706.0,
900
+ "step": 89
901
+ },
902
+ {
903
+ "entropy": 1.329784169793129,
904
+ "epoch": 2.5839416058394162,
905
+ "grad_norm": 5.40625,
906
+ "learning_rate": 2.279278557653611e-06,
907
+ "loss": 0.8799,
908
+ "mean_token_accuracy": 0.7584780603647232,
909
+ "num_tokens": 174586.0,
910
+ "step": 90
911
+ },
912
+ {
913
+ "entropy": 1.2622641026973724,
914
+ "epoch": 2.613138686131387,
915
+ "grad_norm": 5.125,
916
+ "learning_rate": 2.0096189432334194e-06,
917
+ "loss": 0.8348,
918
+ "mean_token_accuracy": 0.7513260990381241,
919
+ "num_tokens": 176525.0,
920
+ "step": 91
921
+ },
922
+ {
923
+ "entropy": 1.2846813797950745,
924
+ "epoch": 2.6423357664233578,
925
+ "grad_norm": 5.0,
926
+ "learning_rate": 1.7557861071160953e-06,
927
+ "loss": 0.7697,
928
+ "mean_token_accuracy": 0.7566402554512024,
929
+ "num_tokens": 178535.0,
930
+ "step": 92
931
+ },
932
+ {
933
+ "entropy": 1.2429047673940659,
934
+ "epoch": 2.6715328467153285,
935
+ "grad_norm": 4.1875,
936
+ "learning_rate": 1.518089305512498e-06,
937
+ "loss": 0.8523,
938
+ "mean_token_accuracy": 0.7609995678067207,
939
+ "num_tokens": 181688.0,
940
+ "step": 93
941
+ },
942
+ {
943
+ "entropy": 1.2306764125823975,
944
+ "epoch": 2.7007299270072993,
945
+ "grad_norm": 5.6875,
946
+ "learning_rate": 1.2968181353609854e-06,
947
+ "loss": 0.795,
948
+ "mean_token_accuracy": 0.7538608759641647,
949
+ "num_tokens": 183350.0,
950
+ "step": 94
951
+ },
952
+ {
953
+ "entropy": 1.2729838192462921,
954
+ "epoch": 2.72992700729927,
955
+ "grad_norm": 5.25,
956
+ "learning_rate": 1.0922421814981904e-06,
957
+ "loss": 0.8463,
958
+ "mean_token_accuracy": 0.7443541586399078,
959
+ "num_tokens": 185369.0,
960
+ "step": 95
961
+ },
962
+ {
963
+ "entropy": 1.2911252602934837,
964
+ "epoch": 2.759124087591241,
965
+ "grad_norm": 5.125,
966
+ "learning_rate": 9.046106882113753e-07,
967
+ "loss": 0.7471,
968
+ "mean_token_accuracy": 0.752311646938324,
969
+ "num_tokens": 187493.0,
970
+ "step": 96
971
+ },
972
+ {
973
+ "entropy": 1.28748519718647,
974
+ "epoch": 2.7883211678832116,
975
+ "grad_norm": 6.4375,
976
+ "learning_rate": 7.341522555726971e-07,
977
+ "loss": 0.7536,
978
+ "mean_token_accuracy": 0.7757409885525703,
979
+ "num_tokens": 188864.0,
980
+ "step": 97
981
+ },
982
+ {
983
+ "entropy": 1.2816387563943863,
984
+ "epoch": 2.8175182481751824,
985
+ "grad_norm": 5.46875,
986
+ "learning_rate": 5.810745609252166e-07,
987
+ "loss": 0.9127,
988
+ "mean_token_accuracy": 0.7290580719709396,
989
+ "num_tokens": 190843.0,
990
+ "step": 98
991
+ },
992
+ {
993
+ "entropy": 1.4024466425180435,
994
+ "epoch": 2.846715328467153,
995
+ "grad_norm": 6.71875,
996
+ "learning_rate": 4.455641058600529e-07,
997
+ "loss": 0.9032,
998
+ "mean_token_accuracy": 0.7520110681653023,
999
+ "num_tokens": 192230.0,
1000
+ "step": 99
1001
+ },
1002
+ {
1003
+ "entropy": 1.354932889342308,
1004
+ "epoch": 2.875912408759124,
1005
+ "grad_norm": 6.71875,
1006
+ "learning_rate": 3.277859889929147e-07,
1007
+ "loss": 0.7987,
1008
+ "mean_token_accuracy": 0.785490907728672,
1009
+ "num_tokens": 193518.0,
1010
+ "step": 100
1011
  }
1012
  ],
1013
  "logging_steps": 1,
1014
+ "max_steps": 105,
1015
  "num_input_tokens_seen": 0,
1016
  "num_train_epochs": 3,
1017
  "save_steps": 5,
 
1027
  "attributes": {}
1028
  }
1029
  },
1030
+ "total_flos": 5186447183892480.0,
1031
  "train_batch_size": 2,
1032
  "trial_name": null,
1033
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:11dc7c8092aa2b8ebf234fc84d3e707b2126e3e231f1ae373dfe72c25a33e317
3
  size 6353
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f89ff4081cf45cebdd9100f3809aacae74d1773ff8c2b672defb6af57e4a514c
3
  size 6353