nt-van-khanh commited on
Commit
783a096
·
verified ·
1 Parent(s): dec7bcb

Upload folder using huggingface_hub

Browse files
adapter_config.json CHANGED
@@ -29,13 +29,13 @@
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
- "q_proj",
 
 
33
  "down_proj",
34
  "up_proj",
35
- "gate_proj",
36
  "o_proj",
37
- "v_proj",
38
- "k_proj"
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
 
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
+ "gate_proj",
33
+ "k_proj",
34
+ "v_proj",
35
  "down_proj",
36
  "up_proj",
 
37
  "o_proj",
38
+ "q_proj"
 
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:18f29aa15404e8d7dc6c3da90b8d1195ed1266c8956e4b683ccf3e64bce4a4b0
3
  size 167832240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f135b1aa2e390d1c988122717a5ac9ab4020af2cd577111a234cd318d9c653e
3
  size 167832240
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8a1011ccc002475182e5e55851a1493f2a954bf6e663ed962d064a3f5ba5e05d
3
  size 85728342
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:811119aceb53e5716fa4f3a64d3d5aa323a9f0ec2a7c50f98d41f16f80104bba
3
  size 85728342
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2f2f9d606f79169206a3e3db665d80d586c6b619b47340f51b44de64a39425e9
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e653f327340c2a2bf9d85813888ec80ce279c9079550355bbe8116334e542c0f
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:26d177a95623697e319358a6c4514ba0881b2fc04c7eac2c4bc678c9cfbb518e
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63ce17dd2c32e1042039dfe648c482c9ff0032ac68df46007019bf1f153ddc3e
3
  size 1064
trainer_state.json CHANGED
@@ -3,1165 +3,1155 @@
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
  "epoch": 1.0,
6
- "eval_steps": 116,
7
- "global_step": 576,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "entropy": 1.4107011198997497,
14
- "epoch": 0.008688097306689836,
15
- "grad_norm": 1.0824192762374878,
16
  "learning_rate": 8e-05,
17
- "loss": 1.3791,
18
- "mean_token_accuracy": 0.6692322790622711,
19
- "num_tokens": 5934.0,
20
  "step": 5
21
  },
22
  {
23
- "entropy": 1.3867508530616761,
24
- "epoch": 0.01737619461337967,
25
- "grad_norm": 0.7395071387290955,
26
  "learning_rate": 0.00018,
27
- "loss": 1.2462,
28
- "mean_token_accuracy": 0.6868680059909821,
29
- "num_tokens": 10879.0,
30
  "step": 10
31
  },
32
  {
33
- "entropy": 1.4622886419296264,
34
- "epoch": 0.026064291920069503,
35
- "grad_norm": 0.8882044553756714,
36
- "learning_rate": 0.00019858657243816254,
37
- "loss": 1.3713,
38
- "mean_token_accuracy": 0.6863115847110748,
39
- "num_tokens": 16008.0,
40
  "step": 15
41
  },
42
  {
43
- "entropy": 1.3233125329017639,
44
- "epoch": 0.03475238922675934,
45
- "grad_norm": 0.8664262294769287,
46
- "learning_rate": 0.00019681978798586573,
47
- "loss": 1.1232,
48
- "mean_token_accuracy": 0.7028895020484924,
49
- "num_tokens": 20909.0,
50
  "step": 20
51
  },
52
  {
53
- "entropy": 1.3151636123657227,
54
- "epoch": 0.043440486533449174,
55
- "grad_norm": 0.8843740820884705,
56
- "learning_rate": 0.00019505300353356894,
57
- "loss": 1.3277,
58
- "mean_token_accuracy": 0.6794350504875183,
59
- "num_tokens": 26475.0,
60
  "step": 25
61
  },
62
  {
63
- "entropy": 1.4190590620040893,
64
- "epoch": 0.052128583840139006,
65
- "grad_norm": 0.8556863069534302,
66
- "learning_rate": 0.0001932862190812721,
67
- "loss": 1.2498,
68
- "mean_token_accuracy": 0.6925376653671265,
69
- "num_tokens": 31585.0,
70
  "step": 30
71
  },
72
  {
73
- "entropy": 1.3756298661231994,
74
- "epoch": 0.060816681146828845,
75
- "grad_norm": 0.7749322056770325,
76
- "learning_rate": 0.00019151943462897527,
77
- "loss": 1.2697,
78
- "mean_token_accuracy": 0.6954838395118713,
79
- "num_tokens": 36597.0,
80
  "step": 35
81
  },
82
  {
83
- "entropy": 1.1881313562393188,
84
- "epoch": 0.06950477845351868,
85
- "grad_norm": 0.7338190674781799,
86
- "learning_rate": 0.00018975265017667846,
87
- "loss": 1.1248,
88
- "mean_token_accuracy": 0.7170480847358703,
89
- "num_tokens": 42077.0,
90
  "step": 40
91
  },
92
  {
93
- "entropy": 1.1893319606781005,
94
- "epoch": 0.07819287576020852,
95
- "grad_norm": 0.917160153388977,
96
- "learning_rate": 0.00018798586572438164,
97
- "loss": 1.1068,
98
- "mean_token_accuracy": 0.723894190788269,
99
- "num_tokens": 47213.0,
100
  "step": 45
101
  },
102
  {
103
- "entropy": 1.3148566722869872,
104
- "epoch": 0.08688097306689835,
105
- "grad_norm": 0.7442152500152588,
106
- "learning_rate": 0.0001862190812720848,
107
- "loss": 1.1797,
108
- "mean_token_accuracy": 0.714643794298172,
109
- "num_tokens": 51843.0,
110
  "step": 50
111
  },
112
  {
113
- "entropy": 1.209594178199768,
114
- "epoch": 0.09556907037358818,
115
- "grad_norm": 0.8733506202697754,
116
- "learning_rate": 0.000184452296819788,
117
- "loss": 1.1516,
118
- "mean_token_accuracy": 0.7124356746673584,
119
- "num_tokens": 56129.0,
120
  "step": 55
121
  },
122
  {
123
- "entropy": 1.2833115696907043,
124
- "epoch": 0.10425716768027801,
125
- "grad_norm": 0.6196924448013306,
126
- "learning_rate": 0.00018268551236749118,
127
- "loss": 1.1723,
128
- "mean_token_accuracy": 0.716529393196106,
129
- "num_tokens": 60886.0,
130
  "step": 60
131
  },
132
  {
133
- "entropy": 1.1704561591148377,
134
- "epoch": 0.11294526498696786,
135
- "grad_norm": 0.7477344274520874,
136
- "learning_rate": 0.00018091872791519434,
137
- "loss": 1.0676,
138
- "mean_token_accuracy": 0.7266331076622009,
139
- "num_tokens": 65493.0,
140
  "step": 65
141
  },
142
  {
143
- "entropy": 1.2534839153289794,
144
- "epoch": 0.12163336229365769,
145
- "grad_norm": 0.5629271864891052,
146
- "learning_rate": 0.00017915194346289755,
147
- "loss": 1.1689,
148
- "mean_token_accuracy": 0.6983003616333008,
149
- "num_tokens": 71532.0,
150
  "step": 70
151
  },
152
  {
153
- "entropy": 1.2661188066005706,
154
- "epoch": 0.13032145960034752,
155
- "grad_norm": 0.7477047443389893,
156
- "learning_rate": 0.0001773851590106007,
157
- "loss": 1.1527,
158
- "mean_token_accuracy": 0.7111507415771484,
159
- "num_tokens": 76336.0,
160
  "step": 75
161
  },
162
  {
163
- "entropy": 1.2280403673648834,
164
- "epoch": 0.13900955690703737,
165
- "grad_norm": 0.5776288509368896,
166
- "learning_rate": 0.0001756183745583039,
167
- "loss": 1.1463,
168
- "mean_token_accuracy": 0.7114346265792847,
169
- "num_tokens": 81831.0,
170
  "step": 80
171
  },
172
  {
173
- "entropy": 1.2760204315185546,
174
- "epoch": 0.1476976542137272,
175
- "grad_norm": 0.7465737462043762,
176
- "learning_rate": 0.00017385159010600707,
177
- "loss": 1.1869,
178
- "mean_token_accuracy": 0.7063124418258667,
179
- "num_tokens": 87088.0,
180
  "step": 85
181
  },
182
  {
183
- "entropy": 1.1543210983276366,
184
- "epoch": 0.15638575152041703,
185
- "grad_norm": 0.9235097169876099,
186
- "learning_rate": 0.00017208480565371025,
187
- "loss": 1.0239,
188
- "mean_token_accuracy": 0.7342011570930481,
189
- "num_tokens": 91481.0,
190
  "step": 90
191
  },
192
  {
193
- "entropy": 1.1509882926940918,
194
- "epoch": 0.16507384882710685,
195
- "grad_norm": 0.8983603119850159,
196
- "learning_rate": 0.00017031802120141343,
197
- "loss": 1.0193,
198
- "mean_token_accuracy": 0.7283918738365174,
199
- "num_tokens": 97082.0,
200
  "step": 95
201
  },
202
  {
203
- "entropy": 1.1474610209465026,
204
- "epoch": 0.1737619461337967,
205
- "grad_norm": 0.8950803875923157,
206
- "learning_rate": 0.00016855123674911661,
207
- "loss": 1.024,
208
- "mean_token_accuracy": 0.7186934947967529,
209
- "num_tokens": 102702.0,
210
  "step": 100
211
  },
212
  {
213
- "entropy": 1.2507054924964904,
214
- "epoch": 0.18245004344048654,
215
- "grad_norm": 0.7641472220420837,
216
- "learning_rate": 0.0001667844522968198,
217
- "loss": 1.2473,
218
- "mean_token_accuracy": 0.689992618560791,
219
- "num_tokens": 107809.0,
220
  "step": 105
221
  },
222
  {
223
- "entropy": 1.1421348989009856,
224
- "epoch": 0.19113814074717636,
225
- "grad_norm": 0.7692685723304749,
226
- "learning_rate": 0.00016501766784452298,
227
- "loss": 0.9813,
228
- "mean_token_accuracy": 0.7342736542224884,
229
- "num_tokens": 113867.0,
230
  "step": 110
231
  },
232
  {
233
- "entropy": 1.1946647882461547,
234
- "epoch": 0.1998262380538662,
235
- "grad_norm": 0.6726035475730896,
236
- "learning_rate": 0.00016325088339222616,
237
- "loss": 1.2356,
238
- "mean_token_accuracy": 0.7197514176368713,
239
- "num_tokens": 118971.0,
240
  "step": 115
241
  },
242
  {
243
- "entropy": 1.030277144908905,
244
- "epoch": 0.20851433536055602,
245
- "grad_norm": 0.6318750381469727,
246
- "learning_rate": 0.00016148409893992932,
247
- "loss": 0.9519,
248
- "mean_token_accuracy": 0.7470094501972199,
249
- "num_tokens": 123917.0,
250
  "step": 120
251
  },
252
  {
253
- "entropy": 1.1761239767074585,
254
- "epoch": 0.21720243266724587,
255
- "grad_norm": 0.5411672592163086,
256
- "learning_rate": 0.00015971731448763253,
257
- "loss": 1.1012,
258
- "mean_token_accuracy": 0.7102837443351746,
259
- "num_tokens": 128953.0,
260
  "step": 125
261
  },
262
  {
263
- "entropy": 1.1714531064033509,
264
- "epoch": 0.22589052997393572,
265
- "grad_norm": 0.8429161906242371,
266
- "learning_rate": 0.00015795053003533568,
267
- "loss": 1.0605,
268
- "mean_token_accuracy": 0.7349712550640106,
269
- "num_tokens": 133480.0,
270
  "step": 130
271
  },
272
  {
273
- "entropy": 1.0772636473178863,
274
- "epoch": 0.23457862728062553,
275
- "grad_norm": 0.799789309501648,
276
- "learning_rate": 0.00015618374558303886,
277
- "loss": 0.926,
278
- "mean_token_accuracy": 0.7549088835716248,
279
- "num_tokens": 137896.0,
280
  "step": 135
281
  },
282
  {
283
- "entropy": 1.0330187737941743,
284
- "epoch": 0.24326672458731538,
285
- "grad_norm": 0.7396602630615234,
286
- "learning_rate": 0.00015441696113074207,
287
- "loss": 0.9503,
288
- "mean_token_accuracy": 0.7505731225013733,
289
- "num_tokens": 142331.0,
290
  "step": 140
291
  },
292
  {
293
- "entropy": 1.0420972228050231,
294
- "epoch": 0.2519548218940052,
295
- "grad_norm": 0.8590136170387268,
296
- "learning_rate": 0.00015265017667844523,
297
- "loss": 0.9898,
298
- "mean_token_accuracy": 0.7424791634082795,
299
- "num_tokens": 147214.0,
300
  "step": 145
301
  },
302
  {
303
- "entropy": 1.0915903329849244,
304
- "epoch": 0.26064291920069504,
305
- "grad_norm": 0.716101884841919,
306
- "learning_rate": 0.00015088339222614844,
307
- "loss": 0.9714,
308
- "mean_token_accuracy": 0.7417037010192871,
309
- "num_tokens": 153121.0,
310
  "step": 150
311
  },
312
  {
313
- "entropy": 1.1882542252540589,
314
- "epoch": 0.2693310165073849,
315
- "grad_norm": 0.6657515168190002,
316
- "learning_rate": 0.0001491166077738516,
317
- "loss": 1.1597,
318
- "mean_token_accuracy": 0.7061243832111359,
319
- "num_tokens": 158383.0,
320
  "step": 155
321
  },
322
  {
323
- "entropy": 1.0055213570594788,
324
- "epoch": 0.27801911381407474,
325
- "grad_norm": 0.5483101606369019,
326
- "learning_rate": 0.00014734982332155477,
327
- "loss": 0.8653,
328
- "mean_token_accuracy": 0.7609937191009521,
329
- "num_tokens": 163784.0,
330
  "step": 160
331
  },
332
  {
333
- "entropy": 1.0749119877815247,
334
- "epoch": 0.2867072111207645,
335
- "grad_norm": 0.7525088787078857,
336
- "learning_rate": 0.00014558303886925796,
337
- "loss": 1.0036,
338
- "mean_token_accuracy": 0.7341200232505798,
339
- "num_tokens": 168920.0,
340
  "step": 165
341
  },
342
  {
343
- "entropy": 1.0247761964797975,
344
- "epoch": 0.2953953084274544,
345
- "grad_norm": 0.9151174426078796,
346
- "learning_rate": 0.00014381625441696114,
347
- "loss": 1.0161,
348
- "mean_token_accuracy": 0.7334173619747162,
349
- "num_tokens": 174444.0,
350
  "step": 170
351
  },
352
  {
353
- "entropy": 0.9753389418125152,
354
- "epoch": 0.3040834057341442,
355
- "grad_norm": 0.838029682636261,
356
- "learning_rate": 0.00014204946996466432,
357
- "loss": 0.8768,
358
- "mean_token_accuracy": 0.7599341690540313,
359
- "num_tokens": 179698.0,
360
  "step": 175
361
  },
362
  {
363
- "entropy": 1.125912880897522,
364
- "epoch": 0.31277150304083406,
365
- "grad_norm": 0.7232808470726013,
366
- "learning_rate": 0.0001402826855123675,
367
- "loss": 1.0314,
368
- "mean_token_accuracy": 0.7424649059772491,
369
- "num_tokens": 183863.0,
370
  "step": 180
371
  },
372
  {
373
- "entropy": 1.2293154418468475,
374
- "epoch": 0.3214596003475239,
375
- "grad_norm": 0.7383331060409546,
376
- "learning_rate": 0.00013851590106007068,
377
- "loss": 1.1186,
378
- "mean_token_accuracy": 0.7178550124168396,
379
- "num_tokens": 189359.0,
380
  "step": 185
381
  },
382
  {
383
- "entropy": 1.156474417448044,
384
- "epoch": 0.3301476976542137,
385
- "grad_norm": 0.7917217016220093,
386
- "learning_rate": 0.00013674911660777384,
387
- "loss": 1.1353,
388
- "mean_token_accuracy": 0.733020156621933,
389
- "num_tokens": 194425.0,
390
  "step": 190
391
  },
392
  {
393
- "entropy": 1.0435515761375427,
394
- "epoch": 0.33883579496090355,
395
- "grad_norm": 0.6386027336120605,
396
- "learning_rate": 0.00013498233215547705,
397
- "loss": 1.0725,
398
- "mean_token_accuracy": 0.7349865734577179,
399
- "num_tokens": 200016.0,
400
  "step": 195
401
  },
402
  {
403
- "entropy": 1.0159077584743499,
404
- "epoch": 0.3475238922675934,
405
- "grad_norm": 0.6591583490371704,
406
- "learning_rate": 0.0001332155477031802,
407
- "loss": 0.9348,
408
- "mean_token_accuracy": 0.7529367506504059,
409
- "num_tokens": 205422.0,
410
  "step": 200
411
  },
412
  {
413
- "entropy": 1.2221834301948546,
414
- "epoch": 0.35621198957428324,
415
- "grad_norm": 0.8224623203277588,
416
- "learning_rate": 0.0001314487632508834,
417
- "loss": 1.1114,
418
- "mean_token_accuracy": 0.7135775506496429,
419
- "num_tokens": 211093.0,
420
  "step": 205
421
  },
422
  {
423
- "entropy": 1.1724997997283935,
424
- "epoch": 0.3649000868809731,
425
- "grad_norm": 0.7568403482437134,
426
- "learning_rate": 0.0001296819787985866,
427
- "loss": 1.0791,
428
- "mean_token_accuracy": 0.7197705745697022,
429
- "num_tokens": 216799.0,
430
  "step": 210
431
  },
432
  {
433
- "entropy": 0.9998281240463257,
434
- "epoch": 0.3735881841876629,
435
- "grad_norm": 0.7882909774780273,
436
- "learning_rate": 0.00012791519434628975,
437
- "loss": 0.9656,
438
- "mean_token_accuracy": 0.7520670473575592,
439
- "num_tokens": 222614.0,
440
  "step": 215
441
  },
442
  {
443
- "entropy": 0.9324562966823577,
444
- "epoch": 0.3822762814943527,
445
- "grad_norm": 0.6336882710456848,
446
- "learning_rate": 0.00012614840989399296,
447
- "loss": 0.8669,
448
- "mean_token_accuracy": 0.7735882341861725,
449
- "num_tokens": 228379.0,
450
  "step": 220
451
  },
452
  {
453
- "entropy": 1.1699727356433869,
454
- "epoch": 0.39096437880104257,
455
- "grad_norm": 0.5986983776092529,
456
- "learning_rate": 0.00012438162544169612,
457
- "loss": 1.1787,
458
- "mean_token_accuracy": 0.7107881784439087,
459
- "num_tokens": 233915.0,
460
  "step": 225
461
  },
462
  {
463
- "entropy": 1.048863458633423,
464
- "epoch": 0.3996524761077324,
465
- "grad_norm": 0.4969522953033447,
466
- "learning_rate": 0.0001226148409893993,
467
- "loss": 0.9199,
468
- "mean_token_accuracy": 0.7468161523342133,
469
- "num_tokens": 240469.0,
470
  "step": 230
471
  },
472
  {
473
- "entropy": 1.111632490158081,
474
- "epoch": 0.40834057341442226,
475
- "grad_norm": 0.7042287588119507,
476
- "learning_rate": 0.00012084805653710247,
477
- "loss": 0.9822,
478
- "mean_token_accuracy": 0.7367093205451966,
479
- "num_tokens": 244564.0,
480
  "step": 235
481
  },
482
  {
483
- "entropy": 1.0202743291854859,
484
- "epoch": 0.41702867072111205,
485
- "grad_norm": 1.0184236764907837,
486
- "learning_rate": 0.00011908127208480566,
487
- "loss": 0.9372,
488
- "mean_token_accuracy": 0.7477603435516358,
489
- "num_tokens": 248941.0,
490
  "step": 240
491
  },
492
  {
493
- "entropy": 0.9523225128650665,
494
- "epoch": 0.4257167680278019,
495
- "grad_norm": 0.8121697902679443,
496
- "learning_rate": 0.00011731448763250883,
497
- "loss": 0.8185,
498
- "mean_token_accuracy": 0.7734134078025818,
499
- "num_tokens": 253828.0,
500
  "step": 245
501
  },
502
  {
503
- "entropy": 0.9181910157203674,
504
- "epoch": 0.43440486533449174,
505
- "grad_norm": 0.8086223006248474,
506
- "learning_rate": 0.00011554770318021201,
507
- "loss": 0.8587,
508
- "mean_token_accuracy": 0.7629885494709014,
509
- "num_tokens": 258346.0,
510
  "step": 250
511
  },
512
  {
513
- "entropy": 1.027286982536316,
514
- "epoch": 0.4430929626411816,
515
- "grad_norm": 0.9183114171028137,
516
- "learning_rate": 0.00011378091872791521,
517
- "loss": 0.9793,
518
- "mean_token_accuracy": 0.746444970369339,
519
- "num_tokens": 263412.0,
520
  "step": 255
521
  },
522
  {
523
- "entropy": 1.1210540890693665,
524
- "epoch": 0.45178105994787143,
525
- "grad_norm": 0.6462275385856628,
526
- "learning_rate": 0.00011201413427561838,
527
- "loss": 1.0733,
528
- "mean_token_accuracy": 0.7240573465824127,
529
- "num_tokens": 269265.0,
530
  "step": 260
531
  },
532
  {
533
- "entropy": 1.0752209186553956,
534
- "epoch": 0.4604691572545613,
535
- "grad_norm": 0.5868723392486572,
536
- "learning_rate": 0.00011024734982332157,
537
- "loss": 1.035,
538
- "mean_token_accuracy": 0.7398930370807648,
539
- "num_tokens": 275181.0,
540
  "step": 265
541
  },
542
  {
543
- "entropy": 1.0981005787849427,
544
- "epoch": 0.46915725456125107,
545
- "grad_norm": 0.6965936422348022,
546
- "learning_rate": 0.00010848056537102473,
547
- "loss": 0.9941,
548
- "mean_token_accuracy": 0.7382525444030762,
549
- "num_tokens": 281653.0,
550
  "step": 270
551
  },
552
  {
553
- "entropy": 1.2062179446220398,
554
- "epoch": 0.4778453518679409,
555
- "grad_norm": 0.5782831311225891,
556
- "learning_rate": 0.00010671378091872792,
557
- "loss": 1.1559,
558
- "mean_token_accuracy": 0.7252971649169921,
559
- "num_tokens": 288339.0,
560
  "step": 275
561
  },
562
  {
563
- "entropy": 0.9347725868225097,
564
- "epoch": 0.48653344917463076,
565
- "grad_norm": 0.8672428131103516,
566
- "learning_rate": 0.00010494699646643109,
567
- "loss": 0.8588,
568
- "mean_token_accuracy": 0.7657521843910218,
569
- "num_tokens": 292550.0,
570
  "step": 280
571
  },
572
  {
573
- "entropy": 0.9903082251548767,
574
- "epoch": 0.4952215464813206,
575
- "grad_norm": 0.8236942291259766,
576
- "learning_rate": 0.00010318021201413429,
577
- "loss": 0.9742,
578
- "mean_token_accuracy": 0.759516978263855,
579
- "num_tokens": 297351.0,
580
  "step": 285
581
  },
582
  {
583
- "entropy": 0.9541885316371918,
584
- "epoch": 0.5039096437880104,
585
- "grad_norm": 0.8054157495498657,
586
- "learning_rate": 0.00010141342756183747,
587
- "loss": 0.8955,
588
- "mean_token_accuracy": 0.7546023488044739,
589
- "num_tokens": 302375.0,
590
  "step": 290
591
  },
592
  {
593
- "entropy": 1.087007749080658,
594
- "epoch": 0.5125977410947002,
595
- "grad_norm": 0.7063644528388977,
596
- "learning_rate": 9.964664310954064e-05,
597
- "loss": 1.0472,
598
- "mean_token_accuracy": 0.7329276382923127,
599
- "num_tokens": 307949.0,
600
  "step": 295
601
  },
602
  {
603
- "entropy": 1.0150154650211334,
604
- "epoch": 0.5212858384013901,
605
- "grad_norm": 0.6294690370559692,
606
- "learning_rate": 9.787985865724382e-05,
607
- "loss": 0.9532,
608
- "mean_token_accuracy": 0.7574328124523163,
609
- "num_tokens": 313937.0,
610
  "step": 300
611
  },
612
  {
613
- "entropy": 0.9173893213272095,
614
- "epoch": 0.5299739357080799,
615
- "grad_norm": 0.7809085845947266,
616
- "learning_rate": 9.611307420494699e-05,
617
- "loss": 0.858,
618
- "mean_token_accuracy": 0.7763119876384735,
619
- "num_tokens": 319961.0,
620
  "step": 305
621
  },
622
  {
623
- "entropy": 0.9054305255413055,
624
- "epoch": 0.5386620330147698,
625
- "grad_norm": 0.6364769339561462,
626
- "learning_rate": 9.434628975265019e-05,
627
- "loss": 0.8299,
628
- "mean_token_accuracy": 0.7723350107669831,
629
- "num_tokens": 325308.0,
630
  "step": 310
631
  },
632
  {
633
- "entropy": 1.1210152804851532,
634
- "epoch": 0.5473501303214596,
635
- "grad_norm": 0.5350526571273804,
636
- "learning_rate": 9.257950530035337e-05,
637
- "loss": 1.0869,
638
- "mean_token_accuracy": 0.7202603399753571,
639
- "num_tokens": 330646.0,
640
  "step": 315
641
  },
642
  {
643
- "entropy": 1.2845207929611206,
644
- "epoch": 0.5560382276281495,
645
- "grad_norm": 0.7305357456207275,
646
- "learning_rate": 9.081272084805655e-05,
647
- "loss": 1.2433,
648
- "mean_token_accuracy": 0.6946423172950744,
649
- "num_tokens": 335970.0,
650
  "step": 320
651
  },
652
  {
653
- "entropy": 1.1076598703861236,
654
- "epoch": 0.5647263249348393,
655
- "grad_norm": 0.7619346380233765,
656
- "learning_rate": 8.904593639575972e-05,
657
- "loss": 1.0772,
658
- "mean_token_accuracy": 0.7353523135185241,
659
- "num_tokens": 340882.0,
660
  "step": 325
661
  },
662
  {
663
- "entropy": 1.0865317761898041,
664
- "epoch": 0.573414422241529,
665
- "grad_norm": 0.7432613968849182,
666
- "learning_rate": 8.72791519434629e-05,
667
- "loss": 0.9719,
668
- "mean_token_accuracy": 0.7420823752880097,
669
- "num_tokens": 347322.0,
670
  "step": 330
671
  },
672
  {
673
- "entropy": 1.0196545660495757,
674
- "epoch": 0.5821025195482189,
675
- "grad_norm": 0.8102108240127563,
676
- "learning_rate": 8.551236749116608e-05,
677
- "loss": 1.0188,
678
- "mean_token_accuracy": 0.737479317188263,
679
- "num_tokens": 352255.0,
680
  "step": 335
681
  },
682
  {
683
- "entropy": 0.9597744405269623,
684
- "epoch": 0.5907906168549087,
685
- "grad_norm": 0.5975119471549988,
686
- "learning_rate": 8.374558303886925e-05,
687
- "loss": 0.9507,
688
- "mean_token_accuracy": 0.7520730376243592,
689
- "num_tokens": 358224.0,
690
  "step": 340
691
  },
692
  {
693
- "entropy": 1.0088558495044708,
694
- "epoch": 0.5994787141615986,
695
- "grad_norm": 0.6641438603401184,
696
- "learning_rate": 8.197879858657245e-05,
697
- "loss": 0.8871,
698
- "mean_token_accuracy": 0.7555708646774292,
699
- "num_tokens": 363628.0,
700
  "step": 345
701
  },
702
  {
703
- "entropy": 0.8582567512989044,
704
- "epoch": 0.6081668114682884,
705
- "grad_norm": 0.6896267533302307,
706
- "learning_rate": 8.021201413427563e-05,
707
- "loss": 0.7596,
708
- "mean_token_accuracy": 0.7876878619194031,
709
- "num_tokens": 368584.0,
710
  "step": 350
711
  },
712
  {
713
- "entropy": 1.0959418714046478,
714
- "epoch": 0.6168549087749783,
715
- "grad_norm": 0.7268755435943604,
716
- "learning_rate": 7.844522968197881e-05,
717
- "loss": 1.0818,
718
- "mean_token_accuracy": 0.7426620662212372,
719
- "num_tokens": 375123.0,
720
  "step": 355
721
  },
722
  {
723
- "entropy": 0.939378696680069,
724
- "epoch": 0.6255430060816681,
725
- "grad_norm": 0.6950759887695312,
726
- "learning_rate": 7.667844522968198e-05,
727
- "loss": 0.8691,
728
- "mean_token_accuracy": 0.7680276036262512,
729
- "num_tokens": 379749.0,
730
  "step": 360
731
  },
732
  {
733
- "entropy": 0.9272566437721252,
734
- "epoch": 0.634231103388358,
735
- "grad_norm": 0.7108306884765625,
736
- "learning_rate": 7.491166077738516e-05,
737
- "loss": 0.874,
738
- "mean_token_accuracy": 0.7642870903015136,
739
- "num_tokens": 384217.0,
740
  "step": 365
741
  },
742
  {
743
- "entropy": 0.966171669960022,
744
- "epoch": 0.6429192006950478,
745
- "grad_norm": 0.7516181468963623,
746
- "learning_rate": 7.314487632508834e-05,
747
- "loss": 0.9207,
748
- "mean_token_accuracy": 0.7558039426803589,
749
- "num_tokens": 389545.0,
750
  "step": 370
751
  },
752
  {
753
- "entropy": 0.9690821290016174,
754
- "epoch": 0.6516072980017377,
755
- "grad_norm": 0.8500149846076965,
756
- "learning_rate": 7.137809187279151e-05,
757
- "loss": 0.8986,
758
- "mean_token_accuracy": 0.7697801053524017,
759
- "num_tokens": 395014.0,
760
  "step": 375
761
  },
762
  {
763
- "entropy": 0.9449369788169861,
764
- "epoch": 0.6602953953084274,
765
- "grad_norm": 0.7869531512260437,
766
- "learning_rate": 6.96113074204947e-05,
767
- "loss": 0.8541,
768
- "mean_token_accuracy": 0.7647368013858795,
769
- "num_tokens": 399347.0,
770
  "step": 380
771
  },
772
  {
773
- "entropy": 1.011014348268509,
774
- "epoch": 0.6689834926151172,
775
- "grad_norm": 0.7705890536308289,
776
- "learning_rate": 6.784452296819789e-05,
777
- "loss": 1.0432,
778
- "mean_token_accuracy": 0.734080308675766,
779
- "num_tokens": 404755.0,
780
  "step": 385
781
  },
782
  {
783
- "entropy": 0.9402169823646546,
784
- "epoch": 0.6776715899218071,
785
- "grad_norm": 0.7126161456108093,
786
- "learning_rate": 6.607773851590107e-05,
787
- "loss": 0.8886,
788
- "mean_token_accuracy": 0.764913672208786,
789
- "num_tokens": 409721.0,
790
  "step": 390
791
  },
792
  {
793
- "entropy": 1.0488521814346314,
794
- "epoch": 0.6863596872284969,
795
- "grad_norm": 0.8272379040718079,
796
- "learning_rate": 6.431095406360424e-05,
797
- "loss": 0.9588,
798
- "mean_token_accuracy": 0.748864209651947,
799
- "num_tokens": 415299.0,
800
  "step": 395
801
  },
802
  {
803
- "entropy": 1.1054470241069794,
804
- "epoch": 0.6950477845351868,
805
- "grad_norm": 0.7811095118522644,
806
- "learning_rate": 6.254416961130742e-05,
807
- "loss": 1.0601,
808
- "mean_token_accuracy": 0.7348082840442658,
809
- "num_tokens": 419272.0,
810
  "step": 400
811
  },
812
  {
813
- "entropy": 1.2347867608070373,
814
- "epoch": 0.7037358818418766,
815
- "grad_norm": 0.496404230594635,
816
- "learning_rate": 6.07773851590106e-05,
817
- "loss": 1.2019,
818
- "mean_token_accuracy": 0.7068372428417206,
819
- "num_tokens": 425168.0,
820
  "step": 405
821
  },
822
  {
823
- "entropy": 1.0330038726329804,
824
- "epoch": 0.7124239791485665,
825
- "grad_norm": 0.874782145023346,
826
- "learning_rate": 5.901060070671378e-05,
827
- "loss": 0.8853,
828
- "mean_token_accuracy": 0.7520223379135131,
829
- "num_tokens": 428786.0,
830
  "step": 410
831
  },
832
  {
833
- "entropy": 0.9399411380290985,
834
- "epoch": 0.7211120764552563,
835
- "grad_norm": 0.5931876301765442,
836
- "learning_rate": 5.724381625441696e-05,
837
- "loss": 0.8794,
838
- "mean_token_accuracy": 0.7625872433185578,
839
- "num_tokens": 434022.0,
840
  "step": 415
841
  },
842
  {
843
- "entropy": 0.9859249532222748,
844
- "epoch": 0.7298001737619462,
845
- "grad_norm": 0.8090170621871948,
846
- "learning_rate": 5.547703180212014e-05,
847
- "loss": 1.0024,
848
- "mean_token_accuracy": 0.7403703987598419,
849
- "num_tokens": 439523.0,
850
  "step": 420
851
  },
852
  {
853
- "entropy": 1.0710622251033783,
854
- "epoch": 0.738488271068636,
855
- "grad_norm": 0.7571151852607727,
856
- "learning_rate": 5.371024734982333e-05,
857
- "loss": 1.0121,
858
- "mean_token_accuracy": 0.7399603426456451,
859
- "num_tokens": 445332.0,
860
  "step": 425
861
  },
862
  {
863
- "entropy": 1.1122443795204162,
864
- "epoch": 0.7471763683753258,
865
- "grad_norm": 0.842430055141449,
866
- "learning_rate": 5.194346289752651e-05,
867
- "loss": 1.0777,
868
- "mean_token_accuracy": 0.734097695350647,
869
- "num_tokens": 450418.0,
870
  "step": 430
871
  },
872
  {
873
- "entropy": 0.9799317717552185,
874
- "epoch": 0.7558644656820156,
875
- "grad_norm": 0.7271431088447571,
876
- "learning_rate": 5.0176678445229686e-05,
877
- "loss": 0.9345,
878
- "mean_token_accuracy": 0.757810401916504,
879
- "num_tokens": 455068.0,
880
  "step": 435
881
  },
882
  {
883
- "entropy": 1.0104888319969176,
884
- "epoch": 0.7645525629887054,
885
- "grad_norm": 0.7085432410240173,
886
- "learning_rate": 4.840989399293286e-05,
887
- "loss": 0.9445,
888
- "mean_token_accuracy": 0.7545992910861969,
889
- "num_tokens": 460628.0,
890
  "step": 440
891
  },
892
  {
893
- "entropy": 0.9711742639541626,
894
- "epoch": 0.7732406602953953,
895
- "grad_norm": 0.7456789016723633,
896
- "learning_rate": 4.664310954063604e-05,
897
- "loss": 0.8805,
898
- "mean_token_accuracy": 0.7588825583457947,
899
- "num_tokens": 466763.0,
900
  "step": 445
901
  },
902
  {
903
- "entropy": 1.0628331184387207,
904
- "epoch": 0.7819287576020851,
905
- "grad_norm": 0.7692188620567322,
906
- "learning_rate": 4.4876325088339225e-05,
907
- "loss": 1.0096,
908
- "mean_token_accuracy": 0.7396968126296997,
909
- "num_tokens": 472278.0,
910
  "step": 450
911
  },
912
  {
913
- "entropy": 0.9426181256771088,
914
- "epoch": 0.790616854908775,
915
- "grad_norm": 1.063111424446106,
916
- "learning_rate": 4.310954063604241e-05,
917
- "loss": 0.8868,
918
- "mean_token_accuracy": 0.7644854426383972,
919
- "num_tokens": 476684.0,
920
  "step": 455
921
  },
922
  {
923
- "entropy": 0.9162843346595764,
924
- "epoch": 0.7993049522154648,
925
- "grad_norm": 0.594411313533783,
926
- "learning_rate": 4.134275618374558e-05,
927
- "loss": 0.8938,
928
- "mean_token_accuracy": 0.7581522405147553,
929
- "num_tokens": 481714.0,
930
  "step": 460
931
  },
932
  {
933
- "entropy": 0.9312973737716674,
934
- "epoch": 0.8079930495221547,
935
- "grad_norm": 0.749118983745575,
936
- "learning_rate": 3.9575971731448765e-05,
937
- "loss": 0.8893,
938
- "mean_token_accuracy": 0.7571979582309722,
939
- "num_tokens": 486388.0,
940
  "step": 465
941
  },
942
  {
943
- "entropy": 0.9682565927505493,
944
- "epoch": 0.8166811468288445,
945
- "grad_norm": 0.7657246589660645,
946
- "learning_rate": 3.780918727915195e-05,
947
- "loss": 0.8794,
948
- "mean_token_accuracy": 0.7685398876667022,
949
- "num_tokens": 491228.0,
950
  "step": 470
951
  },
952
  {
953
- "entropy": 1.1628508567810059,
954
- "epoch": 0.8253692441355344,
955
- "grad_norm": 0.6811634302139282,
956
- "learning_rate": 3.604240282685513e-05,
957
- "loss": 1.0503,
958
- "mean_token_accuracy": 0.736689418554306,
959
- "num_tokens": 496097.0,
960
  "step": 475
961
  },
962
  {
963
- "entropy": 1.0763611137866973,
964
- "epoch": 0.8340573414422241,
965
- "grad_norm": 0.6886923909187317,
966
- "learning_rate": 3.4275618374558305e-05,
967
- "loss": 1.0801,
968
- "mean_token_accuracy": 0.7297431588172912,
969
- "num_tokens": 502110.0,
970
  "step": 480
971
  },
972
  {
973
- "entropy": 1.0470020353794098,
974
- "epoch": 0.8427454387489139,
975
- "grad_norm": 0.7665517926216125,
976
- "learning_rate": 3.250883392226148e-05,
977
- "loss": 0.9489,
978
- "mean_token_accuracy": 0.7415068805217743,
979
- "num_tokens": 507210.0,
980
  "step": 485
981
  },
982
  {
983
- "entropy": 1.0041724681854247,
984
- "epoch": 0.8514335360556038,
985
- "grad_norm": 0.7603459358215332,
986
- "learning_rate": 3.074204946996467e-05,
987
- "loss": 0.9377,
988
- "mean_token_accuracy": 0.753987443447113,
989
- "num_tokens": 511823.0,
990
  "step": 490
991
  },
992
  {
993
- "entropy": 0.9953260302543641,
994
- "epoch": 0.8601216333622936,
995
- "grad_norm": 0.733259916305542,
996
- "learning_rate": 2.8975265017667848e-05,
997
- "loss": 0.9727,
998
- "mean_token_accuracy": 0.7504080295562744,
999
- "num_tokens": 516499.0,
1000
  "step": 495
1001
  },
1002
  {
1003
- "entropy": 0.973844712972641,
1004
- "epoch": 0.8688097306689835,
1005
- "grad_norm": 0.7084928750991821,
1006
- "learning_rate": 2.7208480565371023e-05,
1007
- "loss": 0.8981,
1008
- "mean_token_accuracy": 0.7571015357971191,
1009
- "num_tokens": 520718.0,
1010
  "step": 500
1011
  },
1012
  {
1013
- "entropy": 1.0785621047019958,
1014
- "epoch": 0.8774978279756733,
1015
- "grad_norm": 0.658848226070404,
1016
- "learning_rate": 2.5441696113074202e-05,
1017
- "loss": 1.0496,
1018
- "mean_token_accuracy": 0.7314722836017609,
1019
- "num_tokens": 526895.0,
1020
  "step": 505
1021
  },
1022
  {
1023
- "entropy": 0.9542811274528503,
1024
- "epoch": 0.8861859252823632,
1025
- "grad_norm": 0.7599547505378723,
1026
- "learning_rate": 2.3674911660777384e-05,
1027
- "loss": 0.8773,
1028
- "mean_token_accuracy": 0.764108294248581,
1029
- "num_tokens": 532482.0,
1030
  "step": 510
1031
  },
1032
  {
1033
- "entropy": 1.0099429547786714,
1034
- "epoch": 0.894874022589053,
1035
- "grad_norm": 0.7237294316291809,
1036
- "learning_rate": 2.1908127208480567e-05,
1037
- "loss": 0.9315,
1038
- "mean_token_accuracy": 0.7524258732795716,
1039
- "num_tokens": 538357.0,
1040
  "step": 515
1041
  },
1042
  {
1043
- "entropy": 0.9563346862792969,
1044
- "epoch": 0.9035621198957429,
1045
- "grad_norm": 0.7831231951713562,
1046
- "learning_rate": 2.0141342756183745e-05,
1047
- "loss": 0.8473,
1048
- "mean_token_accuracy": 0.7614937841892242,
1049
- "num_tokens": 542903.0,
1050
  "step": 520
1051
  },
1052
  {
1053
- "entropy": 1.1394842267036438,
1054
- "epoch": 0.9122502172024327,
1055
- "grad_norm": 0.7991846203804016,
1056
- "learning_rate": 1.8374558303886928e-05,
1057
- "loss": 1.0732,
1058
- "mean_token_accuracy": 0.7291809320449829,
1059
- "num_tokens": 548280.0,
1060
  "step": 525
1061
  },
1062
  {
1063
- "entropy": 1.0433153212070465,
1064
- "epoch": 0.9209383145091226,
1065
- "grad_norm": 0.8753973245620728,
1066
- "learning_rate": 1.6607773851590106e-05,
1067
- "loss": 1.0023,
1068
- "mean_token_accuracy": 0.7437108099460602,
1069
- "num_tokens": 554270.0,
1070
  "step": 530
1071
  },
1072
  {
1073
- "entropy": 1.0431060135364532,
1074
- "epoch": 0.9296264118158123,
1075
- "grad_norm": 0.8111518025398254,
1076
- "learning_rate": 1.4840989399293287e-05,
1077
- "loss": 1.0001,
1078
- "mean_token_accuracy": 0.7341843962669372,
1079
- "num_tokens": 558806.0,
1080
  "step": 535
1081
  },
1082
  {
1083
- "entropy": 0.9852993071079255,
1084
- "epoch": 0.9383145091225021,
1085
- "grad_norm": 0.8921456336975098,
1086
- "learning_rate": 1.3074204946996469e-05,
1087
- "loss": 0.8971,
1088
- "mean_token_accuracy": 0.7646882355213165,
1089
- "num_tokens": 563251.0,
1090
  "step": 540
1091
  },
1092
  {
1093
- "entropy": 0.9689121782779694,
1094
- "epoch": 0.947002606429192,
1095
- "grad_norm": 0.6307923793792725,
1096
- "learning_rate": 1.1307420494699646e-05,
1097
- "loss": 0.9439,
1098
- "mean_token_accuracy": 0.7631863057613373,
1099
- "num_tokens": 568462.0,
1100
  "step": 545
1101
  },
1102
  {
1103
- "entropy": 0.8768801867961884,
1104
- "epoch": 0.9556907037358818,
1105
- "grad_norm": 0.6335266828536987,
1106
- "learning_rate": 9.540636042402827e-06,
1107
- "loss": 0.8548,
1108
- "mean_token_accuracy": 0.773711097240448,
1109
- "num_tokens": 573579.0,
1110
  "step": 550
1111
  },
1112
  {
1113
- "entropy": 0.8275218904018402,
1114
- "epoch": 0.9643788010425717,
1115
- "grad_norm": 0.9174020886421204,
1116
- "learning_rate": 7.773851590106007e-06,
1117
- "loss": 0.7051,
1118
- "mean_token_accuracy": 0.7934583604335785,
1119
- "num_tokens": 577791.0,
1120
  "step": 555
1121
  },
1122
  {
1123
- "entropy": 1.0373184978961945,
1124
- "epoch": 0.9730668983492615,
1125
- "grad_norm": 0.9197268486022949,
1126
- "learning_rate": 6.007067137809187e-06,
1127
- "loss": 0.9661,
1128
- "mean_token_accuracy": 0.7519765794277191,
1129
- "num_tokens": 582350.0,
1130
  "step": 560
1131
  },
1132
  {
1133
- "entropy": 1.0240559220314025,
1134
- "epoch": 0.9817549956559514,
1135
- "grad_norm": 0.7955470681190491,
1136
- "learning_rate": 4.240282685512368e-06,
1137
- "loss": 0.9094,
1138
- "mean_token_accuracy": 0.756653618812561,
1139
- "num_tokens": 588229.0,
1140
  "step": 565
1141
  },
1142
  {
1143
- "entropy": 0.9118911564350128,
1144
- "epoch": 0.9904430929626412,
1145
- "grad_norm": 0.6962121725082397,
1146
- "learning_rate": 2.473498233215548e-06,
1147
- "loss": 0.8338,
1148
- "mean_token_accuracy": 0.7678785204887391,
1149
- "num_tokens": 593015.0,
1150
  "step": 570
1151
- },
1152
- {
1153
- "entropy": 1.070508062839508,
1154
- "epoch": 0.9991311902693311,
1155
- "grad_norm": 0.6025918126106262,
1156
- "learning_rate": 7.067137809187279e-07,
1157
- "loss": 1.0566,
1158
- "mean_token_accuracy": 0.7361152768135071,
1159
- "num_tokens": 598486.0,
1160
- "step": 575
1161
  }
1162
  ],
1163
  "logging_steps": 5,
1164
- "max_steps": 576,
1165
  "num_input_tokens_seen": 0,
1166
  "num_train_epochs": 1,
1167
  "save_steps": 500,
@@ -1177,7 +1167,7 @@
1177
  "attributes": {}
1178
  }
1179
  },
1180
- "total_flos": 2.712525380517888e+16,
1181
  "train_batch_size": 1,
1182
  "trial_name": null,
1183
  "trial_params": null
 
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
  "epoch": 1.0,
6
+ "eval_steps": 115,
7
+ "global_step": 572,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "entropy": 1.3924612522125244,
14
+ "epoch": 0.008741258741258742,
15
+ "grad_norm": 0.9623214602470398,
16
  "learning_rate": 8e-05,
17
+ "loss": 1.3775,
18
+ "mean_token_accuracy": 0.6655579686164856,
19
+ "num_tokens": 4268.0,
20
  "step": 5
21
  },
22
  {
23
+ "entropy": 1.3508465528488158,
24
+ "epoch": 0.017482517482517484,
25
+ "grad_norm": 0.9386249780654907,
26
  "learning_rate": 0.00018,
27
+ "loss": 1.1938,
28
+ "mean_token_accuracy": 0.7008997738361359,
29
+ "num_tokens": 9206.0,
30
  "step": 10
31
  },
32
  {
33
+ "entropy": 1.4163436055183412,
34
+ "epoch": 0.026223776223776224,
35
+ "grad_norm": 1.047428011894226,
36
+ "learning_rate": 0.00019857651245551604,
37
+ "loss": 1.2855,
38
+ "mean_token_accuracy": 0.6798348546028137,
39
+ "num_tokens": 13655.0,
40
  "step": 15
41
  },
42
  {
43
+ "entropy": 1.3434713006019592,
44
+ "epoch": 0.03496503496503497,
45
+ "grad_norm": 0.7252029180526733,
46
+ "learning_rate": 0.00019679715302491104,
47
+ "loss": 1.2563,
48
+ "mean_token_accuracy": 0.6956509709358215,
49
+ "num_tokens": 19321.0,
50
  "step": 20
51
  },
52
  {
53
+ "entropy": 1.1038120150566102,
54
+ "epoch": 0.043706293706293704,
55
+ "grad_norm": 1.1018619537353516,
56
+ "learning_rate": 0.00019501779359430604,
57
+ "loss": 1.0152,
58
+ "mean_token_accuracy": 0.7334934413433075,
59
+ "num_tokens": 24098.0,
60
  "step": 25
61
  },
62
  {
63
+ "entropy": 1.1559201896190643,
64
+ "epoch": 0.05244755244755245,
65
+ "grad_norm": 0.7375137209892273,
66
+ "learning_rate": 0.0001932384341637011,
67
+ "loss": 1.0187,
68
+ "mean_token_accuracy": 0.736066859960556,
69
+ "num_tokens": 28311.0,
70
  "step": 30
71
  },
72
  {
73
+ "entropy": 1.251962125301361,
74
+ "epoch": 0.06118881118881119,
75
+ "grad_norm": 0.9549528956413269,
76
+ "learning_rate": 0.0001914590747330961,
77
+ "loss": 1.1877,
78
+ "mean_token_accuracy": 0.7039336919784546,
79
+ "num_tokens": 33549.0,
80
  "step": 35
81
  },
82
  {
83
+ "entropy": 1.2189816296100617,
84
+ "epoch": 0.06993006993006994,
85
+ "grad_norm": 0.7660185694694519,
86
+ "learning_rate": 0.00018967971530249112,
87
+ "loss": 1.1063,
88
+ "mean_token_accuracy": 0.714855033159256,
89
+ "num_tokens": 38668.0,
90
  "step": 40
91
  },
92
  {
93
+ "entropy": 1.2883310735225677,
94
+ "epoch": 0.07867132867132867,
95
+ "grad_norm": 0.9150952696800232,
96
+ "learning_rate": 0.00018790035587188612,
97
+ "loss": 1.2873,
98
+ "mean_token_accuracy": 0.6970557630062103,
99
+ "num_tokens": 44197.0,
100
  "step": 45
101
  },
102
  {
103
+ "entropy": 1.1909499704837798,
104
+ "epoch": 0.08741258741258741,
105
+ "grad_norm": 0.6662327647209167,
106
+ "learning_rate": 0.00018612099644128114,
107
+ "loss": 1.1576,
108
+ "mean_token_accuracy": 0.7179031908512116,
109
+ "num_tokens": 49908.0,
110
  "step": 50
111
  },
112
  {
113
+ "entropy": 1.3348850965499879,
114
+ "epoch": 0.09615384615384616,
115
+ "grad_norm": 0.8849563002586365,
116
+ "learning_rate": 0.00018434163701067617,
117
+ "loss": 1.1603,
118
+ "mean_token_accuracy": 0.7022291779518127,
119
+ "num_tokens": 54191.0,
120
  "step": 55
121
  },
122
  {
123
+ "entropy": 1.2066528439521789,
124
+ "epoch": 0.1048951048951049,
125
+ "grad_norm": 0.8784617781639099,
126
+ "learning_rate": 0.0001825622775800712,
127
+ "loss": 1.1539,
128
+ "mean_token_accuracy": 0.7269383847713471,
129
+ "num_tokens": 59562.0,
130
  "step": 60
131
  },
132
  {
133
+ "entropy": 1.129963719844818,
134
+ "epoch": 0.11363636363636363,
135
+ "grad_norm": 0.774686336517334,
136
+ "learning_rate": 0.0001807829181494662,
137
+ "loss": 1.0373,
138
+ "mean_token_accuracy": 0.7341378688812256,
139
+ "num_tokens": 65018.0,
140
  "step": 65
141
  },
142
  {
143
+ "entropy": 1.276548171043396,
144
+ "epoch": 0.12237762237762238,
145
+ "grad_norm": 0.7157500386238098,
146
+ "learning_rate": 0.0001790035587188612,
147
+ "loss": 1.1742,
148
+ "mean_token_accuracy": 0.7058017492294312,
149
+ "num_tokens": 70812.0,
150
  "step": 70
151
  },
152
  {
153
+ "entropy": 1.0836508870124817,
154
+ "epoch": 0.13111888111888112,
155
+ "grad_norm": 0.7531887292861938,
156
+ "learning_rate": 0.00017722419928825625,
157
+ "loss": 0.9649,
158
+ "mean_token_accuracy": 0.748905599117279,
159
+ "num_tokens": 76106.0,
160
  "step": 75
161
  },
162
  {
163
+ "entropy": 1.0430119216442109,
164
+ "epoch": 0.13986013986013987,
165
+ "grad_norm": 0.6007382869720459,
166
+ "learning_rate": 0.00017544483985765125,
167
+ "loss": 0.9445,
168
+ "mean_token_accuracy": 0.7515589416027069,
169
+ "num_tokens": 81557.0,
170
  "step": 80
171
  },
172
  {
173
+ "entropy": 1.1489889979362489,
174
+ "epoch": 0.1486013986013986,
175
+ "grad_norm": 0.7831665277481079,
176
+ "learning_rate": 0.00017366548042704627,
177
+ "loss": 1.0943,
178
+ "mean_token_accuracy": 0.7280911147594452,
179
+ "num_tokens": 86442.0,
180
  "step": 85
181
  },
182
  {
183
+ "entropy": 1.0606273770332337,
184
+ "epoch": 0.15734265734265734,
185
+ "grad_norm": 0.7512551546096802,
186
+ "learning_rate": 0.00017188612099644127,
187
+ "loss": 0.9434,
188
+ "mean_token_accuracy": 0.7480818212032319,
189
+ "num_tokens": 90404.0,
190
  "step": 90
191
  },
192
  {
193
+ "entropy": 1.1492775142192841,
194
+ "epoch": 0.1660839160839161,
195
+ "grad_norm": 0.5179319381713867,
196
+ "learning_rate": 0.00017010676156583633,
197
+ "loss": 1.0696,
198
+ "mean_token_accuracy": 0.7424242496490479,
199
+ "num_tokens": 97002.0,
200
  "step": 95
201
  },
202
  {
203
+ "entropy": 1.194596391916275,
204
+ "epoch": 0.17482517482517482,
205
+ "grad_norm": 0.7215603590011597,
206
+ "learning_rate": 0.00016832740213523133,
207
+ "loss": 1.1231,
208
+ "mean_token_accuracy": 0.7197276711463928,
209
+ "num_tokens": 101935.0,
210
  "step": 100
211
  },
212
  {
213
+ "entropy": 1.048801952600479,
214
+ "epoch": 0.18356643356643357,
215
+ "grad_norm": 0.9170930981636047,
216
+ "learning_rate": 0.00016654804270462633,
217
+ "loss": 0.9718,
218
+ "mean_token_accuracy": 0.7438604295253753,
219
+ "num_tokens": 107692.0,
220
  "step": 105
221
  },
222
  {
223
+ "entropy": 1.2630416095256805,
224
+ "epoch": 0.19230769230769232,
225
+ "grad_norm": 0.6975880861282349,
226
+ "learning_rate": 0.00016476868327402135,
227
+ "loss": 1.1672,
228
+ "mean_token_accuracy": 0.7042996108531951,
229
+ "num_tokens": 113041.0,
230
  "step": 110
231
  },
232
  {
233
+ "entropy": 1.170883482694626,
234
+ "epoch": 0.20104895104895104,
235
+ "grad_norm": 1.2549158334732056,
236
+ "learning_rate": 0.00016298932384341638,
237
+ "loss": 1.1129,
238
+ "mean_token_accuracy": 0.7294944524765015,
239
+ "num_tokens": 118065.0,
240
  "step": 115
241
  },
242
  {
243
+ "entropy": 1.2757395565509797,
244
+ "epoch": 0.2097902097902098,
245
+ "grad_norm": 0.7007513046264648,
246
+ "learning_rate": 0.0001612099644128114,
247
+ "loss": 1.2333,
248
+ "mean_token_accuracy": 0.7045675635337829,
249
+ "num_tokens": 123502.0,
250
  "step": 120
251
  },
252
  {
253
+ "entropy": 1.102766215801239,
254
+ "epoch": 0.21853146853146854,
255
+ "grad_norm": 0.6966100931167603,
256
+ "learning_rate": 0.0001594306049822064,
257
+ "loss": 0.9903,
258
+ "mean_token_accuracy": 0.7430883646011353,
259
+ "num_tokens": 128417.0,
260
  "step": 125
261
  },
262
  {
263
+ "entropy": 1.0878133654594422,
264
+ "epoch": 0.22727272727272727,
265
+ "grad_norm": 0.5765619277954102,
266
+ "learning_rate": 0.00015765124555160143,
267
+ "loss": 1.0547,
268
+ "mean_token_accuracy": 0.7235966801643372,
269
+ "num_tokens": 134955.0,
270
  "step": 130
271
  },
272
  {
273
+ "entropy": 1.059841650724411,
274
+ "epoch": 0.23601398601398602,
275
+ "grad_norm": 0.6278873085975647,
276
+ "learning_rate": 0.00015587188612099646,
277
+ "loss": 1.0116,
278
+ "mean_token_accuracy": 0.7435504794120789,
279
+ "num_tokens": 140785.0,
280
  "step": 135
281
  },
282
  {
283
+ "entropy": 1.0601991772651673,
284
+ "epoch": 0.24475524475524477,
285
+ "grad_norm": 0.7335526943206787,
286
+ "learning_rate": 0.00015409252669039148,
287
+ "loss": 0.9323,
288
+ "mean_token_accuracy": 0.7478764116764068,
289
+ "num_tokens": 145727.0,
290
  "step": 140
291
  },
292
  {
293
+ "entropy": 1.1874103128910065,
294
+ "epoch": 0.2534965034965035,
295
+ "grad_norm": 0.5723336338996887,
296
+ "learning_rate": 0.00015231316725978648,
297
+ "loss": 1.0757,
298
+ "mean_token_accuracy": 0.7186325311660766,
299
+ "num_tokens": 151867.0,
300
  "step": 145
301
  },
302
  {
303
+ "entropy": 0.9424997448921204,
304
+ "epoch": 0.26223776223776224,
305
+ "grad_norm": 0.8389629125595093,
306
+ "learning_rate": 0.00015053380782918148,
307
+ "loss": 0.8844,
308
+ "mean_token_accuracy": 0.7715938806533813,
309
+ "num_tokens": 155296.0,
310
  "step": 150
311
  },
312
  {
313
+ "entropy": 1.0623292565345763,
314
+ "epoch": 0.270979020979021,
315
+ "grad_norm": 0.7301695942878723,
316
+ "learning_rate": 0.00014875444839857654,
317
+ "loss": 0.9899,
318
+ "mean_token_accuracy": 0.7323084354400635,
319
+ "num_tokens": 160654.0,
320
  "step": 155
321
  },
322
  {
323
+ "entropy": 1.065491944551468,
324
+ "epoch": 0.27972027972027974,
325
+ "grad_norm": 0.7877907156944275,
326
+ "learning_rate": 0.00014697508896797153,
327
+ "loss": 0.9466,
328
+ "mean_token_accuracy": 0.7487669110298156,
329
+ "num_tokens": 165603.0,
330
  "step": 160
331
  },
332
  {
333
+ "entropy": 1.1675564229488373,
334
+ "epoch": 0.28846153846153844,
335
+ "grad_norm": 0.8203403949737549,
336
+ "learning_rate": 0.00014519572953736656,
337
+ "loss": 1.0758,
338
+ "mean_token_accuracy": 0.7346278429031372,
339
+ "num_tokens": 171119.0,
340
  "step": 165
341
  },
342
  {
343
+ "entropy": 1.0618612051010132,
344
+ "epoch": 0.2972027972027972,
345
+ "grad_norm": 0.8218940496444702,
346
+ "learning_rate": 0.00014341637010676156,
347
+ "loss": 1.0296,
348
+ "mean_token_accuracy": 0.7328690826892853,
349
+ "num_tokens": 176227.0,
350
  "step": 170
351
  },
352
  {
353
+ "entropy": 1.193172001838684,
354
+ "epoch": 0.30594405594405594,
355
+ "grad_norm": 0.9550092816352844,
356
+ "learning_rate": 0.0001416370106761566,
357
+ "loss": 1.1211,
358
+ "mean_token_accuracy": 0.7121898174285889,
359
+ "num_tokens": 181621.0,
360
  "step": 175
361
  },
362
  {
363
+ "entropy": 1.136334627866745,
364
+ "epoch": 0.3146853146853147,
365
+ "grad_norm": 0.6639471650123596,
366
+ "learning_rate": 0.0001398576512455516,
367
+ "loss": 1.0134,
368
+ "mean_token_accuracy": 0.7396367609500885,
369
+ "num_tokens": 188262.0,
370
  "step": 180
371
  },
372
  {
373
+ "entropy": 1.1639393329620362,
374
+ "epoch": 0.32342657342657344,
375
+ "grad_norm": 0.6816486120223999,
376
+ "learning_rate": 0.0001380782918149466,
377
+ "loss": 1.0736,
378
+ "mean_token_accuracy": 0.7224510788917542,
379
+ "num_tokens": 192956.0,
380
  "step": 185
381
  },
382
  {
383
+ "entropy": 1.146146583557129,
384
+ "epoch": 0.3321678321678322,
385
+ "grad_norm": 0.786189079284668,
386
+ "learning_rate": 0.00013629893238434164,
387
+ "loss": 1.0364,
388
+ "mean_token_accuracy": 0.7313450872898102,
389
+ "num_tokens": 197213.0,
390
  "step": 190
391
  },
392
  {
393
+ "entropy": 1.10458744764328,
394
+ "epoch": 0.3409090909090909,
395
+ "grad_norm": 1.0277358293533325,
396
+ "learning_rate": 0.00013451957295373666,
397
+ "loss": 1.0431,
398
+ "mean_token_accuracy": 0.7269207119941712,
399
+ "num_tokens": 201735.0,
400
  "step": 195
401
  },
402
  {
403
+ "entropy": 1.0827986776828766,
404
+ "epoch": 0.34965034965034963,
405
+ "grad_norm": 0.7654422521591187,
406
+ "learning_rate": 0.0001327402135231317,
407
+ "loss": 1.0355,
408
+ "mean_token_accuracy": 0.738901925086975,
409
+ "num_tokens": 206600.0,
410
  "step": 200
411
  },
412
  {
413
+ "entropy": 1.0413719892501831,
414
+ "epoch": 0.3583916083916084,
415
+ "grad_norm": 0.8267967700958252,
416
+ "learning_rate": 0.0001309608540925267,
417
+ "loss": 0.9717,
418
+ "mean_token_accuracy": 0.7459078669548035,
419
+ "num_tokens": 211319.0,
420
  "step": 205
421
  },
422
  {
423
+ "entropy": 0.9956618547439575,
424
+ "epoch": 0.36713286713286714,
425
+ "grad_norm": 0.7114885449409485,
426
+ "learning_rate": 0.00012918149466192172,
427
+ "loss": 0.8975,
428
+ "mean_token_accuracy": 0.7587344646453857,
429
+ "num_tokens": 216407.0,
430
  "step": 210
431
  },
432
  {
433
+ "entropy": 1.201312917470932,
434
+ "epoch": 0.3758741258741259,
435
+ "grad_norm": 0.5830783843994141,
436
+ "learning_rate": 0.00012740213523131672,
437
+ "loss": 1.1477,
438
+ "mean_token_accuracy": 0.7168383121490478,
439
+ "num_tokens": 222016.0,
440
  "step": 215
441
  },
442
  {
443
+ "entropy": 1.1125480353832244,
444
+ "epoch": 0.38461538461538464,
445
+ "grad_norm": 0.6842811107635498,
446
+ "learning_rate": 0.00012562277580071177,
447
+ "loss": 0.982,
448
+ "mean_token_accuracy": 0.7435801923274994,
449
+ "num_tokens": 226748.0,
450
  "step": 220
451
  },
452
  {
453
+ "entropy": 1.1250860214233398,
454
+ "epoch": 0.39335664335664333,
455
+ "grad_norm": 1.392675757408142,
456
+ "learning_rate": 0.00012384341637010677,
457
+ "loss": 1.0523,
458
+ "mean_token_accuracy": 0.7364842057228088,
459
+ "num_tokens": 231513.0,
460
  "step": 225
461
  },
462
  {
463
+ "entropy": 0.9735329568386077,
464
+ "epoch": 0.4020979020979021,
465
+ "grad_norm": 0.8255024552345276,
466
+ "learning_rate": 0.00012206405693950178,
467
+ "loss": 0.9083,
468
+ "mean_token_accuracy": 0.7571049571037293,
469
+ "num_tokens": 235433.0,
470
  "step": 230
471
  },
472
  {
473
+ "entropy": 1.0409073889255525,
474
+ "epoch": 0.41083916083916083,
475
+ "grad_norm": 0.6322015523910522,
476
+ "learning_rate": 0.0001202846975088968,
477
+ "loss": 0.9712,
478
+ "mean_token_accuracy": 0.7544535756111145,
479
+ "num_tokens": 240991.0,
480
  "step": 235
481
  },
482
  {
483
+ "entropy": 0.9808995604515076,
484
+ "epoch": 0.4195804195804196,
485
+ "grad_norm": 0.693168044090271,
486
+ "learning_rate": 0.00011850533807829183,
487
+ "loss": 0.9637,
488
+ "mean_token_accuracy": 0.7572705090045929,
489
+ "num_tokens": 245361.0,
490
  "step": 240
491
  },
492
  {
493
+ "entropy": 1.1916967630386353,
494
+ "epoch": 0.42832167832167833,
495
+ "grad_norm": 0.7691939473152161,
496
+ "learning_rate": 0.00011672597864768685,
497
+ "loss": 1.1378,
498
+ "mean_token_accuracy": 0.7057560324668884,
499
+ "num_tokens": 249896.0,
500
  "step": 245
501
  },
502
  {
503
+ "entropy": 0.9713864088058471,
504
+ "epoch": 0.4370629370629371,
505
+ "grad_norm": 0.6049178838729858,
506
+ "learning_rate": 0.00011494661921708185,
507
+ "loss": 0.9592,
508
+ "mean_token_accuracy": 0.7560720384120941,
509
+ "num_tokens": 255682.0,
510
  "step": 250
511
  },
512
  {
513
+ "entropy": 1.260662978887558,
514
+ "epoch": 0.4458041958041958,
515
+ "grad_norm": 0.7776870131492615,
516
+ "learning_rate": 0.00011316725978647686,
517
+ "loss": 1.1992,
518
+ "mean_token_accuracy": 0.6990963518619537,
519
+ "num_tokens": 261698.0,
520
  "step": 255
521
  },
522
  {
523
+ "entropy": 1.0263409852981566,
524
+ "epoch": 0.45454545454545453,
525
+ "grad_norm": 0.5895385146141052,
526
+ "learning_rate": 0.0001113879003558719,
527
+ "loss": 1.0182,
528
+ "mean_token_accuracy": 0.7378697097301483,
529
+ "num_tokens": 266624.0,
530
  "step": 260
531
  },
532
  {
533
+ "entropy": 1.0448009312152862,
534
+ "epoch": 0.4632867132867133,
535
+ "grad_norm": 0.7714991569519043,
536
+ "learning_rate": 0.00010960854092526691,
537
+ "loss": 0.9675,
538
+ "mean_token_accuracy": 0.7545935451984406,
539
+ "num_tokens": 272155.0,
540
  "step": 265
541
  },
542
  {
543
+ "entropy": 1.009095060825348,
544
+ "epoch": 0.47202797202797203,
545
+ "grad_norm": 0.7107412219047546,
546
+ "learning_rate": 0.00010782918149466192,
547
+ "loss": 0.9022,
548
+ "mean_token_accuracy": 0.7640557646751404,
549
+ "num_tokens": 277590.0,
550
  "step": 270
551
  },
552
  {
553
+ "entropy": 1.085400366783142,
554
+ "epoch": 0.4807692307692308,
555
+ "grad_norm": 0.6840293407440186,
556
+ "learning_rate": 0.00010604982206405694,
557
+ "loss": 1.101,
558
+ "mean_token_accuracy": 0.7363012135028839,
559
+ "num_tokens": 282989.0,
560
  "step": 275
561
  },
562
  {
563
+ "entropy": 1.209915179014206,
564
+ "epoch": 0.48951048951048953,
565
+ "grad_norm": 0.7322263121604919,
566
+ "learning_rate": 0.00010427046263345198,
567
+ "loss": 1.0632,
568
+ "mean_token_accuracy": 0.7248473286628723,
569
+ "num_tokens": 288148.0,
570
  "step": 280
571
  },
572
  {
573
+ "entropy": 1.1313316702842713,
574
+ "epoch": 0.4982517482517482,
575
+ "grad_norm": 0.8790935277938843,
576
+ "learning_rate": 0.00010249110320284699,
577
+ "loss": 1.0362,
578
+ "mean_token_accuracy": 0.7234691977500916,
579
+ "num_tokens": 293421.0,
580
  "step": 285
581
  },
582
  {
583
+ "entropy": 1.0769161105155944,
584
+ "epoch": 0.506993006993007,
585
+ "grad_norm": 0.742671012878418,
586
+ "learning_rate": 0.00010071174377224199,
587
+ "loss": 1.0596,
588
+ "mean_token_accuracy": 0.7369856536388397,
589
+ "num_tokens": 299197.0,
590
  "step": 290
591
  },
592
  {
593
+ "entropy": 1.1410824477672576,
594
+ "epoch": 0.5157342657342657,
595
+ "grad_norm": 0.6181492209434509,
596
+ "learning_rate": 9.893238434163702e-05,
597
+ "loss": 1.165,
598
+ "mean_token_accuracy": 0.7148903965950012,
599
+ "num_tokens": 305681.0,
600
  "step": 295
601
  },
602
  {
603
+ "entropy": 1.1295619785785675,
604
+ "epoch": 0.5244755244755245,
605
+ "grad_norm": 0.6285997033119202,
606
+ "learning_rate": 9.715302491103203e-05,
607
+ "loss": 1.0482,
608
+ "mean_token_accuracy": 0.723493081331253,
609
+ "num_tokens": 312074.0,
610
  "step": 300
611
  },
612
  {
613
+ "entropy": 1.0108375370502471,
614
+ "epoch": 0.5332167832167832,
615
+ "grad_norm": 0.9831832647323608,
616
+ "learning_rate": 9.537366548042705e-05,
617
+ "loss": 0.8795,
618
+ "mean_token_accuracy": 0.7591509163379669,
619
+ "num_tokens": 316386.0,
620
  "step": 305
621
  },
622
  {
623
+ "entropy": 1.0078293979167938,
624
+ "epoch": 0.541958041958042,
625
+ "grad_norm": 0.7532368302345276,
626
+ "learning_rate": 9.359430604982207e-05,
627
+ "loss": 0.9584,
628
+ "mean_token_accuracy": 0.7491445183753968,
629
+ "num_tokens": 322246.0,
630
  "step": 310
631
  },
632
  {
633
+ "entropy": 0.940712821483612,
634
+ "epoch": 0.5506993006993007,
635
+ "grad_norm": 0.8640061020851135,
636
+ "learning_rate": 9.18149466192171e-05,
637
+ "loss": 0.9253,
638
+ "mean_token_accuracy": 0.7581913948059082,
639
+ "num_tokens": 328041.0,
640
  "step": 315
641
  },
642
  {
643
+ "entropy": 0.9539014101028442,
644
+ "epoch": 0.5594405594405595,
645
+ "grad_norm": 0.5698885321617126,
646
+ "learning_rate": 9.00355871886121e-05,
647
+ "loss": 0.8867,
648
+ "mean_token_accuracy": 0.7597615242004394,
649
+ "num_tokens": 332751.0,
650
  "step": 320
651
  },
652
  {
653
+ "entropy": 1.08140572309494,
654
+ "epoch": 0.5681818181818182,
655
+ "grad_norm": 0.5825881361961365,
656
+ "learning_rate": 8.825622775800713e-05,
657
+ "loss": 1.0597,
658
+ "mean_token_accuracy": 0.7322126507759095,
659
+ "num_tokens": 338448.0,
660
  "step": 325
661
  },
662
  {
663
+ "entropy": 1.0642346262931823,
664
+ "epoch": 0.5769230769230769,
665
+ "grad_norm": 0.8457391858100891,
666
+ "learning_rate": 8.647686832740213e-05,
667
+ "loss": 1.0298,
668
+ "mean_token_accuracy": 0.7364085793495179,
669
+ "num_tokens": 343508.0,
670
  "step": 330
671
  },
672
  {
673
+ "entropy": 1.0377025127410888,
674
+ "epoch": 0.5856643356643356,
675
+ "grad_norm": 0.7959486842155457,
676
+ "learning_rate": 8.469750889679716e-05,
677
+ "loss": 0.9248,
678
+ "mean_token_accuracy": 0.757226413488388,
679
+ "num_tokens": 347840.0,
680
  "step": 335
681
  },
682
  {
683
+ "entropy": 1.0676892161369325,
684
+ "epoch": 0.5944055944055944,
685
+ "grad_norm": 0.9492782950401306,
686
+ "learning_rate": 8.291814946619217e-05,
687
+ "loss": 0.9644,
688
+ "mean_token_accuracy": 0.7350347638130188,
689
+ "num_tokens": 353004.0,
690
  "step": 340
691
  },
692
  {
693
+ "entropy": 1.2051751494407654,
694
+ "epoch": 0.6031468531468531,
695
+ "grad_norm": 0.6062285304069519,
696
+ "learning_rate": 8.11387900355872e-05,
697
+ "loss": 1.1306,
698
+ "mean_token_accuracy": 0.71878741979599,
699
+ "num_tokens": 358355.0,
700
  "step": 345
701
  },
702
  {
703
+ "entropy": 0.9939802944660187,
704
+ "epoch": 0.6118881118881119,
705
+ "grad_norm": 0.6014482378959656,
706
+ "learning_rate": 7.935943060498221e-05,
707
+ "loss": 0.9206,
708
+ "mean_token_accuracy": 0.7534485578536987,
709
+ "num_tokens": 363815.0,
710
  "step": 350
711
  },
712
  {
713
+ "entropy": 0.9838183641433715,
714
+ "epoch": 0.6206293706293706,
715
+ "grad_norm": 0.6233981251716614,
716
+ "learning_rate": 7.758007117437722e-05,
717
+ "loss": 0.9557,
718
+ "mean_token_accuracy": 0.7579984903335572,
719
+ "num_tokens": 370209.0,
720
  "step": 355
721
  },
722
  {
723
+ "entropy": 1.1523795008659363,
724
+ "epoch": 0.6293706293706294,
725
+ "grad_norm": 0.9388852119445801,
726
+ "learning_rate": 7.580071174377225e-05,
727
+ "loss": 1.1244,
728
+ "mean_token_accuracy": 0.7127670645713806,
729
+ "num_tokens": 375178.0,
730
  "step": 360
731
  },
732
  {
733
+ "entropy": 1.1256710410118103,
734
+ "epoch": 0.6381118881118881,
735
+ "grad_norm": 0.7773574590682983,
736
+ "learning_rate": 7.402135231316726e-05,
737
+ "loss": 1.199,
738
+ "mean_token_accuracy": 0.7347433745861054,
739
+ "num_tokens": 380359.0,
740
  "step": 365
741
  },
742
  {
743
+ "entropy": 1.0246877193450927,
744
+ "epoch": 0.6468531468531469,
745
+ "grad_norm": 0.7057833671569824,
746
+ "learning_rate": 7.224199288256229e-05,
747
+ "loss": 0.9349,
748
+ "mean_token_accuracy": 0.7434077799320221,
749
+ "num_tokens": 386251.0,
750
  "step": 370
751
  },
752
  {
753
+ "entropy": 0.9082993268966675,
754
+ "epoch": 0.6555944055944056,
755
+ "grad_norm": 0.7693665027618408,
756
+ "learning_rate": 7.046263345195729e-05,
757
+ "loss": 0.8317,
758
+ "mean_token_accuracy": 0.7674221277236939,
759
+ "num_tokens": 391273.0,
760
  "step": 375
761
  },
762
  {
763
+ "entropy": 1.0551639199256897,
764
+ "epoch": 0.6643356643356644,
765
+ "grad_norm": 0.6118054986000061,
766
+ "learning_rate": 6.868327402135231e-05,
767
+ "loss": 0.9564,
768
+ "mean_token_accuracy": 0.7505346298217773,
769
+ "num_tokens": 396405.0,
770
  "step": 380
771
  },
772
  {
773
+ "entropy": 0.856031060218811,
774
+ "epoch": 0.6730769230769231,
775
+ "grad_norm": 0.7436105608940125,
776
+ "learning_rate": 6.690391459074733e-05,
777
+ "loss": 0.7753,
778
+ "mean_token_accuracy": 0.7836384952068329,
779
+ "num_tokens": 401417.0,
780
  "step": 385
781
  },
782
  {
783
+ "entropy": 1.1769568383693696,
784
+ "epoch": 0.6818181818181818,
785
+ "grad_norm": 0.5364604592323303,
786
+ "learning_rate": 6.512455516014235e-05,
787
+ "loss": 1.1369,
788
+ "mean_token_accuracy": 0.7138187170028687,
789
+ "num_tokens": 408045.0,
790
  "step": 390
791
  },
792
  {
793
+ "entropy": 0.9055932879447937,
794
+ "epoch": 0.6905594405594405,
795
+ "grad_norm": 0.7993744015693665,
796
+ "learning_rate": 6.334519572953737e-05,
797
+ "loss": 0.8238,
798
+ "mean_token_accuracy": 0.7695916533470154,
799
+ "num_tokens": 412408.0,
800
  "step": 395
801
  },
802
  {
803
+ "entropy": 1.067290061712265,
804
+ "epoch": 0.6993006993006993,
805
+ "grad_norm": 0.5611645579338074,
806
+ "learning_rate": 6.156583629893239e-05,
807
+ "loss": 1.0754,
808
+ "mean_token_accuracy": 0.7374713003635407,
809
+ "num_tokens": 417539.0,
810
  "step": 400
811
  },
812
  {
813
+ "entropy": 0.9325143158435821,
814
+ "epoch": 0.708041958041958,
815
+ "grad_norm": 0.8282243609428406,
816
+ "learning_rate": 5.97864768683274e-05,
817
+ "loss": 0.8287,
818
+ "mean_token_accuracy": 0.7693089723587037,
819
+ "num_tokens": 421587.0,
820
  "step": 405
821
  },
822
  {
823
+ "entropy": 0.9437564730644226,
824
+ "epoch": 0.7167832167832168,
825
+ "grad_norm": 0.8528610467910767,
826
+ "learning_rate": 5.8007117437722425e-05,
827
+ "loss": 0.8851,
828
+ "mean_token_accuracy": 0.7588753461837768,
829
+ "num_tokens": 425118.0,
830
  "step": 410
831
  },
832
  {
833
+ "entropy": 0.9383285760879516,
834
+ "epoch": 0.7255244755244755,
835
+ "grad_norm": 0.9912576079368591,
836
+ "learning_rate": 5.622775800711744e-05,
837
+ "loss": 0.8777,
838
+ "mean_token_accuracy": 0.7649032652378083,
839
+ "num_tokens": 429766.0,
840
  "step": 415
841
  },
842
  {
843
+ "entropy": 0.9844208836555481,
844
+ "epoch": 0.7342657342657343,
845
+ "grad_norm": 0.8838147521018982,
846
+ "learning_rate": 5.4448398576512464e-05,
847
+ "loss": 0.9286,
848
+ "mean_token_accuracy": 0.7666606605052948,
849
+ "num_tokens": 434826.0,
850
  "step": 420
851
  },
852
  {
853
+ "entropy": 1.0472073316574098,
854
+ "epoch": 0.743006993006993,
855
+ "grad_norm": 0.9893532991409302,
856
+ "learning_rate": 5.266903914590747e-05,
857
+ "loss": 0.9453,
858
+ "mean_token_accuracy": 0.7458884060382843,
859
+ "num_tokens": 439219.0,
860
  "step": 425
861
  },
862
  {
863
+ "entropy": 1.059507966041565,
864
+ "epoch": 0.7517482517482518,
865
+ "grad_norm": 0.7243296504020691,
866
+ "learning_rate": 5.0889679715302496e-05,
867
+ "loss": 0.9485,
868
+ "mean_token_accuracy": 0.7473999261856079,
869
+ "num_tokens": 444496.0,
870
  "step": 430
871
  },
872
  {
873
+ "entropy": 0.96737100481987,
874
+ "epoch": 0.7604895104895105,
875
+ "grad_norm": 0.7511352300643921,
876
+ "learning_rate": 4.911032028469751e-05,
877
+ "loss": 0.9112,
878
+ "mean_token_accuracy": 0.7562202334403991,
879
+ "num_tokens": 449115.0,
880
  "step": 435
881
  },
882
  {
883
+ "entropy": 1.0681302666664123,
884
+ "epoch": 0.7692307692307693,
885
+ "grad_norm": 0.6476220488548279,
886
+ "learning_rate": 4.733096085409253e-05,
887
+ "loss": 1.1169,
888
+ "mean_token_accuracy": 0.7343231618404389,
889
+ "num_tokens": 454151.0,
890
  "step": 440
891
  },
892
  {
893
+ "entropy": 0.9483801007270813,
894
+ "epoch": 0.777972027972028,
895
+ "grad_norm": 0.7808278799057007,
896
+ "learning_rate": 4.555160142348754e-05,
897
+ "loss": 0.9041,
898
+ "mean_token_accuracy": 0.7763189613819123,
899
+ "num_tokens": 458892.0,
900
  "step": 445
901
  },
902
  {
903
+ "entropy": 0.9629013359546661,
904
+ "epoch": 0.7867132867132867,
905
+ "grad_norm": 0.7341641187667847,
906
+ "learning_rate": 4.377224199288256e-05,
907
+ "loss": 0.8238,
908
+ "mean_token_accuracy": 0.765246057510376,
909
+ "num_tokens": 463856.0,
910
  "step": 450
911
  },
912
  {
913
+ "entropy": 1.180522269010544,
914
+ "epoch": 0.7954545454545454,
915
+ "grad_norm": 0.8312517404556274,
916
+ "learning_rate": 4.199288256227758e-05,
917
+ "loss": 1.1042,
918
+ "mean_token_accuracy": 0.7128246188163757,
919
+ "num_tokens": 470112.0,
920
  "step": 455
921
  },
922
  {
923
+ "entropy": 1.004443597793579,
924
+ "epoch": 0.8041958041958042,
925
+ "grad_norm": 0.9074130654335022,
926
+ "learning_rate": 4.02135231316726e-05,
927
+ "loss": 0.9222,
928
+ "mean_token_accuracy": 0.7539559602737427,
929
+ "num_tokens": 475012.0,
930
  "step": 460
931
  },
932
  {
933
+ "entropy": 1.0228057682514191,
934
+ "epoch": 0.8129370629370629,
935
+ "grad_norm": 0.920925498008728,
936
+ "learning_rate": 3.843416370106761e-05,
937
+ "loss": 0.9035,
938
+ "mean_token_accuracy": 0.7569567143917084,
939
+ "num_tokens": 480558.0,
940
  "step": 465
941
  },
942
  {
943
+ "entropy": 0.949072140455246,
944
+ "epoch": 0.8216783216783217,
945
+ "grad_norm": 0.6804259419441223,
946
+ "learning_rate": 3.665480427046263e-05,
947
+ "loss": 0.8606,
948
+ "mean_token_accuracy": 0.7625180125236511,
949
+ "num_tokens": 486294.0,
950
  "step": 470
951
  },
952
  {
953
+ "entropy": 1.0250387787818909,
954
+ "epoch": 0.8304195804195804,
955
+ "grad_norm": 0.6318123936653137,
956
+ "learning_rate": 3.487544483985765e-05,
957
+ "loss": 0.9913,
958
+ "mean_token_accuracy": 0.7425659537315369,
959
+ "num_tokens": 492617.0,
960
  "step": 475
961
  },
962
  {
963
+ "entropy": 0.8904710471630096,
964
+ "epoch": 0.8391608391608392,
965
+ "grad_norm": 0.6852394342422485,
966
+ "learning_rate": 3.309608540925267e-05,
967
+ "loss": 0.8392,
968
+ "mean_token_accuracy": 0.7645678043365478,
969
+ "num_tokens": 497070.0,
970
  "step": 480
971
  },
972
  {
973
+ "entropy": 0.9813799023628235,
974
+ "epoch": 0.8479020979020979,
975
+ "grad_norm": 0.6071293950080872,
976
+ "learning_rate": 3.1316725978647684e-05,
977
+ "loss": 0.8984,
978
+ "mean_token_accuracy": 0.7646778285503387,
979
+ "num_tokens": 502298.0,
980
  "step": 485
981
  },
982
  {
983
+ "entropy": 1.0262552201747894,
984
+ "epoch": 0.8566433566433567,
985
+ "grad_norm": 0.8407160043716431,
986
+ "learning_rate": 2.9537366548042704e-05,
987
+ "loss": 0.9343,
988
+ "mean_token_accuracy": 0.7484920144081115,
989
+ "num_tokens": 507261.0,
990
  "step": 490
991
  },
992
  {
993
+ "entropy": 0.9773908019065857,
994
+ "epoch": 0.8653846153846154,
995
+ "grad_norm": 0.6108224987983704,
996
+ "learning_rate": 2.7758007117437723e-05,
997
+ "loss": 0.8876,
998
+ "mean_token_accuracy": 0.7593122482299804,
999
+ "num_tokens": 512933.0,
1000
  "step": 495
1001
  },
1002
  {
1003
+ "entropy": 1.143789404630661,
1004
+ "epoch": 0.8741258741258742,
1005
+ "grad_norm": 0.6079063415527344,
1006
+ "learning_rate": 2.597864768683274e-05,
1007
+ "loss": 1.0861,
1008
+ "mean_token_accuracy": 0.7239168882369995,
1009
+ "num_tokens": 518867.0,
1010
  "step": 500
1011
  },
1012
  {
1013
+ "entropy": 0.9865677416324615,
1014
+ "epoch": 0.8828671328671329,
1015
+ "grad_norm": 0.8393223285675049,
1016
+ "learning_rate": 2.419928825622776e-05,
1017
+ "loss": 0.9208,
1018
+ "mean_token_accuracy": 0.7588137328624726,
1019
+ "num_tokens": 523197.0,
1020
  "step": 505
1021
  },
1022
  {
1023
+ "entropy": 1.0429059386253356,
1024
+ "epoch": 0.8916083916083916,
1025
+ "grad_norm": 0.7288678288459778,
1026
+ "learning_rate": 2.2419928825622775e-05,
1027
+ "loss": 1.0118,
1028
+ "mean_token_accuracy": 0.7459483563899993,
1029
+ "num_tokens": 528553.0,
1030
  "step": 510
1031
  },
1032
  {
1033
+ "entropy": 0.936554628610611,
1034
+ "epoch": 0.9003496503496503,
1035
+ "grad_norm": 1.026867151260376,
1036
+ "learning_rate": 2.0640569395017795e-05,
1037
+ "loss": 0.8488,
1038
+ "mean_token_accuracy": 0.7743270337581635,
1039
+ "num_tokens": 533175.0,
1040
  "step": 515
1041
  },
1042
  {
1043
+ "entropy": 1.0927321076393128,
1044
+ "epoch": 0.9090909090909091,
1045
+ "grad_norm": 0.8070006370544434,
1046
+ "learning_rate": 1.8861209964412814e-05,
1047
+ "loss": 1.0321,
1048
+ "mean_token_accuracy": 0.7340759754180908,
1049
+ "num_tokens": 537923.0,
1050
  "step": 520
1051
  },
1052
  {
1053
+ "entropy": 0.923135507106781,
1054
+ "epoch": 0.9178321678321678,
1055
+ "grad_norm": 0.7885546684265137,
1056
+ "learning_rate": 1.708185053380783e-05,
1057
+ "loss": 0.8886,
1058
+ "mean_token_accuracy": 0.7669959187507629,
1059
+ "num_tokens": 543086.0,
1060
  "step": 525
1061
  },
1062
  {
1063
+ "entropy": 0.803551995754242,
1064
+ "epoch": 0.9265734265734266,
1065
+ "grad_norm": 0.5133217573165894,
1066
+ "learning_rate": 1.530249110320285e-05,
1067
+ "loss": 0.7201,
1068
+ "mean_token_accuracy": 0.7979816317558288,
1069
+ "num_tokens": 547920.0,
1070
  "step": 530
1071
  },
1072
  {
1073
+ "entropy": 1.0683785855770112,
1074
+ "epoch": 0.9353146853146853,
1075
+ "grad_norm": 1.0883749723434448,
1076
+ "learning_rate": 1.3523131672597866e-05,
1077
+ "loss": 0.979,
1078
+ "mean_token_accuracy": 0.7476417005062104,
1079
+ "num_tokens": 553743.0,
1080
  "step": 535
1081
  },
1082
  {
1083
+ "entropy": 1.0017572939395905,
1084
+ "epoch": 0.9440559440559441,
1085
+ "grad_norm": 0.8225399851799011,
1086
+ "learning_rate": 1.1743772241992882e-05,
1087
+ "loss": 0.8984,
1088
+ "mean_token_accuracy": 0.761442244052887,
1089
+ "num_tokens": 558414.0,
1090
  "step": 540
1091
  },
1092
  {
1093
+ "entropy": 1.0132792532444,
1094
+ "epoch": 0.9527972027972028,
1095
+ "grad_norm": 0.9049685001373291,
1096
+ "learning_rate": 9.9644128113879e-06,
1097
+ "loss": 0.9703,
1098
+ "mean_token_accuracy": 0.7527327954769134,
1099
+ "num_tokens": 563295.0,
1100
  "step": 545
1101
  },
1102
  {
1103
+ "entropy": 0.972287380695343,
1104
+ "epoch": 0.9615384615384616,
1105
+ "grad_norm": 0.657630980014801,
1106
+ "learning_rate": 8.185053380782918e-06,
1107
+ "loss": 0.8971,
1108
+ "mean_token_accuracy": 0.7535503268241882,
1109
+ "num_tokens": 568925.0,
1110
  "step": 550
1111
  },
1112
  {
1113
+ "entropy": 1.0074927151203155,
1114
+ "epoch": 0.9702797202797203,
1115
+ "grad_norm": 0.5989683866500854,
1116
+ "learning_rate": 6.405693950177937e-06,
1117
+ "loss": 0.9767,
1118
+ "mean_token_accuracy": 0.7365618705749511,
1119
+ "num_tokens": 574965.0,
1120
  "step": 555
1121
  },
1122
  {
1123
+ "entropy": 1.2086752831935883,
1124
+ "epoch": 0.9790209790209791,
1125
+ "grad_norm": 0.6988089084625244,
1126
+ "learning_rate": 4.626334519572954e-06,
1127
+ "loss": 1.1787,
1128
+ "mean_token_accuracy": 0.7035641133785248,
1129
+ "num_tokens": 580877.0,
1130
  "step": 560
1131
  },
1132
  {
1133
+ "entropy": 0.9592096865177154,
1134
+ "epoch": 0.9877622377622378,
1135
+ "grad_norm": 1.0358166694641113,
1136
+ "learning_rate": 2.8469750889679713e-06,
1137
+ "loss": 0.8782,
1138
+ "mean_token_accuracy": 0.76033256649971,
1139
+ "num_tokens": 585093.0,
1140
  "step": 565
1141
  },
1142
  {
1143
+ "entropy": 0.8759162247180938,
1144
+ "epoch": 0.9965034965034965,
1145
+ "grad_norm": 0.6450009942054749,
1146
+ "learning_rate": 1.0676156583629894e-06,
1147
+ "loss": 0.7832,
1148
+ "mean_token_accuracy": 0.7865857958793641,
1149
+ "num_tokens": 590558.0,
1150
  "step": 570
 
 
 
 
 
 
 
 
 
 
1151
  }
1152
  ],
1153
  "logging_steps": 5,
1154
+ "max_steps": 572,
1155
  "num_input_tokens_seen": 0,
1156
  "num_train_epochs": 1,
1157
  "save_steps": 500,
 
1167
  "attributes": {}
1168
  }
1169
  },
1170
+ "total_flos": 2.684206514115379e+16,
1171
  "train_batch_size": 1,
1172
  "trial_name": null,
1173
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e8ea3ba75a359efb29ad1e55e0aa4877049a0fc5d9f4d1850f8c2eac9197e24f
3
  size 5816
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58293d1261da0e67c9bdcabfa9d91110498e1d28ff6f6e0d9d07cd560a155972
3
  size 5816