nt-van-khanh commited on
Commit
865ac54
·
verified ·
1 Parent(s): ec41953

Upload folder using huggingface_hub

Browse files
adapter_config.json CHANGED
@@ -29,13 +29,13 @@
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
 
32
  "up_proj",
33
- "q_proj",
34
  "v_proj",
35
- "gate_proj",
36
- "down_proj",
37
  "k_proj",
38
- "o_proj"
 
 
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
 
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
+ "down_proj",
33
  "up_proj",
 
34
  "v_proj",
 
 
35
  "k_proj",
36
+ "gate_proj",
37
+ "o_proj",
38
+ "q_proj"
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a3db570fd27876879cc33103ca8933604745e61f4019c1d836c15c7fe2de9457
3
  size 167832240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01b1fff680a543298f99a02a59b07fdd20779d45d76050ae4657d7155dba7881
3
  size 167832240
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:beeda5e00e3f2a251a26931469526d6fefa8f6cc35fc3926826851fecb38c416
3
  size 85728342
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d57468746ed225d196506f39da49e6e801e88bb9404cd3efc0e75e0398a685ef
3
  size 85728342
scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0738d3f7bc7a3d09895ccd1699afea3beb6076cacca6285d9dff68b3826864fb
3
  size 988
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2955c699f5b1504e9840700f4d5dd5648f18dbf48dd8849c32859b9338c9c1b6
3
  size 988
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8bea4c55977be70b1134031e6a8b57e36f8f593b2249c6d9d6b94a16db34cae2
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af2fc7fde6a8810f2aaee4f51ad7b6ecee188721a4bad00d143c4300ff3122b5
3
  size 1064
trainer_state.json CHANGED
@@ -10,1142 +10,1142 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "entropy": 1.333236712217331,
14
  "epoch": 0.008726003490401396,
15
- "grad_norm": 1.05680251121521,
16
- "learning_rate": 8e-05,
17
- "loss": 1.3485,
18
- "mean_token_accuracy": 0.6778763651847839,
19
  "num_tokens": 4689.0,
20
  "step": 5
21
  },
22
  {
23
- "entropy": 1.3305357813835144,
24
  "epoch": 0.017452006980802792,
25
- "grad_norm": 0.872683584690094,
26
- "learning_rate": 0.00018,
27
- "loss": 1.2321,
28
- "mean_token_accuracy": 0.6924330353736877,
29
  "num_tokens": 9076.0,
30
  "step": 10
31
  },
32
  {
33
- "entropy": 1.1033322036266326,
34
  "epoch": 0.02617801047120419,
35
- "grad_norm": 0.7987897992134094,
36
- "learning_rate": 0.0001985790408525755,
37
- "loss": 1.0149,
38
- "mean_token_accuracy": 0.7376092612743378,
39
  "num_tokens": 13468.0,
40
  "step": 15
41
  },
42
  {
43
- "entropy": 1.3769255757331849,
44
  "epoch": 0.034904013961605584,
45
- "grad_norm": 0.5818138718605042,
46
- "learning_rate": 0.00019680284191829486,
47
- "loss": 1.3507,
48
- "mean_token_accuracy": 0.6773534297943116,
49
  "num_tokens": 19687.0,
50
  "step": 20
51
  },
52
  {
53
- "entropy": 1.3704544663429261,
54
  "epoch": 0.04363001745200698,
55
- "grad_norm": 0.7997947335243225,
56
- "learning_rate": 0.00019502664298401423,
57
- "loss": 1.2011,
58
- "mean_token_accuracy": 0.7000263512134552,
59
  "num_tokens": 25321.0,
60
  "step": 25
61
  },
62
  {
63
- "entropy": 1.1999043464660644,
64
  "epoch": 0.05235602094240838,
65
- "grad_norm": 0.7836564779281616,
66
- "learning_rate": 0.00019325044404973357,
67
- "loss": 1.1702,
68
- "mean_token_accuracy": 0.7157157480716705,
69
  "num_tokens": 29969.0,
70
  "step": 30
71
  },
72
  {
73
- "entropy": 1.1485023498535156,
74
  "epoch": 0.06108202443280977,
75
- "grad_norm": 0.8341999650001526,
76
- "learning_rate": 0.00019147424511545294,
77
- "loss": 1.0127,
78
- "mean_token_accuracy": 0.736984384059906,
79
  "num_tokens": 34779.0,
80
  "step": 35
81
  },
82
  {
83
- "entropy": 1.175552135705948,
84
  "epoch": 0.06980802792321117,
85
- "grad_norm": 0.8367146849632263,
86
- "learning_rate": 0.0001896980461811723,
87
- "loss": 1.081,
88
- "mean_token_accuracy": 0.7145774185657501,
89
  "num_tokens": 40454.0,
90
  "step": 40
91
  },
92
  {
93
- "entropy": 1.2580669283866883,
94
  "epoch": 0.07853403141361257,
95
- "grad_norm": 0.7274704575538635,
96
- "learning_rate": 0.00018792184724689167,
97
- "loss": 1.1622,
98
- "mean_token_accuracy": 0.7037679016590118,
99
  "num_tokens": 45435.0,
100
  "step": 45
101
  },
102
  {
103
- "entropy": 1.3289404153823852,
104
  "epoch": 0.08726003490401396,
105
- "grad_norm": 0.9459134936332703,
106
- "learning_rate": 0.00018614564831261103,
107
- "loss": 1.2887,
108
- "mean_token_accuracy": 0.7032808780670166,
109
  "num_tokens": 50161.0,
110
  "step": 50
111
  },
112
  {
113
- "entropy": 1.206754869222641,
114
  "epoch": 0.09598603839441536,
115
- "grad_norm": 0.8248263001441956,
116
- "learning_rate": 0.00018436944937833037,
117
- "loss": 1.0185,
118
- "mean_token_accuracy": 0.7304032206535339,
119
  "num_tokens": 55082.0,
120
  "step": 55
121
  },
122
  {
123
- "entropy": 1.3467580556869507,
124
  "epoch": 0.10471204188481675,
125
- "grad_norm": 0.7025023698806763,
126
- "learning_rate": 0.00018259325044404974,
127
- "loss": 1.3245,
128
- "mean_token_accuracy": 0.6774280846118927,
129
  "num_tokens": 61109.0,
130
  "step": 60
131
  },
132
  {
133
- "entropy": 1.1657752752304078,
134
  "epoch": 0.11343804537521815,
135
- "grad_norm": 0.7866821885108948,
136
- "learning_rate": 0.0001808170515097691,
137
- "loss": 1.0342,
138
- "mean_token_accuracy": 0.7379155635833741,
139
  "num_tokens": 65130.0,
140
  "step": 65
141
  },
142
  {
143
- "entropy": 1.3768277764320374,
144
  "epoch": 0.12216404886561955,
145
- "grad_norm": 0.6452690958976746,
146
- "learning_rate": 0.00017904085257548847,
147
- "loss": 1.3499,
148
- "mean_token_accuracy": 0.6878371357917785,
149
  "num_tokens": 71720.0,
150
  "step": 70
151
  },
152
  {
153
- "entropy": 1.2285258889198303,
154
  "epoch": 0.13089005235602094,
155
- "grad_norm": 0.8868134617805481,
156
- "learning_rate": 0.00017726465364120784,
157
- "loss": 1.1203,
158
- "mean_token_accuracy": 0.7103672683238983,
159
  "num_tokens": 76475.0,
160
  "step": 75
161
  },
162
  {
163
- "entropy": 1.142468798160553,
164
  "epoch": 0.13961605584642234,
165
- "grad_norm": 0.7537686228752136,
166
- "learning_rate": 0.00017548845470692718,
167
- "loss": 1.0207,
168
- "mean_token_accuracy": 0.7329977452754974,
169
  "num_tokens": 82239.0,
170
  "step": 80
171
  },
172
  {
173
- "entropy": 1.30864217877388,
174
  "epoch": 0.14834205933682373,
175
- "grad_norm": 0.9109086394309998,
176
- "learning_rate": 0.00017371225577264654,
177
- "loss": 1.2256,
178
- "mean_token_accuracy": 0.6924388945102692,
179
  "num_tokens": 86033.0,
180
  "step": 85
181
  },
182
  {
183
- "entropy": 1.279932165145874,
184
  "epoch": 0.15706806282722513,
185
- "grad_norm": 0.7983659505844116,
186
- "learning_rate": 0.0001719360568383659,
187
- "loss": 1.1764,
188
- "mean_token_accuracy": 0.7101370930671692,
189
  "num_tokens": 90170.0,
190
  "step": 90
191
  },
192
  {
193
- "entropy": 1.1692178070545196,
194
  "epoch": 0.16579406631762653,
195
- "grad_norm": 0.8946067690849304,
196
- "learning_rate": 0.00017015985790408525,
197
- "loss": 1.0826,
198
- "mean_token_accuracy": 0.7317939043045044,
199
  "num_tokens": 95473.0,
200
  "step": 95
201
  },
202
  {
203
- "entropy": 1.025848913192749,
204
  "epoch": 0.17452006980802792,
205
- "grad_norm": 0.8327645063400269,
206
- "learning_rate": 0.00016838365896980464,
207
- "loss": 0.9294,
208
- "mean_token_accuracy": 0.7514408528804779,
209
  "num_tokens": 99423.0,
210
  "step": 100
211
  },
212
  {
213
- "entropy": 1.0799501717090607,
214
  "epoch": 0.18324607329842932,
215
- "grad_norm": 0.7194784283638,
216
- "learning_rate": 0.00016660746003552398,
217
- "loss": 1.0222,
218
- "mean_token_accuracy": 0.7337860226631164,
219
  "num_tokens": 104249.0,
220
  "step": 105
221
  },
222
  {
223
- "entropy": 1.1033223390579223,
224
  "epoch": 0.19197207678883071,
225
- "grad_norm": 0.7712328433990479,
226
- "learning_rate": 0.00016483126110124335,
227
- "loss": 0.9856,
228
- "mean_token_accuracy": 0.7449049592018128,
229
  "num_tokens": 109205.0,
230
  "step": 110
231
  },
232
  {
233
- "entropy": 1.1388230919837952,
234
  "epoch": 0.2006980802792321,
235
- "grad_norm": 0.6309220194816589,
236
- "learning_rate": 0.00016305506216696272,
237
- "loss": 1.1354,
238
- "mean_token_accuracy": 0.724005150794983,
239
  "num_tokens": 115207.0,
240
  "step": 115
241
  },
242
  {
243
- "entropy": 1.0293731987476349,
244
  "epoch": 0.2094240837696335,
245
- "grad_norm": 1.0027621984481812,
246
- "learning_rate": 0.00016127886323268206,
247
- "loss": 0.9218,
248
- "mean_token_accuracy": 0.7559137165546417,
249
  "num_tokens": 120323.0,
250
  "step": 120
251
  },
252
  {
253
- "entropy": 1.1900119483470917,
254
  "epoch": 0.2181500872600349,
255
- "grad_norm": 0.8019612431526184,
256
- "learning_rate": 0.00015950266429840145,
257
- "loss": 1.106,
258
- "mean_token_accuracy": 0.7178053438663483,
259
  "num_tokens": 125253.0,
260
  "step": 125
261
  },
262
  {
263
- "entropy": 1.0218496084213258,
264
  "epoch": 0.2268760907504363,
265
- "grad_norm": 0.699367105960846,
266
- "learning_rate": 0.0001577264653641208,
267
- "loss": 0.931,
268
- "mean_token_accuracy": 0.7488141357898712,
269
  "num_tokens": 130360.0,
270
  "step": 130
271
  },
272
  {
273
- "entropy": 1.1080122888088226,
274
  "epoch": 0.2356020942408377,
275
- "grad_norm": 0.7124127745628357,
276
- "learning_rate": 0.00015595026642984015,
277
- "loss": 1.0557,
278
- "mean_token_accuracy": 0.7226514399051667,
279
  "num_tokens": 135538.0,
280
  "step": 135
281
  },
282
  {
283
- "entropy": 1.173432421684265,
284
  "epoch": 0.2443280977312391,
285
- "grad_norm": 0.794236421585083,
286
- "learning_rate": 0.00015417406749555952,
287
- "loss": 1.056,
288
- "mean_token_accuracy": 0.7334702372550964,
289
  "num_tokens": 140532.0,
290
  "step": 140
291
  },
292
  {
293
- "entropy": 1.0574114263057708,
294
  "epoch": 0.2530541012216405,
295
- "grad_norm": 0.6696324944496155,
296
- "learning_rate": 0.00015239786856127886,
297
- "loss": 0.9361,
298
- "mean_token_accuracy": 0.7482443630695343,
299
  "num_tokens": 145908.0,
300
  "step": 145
301
  },
302
  {
303
- "entropy": 1.086327201128006,
304
  "epoch": 0.2617801047120419,
305
- "grad_norm": 0.5255310535430908,
306
- "learning_rate": 0.00015062166962699825,
307
- "loss": 1.0768,
308
- "mean_token_accuracy": 0.7292326390743256,
309
  "num_tokens": 151148.0,
310
  "step": 150
311
  },
312
  {
313
- "entropy": 1.092069786787033,
314
  "epoch": 0.2705061082024433,
315
- "grad_norm": 0.6275709271430969,
316
- "learning_rate": 0.0001488454706927176,
317
- "loss": 1.0778,
318
- "mean_token_accuracy": 0.7255069613456726,
319
  "num_tokens": 157506.0,
320
  "step": 155
321
  },
322
  {
323
- "entropy": 1.1596343219280243,
324
  "epoch": 0.2792321116928447,
325
- "grad_norm": 0.9472619295120239,
326
- "learning_rate": 0.00014706927175843693,
327
- "loss": 1.1003,
328
- "mean_token_accuracy": 0.7315803647041321,
329
  "num_tokens": 162992.0,
330
  "step": 160
331
  },
332
  {
333
- "entropy": 1.0481273233890533,
334
  "epoch": 0.2879581151832461,
335
- "grad_norm": 0.6921494007110596,
336
- "learning_rate": 0.00014529307282415633,
337
- "loss": 0.8895,
338
- "mean_token_accuracy": 0.7529896676540375,
339
  "num_tokens": 167640.0,
340
  "step": 165
341
  },
342
  {
343
- "entropy": 1.0518691539764404,
344
  "epoch": 0.29668411867364747,
345
- "grad_norm": 0.6654248237609863,
346
- "learning_rate": 0.00014351687388987566,
347
- "loss": 1.018,
348
- "mean_token_accuracy": 0.7503870785236358,
349
  "num_tokens": 173423.0,
350
  "step": 170
351
  },
352
  {
353
- "entropy": 1.1176642417907714,
354
  "epoch": 0.3054101221640489,
355
- "grad_norm": 0.7743102312088013,
356
- "learning_rate": 0.00014174067495559503,
357
- "loss": 1.0807,
358
- "mean_token_accuracy": 0.7225248873233795,
359
  "num_tokens": 178986.0,
360
  "step": 175
361
  },
362
  {
363
- "entropy": 0.9516431629657746,
364
  "epoch": 0.31413612565445026,
365
- "grad_norm": 1.0389933586120605,
366
- "learning_rate": 0.0001399644760213144,
367
- "loss": 0.8189,
368
- "mean_token_accuracy": 0.7752299129962921,
369
  "num_tokens": 183459.0,
370
  "step": 180
371
  },
372
  {
373
- "entropy": 1.1684755861759186,
374
  "epoch": 0.3228621291448517,
375
- "grad_norm": 1.4807476997375488,
376
- "learning_rate": 0.00013818827708703374,
377
- "loss": 1.1822,
378
- "mean_token_accuracy": 0.7197710394859314,
379
  "num_tokens": 187614.0,
380
  "step": 185
381
  },
382
  {
383
- "entropy": 1.099220609664917,
384
  "epoch": 0.33158813263525305,
385
- "grad_norm": 0.7266477346420288,
386
- "learning_rate": 0.00013641207815275313,
387
- "loss": 1.0095,
388
- "mean_token_accuracy": 0.7297711133956909,
389
  "num_tokens": 192316.0,
390
  "step": 190
391
  },
392
  {
393
- "entropy": 1.0837588012218475,
394
  "epoch": 0.3403141361256545,
395
- "grad_norm": 0.696660041809082,
396
- "learning_rate": 0.00013463587921847247,
397
- "loss": 0.9739,
398
- "mean_token_accuracy": 0.7354932248592376,
399
  "num_tokens": 197728.0,
400
  "step": 195
401
  },
402
  {
403
- "entropy": 1.1696858763694764,
404
  "epoch": 0.34904013961605584,
405
- "grad_norm": 0.5466914772987366,
406
- "learning_rate": 0.00013285968028419184,
407
- "loss": 1.1444,
408
- "mean_token_accuracy": 0.7138558447360992,
409
  "num_tokens": 204502.0,
410
  "step": 200
411
  },
412
  {
413
- "entropy": 1.147382140159607,
414
  "epoch": 0.35776614310645727,
415
- "grad_norm": 0.8311446905136108,
416
- "learning_rate": 0.0001310834813499112,
417
- "loss": 1.1093,
418
- "mean_token_accuracy": 0.7309025764465332,
419
  "num_tokens": 209069.0,
420
  "step": 205
421
  },
422
  {
423
- "entropy": 1.2201330184936523,
424
  "epoch": 0.36649214659685864,
425
- "grad_norm": 0.6816751956939697,
426
- "learning_rate": 0.00012930728241563054,
427
- "loss": 1.2094,
428
- "mean_token_accuracy": 0.7130683898925781,
429
  "num_tokens": 214185.0,
430
  "step": 210
431
  },
432
  {
433
- "entropy": 1.152731454372406,
434
  "epoch": 0.37521815008726006,
435
- "grad_norm": 0.6387792825698853,
436
- "learning_rate": 0.00012753108348134993,
437
- "loss": 1.0565,
438
- "mean_token_accuracy": 0.7268509924411773,
439
  "num_tokens": 219312.0,
440
  "step": 215
441
  },
442
  {
443
- "entropy": 1.1504864931106566,
444
  "epoch": 0.38394415357766143,
445
- "grad_norm": 0.7773131728172302,
446
- "learning_rate": 0.00012575488454706927,
447
- "loss": 1.0913,
448
- "mean_token_accuracy": 0.7241075754165649,
449
  "num_tokens": 225616.0,
450
  "step": 220
451
  },
452
  {
453
- "entropy": 1.0282553434371948,
454
  "epoch": 0.39267015706806285,
455
- "grad_norm": 0.8763700723648071,
456
- "learning_rate": 0.00012397868561278864,
457
- "loss": 0.9342,
458
- "mean_token_accuracy": 0.7502905786037445,
459
  "num_tokens": 230696.0,
460
  "step": 225
461
  },
462
  {
463
- "entropy": 1.0895283699035645,
464
  "epoch": 0.4013961605584642,
465
- "grad_norm": 0.8293470740318298,
466
- "learning_rate": 0.000122202486678508,
467
- "loss": 1.067,
468
- "mean_token_accuracy": 0.7364717125892639,
469
  "num_tokens": 236685.0,
470
  "step": 230
471
  },
472
  {
473
- "entropy": 1.172694307565689,
474
  "epoch": 0.41012216404886565,
475
- "grad_norm": 0.8818181753158569,
476
- "learning_rate": 0.00012042628774422735,
477
- "loss": 1.0149,
478
- "mean_token_accuracy": 0.7262615323066711,
479
  "num_tokens": 241211.0,
480
  "step": 235
481
  },
482
  {
483
- "entropy": 1.2173514723777772,
484
  "epoch": 0.418848167539267,
485
- "grad_norm": 0.5635867714881897,
486
- "learning_rate": 0.00011865008880994673,
487
- "loss": 1.1783,
488
- "mean_token_accuracy": 0.7147055625915527,
489
  "num_tokens": 246360.0,
490
  "step": 240
491
  },
492
  {
493
- "entropy": 1.188833224773407,
494
  "epoch": 0.42757417102966844,
495
- "grad_norm": 0.6060160398483276,
496
- "learning_rate": 0.00011687388987566608,
497
- "loss": 1.1545,
498
- "mean_token_accuracy": 0.717083477973938,
499
  "num_tokens": 252717.0,
500
  "step": 245
501
  },
502
  {
503
- "entropy": 1.0905582129955291,
504
  "epoch": 0.4363001745200698,
505
- "grad_norm": 0.6812947988510132,
506
- "learning_rate": 0.00011509769094138544,
507
- "loss": 0.9922,
508
- "mean_token_accuracy": 0.7299255549907684,
509
  "num_tokens": 257249.0,
510
  "step": 250
511
  },
512
  {
513
- "entropy": 0.8695837318897247,
514
  "epoch": 0.44502617801047123,
515
- "grad_norm": 0.8577454090118408,
516
- "learning_rate": 0.0001133214920071048,
517
- "loss": 0.8209,
518
- "mean_token_accuracy": 0.7762204229831695,
519
  "num_tokens": 262381.0,
520
  "step": 255
521
  },
522
  {
523
- "entropy": 0.9932888269424438,
524
  "epoch": 0.4537521815008726,
525
- "grad_norm": 0.697665810585022,
526
- "learning_rate": 0.00011154529307282415,
527
- "loss": 1.0232,
528
- "mean_token_accuracy": 0.7427519500255585,
529
  "num_tokens": 267410.0,
530
  "step": 260
531
  },
532
  {
533
- "entropy": 0.8414939880371094,
534
  "epoch": 0.462478184991274,
535
- "grad_norm": 0.789999783039093,
536
- "learning_rate": 0.00010976909413854353,
537
- "loss": 0.7225,
538
- "mean_token_accuracy": 0.7937956035137177,
539
  "num_tokens": 272109.0,
540
  "step": 265
541
  },
542
  {
543
- "entropy": 1.0776531934738158,
544
  "epoch": 0.4712041884816754,
545
- "grad_norm": 0.6461851000785828,
546
- "learning_rate": 0.00010799289520426288,
547
- "loss": 1.0389,
548
- "mean_token_accuracy": 0.7343196094036102,
549
  "num_tokens": 276623.0,
550
  "step": 270
551
  },
552
  {
553
- "entropy": 1.1227709293365478,
554
  "epoch": 0.4799301919720768,
555
- "grad_norm": 0.6017542481422424,
556
- "learning_rate": 0.00010621669626998225,
557
- "loss": 1.0346,
558
- "mean_token_accuracy": 0.7320161819458008,
559
  "num_tokens": 283256.0,
560
  "step": 275
561
  },
562
  {
563
- "entropy": 0.9767000675201416,
564
  "epoch": 0.4886561954624782,
565
- "grad_norm": 0.7064502835273743,
566
- "learning_rate": 0.0001044404973357016,
567
- "loss": 0.9051,
568
- "mean_token_accuracy": 0.7693962216377258,
569
  "num_tokens": 288780.0,
570
  "step": 280
571
  },
572
  {
573
- "entropy": 0.9595549941062927,
574
  "epoch": 0.4973821989528796,
575
- "grad_norm": 0.7622601985931396,
576
- "learning_rate": 0.00010266429840142096,
577
- "loss": 0.8922,
578
- "mean_token_accuracy": 0.767174756526947,
579
  "num_tokens": 293775.0,
580
  "step": 285
581
  },
582
  {
583
- "entropy": 0.9456490218639374,
584
  "epoch": 0.506108202443281,
585
- "grad_norm": 0.7910531163215637,
586
- "learning_rate": 0.00010088809946714034,
587
- "loss": 0.8845,
588
- "mean_token_accuracy": 0.7625713229179383,
589
  "num_tokens": 299667.0,
590
  "step": 290
591
  },
592
  {
593
- "entropy": 0.9972454011440277,
594
  "epoch": 0.5148342059336823,
595
- "grad_norm": 0.8077422976493835,
596
- "learning_rate": 9.911190053285967e-05,
597
- "loss": 0.9629,
598
- "mean_token_accuracy": 0.7550196409225464,
599
  "num_tokens": 304401.0,
600
  "step": 295
601
  },
602
  {
603
- "entropy": 1.0132270872592926,
604
  "epoch": 0.5235602094240838,
605
- "grad_norm": 0.5776278972625732,
606
- "learning_rate": 9.733570159857904e-05,
607
- "loss": 0.9083,
608
- "mean_token_accuracy": 0.7645319044589997,
609
  "num_tokens": 310983.0,
610
  "step": 300
611
  },
612
  {
613
- "entropy": 1.1321196973323822,
614
  "epoch": 0.5322862129144852,
615
- "grad_norm": 0.765808641910553,
616
- "learning_rate": 9.555950266429841e-05,
617
- "loss": 1.0364,
618
- "mean_token_accuracy": 0.7226320803165436,
619
  "num_tokens": 315721.0,
620
  "step": 305
621
  },
622
  {
623
- "entropy": 1.0132107377052306,
624
  "epoch": 0.5410122164048866,
625
- "grad_norm": 0.5765398144721985,
626
- "learning_rate": 9.378330373001777e-05,
627
- "loss": 0.9858,
628
- "mean_token_accuracy": 0.7562039911746978,
629
  "num_tokens": 321834.0,
630
  "step": 310
631
  },
632
  {
633
- "entropy": 1.097977089881897,
634
  "epoch": 0.5497382198952879,
635
- "grad_norm": 0.7264753580093384,
636
- "learning_rate": 9.200710479573713e-05,
637
- "loss": 1.0686,
638
- "mean_token_accuracy": 0.7291842579841614,
639
  "num_tokens": 327063.0,
640
  "step": 315
641
  },
642
  {
643
- "entropy": 1.2174109816551208,
644
  "epoch": 0.5584642233856894,
645
- "grad_norm": 0.7541456818580627,
646
- "learning_rate": 9.023090586145648e-05,
647
- "loss": 1.1817,
648
- "mean_token_accuracy": 0.7097965478897095,
649
  "num_tokens": 332900.0,
650
  "step": 320
651
  },
652
  {
653
- "entropy": 1.0044541895389556,
654
  "epoch": 0.5671902268760908,
655
- "grad_norm": 0.5834890604019165,
656
- "learning_rate": 8.845470692717585e-05,
657
- "loss": 0.9467,
658
- "mean_token_accuracy": 0.7500465452671051,
659
  "num_tokens": 337508.0,
660
  "step": 325
661
  },
662
  {
663
- "entropy": 1.0295350253582,
664
  "epoch": 0.5759162303664922,
665
- "grad_norm": 0.8909983038902283,
666
- "learning_rate": 8.667850799289521e-05,
667
- "loss": 0.9113,
668
- "mean_token_accuracy": 0.7476867496967315,
669
  "num_tokens": 342644.0,
670
  "step": 330
671
  },
672
  {
673
- "entropy": 1.0791299104690553,
674
  "epoch": 0.5846422338568935,
675
- "grad_norm": 1.0385737419128418,
676
- "learning_rate": 8.490230905861456e-05,
677
- "loss": 1.1175,
678
- "mean_token_accuracy": 0.7305109918117523,
679
  "num_tokens": 347547.0,
680
  "step": 335
681
  },
682
  {
683
- "entropy": 1.0213176369667054,
684
  "epoch": 0.5933682373472949,
685
- "grad_norm": 0.943204402923584,
686
- "learning_rate": 8.312611012433393e-05,
687
- "loss": 0.9055,
688
- "mean_token_accuracy": 0.7596513092517853,
689
  "num_tokens": 351932.0,
690
  "step": 340
691
  },
692
  {
693
- "entropy": 1.0257258594036103,
694
  "epoch": 0.6020942408376964,
695
- "grad_norm": 0.7949322462081909,
696
- "learning_rate": 8.134991119005328e-05,
697
- "loss": 0.9098,
698
- "mean_token_accuracy": 0.7553630173206329,
699
  "num_tokens": 357045.0,
700
  "step": 345
701
  },
702
  {
703
- "entropy": 1.0372248589992523,
704
  "epoch": 0.6108202443280978,
705
- "grad_norm": 0.8405324220657349,
706
- "learning_rate": 7.957371225577265e-05,
707
- "loss": 0.9929,
708
- "mean_token_accuracy": 0.7452831089496612,
709
  "num_tokens": 362284.0,
710
  "step": 350
711
  },
712
  {
713
- "entropy": 0.9565088748931885,
714
  "epoch": 0.6195462478184991,
715
- "grad_norm": 0.6379778981208801,
716
- "learning_rate": 7.779751332149202e-05,
717
- "loss": 0.9219,
718
- "mean_token_accuracy": 0.7565369844436646,
719
  "num_tokens": 367217.0,
720
  "step": 355
721
  },
722
  {
723
- "entropy": 1.0628814578056336,
724
  "epoch": 0.6282722513089005,
725
- "grad_norm": 0.6335421204566956,
726
- "learning_rate": 7.602131438721137e-05,
727
- "loss": 1.0041,
728
- "mean_token_accuracy": 0.7395376443862915,
729
  "num_tokens": 372678.0,
730
  "step": 360
731
  },
732
  {
733
- "entropy": 0.9448712587356567,
734
  "epoch": 0.6369982547993019,
735
- "grad_norm": 0.737162172794342,
736
- "learning_rate": 7.424511545293074e-05,
737
- "loss": 0.8211,
738
- "mean_token_accuracy": 0.771143788099289,
739
  "num_tokens": 377750.0,
740
  "step": 365
741
  },
742
  {
743
- "entropy": 0.9797238111495972,
744
  "epoch": 0.6457242582897034,
745
- "grad_norm": 0.5577957034111023,
746
- "learning_rate": 7.246891651865009e-05,
747
- "loss": 0.9415,
748
- "mean_token_accuracy": 0.7499814212322236,
749
  "num_tokens": 383406.0,
750
  "step": 370
751
  },
752
  {
753
- "entropy": 1.1891680419445039,
754
  "epoch": 0.6544502617801047,
755
- "grad_norm": 0.48097750544548035,
756
- "learning_rate": 7.069271758436945e-05,
757
- "loss": 1.1327,
758
- "mean_token_accuracy": 0.7193056166172027,
759
  "num_tokens": 389696.0,
760
  "step": 375
761
  },
762
  {
763
- "entropy": 1.0238433182239532,
764
  "epoch": 0.6631762652705061,
765
- "grad_norm": 0.5823986530303955,
766
- "learning_rate": 6.891651865008881e-05,
767
- "loss": 0.9708,
768
- "mean_token_accuracy": 0.7535522282123566,
769
  "num_tokens": 394688.0,
770
  "step": 380
771
  },
772
  {
773
- "entropy": 1.162860244512558,
774
  "epoch": 0.6719022687609075,
775
- "grad_norm": 0.6299170255661011,
776
- "learning_rate": 6.714031971580817e-05,
777
- "loss": 1.1866,
778
- "mean_token_accuracy": 0.710206264257431,
779
  "num_tokens": 400319.0,
780
  "step": 385
781
  },
782
  {
783
- "entropy": 1.0206872344017028,
784
  "epoch": 0.680628272251309,
785
- "grad_norm": 0.7722362875938416,
786
- "learning_rate": 6.536412078152754e-05,
787
- "loss": 0.9289,
788
- "mean_token_accuracy": 0.7554251432418824,
789
  "num_tokens": 404918.0,
790
  "step": 390
791
  },
792
  {
793
- "entropy": 1.0980794131755829,
794
  "epoch": 0.6893542757417103,
795
- "grad_norm": 0.9234552979469299,
796
- "learning_rate": 6.358792184724689e-05,
797
- "loss": 0.9551,
798
- "mean_token_accuracy": 0.7426558673381806,
799
  "num_tokens": 410635.0,
800
  "step": 395
801
  },
802
  {
803
- "entropy": 1.0166767477989196,
804
  "epoch": 0.6980802792321117,
805
- "grad_norm": 0.9343558549880981,
806
- "learning_rate": 6.181172291296625e-05,
807
- "loss": 0.9624,
808
- "mean_token_accuracy": 0.7539155185222626,
809
  "num_tokens": 415005.0,
810
  "step": 400
811
  },
812
  {
813
- "entropy": 1.0832793176174165,
814
  "epoch": 0.7068062827225131,
815
- "grad_norm": 0.7815644145011902,
816
- "learning_rate": 6.003552397868561e-05,
817
- "loss": 1.0316,
818
- "mean_token_accuracy": 0.7289174854755401,
819
  "num_tokens": 419347.0,
820
  "step": 405
821
  },
822
  {
823
- "entropy": 1.0699054658412934,
824
  "epoch": 0.7155322862129145,
825
- "grad_norm": 0.7760159373283386,
826
- "learning_rate": 5.825932504440498e-05,
827
- "loss": 1.0357,
828
- "mean_token_accuracy": 0.7321902751922608,
829
  "num_tokens": 424588.0,
830
  "step": 410
831
  },
832
  {
833
- "entropy": 0.966323298215866,
834
  "epoch": 0.7242582897033158,
835
- "grad_norm": 0.805746853351593,
836
- "learning_rate": 5.648312611012434e-05,
837
- "loss": 0.9306,
838
- "mean_token_accuracy": 0.7569182515144348,
839
  "num_tokens": 428943.0,
840
  "step": 415
841
  },
842
  {
843
- "entropy": 0.9721911072731018,
844
  "epoch": 0.7329842931937173,
845
- "grad_norm": 0.6620533466339111,
846
- "learning_rate": 5.470692717584369e-05,
847
- "loss": 0.9465,
848
- "mean_token_accuracy": 0.7597197592258453,
849
  "num_tokens": 435326.0,
850
  "step": 420
851
  },
852
  {
853
- "entropy": 0.9292757451534271,
854
  "epoch": 0.7417102966841187,
855
- "grad_norm": 0.7177068591117859,
856
- "learning_rate": 5.293072824156306e-05,
857
- "loss": 0.858,
858
- "mean_token_accuracy": 0.7738080501556397,
859
  "num_tokens": 441702.0,
860
  "step": 425
861
  },
862
  {
863
- "entropy": 1.0638712823390961,
864
  "epoch": 0.7504363001745201,
865
- "grad_norm": 0.5912255048751831,
866
- "learning_rate": 5.115452930728242e-05,
867
- "loss": 1.0654,
868
- "mean_token_accuracy": 0.747636479139328,
869
  "num_tokens": 446862.0,
870
  "step": 430
871
  },
872
  {
873
- "entropy": 0.9203409194946289,
874
  "epoch": 0.7591623036649214,
875
- "grad_norm": 0.8877400159835815,
876
- "learning_rate": 4.9378330373001777e-05,
877
- "loss": 0.8225,
878
- "mean_token_accuracy": 0.7788766026496887,
879
  "num_tokens": 451024.0,
880
  "step": 435
881
  },
882
  {
883
- "entropy": 1.0310194969177247,
884
  "epoch": 0.7678883071553229,
885
- "grad_norm": 0.593137800693512,
886
- "learning_rate": 4.7602131438721136e-05,
887
- "loss": 1.0058,
888
- "mean_token_accuracy": 0.7474644720554352,
889
  "num_tokens": 457528.0,
890
  "step": 440
891
  },
892
  {
893
- "entropy": 0.9218507647514343,
894
  "epoch": 0.7766143106457243,
895
- "grad_norm": 0.8034109473228455,
896
- "learning_rate": 4.58259325044405e-05,
897
- "loss": 0.8161,
898
- "mean_token_accuracy": 0.773482757806778,
899
  "num_tokens": 462267.0,
900
  "step": 445
901
  },
902
  {
903
- "entropy": 1.0368493318557739,
904
  "epoch": 0.7853403141361257,
905
- "grad_norm": 0.9129230380058289,
906
- "learning_rate": 4.404973357015986e-05,
907
- "loss": 1.0042,
908
- "mean_token_accuracy": 0.7518712699413299,
909
  "num_tokens": 467337.0,
910
  "step": 450
911
  },
912
  {
913
- "entropy": 0.8776600241661072,
914
  "epoch": 0.794066317626527,
915
- "grad_norm": 0.5392698645591736,
916
- "learning_rate": 4.227353463587922e-05,
917
- "loss": 0.7964,
918
- "mean_token_accuracy": 0.773613715171814,
919
  "num_tokens": 472361.0,
920
  "step": 455
921
  },
922
  {
923
- "entropy": 0.9013674080371856,
924
  "epoch": 0.8027923211169284,
925
- "grad_norm": 0.731060266494751,
926
- "learning_rate": 4.049733570159858e-05,
927
- "loss": 0.9098,
928
- "mean_token_accuracy": 0.7663923025131225,
929
  "num_tokens": 477324.0,
930
  "step": 460
931
  },
932
  {
933
- "entropy": 1.0141965687274932,
934
  "epoch": 0.8115183246073299,
935
- "grad_norm": 0.6941847205162048,
936
- "learning_rate": 3.872113676731794e-05,
937
- "loss": 1.0052,
938
- "mean_token_accuracy": 0.747931432723999,
939
  "num_tokens": 483192.0,
940
  "step": 465
941
  },
942
  {
943
- "entropy": 0.9370434999465942,
944
  "epoch": 0.8202443280977313,
945
- "grad_norm": 0.7024611830711365,
946
- "learning_rate": 3.69449378330373e-05,
947
- "loss": 0.9472,
948
- "mean_token_accuracy": 0.7648843646049499,
949
  "num_tokens": 488771.0,
950
  "step": 470
951
  },
952
  {
953
- "entropy": 1.2138389825820923,
954
  "epoch": 0.8289703315881326,
955
- "grad_norm": 0.6181853413581848,
956
- "learning_rate": 3.516873889875667e-05,
957
- "loss": 1.1913,
958
- "mean_token_accuracy": 0.7199933648109436,
959
  "num_tokens": 495594.0,
960
  "step": 475
961
  },
962
  {
963
- "entropy": 0.9946802318096161,
964
  "epoch": 0.837696335078534,
965
- "grad_norm": 0.8392300009727478,
966
- "learning_rate": 3.339253996447602e-05,
967
- "loss": 0.8846,
968
- "mean_token_accuracy": 0.7601681053638458,
969
  "num_tokens": 501431.0,
970
  "step": 480
971
  },
972
  {
973
- "entropy": 1.0851561069488525,
974
  "epoch": 0.8464223385689355,
975
- "grad_norm": 0.7538084983825684,
976
- "learning_rate": 3.1616341030195386e-05,
977
- "loss": 1.0112,
978
- "mean_token_accuracy": 0.7339279770851135,
979
  "num_tokens": 506603.0,
980
  "step": 485
981
  },
982
  {
983
- "entropy": 0.9791876435279846,
984
  "epoch": 0.8551483420593369,
985
- "grad_norm": 0.6512478590011597,
986
- "learning_rate": 2.9840142095914742e-05,
987
- "loss": 0.9047,
988
- "mean_token_accuracy": 0.7637781441211701,
989
  "num_tokens": 511657.0,
990
  "step": 490
991
  },
992
  {
993
- "entropy": 0.8807009816169739,
994
  "epoch": 0.8638743455497382,
995
- "grad_norm": 1.0381275415420532,
996
- "learning_rate": 2.8063943161634105e-05,
997
- "loss": 0.7989,
998
- "mean_token_accuracy": 0.778300940990448,
999
  "num_tokens": 516346.0,
1000
  "step": 495
1001
  },
1002
  {
1003
- "entropy": 0.9706099390983581,
1004
  "epoch": 0.8726003490401396,
1005
- "grad_norm": 0.7503977417945862,
1006
- "learning_rate": 2.6287744227353468e-05,
1007
- "loss": 0.8633,
1008
- "mean_token_accuracy": 0.7602749288082122,
1009
  "num_tokens": 521118.0,
1010
  "step": 500
1011
  },
1012
  {
1013
- "entropy": 0.9830702662467956,
1014
  "epoch": 0.881326352530541,
1015
- "grad_norm": 0.7824010252952576,
1016
- "learning_rate": 2.4511545293072824e-05,
1017
- "loss": 0.8701,
1018
- "mean_token_accuracy": 0.7697367370128632,
1019
  "num_tokens": 525785.0,
1020
  "step": 505
1021
  },
1022
  {
1023
- "entropy": 1.0895603597164154,
1024
  "epoch": 0.8900523560209425,
1025
- "grad_norm": 0.6201509237289429,
1026
- "learning_rate": 2.2735346358792187e-05,
1027
- "loss": 0.999,
1028
- "mean_token_accuracy": 0.7415844857692718,
1029
  "num_tokens": 531296.0,
1030
  "step": 510
1031
  },
1032
  {
1033
- "entropy": 1.0094242215156555,
1034
  "epoch": 0.8987783595113438,
1035
- "grad_norm": 0.6755935549736023,
1036
- "learning_rate": 2.0959147424511547e-05,
1037
- "loss": 0.9283,
1038
- "mean_token_accuracy": 0.7551429510116577,
1039
  "num_tokens": 536703.0,
1040
  "step": 515
1041
  },
1042
  {
1043
- "entropy": 0.9846092760562897,
1044
  "epoch": 0.9075043630017452,
1045
- "grad_norm": 1.0709046125411987,
1046
- "learning_rate": 1.9182948490230906e-05,
1047
- "loss": 0.9426,
1048
- "mean_token_accuracy": 0.7431533575057984,
1049
  "num_tokens": 541044.0,
1050
  "step": 520
1051
  },
1052
  {
1053
- "entropy": 0.9527219116687775,
1054
  "epoch": 0.9162303664921466,
1055
- "grad_norm": 0.6978484392166138,
1056
- "learning_rate": 1.7406749555950266e-05,
1057
- "loss": 0.8911,
1058
- "mean_token_accuracy": 0.7653971970081329,
1059
  "num_tokens": 546836.0,
1060
  "step": 525
1061
  },
1062
  {
1063
- "entropy": 0.8855733275413513,
1064
  "epoch": 0.924956369982548,
1065
- "grad_norm": 0.9127820134162903,
1066
- "learning_rate": 1.563055062166963e-05,
1067
- "loss": 0.8139,
1068
- "mean_token_accuracy": 0.7775610208511352,
1069
  "num_tokens": 551666.0,
1070
  "step": 530
1071
  },
1072
  {
1073
- "entropy": 0.9590709805488586,
1074
  "epoch": 0.9336823734729494,
1075
- "grad_norm": 0.7010323405265808,
1076
- "learning_rate": 1.3854351687388988e-05,
1077
- "loss": 0.9334,
1078
- "mean_token_accuracy": 0.759455144405365,
1079
  "num_tokens": 556932.0,
1080
  "step": 535
1081
  },
1082
  {
1083
- "entropy": 0.9646609544754028,
1084
  "epoch": 0.9424083769633508,
1085
- "grad_norm": 0.5711817145347595,
1086
- "learning_rate": 1.2078152753108348e-05,
1087
- "loss": 0.9678,
1088
- "mean_token_accuracy": 0.7603480279445648,
1089
  "num_tokens": 562608.0,
1090
  "step": 540
1091
  },
1092
  {
1093
- "entropy": 1.0106851994991302,
1094
  "epoch": 0.9511343804537522,
1095
- "grad_norm": 0.7159616947174072,
1096
- "learning_rate": 1.030195381882771e-05,
1097
- "loss": 0.9285,
1098
- "mean_token_accuracy": 0.7571455597877502,
1099
  "num_tokens": 568591.0,
1100
  "step": 545
1101
  },
1102
  {
1103
- "entropy": 1.0865988105535507,
1104
  "epoch": 0.9598603839441536,
1105
- "grad_norm": 0.7819423079490662,
1106
- "learning_rate": 8.52575488454707e-06,
1107
- "loss": 1.1628,
1108
- "mean_token_accuracy": 0.7484253525733948,
1109
  "num_tokens": 572932.0,
1110
  "step": 550
1111
  },
1112
  {
1113
- "entropy": 0.8808701932430267,
1114
  "epoch": 0.9685863874345549,
1115
- "grad_norm": 0.6782775521278381,
1116
- "learning_rate": 6.74955595026643e-06,
1117
- "loss": 0.7661,
1118
- "mean_token_accuracy": 0.7774575710296631,
1119
  "num_tokens": 577818.0,
1120
  "step": 555
1121
  },
1122
  {
1123
- "entropy": 0.9287233471870422,
1124
  "epoch": 0.9773123909249564,
1125
- "grad_norm": 0.8206584453582764,
1126
- "learning_rate": 4.973357015985791e-06,
1127
- "loss": 0.7572,
1128
- "mean_token_accuracy": 0.7789205074310303,
1129
  "num_tokens": 581899.0,
1130
  "step": 560
1131
  },
1132
  {
1133
- "entropy": 0.8616821765899658,
1134
  "epoch": 0.9860383944153578,
1135
- "grad_norm": 0.6403858661651611,
1136
- "learning_rate": 3.197158081705151e-06,
1137
- "loss": 0.7855,
1138
- "mean_token_accuracy": 0.7908896625041961,
1139
  "num_tokens": 587864.0,
1140
  "step": 565
1141
  },
1142
  {
1143
- "entropy": 1.0378393054008483,
1144
  "epoch": 0.9947643979057592,
1145
- "grad_norm": 0.7347800731658936,
1146
- "learning_rate": 1.4209591474245117e-06,
1147
- "loss": 1.0399,
1148
- "mean_token_accuracy": 0.7526734173297882,
1149
  "num_tokens": 592990.0,
1150
  "step": 570
1151
  }
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "entropy": 1.3178111910820007,
14
  "epoch": 0.008726003490401396,
15
+ "grad_norm": 1.3144103288650513,
16
+ "learning_rate": 2e-05,
17
+ "loss": 1.3891,
18
+ "mean_token_accuracy": 0.6749979019165039,
19
  "num_tokens": 4689.0,
20
  "step": 5
21
  },
22
  {
23
+ "entropy": 1.3977092504501343,
24
  "epoch": 0.017452006980802792,
25
+ "grad_norm": 0.9799396991729736,
26
+ "learning_rate": 4.5e-05,
27
+ "loss": 1.3155,
28
+ "mean_token_accuracy": 0.672690337896347,
29
  "num_tokens": 9076.0,
30
  "step": 10
31
  },
32
  {
33
+ "entropy": 1.1513380348682403,
34
  "epoch": 0.02617801047120419,
35
+ "grad_norm": 0.903432309627533,
36
+ "learning_rate": 7e-05,
37
+ "loss": 1.061,
38
+ "mean_token_accuracy": 0.728508323431015,
39
  "num_tokens": 13468.0,
40
  "step": 15
41
  },
42
  {
43
+ "entropy": 1.4041914224624634,
44
  "epoch": 0.034904013961605584,
45
+ "grad_norm": 0.6405840516090393,
46
+ "learning_rate": 9.5e-05,
47
+ "loss": 1.3689,
48
+ "mean_token_accuracy": 0.675159877538681,
49
  "num_tokens": 19687.0,
50
  "step": 20
51
  },
52
  {
53
+ "entropy": 1.3952594161033631,
54
  "epoch": 0.04363001745200698,
55
+ "grad_norm": 0.8190271854400635,
56
+ "learning_rate": 9.927667269439421e-05,
57
+ "loss": 1.2273,
58
+ "mean_token_accuracy": 0.6897247910499573,
59
  "num_tokens": 25321.0,
60
  "step": 25
61
  },
62
  {
63
+ "entropy": 1.2674797415733337,
64
  "epoch": 0.05235602094240838,
65
+ "grad_norm": 1.1967648267745972,
66
+ "learning_rate": 9.837251356238698e-05,
67
+ "loss": 1.199,
68
+ "mean_token_accuracy": 0.7063600897789002,
69
  "num_tokens": 29969.0,
70
  "step": 30
71
  },
72
  {
73
+ "entropy": 1.1735109210014343,
74
  "epoch": 0.06108202443280977,
75
+ "grad_norm": 0.8456419706344604,
76
+ "learning_rate": 9.746835443037975e-05,
77
+ "loss": 1.0482,
78
+ "mean_token_accuracy": 0.7267675757408142,
79
  "num_tokens": 34779.0,
80
  "step": 35
81
  },
82
  {
83
+ "entropy": 1.2172421634197235,
84
  "epoch": 0.06980802792321117,
85
+ "grad_norm": 0.8677243590354919,
86
+ "learning_rate": 9.656419529837252e-05,
87
+ "loss": 1.1052,
88
+ "mean_token_accuracy": 0.7135568201541901,
89
  "num_tokens": 40454.0,
90
  "step": 40
91
  },
92
  {
93
+ "entropy": 1.2951707720756531,
94
  "epoch": 0.07853403141361257,
95
+ "grad_norm": 0.7614730596542358,
96
+ "learning_rate": 9.566003616636529e-05,
97
+ "loss": 1.1822,
98
+ "mean_token_accuracy": 0.6976100087165833,
99
  "num_tokens": 45435.0,
100
  "step": 45
101
  },
102
  {
103
+ "entropy": 1.369252896308899,
104
  "epoch": 0.08726003490401396,
105
+ "grad_norm": 1.0491780042648315,
106
+ "learning_rate": 9.475587703435806e-05,
107
+ "loss": 1.303,
108
+ "mean_token_accuracy": 0.6986614286899566,
109
  "num_tokens": 50161.0,
110
  "step": 50
111
  },
112
  {
113
+ "entropy": 1.2288368880748748,
114
  "epoch": 0.09598603839441536,
115
+ "grad_norm": 0.8575630784034729,
116
+ "learning_rate": 9.385171790235083e-05,
117
+ "loss": 1.0351,
118
+ "mean_token_accuracy": 0.7264712870121002,
119
  "num_tokens": 55082.0,
120
  "step": 55
121
  },
122
  {
123
+ "entropy": 1.3860582947731017,
124
  "epoch": 0.10471204188481675,
125
+ "grad_norm": 0.7137913703918457,
126
+ "learning_rate": 9.29475587703436e-05,
127
+ "loss": 1.3276,
128
+ "mean_token_accuracy": 0.6737604200839996,
129
  "num_tokens": 61109.0,
130
  "step": 60
131
  },
132
  {
133
+ "entropy": 1.186289870738983,
134
  "epoch": 0.11343804537521815,
135
+ "grad_norm": 0.8168688416481018,
136
+ "learning_rate": 9.204339963833635e-05,
137
+ "loss": 1.0535,
138
+ "mean_token_accuracy": 0.7318423926830292,
139
  "num_tokens": 65130.0,
140
  "step": 65
141
  },
142
  {
143
+ "entropy": 1.3940527975559234,
144
  "epoch": 0.12216404886561955,
145
+ "grad_norm": 0.7069775462150574,
146
+ "learning_rate": 9.113924050632912e-05,
147
+ "loss": 1.3631,
148
+ "mean_token_accuracy": 0.6877448439598084,
149
  "num_tokens": 71720.0,
150
  "step": 70
151
  },
152
  {
153
+ "entropy": 1.248500007390976,
154
  "epoch": 0.13089005235602094,
155
+ "grad_norm": 0.9421133399009705,
156
+ "learning_rate": 9.023508137432188e-05,
157
+ "loss": 1.1336,
158
+ "mean_token_accuracy": 0.7071475625038147,
159
  "num_tokens": 76475.0,
160
  "step": 75
161
  },
162
  {
163
+ "entropy": 1.1854017496109008,
164
  "epoch": 0.13961605584642234,
165
+ "grad_norm": 0.8254420757293701,
166
+ "learning_rate": 8.933092224231465e-05,
167
+ "loss": 1.0336,
168
+ "mean_token_accuracy": 0.7307356059551239,
169
  "num_tokens": 82239.0,
170
  "step": 80
171
  },
172
  {
173
+ "entropy": 1.3216606438159944,
174
  "epoch": 0.14834205933682373,
175
+ "grad_norm": 0.9723417162895203,
176
+ "learning_rate": 8.842676311030742e-05,
177
+ "loss": 1.2206,
178
+ "mean_token_accuracy": 0.6910631775856018,
179
  "num_tokens": 86033.0,
180
  "step": 85
181
  },
182
  {
183
+ "entropy": 1.2633710026741027,
184
  "epoch": 0.15706806282722513,
185
+ "grad_norm": 0.8575649857521057,
186
+ "learning_rate": 8.752260397830019e-05,
187
+ "loss": 1.1781,
188
+ "mean_token_accuracy": 0.711302649974823,
189
  "num_tokens": 90170.0,
190
  "step": 90
191
  },
192
  {
193
+ "entropy": 1.1930577754974365,
194
  "epoch": 0.16579406631762653,
195
+ "grad_norm": 0.9084628224372864,
196
+ "learning_rate": 8.661844484629296e-05,
197
+ "loss": 1.0925,
198
+ "mean_token_accuracy": 0.7296910464763642,
199
  "num_tokens": 95473.0,
200
  "step": 95
201
  },
202
  {
203
+ "entropy": 1.097593402862549,
204
  "epoch": 0.17452006980802792,
205
+ "grad_norm": 0.9089357852935791,
206
+ "learning_rate": 8.571428571428571e-05,
207
+ "loss": 0.9541,
208
+ "mean_token_accuracy": 0.7425937652587891,
209
  "num_tokens": 99423.0,
210
  "step": 100
211
  },
212
  {
213
+ "entropy": 1.1105962693691254,
214
  "epoch": 0.18324607329842932,
215
+ "grad_norm": 0.8107589483261108,
216
+ "learning_rate": 8.481012658227848e-05,
217
+ "loss": 1.0277,
218
+ "mean_token_accuracy": 0.7366280615329742,
219
  "num_tokens": 104249.0,
220
  "step": 105
221
  },
222
  {
223
+ "entropy": 1.121857112646103,
224
  "epoch": 0.19197207678883071,
225
+ "grad_norm": 0.8907290101051331,
226
+ "learning_rate": 8.390596745027125e-05,
227
+ "loss": 0.9944,
228
+ "mean_token_accuracy": 0.7407838046550751,
229
  "num_tokens": 109205.0,
230
  "step": 110
231
  },
232
  {
233
+ "entropy": 1.1668152272701264,
234
  "epoch": 0.2006980802792321,
235
+ "grad_norm": 0.7008240222930908,
236
+ "learning_rate": 8.300180831826402e-05,
237
+ "loss": 1.1461,
238
+ "mean_token_accuracy": 0.7233073830604553,
239
  "num_tokens": 115207.0,
240
  "step": 115
241
  },
242
  {
243
+ "entropy": 1.0229185461997985,
244
  "epoch": 0.2094240837696335,
245
+ "grad_norm": 1.1023997068405151,
246
+ "learning_rate": 8.209764918625679e-05,
247
+ "loss": 0.9365,
248
+ "mean_token_accuracy": 0.7564353287220001,
249
  "num_tokens": 120323.0,
250
  "step": 120
251
  },
252
  {
253
+ "entropy": 1.205068576335907,
254
  "epoch": 0.2181500872600349,
255
+ "grad_norm": 0.9460663199424744,
256
+ "learning_rate": 8.119349005424956e-05,
257
+ "loss": 1.1133,
258
+ "mean_token_accuracy": 0.7165132164955139,
259
  "num_tokens": 125253.0,
260
  "step": 125
261
  },
262
  {
263
+ "entropy": 1.0808123528957367,
264
  "epoch": 0.2268760907504363,
265
+ "grad_norm": 0.9484661221504211,
266
+ "learning_rate": 8.028933092224232e-05,
267
+ "loss": 0.9489,
268
+ "mean_token_accuracy": 0.7440634310245514,
269
  "num_tokens": 130360.0,
270
  "step": 130
271
  },
272
  {
273
+ "entropy": 1.152905023097992,
274
  "epoch": 0.2356020942408377,
275
+ "grad_norm": 0.8243975043296814,
276
+ "learning_rate": 7.938517179023509e-05,
277
+ "loss": 1.0735,
278
+ "mean_token_accuracy": 0.7156176209449768,
279
  "num_tokens": 135538.0,
280
  "step": 135
281
  },
282
  {
283
+ "entropy": 1.2132245182991028,
284
  "epoch": 0.2443280977312391,
285
+ "grad_norm": 0.8641315698623657,
286
+ "learning_rate": 7.848101265822784e-05,
287
+ "loss": 1.0773,
288
+ "mean_token_accuracy": 0.7272770285606385,
289
  "num_tokens": 140532.0,
290
  "step": 140
291
  },
292
  {
293
+ "entropy": 1.0758103907108307,
294
  "epoch": 0.2530541012216405,
295
+ "grad_norm": 0.7532796859741211,
296
+ "learning_rate": 7.757685352622061e-05,
297
+ "loss": 0.9522,
298
+ "mean_token_accuracy": 0.7446901857852936,
299
  "num_tokens": 145908.0,
300
  "step": 145
301
  },
302
  {
303
+ "entropy": 1.1179533541202544,
304
  "epoch": 0.2617801047120419,
305
+ "grad_norm": 0.5681455135345459,
306
+ "learning_rate": 7.667269439421338e-05,
307
+ "loss": 1.0841,
308
+ "mean_token_accuracy": 0.730886948108673,
309
  "num_tokens": 151148.0,
310
  "step": 150
311
  },
312
  {
313
+ "entropy": 1.1392421841621398,
314
  "epoch": 0.2705061082024433,
315
+ "grad_norm": 0.6956934332847595,
316
+ "learning_rate": 7.576853526220615e-05,
317
+ "loss": 1.0957,
318
+ "mean_token_accuracy": 0.7271894216537476,
319
  "num_tokens": 157506.0,
320
  "step": 155
321
  },
322
  {
323
+ "entropy": 1.1840439975261687,
324
  "epoch": 0.2792321116928447,
325
+ "grad_norm": 1.076908826828003,
326
+ "learning_rate": 7.486437613019892e-05,
327
+ "loss": 1.1233,
328
+ "mean_token_accuracy": 0.7299613058567047,
329
  "num_tokens": 162992.0,
330
  "step": 160
331
  },
332
  {
333
+ "entropy": 1.0738611757755279,
334
  "epoch": 0.2879581151832461,
335
+ "grad_norm": 0.7862851023674011,
336
+ "learning_rate": 7.396021699819169e-05,
337
+ "loss": 0.9063,
338
+ "mean_token_accuracy": 0.7484164655208587,
339
  "num_tokens": 167640.0,
340
  "step": 165
341
  },
342
  {
343
+ "entropy": 1.0928088903427124,
344
  "epoch": 0.29668411867364747,
345
+ "grad_norm": 0.7506266236305237,
346
+ "learning_rate": 7.305605786618446e-05,
347
+ "loss": 1.0254,
348
+ "mean_token_accuracy": 0.751311433315277,
349
  "num_tokens": 173423.0,
350
  "step": 170
351
  },
352
  {
353
+ "entropy": 1.1541666328907012,
354
  "epoch": 0.3054101221640489,
355
+ "grad_norm": 0.8343175053596497,
356
+ "learning_rate": 7.215189873417722e-05,
357
+ "loss": 1.0956,
358
+ "mean_token_accuracy": 0.7194818913936615,
359
  "num_tokens": 178986.0,
360
  "step": 175
361
  },
362
  {
363
+ "entropy": 0.9744765520095825,
364
  "epoch": 0.31413612565445026,
365
+ "grad_norm": 1.1534874439239502,
366
+ "learning_rate": 7.124773960216999e-05,
367
+ "loss": 0.8336,
368
+ "mean_token_accuracy": 0.772490268945694,
369
  "num_tokens": 183459.0,
370
  "step": 180
371
  },
372
  {
373
+ "entropy": 1.1847937881946564,
374
  "epoch": 0.3228621291448517,
375
+ "grad_norm": 1.7201915979385376,
376
+ "learning_rate": 7.034358047016275e-05,
377
+ "loss": 1.1898,
378
+ "mean_token_accuracy": 0.7138440608978271,
379
  "num_tokens": 187614.0,
380
  "step": 185
381
  },
382
  {
383
+ "entropy": 1.1254307150840759,
384
  "epoch": 0.33158813263525305,
385
+ "grad_norm": 0.8325474262237549,
386
+ "learning_rate": 6.943942133815552e-05,
387
+ "loss": 1.0264,
388
+ "mean_token_accuracy": 0.7251651823520661,
389
  "num_tokens": 192316.0,
390
  "step": 190
391
  },
392
  {
393
+ "entropy": 1.1271761000156402,
394
  "epoch": 0.3403141361256545,
395
+ "grad_norm": 0.7783677577972412,
396
+ "learning_rate": 6.85352622061483e-05,
397
+ "loss": 0.9858,
398
+ "mean_token_accuracy": 0.7338063836097717,
399
  "num_tokens": 197728.0,
400
  "step": 195
401
  },
402
  {
403
+ "entropy": 1.1959772825241088,
404
  "epoch": 0.34904013961605584,
405
+ "grad_norm": 0.6065697073936462,
406
+ "learning_rate": 6.763110307414105e-05,
407
+ "loss": 1.1579,
408
+ "mean_token_accuracy": 0.7148371398448944,
409
  "num_tokens": 204502.0,
410
  "step": 200
411
  },
412
  {
413
+ "entropy": 1.1796980381011963,
414
  "epoch": 0.35776614310645727,
415
+ "grad_norm": 0.9461066722869873,
416
+ "learning_rate": 6.672694394213382e-05,
417
+ "loss": 1.1302,
418
+ "mean_token_accuracy": 0.7260191440582275,
419
  "num_tokens": 209069.0,
420
  "step": 205
421
  },
422
  {
423
+ "entropy": 1.2325600683689117,
424
  "epoch": 0.36649214659685864,
425
+ "grad_norm": 0.8394590020179749,
426
+ "learning_rate": 6.582278481012658e-05,
427
+ "loss": 1.2181,
428
+ "mean_token_accuracy": 0.7085430741310119,
429
  "num_tokens": 214185.0,
430
  "step": 210
431
  },
432
  {
433
+ "entropy": 1.177025467157364,
434
  "epoch": 0.37521815008726006,
435
+ "grad_norm": 0.7099196910858154,
436
+ "learning_rate": 6.491862567811935e-05,
437
+ "loss": 1.0698,
438
+ "mean_token_accuracy": 0.7251012861728668,
439
  "num_tokens": 219312.0,
440
  "step": 215
441
  },
442
  {
443
+ "entropy": 1.1909004271030426,
444
  "epoch": 0.38394415357766143,
445
+ "grad_norm": 0.8594046831130981,
446
+ "learning_rate": 6.401446654611211e-05,
447
+ "loss": 1.1085,
448
+ "mean_token_accuracy": 0.7182255506515502,
449
  "num_tokens": 225616.0,
450
  "step": 220
451
  },
452
  {
453
+ "entropy": 1.0699391067028046,
454
  "epoch": 0.39267015706806285,
455
+ "grad_norm": 0.9555573463439941,
456
+ "learning_rate": 6.311030741410488e-05,
457
+ "loss": 0.9639,
458
+ "mean_token_accuracy": 0.7441750049591065,
459
  "num_tokens": 230696.0,
460
  "step": 225
461
  },
462
  {
463
+ "entropy": 1.1240986049175263,
464
  "epoch": 0.4013961605584642,
465
+ "grad_norm": 0.9631731510162354,
466
+ "learning_rate": 6.220614828209765e-05,
467
+ "loss": 1.083,
468
+ "mean_token_accuracy": 0.7323605120182037,
469
  "num_tokens": 236685.0,
470
  "step": 230
471
  },
472
  {
473
+ "entropy": 1.1956682801246643,
474
  "epoch": 0.41012216404886565,
475
+ "grad_norm": 1.0035083293914795,
476
+ "learning_rate": 6.130198915009042e-05,
477
+ "loss": 1.0327,
478
+ "mean_token_accuracy": 0.7194140493869782,
479
  "num_tokens": 241211.0,
480
  "step": 235
481
  },
482
  {
483
+ "entropy": 1.2125337064266204,
484
  "epoch": 0.418848167539267,
485
+ "grad_norm": 0.6551855802536011,
486
+ "learning_rate": 6.039783001808319e-05,
487
+ "loss": 1.1878,
488
+ "mean_token_accuracy": 0.7105167448520661,
489
  "num_tokens": 246360.0,
490
  "step": 240
491
  },
492
  {
493
+ "entropy": 1.1998135149478912,
494
  "epoch": 0.42757417102966844,
495
+ "grad_norm": 0.6873531341552734,
496
+ "learning_rate": 5.949367088607595e-05,
497
+ "loss": 1.1622,
498
+ "mean_token_accuracy": 0.7172465562820435,
499
  "num_tokens": 252717.0,
500
  "step": 245
501
  },
502
  {
503
+ "entropy": 1.1167365849018096,
504
  "epoch": 0.4363001745200698,
505
+ "grad_norm": 0.8184595108032227,
506
+ "learning_rate": 5.858951175406872e-05,
507
+ "loss": 1.0069,
508
+ "mean_token_accuracy": 0.7279561281204223,
509
  "num_tokens": 257249.0,
510
  "step": 250
511
  },
512
  {
513
+ "entropy": 0.9268409907817841,
514
  "epoch": 0.44502617801047123,
515
+ "grad_norm": 1.001150131225586,
516
+ "learning_rate": 5.768535262206148e-05,
517
+ "loss": 0.8418,
518
+ "mean_token_accuracy": 0.7676967799663543,
519
  "num_tokens": 262381.0,
520
  "step": 255
521
  },
522
  {
523
+ "entropy": 1.0302452743053436,
524
  "epoch": 0.4537521815008726,
525
+ "grad_norm": 0.8092007040977478,
526
+ "learning_rate": 5.678119349005425e-05,
527
+ "loss": 1.0422,
528
+ "mean_token_accuracy": 0.7399900496006012,
529
  "num_tokens": 267410.0,
530
  "step": 260
531
  },
532
  {
533
+ "entropy": 0.8591890454292297,
534
  "epoch": 0.462478184991274,
535
+ "grad_norm": 0.9062550067901611,
536
+ "learning_rate": 5.587703435804702e-05,
537
+ "loss": 0.7452,
538
+ "mean_token_accuracy": 0.7874986886978149,
539
  "num_tokens": 272109.0,
540
  "step": 265
541
  },
542
  {
543
+ "entropy": 1.0930480301380157,
544
  "epoch": 0.4712041884816754,
545
+ "grad_norm": 0.7568405270576477,
546
+ "learning_rate": 5.497287522603979e-05,
547
+ "loss": 1.0479,
548
+ "mean_token_accuracy": 0.7330868363380432,
549
  "num_tokens": 276623.0,
550
  "step": 270
551
  },
552
  {
553
+ "entropy": 1.1516533672809601,
554
  "epoch": 0.4799301919720768,
555
+ "grad_norm": 0.6892510652542114,
556
+ "learning_rate": 5.406871609403256e-05,
557
+ "loss": 1.0581,
558
+ "mean_token_accuracy": 0.7289695620536805,
559
  "num_tokens": 283256.0,
560
  "step": 275
561
  },
562
  {
563
+ "entropy": 1.0084587812423706,
564
  "epoch": 0.4886561954624782,
565
+ "grad_norm": 0.8208179473876953,
566
+ "learning_rate": 5.3164556962025316e-05,
567
+ "loss": 0.9185,
568
+ "mean_token_accuracy": 0.7670749068260193,
569
  "num_tokens": 288780.0,
570
  "step": 280
571
  },
572
  {
573
+ "entropy": 0.9972082734107971,
574
  "epoch": 0.4973821989528796,
575
+ "grad_norm": 0.917007565498352,
576
+ "learning_rate": 5.2260397830018085e-05,
577
+ "loss": 0.9135,
578
+ "mean_token_accuracy": 0.7594508528709412,
579
  "num_tokens": 293775.0,
580
  "step": 285
581
  },
582
  {
583
+ "entropy": 0.9781757235527039,
584
  "epoch": 0.506108202443281,
585
+ "grad_norm": 0.8585111498832703,
586
+ "learning_rate": 5.135623869801085e-05,
587
+ "loss": 0.9062,
588
+ "mean_token_accuracy": 0.7560348510742188,
589
  "num_tokens": 299667.0,
590
  "step": 290
591
  },
592
  {
593
+ "entropy": 1.0113820374011993,
594
  "epoch": 0.5148342059336823,
595
+ "grad_norm": 0.9319034814834595,
596
+ "learning_rate": 5.045207956600362e-05,
597
+ "loss": 0.9816,
598
+ "mean_token_accuracy": 0.7550252437591553,
599
  "num_tokens": 304401.0,
600
  "step": 295
601
  },
602
  {
603
+ "entropy": 1.0391628086566924,
604
  "epoch": 0.5235602094240838,
605
+ "grad_norm": 0.6635801196098328,
606
+ "learning_rate": 4.954792043399639e-05,
607
+ "loss": 0.921,
608
+ "mean_token_accuracy": 0.7602339625358582,
609
  "num_tokens": 310983.0,
610
  "step": 300
611
  },
612
  {
613
+ "entropy": 1.1690425276756287,
614
  "epoch": 0.5322862129144852,
615
+ "grad_norm": 0.8804797530174255,
616
+ "learning_rate": 4.864376130198916e-05,
617
+ "loss": 1.0541,
618
+ "mean_token_accuracy": 0.7173005819320679,
619
  "num_tokens": 315721.0,
620
  "step": 305
621
  },
622
  {
623
+ "entropy": 1.0556828916072845,
624
  "epoch": 0.5410122164048866,
625
+ "grad_norm": 0.7090078592300415,
626
+ "learning_rate": 4.773960216998192e-05,
627
+ "loss": 1.0045,
628
+ "mean_token_accuracy": 0.7494379639625549,
629
  "num_tokens": 321834.0,
630
  "step": 310
631
  },
632
  {
633
+ "entropy": 1.1323361456394196,
634
  "epoch": 0.5497382198952879,
635
+ "grad_norm": 0.8676872253417969,
636
+ "learning_rate": 4.683544303797468e-05,
637
+ "loss": 1.0833,
638
+ "mean_token_accuracy": 0.7251208662986756,
639
  "num_tokens": 327063.0,
640
  "step": 315
641
  },
642
  {
643
+ "entropy": 1.2482255697250366,
644
  "epoch": 0.5584642233856894,
645
+ "grad_norm": 0.8192646503448486,
646
+ "learning_rate": 4.593128390596745e-05,
647
+ "loss": 1.2016,
648
+ "mean_token_accuracy": 0.7059409141540527,
649
  "num_tokens": 332900.0,
650
  "step": 320
651
  },
652
  {
653
+ "entropy": 1.0314257562160491,
654
  "epoch": 0.5671902268760908,
655
+ "grad_norm": 0.6856659650802612,
656
+ "learning_rate": 4.5027124773960215e-05,
657
+ "loss": 0.9671,
658
+ "mean_token_accuracy": 0.7470330059528351,
659
  "num_tokens": 337508.0,
660
  "step": 325
661
  },
662
  {
663
+ "entropy": 1.060635393857956,
664
  "epoch": 0.5759162303664922,
665
+ "grad_norm": 1.0749919414520264,
666
+ "learning_rate": 4.4122965641952984e-05,
667
+ "loss": 0.9374,
668
+ "mean_token_accuracy": 0.74344761967659,
669
  "num_tokens": 342644.0,
670
  "step": 330
671
  },
672
  {
673
+ "entropy": 1.1081909716129303,
674
  "epoch": 0.5846422338568935,
675
+ "grad_norm": 1.2220737934112549,
676
+ "learning_rate": 4.3218806509945754e-05,
677
+ "loss": 1.1298,
678
+ "mean_token_accuracy": 0.7298382222652435,
679
  "num_tokens": 347547.0,
680
  "step": 335
681
  },
682
  {
683
+ "entropy": 1.0464669644832612,
684
  "epoch": 0.5933682373472949,
685
+ "grad_norm": 1.0865596532821655,
686
+ "learning_rate": 4.2314647377938523e-05,
687
+ "loss": 0.9198,
688
+ "mean_token_accuracy": 0.7569529414176941,
689
  "num_tokens": 351932.0,
690
  "step": 340
691
  },
692
  {
693
+ "entropy": 1.062287026643753,
694
  "epoch": 0.6020942408376964,
695
+ "grad_norm": 0.9347847104072571,
696
+ "learning_rate": 4.1410488245931286e-05,
697
+ "loss": 0.9329,
698
+ "mean_token_accuracy": 0.7522122919559479,
699
  "num_tokens": 357045.0,
700
  "step": 345
701
  },
702
  {
703
+ "entropy": 1.0700653612613678,
704
  "epoch": 0.6108202443280978,
705
+ "grad_norm": 0.9848562479019165,
706
+ "learning_rate": 4.050632911392405e-05,
707
+ "loss": 1.0067,
708
+ "mean_token_accuracy": 0.7463895082473755,
709
  "num_tokens": 362284.0,
710
  "step": 350
711
  },
712
  {
713
+ "entropy": 0.9772404789924621,
714
  "epoch": 0.6195462478184991,
715
+ "grad_norm": 0.7483623623847961,
716
+ "learning_rate": 3.960216998191682e-05,
717
+ "loss": 0.9363,
718
+ "mean_token_accuracy": 0.7513622403144836,
719
  "num_tokens": 367217.0,
720
  "step": 355
721
  },
722
  {
723
+ "entropy": 1.0971436262130738,
724
  "epoch": 0.6282722513089005,
725
+ "grad_norm": 0.7464447021484375,
726
+ "learning_rate": 3.869801084990959e-05,
727
+ "loss": 1.0344,
728
+ "mean_token_accuracy": 0.7349282145500183,
729
  "num_tokens": 372678.0,
730
  "step": 360
731
  },
732
  {
733
+ "entropy": 0.9754513740539551,
734
  "epoch": 0.6369982547993019,
735
+ "grad_norm": 0.8585572838783264,
736
+ "learning_rate": 3.779385171790235e-05,
737
+ "loss": 0.8401,
738
+ "mean_token_accuracy": 0.7676230728626251,
739
  "num_tokens": 377750.0,
740
  "step": 365
741
  },
742
  {
743
+ "entropy": 1.0239232301712036,
744
  "epoch": 0.6457242582897034,
745
+ "grad_norm": 0.6581646203994751,
746
+ "learning_rate": 3.688969258589512e-05,
747
+ "loss": 0.9598,
748
+ "mean_token_accuracy": 0.7424759864807129,
749
  "num_tokens": 383406.0,
750
  "step": 370
751
  },
752
  {
753
+ "entropy": 1.2168250560760498,
754
  "epoch": 0.6544502617801047,
755
+ "grad_norm": 0.5685426592826843,
756
+ "learning_rate": 3.598553345388789e-05,
757
+ "loss": 1.1598,
758
+ "mean_token_accuracy": 0.7155482172966003,
759
  "num_tokens": 389696.0,
760
  "step": 375
761
  },
762
  {
763
+ "entropy": 1.0621464610099793,
764
  "epoch": 0.6631762652705061,
765
+ "grad_norm": 0.6709677577018738,
766
+ "learning_rate": 3.508137432188065e-05,
767
+ "loss": 0.9943,
768
+ "mean_token_accuracy": 0.7490235090255737,
769
  "num_tokens": 394688.0,
770
  "step": 380
771
  },
772
  {
773
+ "entropy": 1.1910510420799256,
774
  "epoch": 0.6719022687609075,
775
+ "grad_norm": 0.7342821359634399,
776
+ "learning_rate": 3.4177215189873416e-05,
777
+ "loss": 1.2074,
778
+ "mean_token_accuracy": 0.7063271880149842,
779
  "num_tokens": 400319.0,
780
  "step": 385
781
  },
782
  {
783
+ "entropy": 1.053422886133194,
784
  "epoch": 0.680628272251309,
785
+ "grad_norm": 0.9014110565185547,
786
+ "learning_rate": 3.3273056057866185e-05,
787
+ "loss": 0.957,
788
+ "mean_token_accuracy": 0.7507923722267151,
789
  "num_tokens": 404918.0,
790
  "step": 390
791
  },
792
  {
793
+ "entropy": 1.12480788230896,
794
  "epoch": 0.6893542757417103,
795
+ "grad_norm": 1.1246176958084106,
796
+ "learning_rate": 3.2368896925858955e-05,
797
+ "loss": 0.9808,
798
+ "mean_token_accuracy": 0.735921996831894,
799
  "num_tokens": 410635.0,
800
  "step": 395
801
  },
802
  {
803
+ "entropy": 1.0456868290901185,
804
  "epoch": 0.6980802792321117,
805
+ "grad_norm": 1.0994240045547485,
806
+ "learning_rate": 3.146473779385172e-05,
807
+ "loss": 0.9792,
808
+ "mean_token_accuracy": 0.7470909178256988,
809
  "num_tokens": 415005.0,
810
  "step": 400
811
  },
812
  {
813
+ "entropy": 1.1365525662899016,
814
  "epoch": 0.7068062827225131,
815
+ "grad_norm": 0.9170509576797485,
816
+ "learning_rate": 3.056057866184449e-05,
817
+ "loss": 1.0618,
818
+ "mean_token_accuracy": 0.7218761801719665,
819
  "num_tokens": 419347.0,
820
  "step": 405
821
  },
822
  {
823
+ "entropy": 1.1176020860671998,
824
  "epoch": 0.7155322862129145,
825
+ "grad_norm": 0.9267513155937195,
826
+ "learning_rate": 2.9656419529837253e-05,
827
+ "loss": 1.057,
828
+ "mean_token_accuracy": 0.7263721942901611,
829
  "num_tokens": 424588.0,
830
  "step": 410
831
  },
832
  {
833
+ "entropy": 0.997305154800415,
834
  "epoch": 0.7242582897033158,
835
+ "grad_norm": 1.025118350982666,
836
+ "learning_rate": 2.8752260397830023e-05,
837
+ "loss": 0.9536,
838
+ "mean_token_accuracy": 0.7515883207321167,
839
  "num_tokens": 428943.0,
840
  "step": 415
841
  },
842
  {
843
+ "entropy": 1.0053665816783905,
844
  "epoch": 0.7329842931937173,
845
+ "grad_norm": 0.7526717782020569,
846
+ "learning_rate": 2.7848101265822786e-05,
847
+ "loss": 0.9632,
848
+ "mean_token_accuracy": 0.7552886366844177,
849
  "num_tokens": 435326.0,
850
  "step": 420
851
  },
852
  {
853
+ "entropy": 0.967602401971817,
854
  "epoch": 0.7417102966841187,
855
+ "grad_norm": 0.8893188834190369,
856
+ "learning_rate": 2.6943942133815552e-05,
857
+ "loss": 0.8872,
858
+ "mean_token_accuracy": 0.7692794561386108,
859
  "num_tokens": 441702.0,
860
  "step": 425
861
  },
862
  {
863
+ "entropy": 1.099077934026718,
864
  "epoch": 0.7504363001745201,
865
+ "grad_norm": 0.7207973599433899,
866
+ "learning_rate": 2.603978300180832e-05,
867
+ "loss": 1.0975,
868
+ "mean_token_accuracy": 0.7474483132362366,
869
  "num_tokens": 446862.0,
870
  "step": 430
871
  },
872
  {
873
+ "entropy": 0.9622289180755615,
874
  "epoch": 0.7591623036649214,
875
+ "grad_norm": 1.0675289630889893,
876
+ "learning_rate": 2.5135623869801084e-05,
877
+ "loss": 0.8535,
878
+ "mean_token_accuracy": 0.7668149411678314,
879
  "num_tokens": 451024.0,
880
  "step": 435
881
  },
882
  {
883
+ "entropy": 1.0761532068252564,
884
  "epoch": 0.7678883071553229,
885
+ "grad_norm": 0.7296017408370972,
886
+ "learning_rate": 2.423146473779385e-05,
887
+ "loss": 1.0343,
888
+ "mean_token_accuracy": 0.744589650630951,
889
  "num_tokens": 457528.0,
890
  "step": 440
891
  },
892
  {
893
+ "entropy": 0.9684066355228425,
894
  "epoch": 0.7766143106457243,
895
+ "grad_norm": 0.931486189365387,
896
+ "learning_rate": 2.332730560578662e-05,
897
+ "loss": 0.8462,
898
+ "mean_token_accuracy": 0.7654486835002899,
899
  "num_tokens": 462267.0,
900
  "step": 445
901
  },
902
  {
903
+ "entropy": 1.082474511861801,
904
  "epoch": 0.7853403141361257,
905
+ "grad_norm": 0.95744389295578,
906
+ "learning_rate": 2.2423146473779386e-05,
907
+ "loss": 1.0284,
908
+ "mean_token_accuracy": 0.7476698577404022,
909
  "num_tokens": 467337.0,
910
  "step": 450
911
  },
912
  {
913
+ "entropy": 0.9113831341266632,
914
  "epoch": 0.794066317626527,
915
+ "grad_norm": 0.6706241965293884,
916
+ "learning_rate": 2.1518987341772153e-05,
917
+ "loss": 0.8162,
918
+ "mean_token_accuracy": 0.7696994006633758,
919
  "num_tokens": 472361.0,
920
  "step": 455
921
  },
922
  {
923
+ "entropy": 0.932946115732193,
924
  "epoch": 0.8027923211169284,
925
+ "grad_norm": 0.8883486390113831,
926
+ "learning_rate": 2.061482820976492e-05,
927
+ "loss": 0.9311,
928
+ "mean_token_accuracy": 0.761142897605896,
929
  "num_tokens": 477324.0,
930
  "step": 460
931
  },
932
  {
933
+ "entropy": 1.0308326423168181,
934
  "epoch": 0.8115183246073299,
935
+ "grad_norm": 0.8458265066146851,
936
+ "learning_rate": 1.971066907775769e-05,
937
+ "loss": 1.0271,
938
+ "mean_token_accuracy": 0.74349946975708,
939
  "num_tokens": 483192.0,
940
  "step": 465
941
  },
942
  {
943
+ "entropy": 0.957154580950737,
944
  "epoch": 0.8202443280977313,
945
+ "grad_norm": 0.8496681451797485,
946
+ "learning_rate": 1.8806509945750454e-05,
947
+ "loss": 0.9683,
948
+ "mean_token_accuracy": 0.7599529087543487,
949
  "num_tokens": 488771.0,
950
  "step": 470
951
  },
952
  {
953
+ "entropy": 1.2366232931613923,
954
  "epoch": 0.8289703315881326,
955
+ "grad_norm": 0.736077606678009,
956
+ "learning_rate": 1.7902350813743217e-05,
957
+ "loss": 1.2093,
958
+ "mean_token_accuracy": 0.7199373066425323,
959
  "num_tokens": 495594.0,
960
  "step": 475
961
  },
962
  {
963
+ "entropy": 1.0375137269496917,
964
  "epoch": 0.837696335078534,
965
+ "grad_norm": 1.0149139165878296,
966
+ "learning_rate": 1.6998191681735987e-05,
967
+ "loss": 0.9121,
968
+ "mean_token_accuracy": 0.7566475987434387,
969
  "num_tokens": 501431.0,
970
  "step": 480
971
  },
972
  {
973
+ "entropy": 1.119648665189743,
974
  "epoch": 0.8464223385689355,
975
+ "grad_norm": 0.9202610850334167,
976
+ "learning_rate": 1.6094032549728753e-05,
977
+ "loss": 1.0352,
978
+ "mean_token_accuracy": 0.7305955648422241,
979
  "num_tokens": 506603.0,
980
  "step": 485
981
  },
982
  {
983
+ "entropy": 1.023408180475235,
984
  "epoch": 0.8551483420593369,
985
+ "grad_norm": 0.793682873249054,
986
+ "learning_rate": 1.5189873417721521e-05,
987
+ "loss": 0.9317,
988
+ "mean_token_accuracy": 0.7577860534191132,
989
  "num_tokens": 511657.0,
990
  "step": 490
991
  },
992
  {
993
+ "entropy": 0.9122473716735839,
994
  "epoch": 0.8638743455497382,
995
+ "grad_norm": 1.3430960178375244,
996
+ "learning_rate": 1.4285714285714285e-05,
997
+ "loss": 0.829,
998
+ "mean_token_accuracy": 0.7760975360870361,
999
  "num_tokens": 516346.0,
1000
  "step": 495
1001
  },
1002
  {
1003
+ "entropy": 1.0150888204574584,
1004
  "epoch": 0.8726003490401396,
1005
+ "grad_norm": 0.8981541395187378,
1006
+ "learning_rate": 1.3381555153707053e-05,
1007
+ "loss": 0.8899,
1008
+ "mean_token_accuracy": 0.7577051818370819,
1009
  "num_tokens": 521118.0,
1010
  "step": 500
1011
  },
1012
  {
1013
+ "entropy": 1.0423671543598174,
1014
  "epoch": 0.881326352530541,
1015
+ "grad_norm": 0.9293156862258911,
1016
+ "learning_rate": 1.247739602169982e-05,
1017
+ "loss": 0.9083,
1018
+ "mean_token_accuracy": 0.7593627750873566,
1019
  "num_tokens": 525785.0,
1020
  "step": 505
1021
  },
1022
  {
1023
+ "entropy": 1.1302937150001526,
1024
  "epoch": 0.8900523560209425,
1025
+ "grad_norm": 0.7647919058799744,
1026
+ "learning_rate": 1.1573236889692586e-05,
1027
+ "loss": 1.0402,
1028
+ "mean_token_accuracy": 0.7308323442935943,
1029
  "num_tokens": 531296.0,
1030
  "step": 510
1031
  },
1032
  {
1033
+ "entropy": 1.0633261859416963,
1034
  "epoch": 0.8987783595113438,
1035
+ "grad_norm": 0.8269041776657104,
1036
+ "learning_rate": 1.0669077757685354e-05,
1037
+ "loss": 0.9716,
1038
+ "mean_token_accuracy": 0.7478925228118897,
1039
  "num_tokens": 536703.0,
1040
  "step": 515
1041
  },
1042
  {
1043
+ "entropy": 1.024457675218582,
1044
  "epoch": 0.9075043630017452,
1045
+ "grad_norm": 1.324475646018982,
1046
+ "learning_rate": 9.76491862567812e-06,
1047
+ "loss": 0.9741,
1048
+ "mean_token_accuracy": 0.7305623233318329,
1049
  "num_tokens": 541044.0,
1050
  "step": 520
1051
  },
1052
  {
1053
+ "entropy": 1.0003005802631377,
1054
  "epoch": 0.9162303664921466,
1055
+ "grad_norm": 0.9192453622817993,
1056
+ "learning_rate": 8.860759493670886e-06,
1057
+ "loss": 0.9253,
1058
+ "mean_token_accuracy": 0.7594391465187073,
1059
  "num_tokens": 546836.0,
1060
  "step": 525
1061
  },
1062
  {
1063
+ "entropy": 0.9267178893089294,
1064
  "epoch": 0.924956369982548,
1065
+ "grad_norm": 1.1192840337753296,
1066
+ "learning_rate": 7.956600361663654e-06,
1067
+ "loss": 0.849,
1068
+ "mean_token_accuracy": 0.7724036395549774,
1069
  "num_tokens": 551666.0,
1070
  "step": 530
1071
  },
1072
  {
1073
+ "entropy": 0.9956534147262573,
1074
  "epoch": 0.9336823734729494,
1075
+ "grad_norm": 0.8383934497833252,
1076
+ "learning_rate": 7.05244122965642e-06,
1077
+ "loss": 0.9602,
1078
+ "mean_token_accuracy": 0.7577574133872986,
1079
  "num_tokens": 556932.0,
1080
  "step": 535
1081
  },
1082
  {
1083
+ "entropy": 1.0035830855369567,
1084
  "epoch": 0.9424083769633508,
1085
+ "grad_norm": 0.6870363354682922,
1086
+ "learning_rate": 6.148282097649186e-06,
1087
+ "loss": 0.9922,
1088
+ "mean_token_accuracy": 0.753506338596344,
1089
  "num_tokens": 562608.0,
1090
  "step": 540
1091
  },
1092
  {
1093
+ "entropy": 1.0436702132225038,
1094
  "epoch": 0.9511343804537522,
1095
+ "grad_norm": 0.8812151551246643,
1096
+ "learning_rate": 5.244122965641953e-06,
1097
+ "loss": 0.9626,
1098
+ "mean_token_accuracy": 0.7489932656288147,
1099
  "num_tokens": 568591.0,
1100
  "step": 545
1101
  },
1102
  {
1103
+ "entropy": 1.123832467198372,
1104
  "epoch": 0.9598603839441536,
1105
+ "grad_norm": 0.9656490683555603,
1106
+ "learning_rate": 4.33996383363472e-06,
1107
+ "loss": 1.2141,
1108
+ "mean_token_accuracy": 0.7377211570739746,
1109
  "num_tokens": 572932.0,
1110
  "step": 550
1111
  },
1112
  {
1113
+ "entropy": 0.9230763018131256,
1114
  "epoch": 0.9685863874345549,
1115
+ "grad_norm": 0.8448044657707214,
1116
+ "learning_rate": 3.435804701627487e-06,
1117
+ "loss": 0.8004,
1118
+ "mean_token_accuracy": 0.7740457057952881,
1119
  "num_tokens": 577818.0,
1120
  "step": 555
1121
  },
1122
  {
1123
+ "entropy": 0.9738658726215362,
1124
  "epoch": 0.9773123909249564,
1125
+ "grad_norm": 1.040029764175415,
1126
+ "learning_rate": 2.531645569620253e-06,
1127
+ "loss": 0.7951,
1128
+ "mean_token_accuracy": 0.7707339942455291,
1129
  "num_tokens": 581899.0,
1130
  "step": 560
1131
  },
1132
  {
1133
+ "entropy": 0.9029529750347137,
1134
  "epoch": 0.9860383944153578,
1135
+ "grad_norm": 0.777828574180603,
1136
+ "learning_rate": 1.62748643761302e-06,
1137
+ "loss": 0.8196,
1138
+ "mean_token_accuracy": 0.7824884414672851,
1139
  "num_tokens": 587864.0,
1140
  "step": 565
1141
  },
1142
  {
1143
+ "entropy": 1.0749013006687165,
1144
  "epoch": 0.9947643979057592,
1145
+ "grad_norm": 0.9190181493759155,
1146
+ "learning_rate": 7.233273056057866e-07,
1147
+ "loss": 1.0676,
1148
+ "mean_token_accuracy": 0.7512221932411194,
1149
  "num_tokens": 592990.0,
1150
  "step": 570
1151
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d49ed87fd1007ddba65a781a7a824d4db6222aa26b1008b2e988302b8cec8fab
3
  size 5816
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4982b9ed04996dd5f6e1133823637f4dc00aad549bfd88090393c4ca029c70b
3
  size 5816