nt-van-khanh commited on
Commit
56515d4
·
verified ·
1 Parent(s): 678ecec

Upload folder using huggingface_hub

Browse files
adapter_config.json CHANGED
@@ -29,13 +29,13 @@
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
- "down_proj",
33
- "q_proj",
34
- "up_proj",
35
  "v_proj",
36
- "k_proj",
 
 
37
  "o_proj",
38
- "gate_proj"
 
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
 
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
 
 
 
32
  "v_proj",
33
+ "gate_proj",
34
+ "up_proj",
35
+ "down_proj",
36
  "o_proj",
37
+ "k_proj",
38
+ "q_proj"
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dd2d08d826340d149aa5513b4dd0539e973bdec29c23d4a91364b7126c19c316
3
  size 167832240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0656ea6a9b2b36d5759ece47a5b3eff68d20b48c1112eadf5ef4c6950d2cf55
3
  size 167832240
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f5c3f504088aead5b8c944df4ed47fcaef2fbcdf4678a6a6457ccbafe3f7bdcb
3
  size 85733654
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e98ec00b3d2f7e34b8479ef6eafbc1e1fa5efe3fb6e5d385f146df37257d394
3
  size 85733654
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:69b9dab5290c195fdacbf7884c40eeca52cafe3b7ddc5ee2280f8f048155a5ab
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:07d2f57764cc2ea1a146a33cfaedbb1294d4dfbe7253eb0435c96b286b35141a
3
  size 14244
trainer_state.json CHANGED
@@ -2,7 +2,7 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.33641715727502103,
6
  "eval_steps": 179,
7
  "global_step": 300,
8
  "is_hyper_param_search": false,
@@ -10,603 +10,603 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "entropy": 1.696909672021866,
14
- "epoch": 0.005606952621250351,
15
- "grad_norm": 0.7663310170173645,
16
  "learning_rate": 0.00016,
17
- "loss": 2.1596,
18
- "mean_token_accuracy": 0.5812449663877487,
19
- "num_tokens": 8218.0,
20
  "step": 5
21
  },
22
  {
23
- "entropy": 1.875484037399292,
24
- "epoch": 0.011213905242500702,
25
- "grad_norm": 0.8613530397415161,
26
  "learning_rate": 0.00019909808342728297,
27
- "loss": 1.6298,
28
- "mean_token_accuracy": 0.6346893429756164,
29
- "num_tokens": 19584.0,
30
  "step": 10
31
  },
32
  {
33
- "entropy": 1.6492740571498872,
34
- "epoch": 0.01682085786375105,
35
- "grad_norm": 0.8438306450843811,
36
  "learning_rate": 0.0001979706877113867,
37
- "loss": 1.448,
38
- "mean_token_accuracy": 0.667498791217804,
39
- "num_tokens": 28392.0,
40
  "step": 15
41
  },
42
  {
43
- "entropy": 1.5424207150936127,
44
- "epoch": 0.022427810485001403,
45
- "grad_norm": 0.5755366086959839,
46
  "learning_rate": 0.00019684329199549043,
47
- "loss": 1.5714,
48
- "mean_token_accuracy": 0.6525402277708053,
49
- "num_tokens": 40558.0,
50
  "step": 20
51
  },
52
  {
53
- "entropy": 1.6563467800617218,
54
- "epoch": 0.02803476310625175,
55
- "grad_norm": 0.640796422958374,
56
  "learning_rate": 0.00019571589627959414,
57
- "loss": 1.4843,
58
- "mean_token_accuracy": 0.6737909287214279,
59
- "num_tokens": 50005.0,
60
  "step": 25
61
  },
62
  {
63
- "entropy": 1.477371919155121,
64
- "epoch": 0.0336417157275021,
65
- "grad_norm": 0.7493678331375122,
66
  "learning_rate": 0.00019458850056369787,
67
- "loss": 1.3474,
68
- "mean_token_accuracy": 0.6935703039169312,
69
- "num_tokens": 58738.0,
70
  "step": 30
71
  },
72
  {
73
- "entropy": 1.464887660741806,
74
- "epoch": 0.03924866834875245,
75
- "grad_norm": 0.6396933794021606,
76
  "learning_rate": 0.00019346110484780158,
77
- "loss": 1.3963,
78
- "mean_token_accuracy": 0.6689792603254319,
79
- "num_tokens": 68528.0,
80
  "step": 35
81
  },
82
  {
83
- "entropy": 1.4757700502872466,
84
- "epoch": 0.044855620970002806,
85
- "grad_norm": 0.5516763925552368,
86
  "learning_rate": 0.0001923337091319053,
87
- "loss": 1.4236,
88
- "mean_token_accuracy": 0.6675747632980347,
89
- "num_tokens": 77986.0,
90
  "step": 40
91
  },
92
  {
93
- "entropy": 1.4118095993995667,
94
- "epoch": 0.050462573591253154,
95
- "grad_norm": 0.6395580172538757,
96
  "learning_rate": 0.00019120631341600902,
97
- "loss": 1.2766,
98
- "mean_token_accuracy": 0.6935016334056854,
99
- "num_tokens": 86893.0,
100
  "step": 45
101
  },
102
  {
103
- "entropy": 1.4825987100601197,
104
- "epoch": 0.0560695262125035,
105
- "grad_norm": 0.7649742960929871,
106
  "learning_rate": 0.00019007891770011275,
107
- "loss": 1.4255,
108
- "mean_token_accuracy": 0.6750761657953263,
109
- "num_tokens": 95692.0,
110
  "step": 50
111
  },
112
  {
113
- "entropy": 1.3713403642177582,
114
- "epoch": 0.06167647883375386,
115
- "grad_norm": 0.6055657863616943,
116
  "learning_rate": 0.00018895152198421646,
117
- "loss": 1.2919,
118
- "mean_token_accuracy": 0.6923940628767014,
119
- "num_tokens": 104810.0,
120
  "step": 55
121
  },
122
  {
123
- "entropy": 1.3107981920242309,
124
- "epoch": 0.0672834314550042,
125
- "grad_norm": 0.932307243347168,
126
  "learning_rate": 0.0001878241262683202,
127
- "loss": 1.2072,
128
- "mean_token_accuracy": 0.7068106323480606,
129
- "num_tokens": 112767.0,
130
  "step": 60
131
  },
132
  {
133
- "entropy": 1.290432232618332,
134
- "epoch": 0.07289038407625456,
135
- "grad_norm": 0.657538115978241,
136
  "learning_rate": 0.00018669673055242392,
137
- "loss": 1.1683,
138
- "mean_token_accuracy": 0.714971786737442,
139
- "num_tokens": 122104.0,
140
  "step": 65
141
  },
142
  {
143
- "entropy": 1.3310194253921508,
144
- "epoch": 0.0784973366975049,
145
- "grad_norm": 0.5447025299072266,
146
  "learning_rate": 0.00018556933483652763,
147
- "loss": 1.3566,
148
- "mean_token_accuracy": 0.6949202805757523,
149
- "num_tokens": 132347.0,
150
  "step": 70
151
  },
152
  {
153
- "entropy": 1.3567017048597336,
154
- "epoch": 0.08410428931875526,
155
- "grad_norm": 0.6126067042350769,
156
  "learning_rate": 0.00018444193912063134,
157
- "loss": 1.2616,
158
- "mean_token_accuracy": 0.6886427521705627,
159
- "num_tokens": 140294.0,
160
  "step": 75
161
  },
162
  {
163
- "entropy": 1.3231679052114487,
164
- "epoch": 0.08971124194000561,
165
- "grad_norm": 0.5827459096908569,
166
  "learning_rate": 0.00018331454340473507,
167
- "loss": 1.2312,
168
- "mean_token_accuracy": 0.6998110383749008,
169
- "num_tokens": 149796.0,
170
  "step": 80
171
  },
172
  {
173
- "entropy": 1.3795920431613922,
174
- "epoch": 0.09531819456125595,
175
- "grad_norm": 0.6522558331489563,
176
  "learning_rate": 0.0001821871476888388,
177
- "loss": 1.3163,
178
- "mean_token_accuracy": 0.6797463029623032,
179
- "num_tokens": 160094.0,
180
  "step": 85
181
  },
182
  {
183
- "entropy": 1.4354715049266815,
184
- "epoch": 0.10092514718250631,
185
- "grad_norm": 0.5437538623809814,
186
  "learning_rate": 0.0001810597519729425,
187
- "loss": 1.4219,
188
- "mean_token_accuracy": 0.6741667121648789,
189
- "num_tokens": 167520.0,
190
  "step": 90
191
  },
192
  {
193
- "entropy": 1.3719047516584397,
194
- "epoch": 0.10653209980375666,
195
- "grad_norm": 0.6490810513496399,
196
  "learning_rate": 0.00017993235625704624,
197
- "loss": 1.3259,
198
- "mean_token_accuracy": 0.69256811439991,
199
- "num_tokens": 177631.0,
200
  "step": 95
201
  },
202
  {
203
- "entropy": 1.381699651479721,
204
- "epoch": 0.112139052425007,
205
- "grad_norm": 0.6738480925559998,
206
  "learning_rate": 0.00017880496054114995,
207
- "loss": 1.3181,
208
- "mean_token_accuracy": 0.696441325545311,
209
- "num_tokens": 186948.0,
210
  "step": 100
211
  },
212
  {
213
- "entropy": 1.271216405928135,
214
- "epoch": 0.11774600504625736,
215
- "grad_norm": 1.1620361804962158,
216
  "learning_rate": 0.00017767756482525365,
217
- "loss": 1.2414,
218
- "mean_token_accuracy": 0.7000553667545318,
219
- "num_tokens": 196406.0,
220
  "step": 105
221
  },
222
  {
223
- "entropy": 1.3180717766284942,
224
- "epoch": 0.12335295766750771,
225
- "grad_norm": 0.5142589211463928,
226
  "learning_rate": 0.0001765501691093574,
227
- "loss": 1.2179,
228
- "mean_token_accuracy": 0.7069388717412949,
229
- "num_tokens": 207134.0,
230
  "step": 110
231
  },
232
  {
233
- "entropy": 1.3922697693109511,
234
- "epoch": 0.12895991028875806,
235
- "grad_norm": 0.5426948666572571,
236
  "learning_rate": 0.00017542277339346112,
237
- "loss": 1.3128,
238
- "mean_token_accuracy": 0.6875284522771835,
239
- "num_tokens": 218970.0,
240
  "step": 115
241
  },
242
  {
243
- "entropy": 1.3745603621006013,
244
- "epoch": 0.1345668629100084,
245
- "grad_norm": 0.6224590539932251,
246
  "learning_rate": 0.00017429537767756482,
247
- "loss": 1.3472,
248
- "mean_token_accuracy": 0.6812066495418548,
249
- "num_tokens": 227951.0,
250
  "step": 120
251
  },
252
  {
253
- "entropy": 1.2305602520704269,
254
- "epoch": 0.14017381553125877,
255
- "grad_norm": 0.6087414026260376,
256
  "learning_rate": 0.00017316798196166856,
257
- "loss": 1.1658,
258
- "mean_token_accuracy": 0.7141091674566269,
259
- "num_tokens": 237592.0,
260
  "step": 125
261
  },
262
  {
263
- "entropy": 1.138188686966896,
264
- "epoch": 0.14578076815250912,
265
- "grad_norm": 0.5676659345626831,
266
  "learning_rate": 0.0001720405862457723,
267
- "loss": 1.1253,
268
- "mean_token_accuracy": 0.7303309470415116,
269
- "num_tokens": 246718.0,
270
  "step": 130
271
  },
272
  {
273
- "entropy": 1.390087616443634,
274
- "epoch": 0.15138772077375945,
275
- "grad_norm": 0.5323399901390076,
276
  "learning_rate": 0.000170913190529876,
277
- "loss": 1.3956,
278
- "mean_token_accuracy": 0.6867286145687104,
279
- "num_tokens": 255586.0,
280
  "step": 135
281
  },
282
  {
283
- "entropy": 1.2312346011400224,
284
- "epoch": 0.1569946733950098,
285
- "grad_norm": 0.6513665318489075,
286
  "learning_rate": 0.0001697857948139797,
287
- "loss": 1.2024,
288
- "mean_token_accuracy": 0.7194372028112411,
289
- "num_tokens": 264356.0,
290
  "step": 140
291
  },
292
  {
293
- "entropy": 1.1984657406806947,
294
- "epoch": 0.16260162601626016,
295
- "grad_norm": 0.6147997379302979,
296
  "learning_rate": 0.00016865839909808344,
297
- "loss": 1.2174,
298
- "mean_token_accuracy": 0.7164656221866608,
299
- "num_tokens": 272889.0,
300
  "step": 145
301
  },
302
  {
303
- "entropy": 1.2961942851543427,
304
- "epoch": 0.16820857863751051,
305
- "grad_norm": 0.6134310364723206,
306
  "learning_rate": 0.00016753100338218714,
307
- "loss": 1.2279,
308
- "mean_token_accuracy": 0.7087068349123001,
309
- "num_tokens": 282106.0,
310
  "step": 150
311
  },
312
  {
313
- "entropy": 1.2217833191156386,
314
- "epoch": 0.17381553125876087,
315
- "grad_norm": 0.6766023635864258,
316
  "learning_rate": 0.00016640360766629087,
317
- "loss": 1.2188,
318
- "mean_token_accuracy": 0.7227874040603638,
319
- "num_tokens": 291923.0,
320
  "step": 155
321
  },
322
  {
323
- "entropy": 1.2688794553279876,
324
- "epoch": 0.17942248388001122,
325
- "grad_norm": 0.666310727596283,
326
  "learning_rate": 0.0001652762119503946,
327
- "loss": 1.2627,
328
- "mean_token_accuracy": 0.7072140723466873,
329
- "num_tokens": 299752.0,
330
  "step": 160
331
  },
332
  {
333
- "entropy": 1.3040128737688064,
334
- "epoch": 0.18502943650126155,
335
- "grad_norm": 0.581291913986206,
336
  "learning_rate": 0.0001641488162344983,
337
- "loss": 1.2133,
338
- "mean_token_accuracy": 0.6970360308885575,
339
- "num_tokens": 308737.0,
340
  "step": 165
341
  },
342
  {
343
- "entropy": 1.0922872066497802,
344
- "epoch": 0.1906363891225119,
345
- "grad_norm": 0.6826110482215881,
346
  "learning_rate": 0.00016302142051860202,
347
- "loss": 1.0315,
348
- "mean_token_accuracy": 0.7402381807565689,
349
- "num_tokens": 315852.0,
350
  "step": 170
351
  },
352
  {
353
- "entropy": 1.184697662293911,
354
- "epoch": 0.19624334174376226,
355
- "grad_norm": 0.670078694820404,
356
  "learning_rate": 0.00016189402480270578,
357
- "loss": 1.1541,
358
- "mean_token_accuracy": 0.7335967868566513,
359
- "num_tokens": 324430.0,
360
  "step": 175
361
  },
362
  {
363
- "entropy": 1.2350615233182907,
364
- "epoch": 0.20185029436501262,
365
- "grad_norm": 0.5114791989326477,
366
  "learning_rate": 0.00016076662908680949,
367
- "loss": 1.1825,
368
- "mean_token_accuracy": 0.7157844036817551,
369
- "num_tokens": 331843.0,
370
  "step": 180
371
  },
372
  {
373
- "entropy": 1.1912701576948166,
374
- "epoch": 0.20745724698626297,
375
- "grad_norm": 0.668006420135498,
376
  "learning_rate": 0.0001596392333709132,
377
- "loss": 1.1067,
378
- "mean_token_accuracy": 0.7309438616037369,
379
- "num_tokens": 340933.0,
380
  "step": 185
381
  },
382
  {
383
- "entropy": 1.0719711840152741,
384
- "epoch": 0.21306419960751333,
385
- "grad_norm": 0.5813568234443665,
386
  "learning_rate": 0.00015851183765501692,
387
- "loss": 1.0212,
388
- "mean_token_accuracy": 0.7530986189842224,
389
- "num_tokens": 349083.0,
390
  "step": 190
391
  },
392
  {
393
- "entropy": 1.3020119816064835,
394
- "epoch": 0.21867115222876365,
395
- "grad_norm": 0.6488296985626221,
396
  "learning_rate": 0.00015738444193912063,
397
- "loss": 1.2975,
398
- "mean_token_accuracy": 0.6928794890642166,
399
- "num_tokens": 358133.0,
400
  "step": 195
401
  },
402
  {
403
- "entropy": 1.1875290542840957,
404
- "epoch": 0.224278104850014,
405
- "grad_norm": 0.6314829587936401,
406
  "learning_rate": 0.00015625704622322436,
407
- "loss": 1.2064,
408
- "mean_token_accuracy": 0.7248553454875946,
409
- "num_tokens": 367701.0,
410
  "step": 200
411
  },
412
  {
413
- "entropy": 1.2919130593538284,
414
- "epoch": 0.22988505747126436,
415
- "grad_norm": 0.5402503609657288,
416
  "learning_rate": 0.0001551296505073281,
417
- "loss": 1.2512,
418
- "mean_token_accuracy": 0.7068570107221603,
419
- "num_tokens": 376039.0,
420
  "step": 205
421
  },
422
  {
423
- "entropy": 1.2941559731960297,
424
- "epoch": 0.23549201009251472,
425
- "grad_norm": 0.5794088244438171,
426
  "learning_rate": 0.0001540022547914318,
427
- "loss": 1.2339,
428
- "mean_token_accuracy": 0.7108895808458329,
429
- "num_tokens": 384327.0,
430
  "step": 210
431
  },
432
  {
433
- "entropy": 1.1999757021665574,
434
- "epoch": 0.24109896271376507,
435
- "grad_norm": 0.5067386627197266,
436
  "learning_rate": 0.0001528748590755355,
437
- "loss": 1.1892,
438
- "mean_token_accuracy": 0.713716721534729,
439
- "num_tokens": 394548.0,
440
  "step": 215
441
  },
442
  {
443
- "entropy": 1.1508339285850524,
444
- "epoch": 0.24670591533501543,
445
- "grad_norm": 0.7008835077285767,
446
  "learning_rate": 0.00015174746335963924,
447
- "loss": 1.1226,
448
- "mean_token_accuracy": 0.7230166435241699,
449
- "num_tokens": 403522.0,
450
  "step": 220
451
  },
452
  {
453
- "entropy": 1.196462707221508,
454
- "epoch": 0.2523128679562658,
455
- "grad_norm": 0.5998469591140747,
456
  "learning_rate": 0.00015062006764374297,
457
- "loss": 1.2183,
458
- "mean_token_accuracy": 0.7174128830432892,
459
- "num_tokens": 414211.0,
460
  "step": 225
461
  },
462
  {
463
- "entropy": 1.044454263150692,
464
- "epoch": 0.2579198205775161,
465
- "grad_norm": 0.5503870844841003,
466
  "learning_rate": 0.00014949267192784668,
467
- "loss": 0.9978,
468
- "mean_token_accuracy": 0.7547496408224106,
469
- "num_tokens": 424482.0,
470
  "step": 230
471
  },
472
  {
473
- "entropy": 1.2794962912797927,
474
- "epoch": 0.2635267731987665,
475
- "grad_norm": 0.5634020566940308,
476
  "learning_rate": 0.00014836527621195039,
477
- "loss": 1.2497,
478
- "mean_token_accuracy": 0.701577228307724,
479
- "num_tokens": 433911.0,
480
  "step": 235
481
  },
482
  {
483
- "entropy": 1.2079395592212676,
484
- "epoch": 0.2691337258200168,
485
- "grad_norm": 0.6672863364219666,
486
  "learning_rate": 0.00014723788049605412,
487
- "loss": 1.185,
488
- "mean_token_accuracy": 0.717086723446846,
489
- "num_tokens": 442974.0,
490
  "step": 240
491
  },
492
  {
493
- "entropy": 1.1594905465841294,
494
- "epoch": 0.27474067844126715,
495
- "grad_norm": 0.6265320181846619,
496
  "learning_rate": 0.00014611048478015785,
497
- "loss": 1.0919,
498
- "mean_token_accuracy": 0.7310166716575622,
499
- "num_tokens": 450850.0,
500
  "step": 245
501
  },
502
  {
503
- "entropy": 1.244988052546978,
504
- "epoch": 0.28034763106251753,
505
- "grad_norm": 0.5000255703926086,
506
  "learning_rate": 0.00014498308906426156,
507
- "loss": 1.2546,
508
- "mean_token_accuracy": 0.7129500776529312,
509
- "num_tokens": 461926.0,
510
  "step": 250
511
  },
512
  {
513
- "entropy": 1.0460072100162505,
514
- "epoch": 0.28595458368376786,
515
- "grad_norm": 0.6440132260322571,
516
  "learning_rate": 0.0001438556933483653,
517
- "loss": 1.0097,
518
- "mean_token_accuracy": 0.7434480965137482,
519
- "num_tokens": 469686.0,
520
  "step": 255
521
  },
522
  {
523
- "entropy": 1.1157130993902684,
524
- "epoch": 0.29156153630501824,
525
- "grad_norm": 0.6023146510124207,
526
  "learning_rate": 0.000142728297632469,
527
- "loss": 1.0708,
528
- "mean_token_accuracy": 0.7285511642694473,
529
- "num_tokens": 479081.0,
530
  "step": 260
531
  },
532
  {
533
- "entropy": 1.2074461445212363,
534
- "epoch": 0.29716848892626857,
535
- "grad_norm": 0.6504403948783875,
536
  "learning_rate": 0.0001416009019165727,
537
- "loss": 1.2055,
538
- "mean_token_accuracy": 0.7167613714933395,
539
- "num_tokens": 488026.0,
540
  "step": 265
541
  },
542
  {
543
- "entropy": 1.1629670545458795,
544
- "epoch": 0.3027754415475189,
545
- "grad_norm": 0.7767821550369263,
546
  "learning_rate": 0.00014047350620067646,
547
- "loss": 1.0925,
548
- "mean_token_accuracy": 0.7313909947872161,
549
- "num_tokens": 496200.0,
550
  "step": 270
551
  },
552
  {
553
- "entropy": 1.0763523548841476,
554
- "epoch": 0.3083823941687693,
555
- "grad_norm": 0.54071444272995,
556
  "learning_rate": 0.00013934611048478017,
557
- "loss": 1.006,
558
- "mean_token_accuracy": 0.7479322016239166,
559
- "num_tokens": 506000.0,
560
  "step": 275
561
  },
562
  {
563
- "entropy": 1.2247862741351128,
564
- "epoch": 0.3139893467900196,
565
- "grad_norm": 0.5242252945899963,
566
  "learning_rate": 0.00013821871476888388,
567
- "loss": 1.2591,
568
- "mean_token_accuracy": 0.7159645825624465,
569
- "num_tokens": 515345.0,
570
  "step": 280
571
  },
572
  {
573
- "entropy": 1.3475455969572068,
574
- "epoch": 0.31959629941127,
575
- "grad_norm": 0.5631939768791199,
576
  "learning_rate": 0.0001370913190529876,
577
- "loss": 1.3375,
578
- "mean_token_accuracy": 0.6826408416032791,
579
- "num_tokens": 524644.0,
580
  "step": 285
581
  },
582
  {
583
- "entropy": 1.0502676755189895,
584
- "epoch": 0.3252032520325203,
585
- "grad_norm": 0.611739993095398,
586
  "learning_rate": 0.00013596392333709131,
587
- "loss": 0.9899,
588
- "mean_token_accuracy": 0.7530880838632583,
589
- "num_tokens": 532559.0,
590
  "step": 290
591
  },
592
  {
593
- "entropy": 1.1604034945368766,
594
- "epoch": 0.3308102046537707,
595
- "grad_norm": 0.4446961283683777,
596
  "learning_rate": 0.00013483652762119505,
597
- "loss": 1.1227,
598
- "mean_token_accuracy": 0.7217718571424484,
599
- "num_tokens": 544302.0,
600
  "step": 295
601
  },
602
  {
603
- "entropy": 1.1056245781481266,
604
- "epoch": 0.33641715727502103,
605
- "grad_norm": 0.8371356725692749,
606
  "learning_rate": 0.00013370913190529878,
607
- "loss": 1.1266,
608
- "mean_token_accuracy": 0.7354450315237046,
609
- "num_tokens": 553700.0,
610
  "step": 300
611
  }
612
  ],
@@ -627,7 +627,7 @@
627
  "attributes": {}
628
  }
629
  },
630
- "total_flos": 2.50722039128064e+16,
631
  "train_batch_size": 1,
632
  "trial_name": null,
633
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.33651149747616377,
6
  "eval_steps": 179,
7
  "global_step": 300,
8
  "is_hyper_param_search": false,
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "entropy": 1.5857114374637604,
14
+ "epoch": 0.005608524957936063,
15
+ "grad_norm": 0.6155414581298828,
16
  "learning_rate": 0.00016,
17
+ "loss": 1.9706,
18
+ "mean_token_accuracy": 0.6006834208965302,
19
+ "num_tokens": 8800.0,
20
  "step": 5
21
  },
22
  {
23
+ "entropy": 1.9297356605529785,
24
+ "epoch": 0.011217049915872126,
25
+ "grad_norm": 0.7395716905593872,
26
  "learning_rate": 0.00019909808342728297,
27
+ "loss": 1.7721,
28
+ "mean_token_accuracy": 0.6421857982873916,
29
+ "num_tokens": 17002.0,
30
  "step": 10
31
  },
32
  {
33
+ "entropy": 1.619718110561371,
34
+ "epoch": 0.01682557487380819,
35
+ "grad_norm": 0.6860098838806152,
36
  "learning_rate": 0.0001979706877113867,
37
+ "loss": 1.4967,
38
+ "mean_token_accuracy": 0.6729415714740753,
39
+ "num_tokens": 25283.0,
40
  "step": 15
41
  },
42
  {
43
+ "entropy": 1.5023219525814056,
44
+ "epoch": 0.022434099831744252,
45
+ "grad_norm": 0.5842112898826599,
46
  "learning_rate": 0.00019684329199549043,
47
+ "loss": 1.4969,
48
+ "mean_token_accuracy": 0.6710207283496856,
49
+ "num_tokens": 33505.0,
50
  "step": 20
51
  },
52
  {
53
+ "entropy": 1.6429471731185914,
54
+ "epoch": 0.028042624789680313,
55
+ "grad_norm": 0.5911830067634583,
56
  "learning_rate": 0.00019571589627959414,
57
+ "loss": 1.5174,
58
+ "mean_token_accuracy": 0.6597635358572006,
59
+ "num_tokens": 43397.0,
60
  "step": 25
61
  },
62
  {
63
+ "entropy": 1.6084780812263488,
64
+ "epoch": 0.03365114974761638,
65
+ "grad_norm": 0.7406187653541565,
66
  "learning_rate": 0.00019458850056369787,
67
+ "loss": 1.4757,
68
+ "mean_token_accuracy": 0.6691249191761017,
69
+ "num_tokens": 52556.0,
70
  "step": 30
71
  },
72
  {
73
+ "entropy": 1.4238544702529907,
74
+ "epoch": 0.03925967470555244,
75
+ "grad_norm": 0.611213743686676,
76
  "learning_rate": 0.00019346110484780158,
77
+ "loss": 1.4085,
78
+ "mean_token_accuracy": 0.692791685461998,
79
+ "num_tokens": 61579.0,
80
  "step": 35
81
  },
82
  {
83
+ "entropy": 1.3825733065605164,
84
+ "epoch": 0.044868199663488505,
85
+ "grad_norm": 0.6608020663261414,
86
  "learning_rate": 0.0001923337091319053,
87
+ "loss": 1.3955,
88
+ "mean_token_accuracy": 0.6937497437000275,
89
+ "num_tokens": 68479.0,
90
  "step": 40
91
  },
92
  {
93
+ "entropy": 1.4096888184547425,
94
+ "epoch": 0.050476724621424565,
95
+ "grad_norm": 0.5221259593963623,
96
  "learning_rate": 0.00019120631341600902,
97
+ "loss": 1.2979,
98
+ "mean_token_accuracy": 0.6925529271364212,
99
+ "num_tokens": 77911.0,
100
  "step": 45
101
  },
102
  {
103
+ "entropy": 1.3391252905130386,
104
+ "epoch": 0.056085249579360626,
105
+ "grad_norm": 0.6178808212280273,
106
  "learning_rate": 0.00019007891770011275,
107
+ "loss": 1.3092,
108
+ "mean_token_accuracy": 0.704131829738617,
109
+ "num_tokens": 86382.0,
110
  "step": 50
111
  },
112
  {
113
+ "entropy": 1.3084194093942643,
114
+ "epoch": 0.06169377453729669,
115
+ "grad_norm": 0.570563554763794,
116
  "learning_rate": 0.00018895152198421646,
117
+ "loss": 1.228,
118
+ "mean_token_accuracy": 0.6969290852546692,
119
+ "num_tokens": 94306.0,
120
  "step": 55
121
  },
122
  {
123
+ "entropy": 1.418030035495758,
124
+ "epoch": 0.06730229949523275,
125
+ "grad_norm": 0.6073914766311646,
126
  "learning_rate": 0.0001878241262683202,
127
+ "loss": 1.3252,
128
+ "mean_token_accuracy": 0.6798107504844666,
129
+ "num_tokens": 103567.0,
130
  "step": 60
131
  },
132
  {
133
+ "entropy": 1.5420262813568115,
134
+ "epoch": 0.07291082445316882,
135
+ "grad_norm": 0.4949992001056671,
136
  "learning_rate": 0.00018669673055242392,
137
+ "loss": 1.4262,
138
+ "mean_token_accuracy": 0.6678558409214019,
139
+ "num_tokens": 113923.0,
140
  "step": 65
141
  },
142
  {
143
+ "entropy": 1.3928685992956162,
144
+ "epoch": 0.07851934941110487,
145
+ "grad_norm": 0.5758721828460693,
146
  "learning_rate": 0.00018556933483652763,
147
+ "loss": 1.3512,
148
+ "mean_token_accuracy": 0.6777496755123138,
149
+ "num_tokens": 126199.0,
150
  "step": 70
151
  },
152
  {
153
+ "entropy": 1.336549162864685,
154
+ "epoch": 0.08412787436904094,
155
+ "grad_norm": 0.678063154220581,
156
  "learning_rate": 0.00018444193912063134,
157
+ "loss": 1.2387,
158
+ "mean_token_accuracy": 0.6992105931043625,
159
+ "num_tokens": 135737.0,
160
  "step": 75
161
  },
162
  {
163
+ "entropy": 1.5632237881422042,
164
+ "epoch": 0.08973639932697701,
165
+ "grad_norm": 0.5325204730033875,
166
  "learning_rate": 0.00018331454340473507,
167
+ "loss": 1.479,
168
+ "mean_token_accuracy": 0.6516371637582778,
169
+ "num_tokens": 146847.0,
170
  "step": 80
171
  },
172
  {
173
+ "entropy": 1.4212194442749024,
174
+ "epoch": 0.09534492428491306,
175
+ "grad_norm": 0.8020451664924622,
176
  "learning_rate": 0.0001821871476888388,
177
+ "loss": 1.3261,
178
+ "mean_token_accuracy": 0.6801748961210251,
179
+ "num_tokens": 154755.0,
180
  "step": 85
181
  },
182
  {
183
+ "entropy": 1.2850608110427857,
184
+ "epoch": 0.10095344924284913,
185
+ "grad_norm": 0.9955788254737854,
186
  "learning_rate": 0.0001810597519729425,
187
+ "loss": 1.1832,
188
+ "mean_token_accuracy": 0.7192482769489288,
189
+ "num_tokens": 162857.0,
190
  "step": 90
191
  },
192
  {
193
+ "entropy": 1.24569151699543,
194
+ "epoch": 0.1065619742007852,
195
+ "grad_norm": 0.6132731437683105,
196
  "learning_rate": 0.00017993235625704624,
197
+ "loss": 1.1905,
198
+ "mean_token_accuracy": 0.7139606773853302,
199
+ "num_tokens": 171381.0,
200
  "step": 95
201
  },
202
  {
203
+ "entropy": 1.3500551611185074,
204
+ "epoch": 0.11217049915872125,
205
+ "grad_norm": 0.604263186454773,
206
  "learning_rate": 0.00017880496054114995,
207
+ "loss": 1.3683,
208
+ "mean_token_accuracy": 0.6883647471666337,
209
+ "num_tokens": 179672.0,
210
  "step": 100
211
  },
212
  {
213
+ "entropy": 1.4277629852294922,
214
+ "epoch": 0.11777902411665732,
215
+ "grad_norm": 0.5616147518157959,
216
  "learning_rate": 0.00017767756482525365,
217
+ "loss": 1.3609,
218
+ "mean_token_accuracy": 0.6817101955413818,
219
+ "num_tokens": 190223.0,
220
  "step": 105
221
  },
222
  {
223
+ "entropy": 1.3598074555397033,
224
+ "epoch": 0.12338754907459339,
225
+ "grad_norm": 0.49820777773857117,
226
  "learning_rate": 0.0001765501691093574,
227
+ "loss": 1.3432,
228
+ "mean_token_accuracy": 0.6918457806110382,
229
+ "num_tokens": 200003.0,
230
  "step": 110
231
  },
232
  {
233
+ "entropy": 1.3189748495817184,
234
+ "epoch": 0.12899607403252944,
235
+ "grad_norm": 0.5357916951179504,
236
  "learning_rate": 0.00017542277339346112,
237
+ "loss": 1.296,
238
+ "mean_token_accuracy": 0.6899977535009384,
239
+ "num_tokens": 210284.0,
240
  "step": 115
241
  },
242
  {
243
+ "entropy": 1.2477733016014099,
244
+ "epoch": 0.1346045989904655,
245
+ "grad_norm": 0.6869284510612488,
246
  "learning_rate": 0.00017429537767756482,
247
+ "loss": 1.1922,
248
+ "mean_token_accuracy": 0.7168257743120193,
249
+ "num_tokens": 220374.0,
250
  "step": 120
251
  },
252
  {
253
+ "entropy": 1.30469251871109,
254
+ "epoch": 0.14021312394840157,
255
+ "grad_norm": 0.6618072986602783,
256
  "learning_rate": 0.00017316798196166856,
257
+ "loss": 1.1881,
258
+ "mean_token_accuracy": 0.7009620904922486,
259
+ "num_tokens": 230717.0,
260
  "step": 125
261
  },
262
  {
263
+ "entropy": 1.2923731699585914,
264
+ "epoch": 0.14582164890633764,
265
+ "grad_norm": 0.5542452931404114,
266
  "learning_rate": 0.0001720405862457723,
267
+ "loss": 1.2582,
268
+ "mean_token_accuracy": 0.7028028458356858,
269
+ "num_tokens": 240440.0,
270
  "step": 130
271
  },
272
  {
273
+ "entropy": 1.3109561294317245,
274
+ "epoch": 0.1514301738642737,
275
+ "grad_norm": 0.7251168489456177,
276
  "learning_rate": 0.000170913190529876,
277
+ "loss": 1.2316,
278
+ "mean_token_accuracy": 0.6982571691274643,
279
+ "num_tokens": 248616.0,
280
  "step": 135
281
  },
282
  {
283
+ "entropy": 1.333225554227829,
284
+ "epoch": 0.15703869882220975,
285
+ "grad_norm": 0.6222459077835083,
286
  "learning_rate": 0.0001697857948139797,
287
+ "loss": 1.2305,
288
+ "mean_token_accuracy": 0.7053877979516983,
289
+ "num_tokens": 259137.0,
290
  "step": 140
291
  },
292
  {
293
+ "entropy": 1.2306534215807914,
294
+ "epoch": 0.16264722378014582,
295
+ "grad_norm": 0.5644901394844055,
296
  "learning_rate": 0.00016865839909808344,
297
+ "loss": 1.2349,
298
+ "mean_token_accuracy": 0.7150521993637085,
299
+ "num_tokens": 270269.0,
300
  "step": 145
301
  },
302
  {
303
+ "entropy": 1.2639500305056572,
304
+ "epoch": 0.16825574873808188,
305
+ "grad_norm": 0.6914640665054321,
306
  "learning_rate": 0.00016753100338218714,
307
+ "loss": 1.2328,
308
+ "mean_token_accuracy": 0.7164399117231369,
309
+ "num_tokens": 278900.0,
310
  "step": 150
311
  },
312
  {
313
+ "entropy": 1.2492301687598228,
314
+ "epoch": 0.17386427369601795,
315
+ "grad_norm": 0.6064640879631042,
316
  "learning_rate": 0.00016640360766629087,
317
+ "loss": 1.2309,
318
+ "mean_token_accuracy": 0.7067903488874435,
319
+ "num_tokens": 288895.0,
320
  "step": 155
321
  },
322
  {
323
+ "entropy": 1.1464111924171447,
324
+ "epoch": 0.17947279865395402,
325
+ "grad_norm": 0.5932626724243164,
326
  "learning_rate": 0.0001652762119503946,
327
+ "loss": 1.1398,
328
+ "mean_token_accuracy": 0.7307830601930618,
329
+ "num_tokens": 298947.0,
330
  "step": 160
331
  },
332
  {
333
+ "entropy": 1.1858425721526147,
334
+ "epoch": 0.1850813236118901,
335
+ "grad_norm": 0.5144683122634888,
336
  "learning_rate": 0.0001641488162344983,
337
+ "loss": 1.1402,
338
+ "mean_token_accuracy": 0.730936524271965,
339
+ "num_tokens": 307660.0,
340
  "step": 165
341
  },
342
  {
343
+ "entropy": 1.3814254194498061,
344
+ "epoch": 0.19068984856982613,
345
+ "grad_norm": 0.5210261940956116,
346
  "learning_rate": 0.00016302142051860202,
347
+ "loss": 1.3247,
348
+ "mean_token_accuracy": 0.6925279140472412,
349
+ "num_tokens": 318346.0,
350
  "step": 170
351
  },
352
  {
353
+ "entropy": 1.0947081446647644,
354
+ "epoch": 0.1962983735277622,
355
+ "grad_norm": 0.5158228278160095,
356
  "learning_rate": 0.00016189402480270578,
357
+ "loss": 1.0251,
358
+ "mean_token_accuracy": 0.7401407450437546,
359
+ "num_tokens": 327924.0,
360
  "step": 175
361
  },
362
  {
363
+ "entropy": 1.1991018429398537,
364
+ "epoch": 0.20190689848569826,
365
+ "grad_norm": 0.6816720366477966,
366
  "learning_rate": 0.00016076662908680949,
367
+ "loss": 1.19,
368
+ "mean_token_accuracy": 0.7179287821054459,
369
+ "num_tokens": 337459.0,
370
  "step": 180
371
  },
372
  {
373
+ "entropy": 1.2390994131565094,
374
+ "epoch": 0.20751542344363433,
375
+ "grad_norm": 0.5661265254020691,
376
  "learning_rate": 0.0001596392333709132,
377
+ "loss": 1.1333,
378
+ "mean_token_accuracy": 0.7147542536258698,
379
+ "num_tokens": 345281.0,
380
  "step": 185
381
  },
382
  {
383
+ "entropy": 1.1696692734956742,
384
+ "epoch": 0.2131239484015704,
385
+ "grad_norm": 0.6144779324531555,
386
  "learning_rate": 0.00015851183765501692,
387
+ "loss": 1.0103,
388
+ "mean_token_accuracy": 0.7357000023126602,
389
+ "num_tokens": 352958.0,
390
  "step": 190
391
  },
392
  {
393
+ "entropy": 1.0967671677470208,
394
+ "epoch": 0.21873247335950646,
395
+ "grad_norm": 0.724777102470398,
396
  "learning_rate": 0.00015738444193912063,
397
+ "loss": 1.0985,
398
+ "mean_token_accuracy": 0.7314315021038056,
399
+ "num_tokens": 362012.0,
400
  "step": 195
401
  },
402
  {
403
+ "entropy": 1.2297322571277618,
404
+ "epoch": 0.2243409983174425,
405
+ "grad_norm": 0.5694834589958191,
406
  "learning_rate": 0.00015625704622322436,
407
+ "loss": 1.1923,
408
+ "mean_token_accuracy": 0.7140013068914414,
409
+ "num_tokens": 371661.0,
410
  "step": 200
411
  },
412
  {
413
+ "entropy": 1.1518067017197609,
414
+ "epoch": 0.22994952327537857,
415
+ "grad_norm": 0.5081655383110046,
416
  "learning_rate": 0.0001551296505073281,
417
+ "loss": 1.0934,
418
+ "mean_token_accuracy": 0.7330572694540024,
419
+ "num_tokens": 380872.0,
420
  "step": 205
421
  },
422
  {
423
+ "entropy": 1.2965759575366973,
424
+ "epoch": 0.23555804823331464,
425
+ "grad_norm": 0.4979788362979889,
426
  "learning_rate": 0.0001540022547914318,
427
+ "loss": 1.224,
428
+ "mean_token_accuracy": 0.702433243393898,
429
+ "num_tokens": 391828.0,
430
  "step": 210
431
  },
432
  {
433
+ "entropy": 1.3438188642263413,
434
+ "epoch": 0.2411665731912507,
435
+ "grad_norm": 0.5993033647537231,
436
  "learning_rate": 0.0001528748590755355,
437
+ "loss": 1.3083,
438
+ "mean_token_accuracy": 0.6839649230241776,
439
+ "num_tokens": 399939.0,
440
  "step": 215
441
  },
442
  {
443
+ "entropy": 1.18723217099905,
444
+ "epoch": 0.24677509814918677,
445
+ "grad_norm": 0.5683300495147705,
446
  "learning_rate": 0.00015174746335963924,
447
+ "loss": 1.1751,
448
+ "mean_token_accuracy": 0.7297552257776261,
449
+ "num_tokens": 408814.0,
450
  "step": 220
451
  },
452
  {
453
+ "entropy": 1.2557217076420784,
454
+ "epoch": 0.2523836231071228,
455
+ "grad_norm": 0.6166518926620483,
456
  "learning_rate": 0.00015062006764374297,
457
+ "loss": 1.2271,
458
+ "mean_token_accuracy": 0.7061844110488892,
459
+ "num_tokens": 418295.0,
460
  "step": 225
461
  },
462
  {
463
+ "entropy": 1.084635604918003,
464
+ "epoch": 0.2579921480650589,
465
+ "grad_norm": 0.5858753323554993,
466
  "learning_rate": 0.00014949267192784668,
467
+ "loss": 1.0096,
468
+ "mean_token_accuracy": 0.7489422798156739,
469
+ "num_tokens": 426560.0,
470
  "step": 230
471
  },
472
  {
473
+ "entropy": 1.2373799532651901,
474
+ "epoch": 0.26360067302299495,
475
+ "grad_norm": 0.6692758202552795,
476
  "learning_rate": 0.00014836527621195039,
477
+ "loss": 1.2283,
478
+ "mean_token_accuracy": 0.7165287554264068,
479
+ "num_tokens": 435354.0,
480
  "step": 235
481
  },
482
  {
483
+ "entropy": 1.1109624326229095,
484
+ "epoch": 0.269209197980931,
485
+ "grad_norm": 0.5951205492019653,
486
  "learning_rate": 0.00014723788049605412,
487
+ "loss": 1.0563,
488
+ "mean_token_accuracy": 0.732743826508522,
489
+ "num_tokens": 444810.0,
490
  "step": 240
491
  },
492
  {
493
+ "entropy": 1.2168930009007455,
494
+ "epoch": 0.2748177229388671,
495
+ "grad_norm": 0.4504969120025635,
496
  "learning_rate": 0.00014611048478015785,
497
+ "loss": 1.2377,
498
+ "mean_token_accuracy": 0.7143894642591476,
499
+ "num_tokens": 454278.0,
500
  "step": 245
501
  },
502
  {
503
+ "entropy": 1.123945553600788,
504
+ "epoch": 0.28042624789680315,
505
+ "grad_norm": 0.6897888779640198,
506
  "learning_rate": 0.00014498308906426156,
507
+ "loss": 1.0974,
508
+ "mean_token_accuracy": 0.7368963181972503,
509
+ "num_tokens": 462577.0,
510
  "step": 250
511
  },
512
  {
513
+ "entropy": 1.141464115679264,
514
+ "epoch": 0.2860347728547392,
515
+ "grad_norm": 0.6041523218154907,
516
  "learning_rate": 0.0001438556933483653,
517
+ "loss": 1.1427,
518
+ "mean_token_accuracy": 0.7364778339862823,
519
+ "num_tokens": 471168.0,
520
  "step": 255
521
  },
522
  {
523
+ "entropy": 1.2550072342157363,
524
+ "epoch": 0.2916432978126753,
525
+ "grad_norm": 0.7248488068580627,
526
  "learning_rate": 0.000142728297632469,
527
+ "loss": 1.2154,
528
+ "mean_token_accuracy": 0.7032849937677383,
529
+ "num_tokens": 480745.0,
530
  "step": 260
531
  },
532
  {
533
+ "entropy": 1.1180053681135178,
534
+ "epoch": 0.29725182277061135,
535
+ "grad_norm": 0.6221792697906494,
536
  "learning_rate": 0.0001416009019165727,
537
+ "loss": 1.1121,
538
+ "mean_token_accuracy": 0.7327967584133148,
539
+ "num_tokens": 489018.0,
540
  "step": 265
541
  },
542
  {
543
+ "entropy": 1.183423639833927,
544
+ "epoch": 0.3028603477285474,
545
+ "grad_norm": 0.5834987759590149,
546
  "learning_rate": 0.00014047350620067646,
547
+ "loss": 1.1682,
548
+ "mean_token_accuracy": 0.7155926108360291,
549
+ "num_tokens": 498458.0,
550
  "step": 270
551
  },
552
  {
553
+ "entropy": 1.2206986933946609,
554
+ "epoch": 0.30846887268648343,
555
+ "grad_norm": 0.6320741176605225,
556
  "learning_rate": 0.00013934611048478017,
557
+ "loss": 1.2206,
558
+ "mean_token_accuracy": 0.7106427907943725,
559
+ "num_tokens": 508278.0,
560
  "step": 275
561
  },
562
  {
563
+ "entropy": 1.3010928213596344,
564
+ "epoch": 0.3140773976444195,
565
+ "grad_norm": 0.8451023101806641,
566
  "learning_rate": 0.00013821871476888388,
567
+ "loss": 1.2283,
568
+ "mean_token_accuracy": 0.7067124038934708,
569
+ "num_tokens": 518492.0,
570
  "step": 280
571
  },
572
  {
573
+ "entropy": 1.2411514788866043,
574
+ "epoch": 0.31968592260235557,
575
+ "grad_norm": 0.5151481032371521,
576
  "learning_rate": 0.0001370913190529876,
577
+ "loss": 1.1855,
578
+ "mean_token_accuracy": 0.7306610077619553,
579
+ "num_tokens": 528203.0,
580
  "step": 285
581
  },
582
  {
583
+ "entropy": 1.094706454873085,
584
+ "epoch": 0.32529444756029163,
585
+ "grad_norm": 0.48817235231399536,
586
  "learning_rate": 0.00013596392333709131,
587
+ "loss": 1.1335,
588
+ "mean_token_accuracy": 0.7377402186393738,
589
+ "num_tokens": 538068.0,
590
  "step": 290
591
  },
592
  {
593
+ "entropy": 1.0780605375766754,
594
+ "epoch": 0.3309029725182277,
595
+ "grad_norm": 0.5559823513031006,
596
  "learning_rate": 0.00013483652762119505,
597
+ "loss": 1.1127,
598
+ "mean_token_accuracy": 0.7499582827091217,
599
+ "num_tokens": 545879.0,
600
  "step": 295
601
  },
602
  {
603
+ "entropy": 1.3355800449848174,
604
+ "epoch": 0.33651149747616377,
605
+ "grad_norm": 0.4629262685775757,
606
  "learning_rate": 0.00013370913190529878,
607
+ "loss": 1.318,
608
+ "mean_token_accuracy": 0.6883445054292678,
609
+ "num_tokens": 555761.0,
610
  "step": 300
611
  }
612
  ],
 
627
  "attributes": {}
628
  }
629
  },
630
+ "total_flos": 2.516552847893299e+16,
631
  "train_batch_size": 1,
632
  "trial_name": null,
633
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:79d41fa02013525705cb7a82d4f608a53737fdbc7baa1d76305c242ebd4e870e
3
  size 5816
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:75cf331c13a33e1598e45e9013486c4013f3af1f377da7304e21a0d1c22d72cb
3
  size 5816