robertou2 commited on
Commit
5e17005
·
verified ·
1 Parent(s): 09af9dc

Upload folder using huggingface_hub

Browse files
adapter_config.json CHANGED
@@ -12,24 +12,24 @@
12
  "layers_pattern": null,
13
  "layers_to_transform": null,
14
  "loftq_config": {},
15
- "lora_alpha": 256,
16
  "lora_bias": false,
17
- "lora_dropout": 0,
18
  "megatron_config": null,
19
  "megatron_core": "megatron.core",
20
  "modules_to_save": null,
21
  "peft_type": "LORA",
22
- "r": 128,
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
- "o_proj",
27
- "down_proj",
28
  "k_proj",
29
- "q_proj",
30
  "gate_proj",
31
- "v_proj",
32
- "up_proj"
 
 
 
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
 
12
  "layers_pattern": null,
13
  "layers_to_transform": null,
14
  "loftq_config": {},
15
+ "lora_alpha": 96,
16
  "lora_bias": false,
17
+ "lora_dropout": 0.05,
18
  "megatron_config": null,
19
  "megatron_core": "megatron.core",
20
  "modules_to_save": null,
21
  "peft_type": "LORA",
22
+ "r": 48,
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
 
 
26
  "k_proj",
 
27
  "gate_proj",
28
+ "up_proj",
29
+ "o_proj",
30
+ "q_proj",
31
+ "down_proj",
32
+ "v_proj"
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f67fb93d73ef689e29fced3646888b9acac71ded7ce8bdc2e47a329b3d916111
3
- size 957942768
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cba0b76173da997b59cb7a5e1fcd715c9862b2f1bbf3b91d8ff2fc9c798e757d
3
+ size 359270696
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:74d8c8563d3fd92da4fa183b9c5a3bef0b8fabc91f3062232d31df923404a061
3
- size 1916174411
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cbb604a23b99a11cbf1db6886d978776df8e140f11a8a277fc8b32f968eb15d7
3
+ size 718831691
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e5b517d1b8e2b0f837c8b00170b154961d4d989feba4326ac25583df7a55c57a
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3d312928d0bb60518eb9856d5ab0ae1674bcb745294bf27f615cb6d07b0463e
3
  size 14645
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5c4e44404b58ce3af1b46c3d4a85a59edbbc386f340c476e894715a1199e1aed
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5418e0fc9a3a6a50ea3a7b440dfb8b2fa26686b28c8f28256150a09922035962
3
  size 1465
trainer_state.json CHANGED
@@ -11,702 +11,702 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.05063291139240506,
14
- "grad_norm": 59.950523376464844,
15
  "learning_rate": 0.0,
16
  "loss": 3.0474,
17
  "step": 1
18
  },
19
  {
20
  "epoch": 0.10126582278481013,
21
- "grad_norm": 97.50016021728516,
22
- "learning_rate": 3.3333333333333333e-06,
23
  "loss": 3.2925,
24
  "step": 2
25
  },
26
  {
27
  "epoch": 0.1518987341772152,
28
- "grad_norm": 19.85240364074707,
29
- "learning_rate": 6.666666666666667e-06,
30
- "loss": 3.0614,
31
  "step": 3
32
  },
33
  {
34
  "epoch": 0.20253164556962025,
35
- "grad_norm": 12.535090446472168,
36
- "learning_rate": 1e-05,
37
- "loss": 2.6279,
38
  "step": 4
39
  },
40
  {
41
  "epoch": 0.25316455696202533,
42
- "grad_norm": 7.785061359405518,
43
- "learning_rate": 1.3333333333333333e-05,
44
- "loss": 2.1835,
45
  "step": 5
46
  },
47
  {
48
  "epoch": 0.3037974683544304,
49
- "grad_norm": 11.934365272521973,
50
- "learning_rate": 1.6666666666666667e-05,
51
- "loss": 2.769,
52
  "step": 6
53
  },
54
  {
55
  "epoch": 0.35443037974683544,
56
- "grad_norm": 14.708230972290039,
57
- "learning_rate": 2e-05,
58
- "loss": 2.5474,
59
  "step": 7
60
  },
61
  {
62
  "epoch": 0.4050632911392405,
63
- "grad_norm": 9.574464797973633,
64
- "learning_rate": 2.3333333333333336e-05,
65
- "loss": 2.3995,
66
  "step": 8
67
  },
68
  {
69
  "epoch": 0.45569620253164556,
70
- "grad_norm": 10.295364379882812,
71
- "learning_rate": 2.6666666666666667e-05,
72
- "loss": 2.7588,
73
  "step": 9
74
  },
75
  {
76
  "epoch": 0.5063291139240507,
77
- "grad_norm": 9.11987590789795,
78
- "learning_rate": 3e-05,
79
- "loss": 2.5851,
80
  "step": 10
81
  },
82
  {
83
  "epoch": 0.5569620253164557,
84
- "grad_norm": 9.329511642456055,
85
- "learning_rate": 3.3333333333333335e-05,
86
- "loss": 2.7085,
87
  "step": 11
88
  },
89
  {
90
  "epoch": 0.6075949367088608,
91
- "grad_norm": 169.67454528808594,
92
- "learning_rate": 3.6666666666666666e-05,
93
- "loss": 2.7721,
94
  "step": 12
95
  },
96
  {
97
  "epoch": 0.6582278481012658,
98
- "grad_norm": 10.326493263244629,
99
- "learning_rate": 4e-05,
100
- "loss": 2.5592,
101
  "step": 13
102
  },
103
  {
104
  "epoch": 0.7088607594936709,
105
- "grad_norm": 9.250118255615234,
106
- "learning_rate": 4.3333333333333334e-05,
107
- "loss": 2.1627,
108
  "step": 14
109
  },
110
  {
111
  "epoch": 0.759493670886076,
112
- "grad_norm": 10.431126594543457,
113
- "learning_rate": 4.666666666666667e-05,
114
- "loss": 2.3183,
115
  "step": 15
116
  },
117
  {
118
  "epoch": 0.810126582278481,
119
- "grad_norm": 12.514673233032227,
120
- "learning_rate": 5e-05,
121
- "loss": 2.7197,
122
  "step": 16
123
  },
124
  {
125
  "epoch": 0.8607594936708861,
126
- "grad_norm": 8.918922424316406,
127
- "learning_rate": 4.998292650357558e-05,
128
- "loss": 2.482,
129
  "step": 17
130
  },
131
  {
132
  "epoch": 0.9113924050632911,
133
- "grad_norm": 9.795475006103516,
134
- "learning_rate": 4.993172933464471e-05,
135
- "loss": 2.9793,
136
  "step": 18
137
  },
138
  {
139
  "epoch": 0.9620253164556962,
140
- "grad_norm": 8.159234046936035,
141
- "learning_rate": 4.984647842238185e-05,
142
- "loss": 2.4679,
143
  "step": 19
144
  },
145
  {
146
  "epoch": 1.0,
147
- "grad_norm": 9.91518497467041,
148
- "learning_rate": 4.972729020927865e-05,
149
- "loss": 2.3049,
150
  "step": 20
151
  },
152
  {
153
  "epoch": 1.0506329113924051,
154
- "grad_norm": 7.462806701660156,
155
- "learning_rate": 4.957432749209755e-05,
156
- "loss": 1.9516,
157
  "step": 21
158
  },
159
  {
160
  "epoch": 1.1012658227848102,
161
- "grad_norm": 11.39200210571289,
162
- "learning_rate": 4.938779919951092e-05,
163
- "loss": 1.7985,
164
  "step": 22
165
  },
166
  {
167
  "epoch": 1.1518987341772151,
168
- "grad_norm": 38.71971130371094,
169
- "learning_rate": 4.916796010672969e-05,
170
- "loss": 1.9365,
171
  "step": 23
172
  },
173
  {
174
  "epoch": 1.2025316455696202,
175
- "grad_norm": 12.706032752990723,
176
- "learning_rate": 4.891511048751102e-05,
177
- "loss": 1.9979,
178
  "step": 24
179
  },
180
  {
181
  "epoch": 1.2531645569620253,
182
- "grad_norm": 7.104768753051758,
183
- "learning_rate": 4.862959570402049e-05,
184
- "loss": 1.5809,
185
  "step": 25
186
  },
187
  {
188
  "epoch": 1.3037974683544304,
189
- "grad_norm": 9.931644439697266,
190
- "learning_rate": 4.8311805735108894e-05,
191
- "loss": 2.2397,
192
  "step": 26
193
  },
194
  {
195
  "epoch": 1.3544303797468356,
196
- "grad_norm": 12.904341697692871,
197
- "learning_rate": 4.796217464364808e-05,
198
- "loss": 2.111,
199
  "step": 27
200
  },
201
  {
202
  "epoch": 1.4050632911392404,
203
- "grad_norm": 9.214466094970703,
204
- "learning_rate": 4.758117998365322e-05,
205
- "loss": 1.8629,
206
  "step": 28
207
  },
208
  {
209
  "epoch": 1.4556962025316456,
210
- "grad_norm": 8.815692901611328,
211
- "learning_rate": 4.716934214800155e-05,
212
- "loss": 1.8746,
213
  "step": 29
214
  },
215
  {
216
  "epoch": 1.5063291139240507,
217
- "grad_norm": 6.838780879974365,
218
- "learning_rate": 4.672722365763821e-05,
219
- "loss": 1.5414,
220
  "step": 30
221
  },
222
  {
223
  "epoch": 1.5569620253164556,
224
- "grad_norm": 8.139121055603027,
225
- "learning_rate": 4.625542839324036e-05,
226
- "loss": 1.6602,
227
  "step": 31
228
  },
229
  {
230
  "epoch": 1.6075949367088609,
231
- "grad_norm": 7.884446620941162,
232
- "learning_rate": 4.575460077038877e-05,
233
- "loss": 1.8808,
234
  "step": 32
235
  },
236
  {
237
  "epoch": 1.6582278481012658,
238
- "grad_norm": 7.442699432373047,
239
- "learning_rate": 4.522542485937369e-05,
240
- "loss": 1.6506,
241
  "step": 33
242
  },
243
  {
244
  "epoch": 1.7088607594936709,
245
- "grad_norm": 8.191823959350586,
246
- "learning_rate": 4.4668623450837085e-05,
247
- "loss": 1.8345,
248
  "step": 34
249
  },
250
  {
251
  "epoch": 1.759493670886076,
252
- "grad_norm": 11.364486694335938,
253
- "learning_rate": 4.408495706852758e-05,
254
- "loss": 1.9866,
255
  "step": 35
256
  },
257
  {
258
  "epoch": 1.810126582278481,
259
- "grad_norm": 11.59144115447998,
260
- "learning_rate": 4.347522293051648e-05,
261
- "loss": 1.9569,
262
  "step": 36
263
  },
264
  {
265
  "epoch": 1.8607594936708862,
266
- "grad_norm": 6.322240829467773,
267
- "learning_rate": 4.284025386029381e-05,
268
- "loss": 1.5203,
269
  "step": 37
270
  },
271
  {
272
  "epoch": 1.9113924050632911,
273
- "grad_norm": 9.493250846862793,
274
- "learning_rate": 4.218091714923157e-05,
275
- "loss": 2.1347,
276
  "step": 38
277
  },
278
  {
279
  "epoch": 1.9620253164556962,
280
- "grad_norm": 7.811888217926025,
281
- "learning_rate": 4.149811337196807e-05,
282
- "loss": 1.6241,
283
  "step": 39
284
  },
285
  {
286
  "epoch": 2.0,
287
- "grad_norm": 211.09934997558594,
288
- "learning_rate": 4.079277515633127e-05,
289
- "loss": 2.5542,
290
  "step": 40
291
  },
292
  {
293
  "epoch": 2.050632911392405,
294
- "grad_norm": 14.255730628967285,
295
- "learning_rate": 4.0065865909481417e-05,
296
- "loss": 1.3737,
297
  "step": 41
298
  },
299
  {
300
  "epoch": 2.1012658227848102,
301
- "grad_norm": 8.713842391967773,
302
- "learning_rate": 3.931837850201263e-05,
303
- "loss": 1.4809,
304
  "step": 42
305
  },
306
  {
307
  "epoch": 2.151898734177215,
308
- "grad_norm": 7.860729217529297,
309
- "learning_rate": 3.855133391181124e-05,
310
- "loss": 1.2102,
311
  "step": 43
312
  },
313
  {
314
  "epoch": 2.2025316455696204,
315
- "grad_norm": 7.420332908630371,
316
- "learning_rate": 3.7765779829522675e-05,
317
- "loss": 1.1833,
318
  "step": 44
319
  },
320
  {
321
  "epoch": 2.2531645569620253,
322
- "grad_norm": 10.270529747009277,
323
- "learning_rate": 3.696278922753216e-05,
324
- "loss": 1.4396,
325
  "step": 45
326
  },
327
  {
328
  "epoch": 2.3037974683544302,
329
- "grad_norm": 10.930971145629883,
330
- "learning_rate": 3.6143458894413465e-05,
331
- "loss": 1.3466,
332
  "step": 46
333
  },
334
  {
335
  "epoch": 2.3544303797468356,
336
- "grad_norm": 7.2761125564575195,
337
- "learning_rate": 3.5308907936847594e-05,
338
- "loss": 1.0504,
339
  "step": 47
340
  },
341
  {
342
  "epoch": 2.4050632911392404,
343
- "grad_norm": 6.623189926147461,
344
- "learning_rate": 3.446027625105776e-05,
345
- "loss": 1.4782,
346
  "step": 48
347
  },
348
  {
349
  "epoch": 2.4556962025316453,
350
- "grad_norm": 6.6565985679626465,
351
- "learning_rate": 3.3598722965848204e-05,
352
- "loss": 1.211,
353
  "step": 49
354
  },
355
  {
356
  "epoch": 2.5063291139240507,
357
- "grad_norm": 7.586391448974609,
358
- "learning_rate": 3.272542485937369e-05,
359
- "loss": 0.99,
360
  "step": 50
361
  },
362
  {
363
  "epoch": 2.5569620253164556,
364
- "grad_norm": 5.9524312019348145,
365
- "learning_rate": 3.1841574751802076e-05,
366
- "loss": 1.0821,
367
  "step": 51
368
  },
369
  {
370
  "epoch": 2.607594936708861,
371
- "grad_norm": 8.896554946899414,
372
- "learning_rate": 3.094837987606547e-05,
373
- "loss": 0.9181,
374
  "step": 52
375
  },
376
  {
377
  "epoch": 2.6582278481012658,
378
- "grad_norm": 6.550152778625488,
379
- "learning_rate": 3.0047060228925256e-05,
380
- "loss": 1.1579,
381
  "step": 53
382
  },
383
  {
384
  "epoch": 2.708860759493671,
385
- "grad_norm": 9.528509140014648,
386
- "learning_rate": 2.913884690460325e-05,
387
- "loss": 1.5195,
388
  "step": 54
389
  },
390
  {
391
  "epoch": 2.759493670886076,
392
- "grad_norm": 6.139204978942871,
393
- "learning_rate": 2.8224980413255086e-05,
394
- "loss": 1.0624,
395
  "step": 55
396
  },
397
  {
398
  "epoch": 2.810126582278481,
399
- "grad_norm": 6.365853309631348,
400
- "learning_rate": 2.7306708986582553e-05,
401
- "loss": 1.0718,
402
  "step": 56
403
  },
404
  {
405
  "epoch": 2.8607594936708862,
406
- "grad_norm": 14.753962516784668,
407
- "learning_rate": 2.638528687289925e-05,
408
- "loss": 1.0694,
409
  "step": 57
410
  },
411
  {
412
  "epoch": 2.911392405063291,
413
- "grad_norm": 6.6717305183410645,
414
- "learning_rate": 2.5461972623978247e-05,
415
- "loss": 1.1045,
416
  "step": 58
417
  },
418
  {
419
  "epoch": 2.962025316455696,
420
- "grad_norm": 10.40539836883545,
421
- "learning_rate": 2.453802737602176e-05,
422
- "loss": 1.3119,
423
  "step": 59
424
  },
425
  {
426
  "epoch": 3.0,
427
- "grad_norm": 4.392307281494141,
428
- "learning_rate": 2.361471312710075e-05,
429
- "loss": 0.5706,
430
  "step": 60
431
  },
432
  {
433
  "epoch": 3.050632911392405,
434
- "grad_norm": 5.2546257972717285,
435
- "learning_rate": 2.2693291013417453e-05,
436
- "loss": 0.6983,
437
  "step": 61
438
  },
439
  {
440
  "epoch": 3.1012658227848102,
441
- "grad_norm": 5.815437316894531,
442
- "learning_rate": 2.1775019586744923e-05,
443
- "loss": 0.9768,
444
  "step": 62
445
  },
446
  {
447
  "epoch": 3.151898734177215,
448
- "grad_norm": 5.194660186767578,
449
- "learning_rate": 2.0861153095396748e-05,
450
- "loss": 0.6243,
451
  "step": 63
452
  },
453
  {
454
  "epoch": 3.2025316455696204,
455
- "grad_norm": 4.012391567230225,
456
- "learning_rate": 1.995293977107475e-05,
457
- "loss": 0.469,
458
  "step": 64
459
  },
460
  {
461
  "epoch": 3.2531645569620253,
462
- "grad_norm": 5.675468444824219,
463
- "learning_rate": 1.9051620123934537e-05,
464
- "loss": 0.6084,
465
  "step": 65
466
  },
467
  {
468
  "epoch": 3.3037974683544302,
469
- "grad_norm": 5.8908209800720215,
470
- "learning_rate": 1.815842524819793e-05,
471
- "loss": 0.648,
472
  "step": 66
473
  },
474
  {
475
  "epoch": 3.3544303797468356,
476
- "grad_norm": 7.725429534912109,
477
- "learning_rate": 1.7274575140626318e-05,
478
- "loss": 0.6949,
479
  "step": 67
480
  },
481
  {
482
  "epoch": 3.4050632911392404,
483
- "grad_norm": 6.168173313140869,
484
- "learning_rate": 1.6401277034151798e-05,
485
- "loss": 0.9213,
486
  "step": 68
487
  },
488
  {
489
  "epoch": 3.4556962025316453,
490
- "grad_norm": 6.947693347930908,
491
- "learning_rate": 1.5539723748942245e-05,
492
- "loss": 0.7397,
493
  "step": 69
494
  },
495
  {
496
  "epoch": 3.5063291139240507,
497
- "grad_norm": 5.9794206619262695,
498
- "learning_rate": 1.4691092063152417e-05,
499
- "loss": 0.5009,
500
  "step": 70
501
  },
502
  {
503
  "epoch": 3.5569620253164556,
504
- "grad_norm": 5.66774320602417,
505
- "learning_rate": 1.3856541105586545e-05,
506
- "loss": 0.5204,
507
  "step": 71
508
  },
509
  {
510
  "epoch": 3.607594936708861,
511
- "grad_norm": 8.234807014465332,
512
- "learning_rate": 1.303721077246784e-05,
513
- "loss": 0.8793,
514
  "step": 72
515
  },
516
  {
517
  "epoch": 3.6582278481012658,
518
- "grad_norm": 8.785400390625,
519
- "learning_rate": 1.223422017047733e-05,
520
- "loss": 0.6229,
521
  "step": 73
522
  },
523
  {
524
  "epoch": 3.708860759493671,
525
- "grad_norm": 6.376526832580566,
526
- "learning_rate": 1.1448666088188764e-05,
527
- "loss": 0.6154,
528
  "step": 74
529
  },
530
  {
531
  "epoch": 3.759493670886076,
532
- "grad_norm": 7.004448413848877,
533
- "learning_rate": 1.068162149798737e-05,
534
- "loss": 0.7203,
535
  "step": 75
536
  },
537
  {
538
  "epoch": 3.810126582278481,
539
- "grad_norm": 5.858279705047607,
540
- "learning_rate": 9.934134090518593e-06,
541
- "loss": 0.5153,
542
  "step": 76
543
  },
544
  {
545
  "epoch": 3.8607594936708862,
546
- "grad_norm": 7.578220844268799,
547
- "learning_rate": 9.207224843668732e-06,
548
- "loss": 0.7153,
549
  "step": 77
550
  },
551
  {
552
  "epoch": 3.911392405063291,
553
- "grad_norm": 7.869601249694824,
554
- "learning_rate": 8.50188662803194e-06,
555
- "loss": 0.6988,
556
  "step": 78
557
  },
558
  {
559
  "epoch": 3.962025316455696,
560
- "grad_norm": 6.777385234832764,
561
- "learning_rate": 7.819082850768434e-06,
562
- "loss": 0.6007,
563
  "step": 79
564
  },
565
  {
566
  "epoch": 4.0,
567
- "grad_norm": 6.161752223968506,
568
- "learning_rate": 7.159746139706194e-06,
569
- "loss": 0.4779,
570
  "step": 80
571
  },
572
  {
573
  "epoch": 4.050632911392405,
574
- "grad_norm": 5.206139087677002,
575
- "learning_rate": 6.524777069483526e-06,
576
- "loss": 0.4173,
577
  "step": 81
578
  },
579
  {
580
  "epoch": 4.10126582278481,
581
- "grad_norm": 4.832441329956055,
582
- "learning_rate": 5.915042931472425e-06,
583
- "loss": 0.4491,
584
  "step": 82
585
  },
586
  {
587
  "epoch": 4.151898734177215,
588
- "grad_norm": 4.783233165740967,
589
- "learning_rate": 5.33137654916292e-06,
590
- "loss": 0.3311,
591
  "step": 83
592
  },
593
  {
594
  "epoch": 4.2025316455696204,
595
- "grad_norm": 3.099482536315918,
596
- "learning_rate": 4.7745751406263165e-06,
597
- "loss": 0.2116,
598
  "step": 84
599
  },
600
  {
601
  "epoch": 4.253164556962025,
602
- "grad_norm": 5.326932907104492,
603
- "learning_rate": 4.245399229611238e-06,
604
- "loss": 0.3897,
605
  "step": 85
606
  },
607
  {
608
  "epoch": 4.30379746835443,
609
- "grad_norm": 4.431222915649414,
610
- "learning_rate": 3.7445716067596503e-06,
611
- "loss": 0.4973,
612
  "step": 86
613
  },
614
  {
615
  "epoch": 4.3544303797468356,
616
- "grad_norm": 4.217422008514404,
617
- "learning_rate": 3.2727763423617913e-06,
618
- "loss": 0.182,
619
  "step": 87
620
  },
621
  {
622
  "epoch": 4.405063291139241,
623
- "grad_norm": 5.346303462982178,
624
- "learning_rate": 2.8306578519984527e-06,
625
- "loss": 0.5239,
626
  "step": 88
627
  },
628
  {
629
  "epoch": 4.455696202531645,
630
- "grad_norm": 8.100042343139648,
631
- "learning_rate": 2.418820016346779e-06,
632
- "loss": 0.2284,
633
  "step": 89
634
  },
635
  {
636
  "epoch": 4.506329113924051,
637
- "grad_norm": 4.507992267608643,
638
- "learning_rate": 2.0378253563519247e-06,
639
- "loss": 0.3344,
640
  "step": 90
641
  },
642
  {
643
  "epoch": 4.556962025316456,
644
- "grad_norm": 4.841477394104004,
645
- "learning_rate": 1.6881942648911076e-06,
646
- "loss": 0.2872,
647
  "step": 91
648
  },
649
  {
650
  "epoch": 4.6075949367088604,
651
- "grad_norm": 4.839809417724609,
652
- "learning_rate": 1.3704042959795132e-06,
653
- "loss": 0.5436,
654
  "step": 92
655
  },
656
  {
657
  "epoch": 4.658227848101266,
658
- "grad_norm": 3.7410666942596436,
659
- "learning_rate": 1.0848895124889818e-06,
660
- "loss": 0.3488,
661
  "step": 93
662
  },
663
  {
664
  "epoch": 4.708860759493671,
665
- "grad_norm": 5.837460041046143,
666
- "learning_rate": 8.320398932703144e-07,
667
- "loss": 0.458,
668
  "step": 94
669
  },
670
  {
671
  "epoch": 4.759493670886076,
672
- "grad_norm": 5.102079391479492,
673
- "learning_rate": 6.122008004890851e-07,
674
- "loss": 0.2965,
675
  "step": 95
676
  },
677
  {
678
  "epoch": 4.810126582278481,
679
- "grad_norm": 4.543964385986328,
680
- "learning_rate": 4.256725079024554e-07,
681
- "loss": 0.2352,
682
  "step": 96
683
  },
684
  {
685
  "epoch": 4.860759493670886,
686
- "grad_norm": 4.671619415283203,
687
- "learning_rate": 2.7270979072135104e-07,
688
- "loss": 0.3958,
689
  "step": 97
690
  },
691
  {
692
  "epoch": 4.911392405063291,
693
- "grad_norm": 5.004724979400635,
694
- "learning_rate": 1.5352157761815977e-07,
695
- "loss": 0.3075,
696
  "step": 98
697
  },
698
  {
699
  "epoch": 4.962025316455696,
700
- "grad_norm": 4.00545597076416,
701
- "learning_rate": 6.827066535529946e-08,
702
- "loss": 0.3152,
703
  "step": 99
704
  },
705
  {
706
  "epoch": 5.0,
707
- "grad_norm": 5.139050483703613,
708
- "learning_rate": 1.7073496424427348e-08,
709
- "loss": 0.1744,
710
  "step": 100
711
  }
712
  ],
@@ -727,7 +727,7 @@
727
  "attributes": {}
728
  }
729
  },
730
- "total_flos": 3.0794717131554816e+16,
731
  "train_batch_size": 2,
732
  "trial_name": null,
733
  "trial_params": null
 
11
  "log_history": [
12
  {
13
  "epoch": 0.05063291139240506,
14
+ "grad_norm": 22.15937042236328,
15
  "learning_rate": 0.0,
16
  "loss": 3.0474,
17
  "step": 1
18
  },
19
  {
20
  "epoch": 0.10126582278481013,
21
+ "grad_norm": 36.183231353759766,
22
+ "learning_rate": 3.3333333333333335e-05,
23
  "loss": 3.2925,
24
  "step": 2
25
  },
26
  {
27
  "epoch": 0.1518987341772152,
28
+ "grad_norm": 5.320329189300537,
29
+ "learning_rate": 6.666666666666667e-05,
30
+ "loss": 3.0096,
31
  "step": 3
32
  },
33
  {
34
  "epoch": 0.20253164556962025,
35
+ "grad_norm": 4.796537399291992,
36
+ "learning_rate": 0.0001,
37
+ "loss": 2.6045,
38
  "step": 4
39
  },
40
  {
41
  "epoch": 0.25316455696202533,
42
+ "grad_norm": 2.9281771183013916,
43
+ "learning_rate": 0.00013333333333333334,
44
+ "loss": 2.1625,
45
  "step": 5
46
  },
47
  {
48
  "epoch": 0.3037974683544304,
49
+ "grad_norm": 7.840775012969971,
50
+ "learning_rate": 0.00016666666666666666,
51
+ "loss": 2.7606,
52
  "step": 6
53
  },
54
  {
55
  "epoch": 0.35443037974683544,
56
+ "grad_norm": 3.6150004863739014,
57
+ "learning_rate": 0.0002,
58
+ "loss": 2.5675,
59
  "step": 7
60
  },
61
  {
62
  "epoch": 0.4050632911392405,
63
+ "grad_norm": 3.3033154010772705,
64
+ "learning_rate": 0.00023333333333333333,
65
+ "loss": 2.3897,
66
  "step": 8
67
  },
68
  {
69
  "epoch": 0.45569620253164556,
70
+ "grad_norm": 4.086965560913086,
71
+ "learning_rate": 0.0002666666666666667,
72
+ "loss": 2.755,
73
  "step": 9
74
  },
75
  {
76
  "epoch": 0.5063291139240507,
77
+ "grad_norm": 3.734769105911255,
78
+ "learning_rate": 0.0003,
79
+ "loss": 2.5837,
80
  "step": 10
81
  },
82
  {
83
  "epoch": 0.5569620253164557,
84
+ "grad_norm": 3.2234697341918945,
85
+ "learning_rate": 0.0003333333333333333,
86
+ "loss": 2.6353,
87
  "step": 11
88
  },
89
  {
90
  "epoch": 0.6075949367088608,
91
+ "grad_norm": 2.629314422607422,
92
+ "learning_rate": 0.00036666666666666667,
93
+ "loss": 2.732,
94
  "step": 12
95
  },
96
  {
97
  "epoch": 0.6582278481012658,
98
+ "grad_norm": 3.710653066635132,
99
+ "learning_rate": 0.0004,
100
+ "loss": 2.4126,
101
  "step": 13
102
  },
103
  {
104
  "epoch": 0.7088607594936709,
105
+ "grad_norm": 3.1855616569519043,
106
+ "learning_rate": 0.00043333333333333337,
107
+ "loss": 2.0725,
108
  "step": 14
109
  },
110
  {
111
  "epoch": 0.759493670886076,
112
+ "grad_norm": 3.34596848487854,
113
+ "learning_rate": 0.00046666666666666666,
114
+ "loss": 2.3351,
115
  "step": 15
116
  },
117
  {
118
  "epoch": 0.810126582278481,
119
+ "grad_norm": 3.2423255443573,
120
+ "learning_rate": 0.0005,
121
+ "loss": 2.7354,
122
  "step": 16
123
  },
124
  {
125
  "epoch": 0.8607594936708861,
126
+ "grad_norm": 3.6997056007385254,
127
+ "learning_rate": 0.0004998292650357557,
128
+ "loss": 2.6154,
129
  "step": 17
130
  },
131
  {
132
  "epoch": 0.9113924050632911,
133
+ "grad_norm": 3.3852779865264893,
134
+ "learning_rate": 0.0004993172933464471,
135
+ "loss": 2.9623,
136
  "step": 18
137
  },
138
  {
139
  "epoch": 0.9620253164556962,
140
+ "grad_norm": 5.298639297485352,
141
+ "learning_rate": 0.0004984647842238185,
142
+ "loss": 2.7019,
143
  "step": 19
144
  },
145
  {
146
  "epoch": 1.0,
147
+ "grad_norm": 4.402466773986816,
148
+ "learning_rate": 0.0004972729020927865,
149
+ "loss": 2.4495,
150
  "step": 20
151
  },
152
  {
153
  "epoch": 1.0506329113924051,
154
+ "grad_norm": 3.915959358215332,
155
+ "learning_rate": 0.0004957432749209755,
156
+ "loss": 1.9027,
157
  "step": 21
158
  },
159
  {
160
  "epoch": 1.1012658227848102,
161
+ "grad_norm": 2.6463258266448975,
162
+ "learning_rate": 0.0004938779919951092,
163
+ "loss": 1.6182,
164
  "step": 22
165
  },
166
  {
167
  "epoch": 1.1518987341772151,
168
+ "grad_norm": 3.179638385772705,
169
+ "learning_rate": 0.0004916796010672969,
170
+ "loss": 1.6585,
171
  "step": 23
172
  },
173
  {
174
  "epoch": 1.2025316455696202,
175
+ "grad_norm": 5.375019550323486,
176
+ "learning_rate": 0.0004891511048751102,
177
+ "loss": 1.9379,
178
  "step": 24
179
  },
180
  {
181
  "epoch": 1.2531645569620253,
182
+ "grad_norm": 3.4863626956939697,
183
+ "learning_rate": 0.00048629595704020493,
184
+ "loss": 1.5123,
185
  "step": 25
186
  },
187
  {
188
  "epoch": 1.3037974683544304,
189
+ "grad_norm": 2.945317506790161,
190
+ "learning_rate": 0.00048311805735108893,
191
+ "loss": 1.8359,
192
  "step": 26
193
  },
194
  {
195
  "epoch": 1.3544303797468356,
196
+ "grad_norm": 4.178781986236572,
197
+ "learning_rate": 0.0004796217464364808,
198
+ "loss": 1.8808,
199
  "step": 27
200
  },
201
  {
202
  "epoch": 1.4050632911392404,
203
+ "grad_norm": 3.691697597503662,
204
+ "learning_rate": 0.0004758117998365322,
205
+ "loss": 1.7988,
206
  "step": 28
207
  },
208
  {
209
  "epoch": 1.4556962025316456,
210
+ "grad_norm": 3.262970447540283,
211
+ "learning_rate": 0.00047169342148001547,
212
+ "loss": 1.8361,
213
  "step": 29
214
  },
215
  {
216
  "epoch": 1.5063291139240507,
217
+ "grad_norm": 3.550689220428467,
218
+ "learning_rate": 0.0004672722365763821,
219
+ "loss": 1.609,
220
  "step": 30
221
  },
222
  {
223
  "epoch": 1.5569620253164556,
224
+ "grad_norm": 3.615360736846924,
225
+ "learning_rate": 0.0004625542839324036,
226
+ "loss": 1.5563,
227
  "step": 31
228
  },
229
  {
230
  "epoch": 1.6075949367088609,
231
+ "grad_norm": 2.8636770248413086,
232
+ "learning_rate": 0.00045754600770388763,
233
+ "loss": 1.5551,
234
  "step": 32
235
  },
236
  {
237
  "epoch": 1.6582278481012658,
238
+ "grad_norm": 4.557415962219238,
239
+ "learning_rate": 0.0004522542485937369,
240
+ "loss": 1.7539,
241
  "step": 33
242
  },
243
  {
244
  "epoch": 1.7088607594936709,
245
+ "grad_norm": 3.1868419647216797,
246
+ "learning_rate": 0.0004466862345083708,
247
+ "loss": 1.7852,
248
  "step": 34
249
  },
250
  {
251
  "epoch": 1.759493670886076,
252
+ "grad_norm": 3.380448818206787,
253
+ "learning_rate": 0.0004408495706852758,
254
+ "loss": 1.7807,
255
  "step": 35
256
  },
257
  {
258
  "epoch": 1.810126582278481,
259
+ "grad_norm": 4.706411838531494,
260
+ "learning_rate": 0.00043475222930516476,
261
+ "loss": 1.9616,
262
  "step": 36
263
  },
264
  {
265
  "epoch": 1.8607594936708862,
266
+ "grad_norm": 2.568176031112671,
267
+ "learning_rate": 0.0004284025386029381,
268
+ "loss": 1.5545,
269
  "step": 37
270
  },
271
  {
272
  "epoch": 1.9113924050632911,
273
+ "grad_norm": 3.263295888900757,
274
+ "learning_rate": 0.00042180917149231567,
275
+ "loss": 1.7313,
276
  "step": 38
277
  },
278
  {
279
  "epoch": 1.9620253164556962,
280
+ "grad_norm": 5.326169967651367,
281
+ "learning_rate": 0.0004149811337196807,
282
+ "loss": 1.6959,
283
  "step": 39
284
  },
285
  {
286
  "epoch": 2.0,
287
+ "grad_norm": 4.056178569793701,
288
+ "learning_rate": 0.00040792775156331276,
289
+ "loss": 1.9885,
290
  "step": 40
291
  },
292
  {
293
  "epoch": 2.050632911392405,
294
+ "grad_norm": 2.816775321960449,
295
+ "learning_rate": 0.0004006586590948141,
296
+ "loss": 1.079,
297
  "step": 41
298
  },
299
  {
300
  "epoch": 2.1012658227848102,
301
+ "grad_norm": 3.266395330429077,
302
+ "learning_rate": 0.0003931837850201263,
303
+ "loss": 1.021,
304
  "step": 42
305
  },
306
  {
307
  "epoch": 2.151898734177215,
308
+ "grad_norm": 4.753467559814453,
309
+ "learning_rate": 0.00038551333911811237,
310
+ "loss": 1.0574,
311
  "step": 43
312
  },
313
  {
314
  "epoch": 2.2025316455696204,
315
+ "grad_norm": 4.696927547454834,
316
+ "learning_rate": 0.00037765779829522674,
317
+ "loss": 1.1594,
318
  "step": 44
319
  },
320
  {
321
  "epoch": 2.2531645569620253,
322
+ "grad_norm": 3.3051199913024902,
323
+ "learning_rate": 0.00036962789227532164,
324
+ "loss": 1.0123,
325
  "step": 45
326
  },
327
  {
328
  "epoch": 2.3037974683544302,
329
+ "grad_norm": 3.196387767791748,
330
+ "learning_rate": 0.0003614345889441346,
331
+ "loss": 1.2621,
332
  "step": 46
333
  },
334
  {
335
  "epoch": 2.3544303797468356,
336
+ "grad_norm": 3.5468530654907227,
337
+ "learning_rate": 0.0003530890793684759,
338
+ "loss": 1.0165,
339
  "step": 47
340
  },
341
  {
342
  "epoch": 2.4050632911392404,
343
+ "grad_norm": 2.5015392303466797,
344
+ "learning_rate": 0.0003446027625105776,
345
+ "loss": 1.216,
346
  "step": 48
347
  },
348
  {
349
  "epoch": 2.4556962025316453,
350
+ "grad_norm": 2.7021989822387695,
351
+ "learning_rate": 0.00033598722965848206,
352
+ "loss": 1.2118,
353
  "step": 49
354
  },
355
  {
356
  "epoch": 2.5063291139240507,
357
+ "grad_norm": 3.0691046714782715,
358
+ "learning_rate": 0.00032725424859373687,
359
+ "loss": 0.9781,
360
  "step": 50
361
  },
362
  {
363
  "epoch": 2.5569620253164556,
364
+ "grad_norm": 2.657027244567871,
365
+ "learning_rate": 0.0003184157475180208,
366
+ "loss": 0.9974,
367
  "step": 51
368
  },
369
  {
370
  "epoch": 2.607594936708861,
371
+ "grad_norm": 3.9573814868927,
372
+ "learning_rate": 0.00030948379876065467,
373
+ "loss": 0.8675,
374
  "step": 52
375
  },
376
  {
377
  "epoch": 2.6582278481012658,
378
+ "grad_norm": 2.825610876083374,
379
+ "learning_rate": 0.00030047060228925254,
380
+ "loss": 1.0503,
381
  "step": 53
382
  },
383
  {
384
  "epoch": 2.708860759493671,
385
+ "grad_norm": 4.903167724609375,
386
+ "learning_rate": 0.0002913884690460325,
387
+ "loss": 1.4037,
388
  "step": 54
389
  },
390
  {
391
  "epoch": 2.759493670886076,
392
+ "grad_norm": 2.883190393447876,
393
+ "learning_rate": 0.00028224980413255084,
394
+ "loss": 0.9242,
395
  "step": 55
396
  },
397
  {
398
  "epoch": 2.810126582278481,
399
+ "grad_norm": 2.112745523452759,
400
+ "learning_rate": 0.0002730670898658255,
401
+ "loss": 0.8504,
402
  "step": 56
403
  },
404
  {
405
  "epoch": 2.8607594936708862,
406
+ "grad_norm": 2.8920631408691406,
407
+ "learning_rate": 0.0002638528687289925,
408
+ "loss": 0.8863,
409
  "step": 57
410
  },
411
  {
412
  "epoch": 2.911392405063291,
413
+ "grad_norm": 2.817871570587158,
414
+ "learning_rate": 0.0002546197262397825,
415
+ "loss": 1.0101,
416
  "step": 58
417
  },
418
  {
419
  "epoch": 2.962025316455696,
420
+ "grad_norm": 3.5625603199005127,
421
+ "learning_rate": 0.0002453802737602176,
422
+ "loss": 1.138,
423
  "step": 59
424
  },
425
  {
426
  "epoch": 3.0,
427
+ "grad_norm": 2.3167858123779297,
428
+ "learning_rate": 0.00023614713127100752,
429
+ "loss": 0.5741,
430
  "step": 60
431
  },
432
  {
433
  "epoch": 3.050632911392405,
434
+ "grad_norm": 2.5570881366729736,
435
+ "learning_rate": 0.00022693291013417452,
436
+ "loss": 0.6103,
437
  "step": 61
438
  },
439
  {
440
  "epoch": 3.1012658227848102,
441
+ "grad_norm": 1.9336360692977905,
442
+ "learning_rate": 0.00021775019586744925,
443
+ "loss": 0.4277,
444
  "step": 62
445
  },
446
  {
447
  "epoch": 3.151898734177215,
448
+ "grad_norm": 2.514486789703369,
449
+ "learning_rate": 0.0002086115309539675,
450
+ "loss": 0.6049,
451
  "step": 63
452
  },
453
  {
454
  "epoch": 3.2025316455696204,
455
+ "grad_norm": 1.7973552942276,
456
+ "learning_rate": 0.0001995293977107475,
457
+ "loss": 0.3149,
458
  "step": 64
459
  },
460
  {
461
  "epoch": 3.2531645569620253,
462
+ "grad_norm": 1.9369994401931763,
463
+ "learning_rate": 0.00019051620123934537,
464
+ "loss": 0.4559,
465
  "step": 65
466
  },
467
  {
468
  "epoch": 3.3037974683544302,
469
+ "grad_norm": 2.178471803665161,
470
+ "learning_rate": 0.0001815842524819793,
471
+ "loss": 0.5129,
472
  "step": 66
473
  },
474
  {
475
  "epoch": 3.3544303797468356,
476
+ "grad_norm": 2.5989177227020264,
477
+ "learning_rate": 0.00017274575140626317,
478
+ "loss": 0.5667,
479
  "step": 67
480
  },
481
  {
482
  "epoch": 3.4050632911392404,
483
+ "grad_norm": 2.144813299179077,
484
+ "learning_rate": 0.00016401277034151795,
485
+ "loss": 0.575,
486
  "step": 68
487
  },
488
  {
489
  "epoch": 3.4556962025316453,
490
+ "grad_norm": 2.2235898971557617,
491
+ "learning_rate": 0.00015539723748942243,
492
+ "loss": 0.5374,
493
  "step": 69
494
  },
495
  {
496
  "epoch": 3.5063291139240507,
497
+ "grad_norm": 1.9977900981903076,
498
+ "learning_rate": 0.00014691092063152418,
499
+ "loss": 0.3938,
500
  "step": 70
501
  },
502
  {
503
  "epoch": 3.5569620253164556,
504
+ "grad_norm": 1.750430941581726,
505
+ "learning_rate": 0.00013856541105586545,
506
+ "loss": 0.3894,
507
  "step": 71
508
  },
509
  {
510
  "epoch": 3.607594936708861,
511
+ "grad_norm": 2.250420570373535,
512
+ "learning_rate": 0.0001303721077246784,
513
+ "loss": 0.4814,
514
  "step": 72
515
  },
516
  {
517
  "epoch": 3.6582278481012658,
518
+ "grad_norm": 2.961397409439087,
519
+ "learning_rate": 0.0001223422017047733,
520
+ "loss": 0.4686,
521
  "step": 73
522
  },
523
  {
524
  "epoch": 3.708860759493671,
525
+ "grad_norm": 2.334899425506592,
526
+ "learning_rate": 0.00011448666088188764,
527
+ "loss": 0.5233,
528
  "step": 74
529
  },
530
  {
531
  "epoch": 3.759493670886076,
532
+ "grad_norm": 2.6683523654937744,
533
+ "learning_rate": 0.00010681621497987371,
534
+ "loss": 0.4644,
535
  "step": 75
536
  },
537
  {
538
  "epoch": 3.810126582278481,
539
+ "grad_norm": 2.351604461669922,
540
+ "learning_rate": 9.934134090518593e-05,
541
+ "loss": 0.4128,
542
  "step": 76
543
  },
544
  {
545
  "epoch": 3.8607594936708862,
546
+ "grad_norm": 2.5051443576812744,
547
+ "learning_rate": 9.207224843668733e-05,
548
+ "loss": 0.3683,
549
  "step": 77
550
  },
551
  {
552
  "epoch": 3.911392405063291,
553
+ "grad_norm": 2.8908872604370117,
554
+ "learning_rate": 8.50188662803194e-05,
555
+ "loss": 0.3682,
556
  "step": 78
557
  },
558
  {
559
  "epoch": 3.962025316455696,
560
+ "grad_norm": 2.4416487216949463,
561
+ "learning_rate": 7.819082850768433e-05,
562
+ "loss": 0.5503,
563
  "step": 79
564
  },
565
  {
566
  "epoch": 4.0,
567
+ "grad_norm": 2.0517449378967285,
568
+ "learning_rate": 7.159746139706194e-05,
569
+ "loss": 0.3605,
570
  "step": 80
571
  },
572
  {
573
  "epoch": 4.050632911392405,
574
+ "grad_norm": 1.0866498947143555,
575
+ "learning_rate": 6.524777069483526e-05,
576
+ "loss": 0.1385,
577
  "step": 81
578
  },
579
  {
580
  "epoch": 4.10126582278481,
581
+ "grad_norm": 1.568613052368164,
582
+ "learning_rate": 5.9150429314724254e-05,
583
+ "loss": 0.2764,
584
  "step": 82
585
  },
586
  {
587
  "epoch": 4.151898734177215,
588
+ "grad_norm": 1.359748363494873,
589
+ "learning_rate": 5.3313765491629194e-05,
590
+ "loss": 0.2143,
591
  "step": 83
592
  },
593
  {
594
  "epoch": 4.2025316455696204,
595
+ "grad_norm": 1.2184932231903076,
596
+ "learning_rate": 4.7745751406263163e-05,
597
+ "loss": 0.1412,
598
  "step": 84
599
  },
600
  {
601
  "epoch": 4.253164556962025,
602
+ "grad_norm": 1.2645926475524902,
603
+ "learning_rate": 4.245399229611238e-05,
604
+ "loss": 0.1776,
605
  "step": 85
606
  },
607
  {
608
  "epoch": 4.30379746835443,
609
+ "grad_norm": 1.2445096969604492,
610
+ "learning_rate": 3.7445716067596506e-05,
611
+ "loss": 0.2054,
612
  "step": 86
613
  },
614
  {
615
  "epoch": 4.3544303797468356,
616
+ "grad_norm": 0.9805382490158081,
617
+ "learning_rate": 3.2727763423617915e-05,
618
+ "loss": 0.0969,
619
  "step": 87
620
  },
621
  {
622
  "epoch": 4.405063291139241,
623
+ "grad_norm": 1.2975034713745117,
624
+ "learning_rate": 2.8306578519984528e-05,
625
+ "loss": 0.1607,
626
  "step": 88
627
  },
628
  {
629
  "epoch": 4.455696202531645,
630
+ "grad_norm": 1.2653566598892212,
631
+ "learning_rate": 2.4188200163467787e-05,
632
+ "loss": 0.1053,
633
  "step": 89
634
  },
635
  {
636
  "epoch": 4.506329113924051,
637
+ "grad_norm": 1.334803581237793,
638
+ "learning_rate": 2.0378253563519245e-05,
639
+ "loss": 0.1769,
640
  "step": 90
641
  },
642
  {
643
  "epoch": 4.556962025316456,
644
+ "grad_norm": 1.574942946434021,
645
+ "learning_rate": 1.6881942648911074e-05,
646
+ "loss": 0.1549,
647
  "step": 91
648
  },
649
  {
650
  "epoch": 4.6075949367088604,
651
+ "grad_norm": 1.321705937385559,
652
+ "learning_rate": 1.3704042959795133e-05,
653
+ "loss": 0.2005,
654
  "step": 92
655
  },
656
  {
657
  "epoch": 4.658227848101266,
658
+ "grad_norm": 1.0175703763961792,
659
+ "learning_rate": 1.0848895124889818e-05,
660
+ "loss": 0.1525,
661
  "step": 93
662
  },
663
  {
664
  "epoch": 4.708860759493671,
665
+ "grad_norm": 1.5820319652557373,
666
+ "learning_rate": 8.320398932703144e-06,
667
+ "loss": 0.1657,
668
  "step": 94
669
  },
670
  {
671
  "epoch": 4.759493670886076,
672
+ "grad_norm": 1.1965199708938599,
673
+ "learning_rate": 6.12200800489085e-06,
674
+ "loss": 0.1327,
675
  "step": 95
676
  },
677
  {
678
  "epoch": 4.810126582278481,
679
+ "grad_norm": 1.2993100881576538,
680
+ "learning_rate": 4.256725079024554e-06,
681
+ "loss": 0.1132,
682
  "step": 96
683
  },
684
  {
685
  "epoch": 4.860759493670886,
686
+ "grad_norm": 1.2790327072143555,
687
+ "learning_rate": 2.7270979072135106e-06,
688
+ "loss": 0.1908,
689
  "step": 97
690
  },
691
  {
692
  "epoch": 4.911392405063291,
693
+ "grad_norm": 1.3205540180206299,
694
+ "learning_rate": 1.5352157761815977e-06,
695
+ "loss": 0.1363,
696
  "step": 98
697
  },
698
  {
699
  "epoch": 4.962025316455696,
700
+ "grad_norm": 1.3380309343338013,
701
+ "learning_rate": 6.827066535529947e-07,
702
+ "loss": 0.1689,
703
  "step": 99
704
  },
705
  {
706
  "epoch": 5.0,
707
+ "grad_norm": 1.3308898210525513,
708
+ "learning_rate": 1.7073496424427348e-07,
709
+ "loss": 0.0823,
710
  "step": 100
711
  }
712
  ],
 
727
  "attributes": {}
728
  }
729
  },
730
+ "total_flos": 2.9265650012307456e+16,
731
  "train_batch_size": 2,
732
  "trial_name": null,
733
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:36aa9ca963d01db4b15535adad54feeec8ce1d204d5d897f4466eefee776e7af
3
  size 6033
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ba1f2914799e90728253ee6efdcfb4b949f06f16db0ce9229e365f91a8f35af
3
  size 6033