robertou2 commited on
Commit
3dc8274
·
verified ·
1 Parent(s): 37fdf3c

Upload folder using huggingface_hub

Browse files
adapter_config.json CHANGED
@@ -12,24 +12,24 @@
12
  "layers_pattern": null,
13
  "layers_to_transform": null,
14
  "loftq_config": {},
15
- "lora_alpha": 128,
16
  "lora_bias": false,
17
- "lora_dropout": 0,
18
  "megatron_config": null,
19
  "megatron_core": "megatron.core",
20
  "modules_to_save": null,
21
  "peft_type": "LORA",
22
- "r": 64,
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
- "v_proj",
27
- "gate_proj",
28
- "k_proj",
29
- "down_proj",
30
  "q_proj",
31
  "up_proj",
32
- "o_proj"
 
 
 
 
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
 
12
  "layers_pattern": null,
13
  "layers_to_transform": null,
14
  "loftq_config": {},
15
+ "lora_alpha": 256,
16
  "lora_bias": false,
17
+ "lora_dropout": 0.05,
18
  "megatron_config": null,
19
  "megatron_core": "megatron.core",
20
  "modules_to_save": null,
21
  "peft_type": "LORA",
22
+ "r": 128,
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
 
 
 
 
26
  "q_proj",
27
  "up_proj",
28
+ "o_proj",
29
+ "v_proj",
30
+ "k_proj",
31
+ "gate_proj",
32
+ "down_proj"
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:310a72286497d50d5e6975572939b89cc2c30822edb5bf7327d465df0b83b07a
3
- size 479005064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa6c7de40a533ba2c4007a745595ab0f2ae9ee258e41c56770c1fd446b5c5664
3
+ size 957942768
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b787d2ecf06ff6725d757284b065b1537c93c563f801a5cedf95dbd9a4aec45d
3
- size 958300235
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23717abd38a71cf0035162ad78af09ed8877c50c8910ff1c5069995befc406e7
3
+ size 1916174411
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e5b517d1b8e2b0f837c8b00170b154961d4d989feba4326ac25583df7a55c57a
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3d312928d0bb60518eb9856d5ab0ae1674bcb745294bf27f615cb6d07b0463e
3
  size 14645
trainer_state.json CHANGED
@@ -11,702 +11,702 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.05063291139240506,
14
- "grad_norm": 29.55121612548828,
15
  "learning_rate": 0.0,
16
  "loss": 3.0474,
17
  "step": 1
18
  },
19
  {
20
  "epoch": 0.10126582278481013,
21
- "grad_norm": 47.9920539855957,
22
  "learning_rate": 3.3333333333333333e-06,
23
  "loss": 3.2925,
24
  "step": 2
25
  },
26
  {
27
  "epoch": 0.1518987341772152,
28
- "grad_norm": 21.540756225585938,
29
  "learning_rate": 6.666666666666667e-06,
30
- "loss": 3.201,
31
  "step": 3
32
  },
33
  {
34
  "epoch": 0.20253164556962025,
35
- "grad_norm": 8.517359733581543,
36
  "learning_rate": 1e-05,
37
- "loss": 2.6792,
38
  "step": 4
39
  },
40
  {
41
  "epoch": 0.25316455696202533,
42
- "grad_norm": 4.019173622131348,
43
  "learning_rate": 1.3333333333333333e-05,
44
- "loss": 2.2043,
45
  "step": 5
46
  },
47
  {
48
  "epoch": 0.3037974683544304,
49
- "grad_norm": 11.462793350219727,
50
  "learning_rate": 1.6666666666666667e-05,
51
- "loss": 2.8127,
52
  "step": 6
53
  },
54
  {
55
  "epoch": 0.35443037974683544,
56
- "grad_norm": 4.689273357391357,
57
  "learning_rate": 2e-05,
58
- "loss": 2.5712,
59
  "step": 7
60
  },
61
  {
62
  "epoch": 0.4050632911392405,
63
- "grad_norm": 4.675689697265625,
64
  "learning_rate": 2.3333333333333336e-05,
65
- "loss": 2.4222,
66
  "step": 8
67
  },
68
  {
69
  "epoch": 0.45569620253164556,
70
- "grad_norm": 4.813397407531738,
71
  "learning_rate": 2.6666666666666667e-05,
72
- "loss": 2.7645,
73
  "step": 9
74
  },
75
  {
76
  "epoch": 0.5063291139240507,
77
- "grad_norm": 4.817192077636719,
78
  "learning_rate": 3e-05,
79
- "loss": 2.5684,
80
  "step": 10
81
  },
82
  {
83
  "epoch": 0.5569620253164557,
84
- "grad_norm": 5.056623458862305,
85
  "learning_rate": 3.3333333333333335e-05,
86
- "loss": 2.7122,
87
  "step": 11
88
  },
89
  {
90
  "epoch": 0.6075949367088608,
91
- "grad_norm": 4.037632465362549,
92
  "learning_rate": 3.6666666666666666e-05,
93
- "loss": 2.7339,
94
  "step": 12
95
  },
96
  {
97
  "epoch": 0.6582278481012658,
98
- "grad_norm": 4.737704277038574,
99
  "learning_rate": 4e-05,
100
- "loss": 2.5438,
101
  "step": 13
102
  },
103
  {
104
  "epoch": 0.7088607594936709,
105
- "grad_norm": 4.448659896850586,
106
  "learning_rate": 4.3333333333333334e-05,
107
- "loss": 2.1418,
108
  "step": 14
109
  },
110
  {
111
  "epoch": 0.759493670886076,
112
- "grad_norm": 4.827819347381592,
113
  "learning_rate": 4.666666666666667e-05,
114
- "loss": 2.2931,
115
  "step": 15
116
  },
117
  {
118
  "epoch": 0.810126582278481,
119
- "grad_norm": 4.652594566345215,
120
  "learning_rate": 5e-05,
121
- "loss": 2.6527,
122
  "step": 16
123
  },
124
  {
125
  "epoch": 0.8607594936708861,
126
- "grad_norm": 3.7353029251098633,
127
  "learning_rate": 4.998292650357558e-05,
128
- "loss": 2.427,
129
  "step": 17
130
  },
131
  {
132
  "epoch": 0.9113924050632911,
133
- "grad_norm": 4.235123634338379,
134
  "learning_rate": 4.993172933464471e-05,
135
- "loss": 3.0115,
136
  "step": 18
137
  },
138
  {
139
  "epoch": 0.9620253164556962,
140
- "grad_norm": 4.2815046310424805,
141
  "learning_rate": 4.984647842238185e-05,
142
- "loss": 2.4574,
143
  "step": 19
144
  },
145
  {
146
  "epoch": 1.0,
147
- "grad_norm": 5.382309436798096,
148
  "learning_rate": 4.972729020927865e-05,
149
- "loss": 2.2266,
150
  "step": 20
151
  },
152
  {
153
  "epoch": 1.0506329113924051,
154
- "grad_norm": 3.046997308731079,
155
  "learning_rate": 4.957432749209755e-05,
156
- "loss": 2.1602,
157
  "step": 21
158
  },
159
  {
160
  "epoch": 1.1012658227848102,
161
- "grad_norm": 3.3037655353546143,
162
  "learning_rate": 4.938779919951092e-05,
163
- "loss": 2.0352,
164
  "step": 22
165
  },
166
  {
167
  "epoch": 1.1518987341772151,
168
- "grad_norm": 3.3994479179382324,
169
  "learning_rate": 4.916796010672969e-05,
170
- "loss": 2.0341,
171
  "step": 23
172
  },
173
  {
174
  "epoch": 1.2025316455696202,
175
- "grad_norm": 4.110241889953613,
176
  "learning_rate": 4.891511048751102e-05,
177
- "loss": 2.1729,
178
  "step": 24
179
  },
180
  {
181
  "epoch": 1.2531645569620253,
182
- "grad_norm": 3.1976218223571777,
183
  "learning_rate": 4.862959570402049e-05,
184
- "loss": 1.7632,
185
  "step": 25
186
  },
187
  {
188
  "epoch": 1.3037974683544304,
189
- "grad_norm": 4.051258563995361,
190
  "learning_rate": 4.8311805735108894e-05,
191
- "loss": 2.25,
192
  "step": 26
193
  },
194
  {
195
  "epoch": 1.3544303797468356,
196
- "grad_norm": 6.929507255554199,
197
  "learning_rate": 4.796217464364808e-05,
198
- "loss": 2.3207,
199
  "step": 27
200
  },
201
  {
202
  "epoch": 1.4050632911392404,
203
- "grad_norm": 4.213507652282715,
204
  "learning_rate": 4.758117998365322e-05,
205
- "loss": 2.1319,
206
  "step": 28
207
  },
208
  {
209
  "epoch": 1.4556962025316456,
210
- "grad_norm": 4.12835168838501,
211
  "learning_rate": 4.716934214800155e-05,
212
- "loss": 2.1621,
213
  "step": 29
214
  },
215
  {
216
  "epoch": 1.5063291139240507,
217
- "grad_norm": 4.359489917755127,
218
  "learning_rate": 4.672722365763821e-05,
219
- "loss": 1.7892,
220
  "step": 30
221
  },
222
  {
223
  "epoch": 1.5569620253164556,
224
- "grad_norm": 4.402414798736572,
225
  "learning_rate": 4.625542839324036e-05,
226
- "loss": 2.0661,
227
  "step": 31
228
  },
229
  {
230
  "epoch": 1.6075949367088609,
231
- "grad_norm": 4.111989498138428,
232
  "learning_rate": 4.575460077038877e-05,
233
- "loss": 2.3087,
234
  "step": 32
235
  },
236
  {
237
  "epoch": 1.6582278481012658,
238
- "grad_norm": 3.7056782245635986,
239
  "learning_rate": 4.522542485937369e-05,
240
- "loss": 1.7851,
241
  "step": 33
242
  },
243
  {
244
  "epoch": 1.7088607594936709,
245
- "grad_norm": 3.535465717315674,
246
  "learning_rate": 4.4668623450837085e-05,
247
- "loss": 1.9468,
248
  "step": 34
249
  },
250
  {
251
  "epoch": 1.759493670886076,
252
- "grad_norm": 4.073657989501953,
253
  "learning_rate": 4.408495706852758e-05,
254
- "loss": 2.2267,
255
  "step": 35
256
  },
257
  {
258
  "epoch": 1.810126582278481,
259
- "grad_norm": 6.199657917022705,
260
  "learning_rate": 4.347522293051648e-05,
261
- "loss": 2.0838,
262
  "step": 36
263
  },
264
  {
265
  "epoch": 1.8607594936708862,
266
- "grad_norm": 3.3513576984405518,
267
  "learning_rate": 4.284025386029381e-05,
268
- "loss": 1.7043,
269
  "step": 37
270
  },
271
  {
272
  "epoch": 1.9113924050632911,
273
- "grad_norm": 4.669745922088623,
274
  "learning_rate": 4.218091714923157e-05,
275
- "loss": 1.9935,
276
  "step": 38
277
  },
278
  {
279
  "epoch": 1.9620253164556962,
280
- "grad_norm": 4.580756187438965,
281
  "learning_rate": 4.149811337196807e-05,
282
- "loss": 1.825,
283
  "step": 39
284
  },
285
  {
286
  "epoch": 2.0,
287
- "grad_norm": 4.372970104217529,
288
  "learning_rate": 4.079277515633127e-05,
289
- "loss": 2.303,
290
  "step": 40
291
  },
292
  {
293
  "epoch": 2.050632911392405,
294
- "grad_norm": 3.1565821170806885,
295
  "learning_rate": 4.0065865909481417e-05,
296
- "loss": 1.489,
297
  "step": 41
298
  },
299
  {
300
  "epoch": 2.1012658227848102,
301
- "grad_norm": 4.271334171295166,
302
  "learning_rate": 3.931837850201263e-05,
303
- "loss": 1.8017,
304
  "step": 42
305
  },
306
  {
307
  "epoch": 2.151898734177215,
308
- "grad_norm": 3.5568835735321045,
309
  "learning_rate": 3.855133391181124e-05,
310
- "loss": 1.4802,
311
  "step": 43
312
  },
313
  {
314
  "epoch": 2.2025316455696204,
315
- "grad_norm": 4.271124362945557,
316
  "learning_rate": 3.7765779829522675e-05,
317
- "loss": 1.5782,
318
  "step": 44
319
  },
320
  {
321
  "epoch": 2.2531645569620253,
322
- "grad_norm": 6.451624870300293,
323
  "learning_rate": 3.696278922753216e-05,
324
- "loss": 1.7873,
325
  "step": 45
326
  },
327
  {
328
  "epoch": 2.3037974683544302,
329
- "grad_norm": 3.1512181758880615,
330
  "learning_rate": 3.6143458894413465e-05,
331
- "loss": 1.4847,
332
  "step": 46
333
  },
334
  {
335
  "epoch": 2.3544303797468356,
336
- "grad_norm": 5.191463470458984,
337
  "learning_rate": 3.5308907936847594e-05,
338
- "loss": 1.4307,
339
  "step": 47
340
  },
341
  {
342
  "epoch": 2.4050632911392404,
343
- "grad_norm": 3.847968816757202,
344
  "learning_rate": 3.446027625105776e-05,
345
- "loss": 1.558,
346
  "step": 48
347
  },
348
  {
349
  "epoch": 2.4556962025316453,
350
- "grad_norm": 4.588655948638916,
351
  "learning_rate": 3.3598722965848204e-05,
352
- "loss": 1.55,
353
  "step": 49
354
  },
355
  {
356
  "epoch": 2.5063291139240507,
357
- "grad_norm": 4.686515808105469,
358
  "learning_rate": 3.272542485937369e-05,
359
- "loss": 1.3516,
360
  "step": 50
361
  },
362
  {
363
  "epoch": 2.5569620253164556,
364
- "grad_norm": 3.790144205093384,
365
  "learning_rate": 3.1841574751802076e-05,
366
- "loss": 1.3935,
367
  "step": 51
368
  },
369
  {
370
  "epoch": 2.607594936708861,
371
- "grad_norm": 5.291225910186768,
372
  "learning_rate": 3.094837987606547e-05,
373
- "loss": 1.3247,
374
  "step": 52
375
  },
376
  {
377
  "epoch": 2.6582278481012658,
378
- "grad_norm": 3.8952436447143555,
379
  "learning_rate": 3.0047060228925256e-05,
380
- "loss": 1.7195,
381
  "step": 53
382
  },
383
  {
384
  "epoch": 2.708860759493671,
385
- "grad_norm": 5.9185261726379395,
386
  "learning_rate": 2.913884690460325e-05,
387
- "loss": 1.9431,
388
  "step": 54
389
  },
390
  {
391
  "epoch": 2.759493670886076,
392
- "grad_norm": 3.7539308071136475,
393
  "learning_rate": 2.8224980413255086e-05,
394
- "loss": 1.4235,
395
  "step": 55
396
  },
397
  {
398
  "epoch": 2.810126582278481,
399
- "grad_norm": 3.4502947330474854,
400
  "learning_rate": 2.7306708986582553e-05,
401
- "loss": 1.2878,
402
  "step": 56
403
  },
404
  {
405
  "epoch": 2.8607594936708862,
406
- "grad_norm": 5.332718849182129,
407
  "learning_rate": 2.638528687289925e-05,
408
- "loss": 1.5786,
409
  "step": 57
410
  },
411
  {
412
  "epoch": 2.911392405063291,
413
- "grad_norm": 4.221064567565918,
414
  "learning_rate": 2.5461972623978247e-05,
415
- "loss": 1.3699,
416
  "step": 58
417
  },
418
  {
419
  "epoch": 2.962025316455696,
420
- "grad_norm": 5.887556076049805,
421
  "learning_rate": 2.453802737602176e-05,
422
- "loss": 1.8087,
423
  "step": 59
424
  },
425
  {
426
  "epoch": 3.0,
427
- "grad_norm": 3.3788065910339355,
428
  "learning_rate": 2.361471312710075e-05,
429
- "loss": 0.8745,
430
  "step": 60
431
  },
432
  {
433
  "epoch": 3.050632911392405,
434
- "grad_norm": 3.735031843185425,
435
  "learning_rate": 2.2693291013417453e-05,
436
- "loss": 1.2198,
437
  "step": 61
438
  },
439
  {
440
  "epoch": 3.1012658227848102,
441
- "grad_norm": 4.278702259063721,
442
  "learning_rate": 2.1775019586744923e-05,
443
- "loss": 1.1114,
444
  "step": 62
445
  },
446
  {
447
  "epoch": 3.151898734177215,
448
- "grad_norm": 3.5521931648254395,
449
  "learning_rate": 2.0861153095396748e-05,
450
- "loss": 1.036,
451
  "step": 63
452
  },
453
  {
454
  "epoch": 3.2025316455696204,
455
- "grad_norm": 2.7933812141418457,
456
  "learning_rate": 1.995293977107475e-05,
457
- "loss": 0.7252,
458
  "step": 64
459
  },
460
  {
461
  "epoch": 3.2531645569620253,
462
- "grad_norm": 6.411159038543701,
463
  "learning_rate": 1.9051620123934537e-05,
464
- "loss": 0.9923,
465
  "step": 65
466
  },
467
  {
468
  "epoch": 3.3037974683544302,
469
- "grad_norm": 4.2825846672058105,
470
  "learning_rate": 1.815842524819793e-05,
471
- "loss": 1.1409,
472
  "step": 66
473
  },
474
  {
475
  "epoch": 3.3544303797468356,
476
- "grad_norm": 5.785189151763916,
477
  "learning_rate": 1.7274575140626318e-05,
478
- "loss": 1.1165,
479
  "step": 67
480
  },
481
  {
482
  "epoch": 3.4050632911392404,
483
- "grad_norm": 4.309160232543945,
484
  "learning_rate": 1.6401277034151798e-05,
485
- "loss": 1.2768,
486
  "step": 68
487
  },
488
  {
489
  "epoch": 3.4556962025316453,
490
- "grad_norm": 5.033167362213135,
491
  "learning_rate": 1.5539723748942245e-05,
492
- "loss": 1.2941,
493
  "step": 69
494
  },
495
  {
496
  "epoch": 3.5063291139240507,
497
- "grad_norm": 4.315425395965576,
498
  "learning_rate": 1.4691092063152417e-05,
499
- "loss": 0.9229,
500
  "step": 70
501
  },
502
  {
503
  "epoch": 3.5569620253164556,
504
- "grad_norm": 4.174651622772217,
505
  "learning_rate": 1.3856541105586545e-05,
506
- "loss": 1.0337,
507
  "step": 71
508
  },
509
  {
510
  "epoch": 3.607594936708861,
511
- "grad_norm": 4.93477201461792,
512
  "learning_rate": 1.303721077246784e-05,
513
- "loss": 0.9844,
514
  "step": 72
515
  },
516
  {
517
  "epoch": 3.6582278481012658,
518
- "grad_norm": 5.448512554168701,
519
  "learning_rate": 1.223422017047733e-05,
520
- "loss": 1.0804,
521
  "step": 73
522
  },
523
  {
524
  "epoch": 3.708860759493671,
525
- "grad_norm": 4.748834609985352,
526
  "learning_rate": 1.1448666088188764e-05,
527
- "loss": 1.0972,
528
  "step": 74
529
  },
530
  {
531
  "epoch": 3.759493670886076,
532
- "grad_norm": 5.283952713012695,
533
  "learning_rate": 1.068162149798737e-05,
534
- "loss": 1.0916,
535
  "step": 75
536
  },
537
  {
538
  "epoch": 3.810126582278481,
539
- "grad_norm": 4.5018630027771,
540
  "learning_rate": 9.934134090518593e-06,
541
- "loss": 0.9075,
542
  "step": 76
543
  },
544
  {
545
  "epoch": 3.8607594936708862,
546
- "grad_norm": 5.909902572631836,
547
  "learning_rate": 9.207224843668732e-06,
548
- "loss": 1.314,
549
  "step": 77
550
  },
551
  {
552
  "epoch": 3.911392405063291,
553
- "grad_norm": 5.771544933319092,
554
  "learning_rate": 8.50188662803194e-06,
555
- "loss": 1.2385,
556
  "step": 78
557
  },
558
  {
559
  "epoch": 3.962025316455696,
560
- "grad_norm": 5.263690948486328,
561
  "learning_rate": 7.819082850768434e-06,
562
- "loss": 1.05,
563
  "step": 79
564
  },
565
  {
566
  "epoch": 4.0,
567
- "grad_norm": 4.816619873046875,
568
  "learning_rate": 7.159746139706194e-06,
569
- "loss": 1.0004,
570
  "step": 80
571
  },
572
  {
573
  "epoch": 4.050632911392405,
574
- "grad_norm": 4.460178375244141,
575
  "learning_rate": 6.524777069483526e-06,
576
- "loss": 0.8953,
577
  "step": 81
578
  },
579
  {
580
  "epoch": 4.10126582278481,
581
- "grad_norm": 4.612957954406738,
582
  "learning_rate": 5.915042931472425e-06,
583
- "loss": 1.0056,
584
  "step": 82
585
  },
586
  {
587
  "epoch": 4.151898734177215,
588
- "grad_norm": 5.433095932006836,
589
  "learning_rate": 5.33137654916292e-06,
590
- "loss": 0.7673,
591
  "step": 83
592
  },
593
  {
594
  "epoch": 4.2025316455696204,
595
- "grad_norm": 3.291714906692505,
596
  "learning_rate": 4.7745751406263165e-06,
597
- "loss": 0.6697,
598
  "step": 84
599
  },
600
  {
601
  "epoch": 4.253164556962025,
602
- "grad_norm": 3.8275110721588135,
603
  "learning_rate": 4.245399229611238e-06,
604
- "loss": 0.942,
605
  "step": 85
606
  },
607
  {
608
  "epoch": 4.30379746835443,
609
- "grad_norm": 3.4578733444213867,
610
  "learning_rate": 3.7445716067596503e-06,
611
- "loss": 0.9534,
612
  "step": 86
613
  },
614
  {
615
  "epoch": 4.3544303797468356,
616
- "grad_norm": 4.556038856506348,
617
  "learning_rate": 3.2727763423617913e-06,
618
- "loss": 0.6355,
619
  "step": 87
620
  },
621
  {
622
  "epoch": 4.405063291139241,
623
- "grad_norm": 3.973484754562378,
624
  "learning_rate": 2.8306578519984527e-06,
625
- "loss": 0.8523,
626
  "step": 88
627
  },
628
  {
629
  "epoch": 4.455696202531645,
630
- "grad_norm": 5.197175979614258,
631
  "learning_rate": 2.418820016346779e-06,
632
- "loss": 0.7797,
633
  "step": 89
634
  },
635
  {
636
  "epoch": 4.506329113924051,
637
- "grad_norm": 4.299957752227783,
638
  "learning_rate": 2.0378253563519247e-06,
639
- "loss": 0.9653,
640
  "step": 90
641
  },
642
  {
643
  "epoch": 4.556962025316456,
644
- "grad_norm": 4.691103458404541,
645
  "learning_rate": 1.6881942648911076e-06,
646
- "loss": 0.7476,
647
  "step": 91
648
  },
649
  {
650
  "epoch": 4.6075949367088604,
651
- "grad_norm": 3.614025354385376,
652
  "learning_rate": 1.3704042959795132e-06,
653
- "loss": 0.9732,
654
  "step": 92
655
  },
656
  {
657
  "epoch": 4.658227848101266,
658
- "grad_norm": 3.1150994300842285,
659
  "learning_rate": 1.0848895124889818e-06,
660
- "loss": 0.8307,
661
  "step": 93
662
  },
663
  {
664
  "epoch": 4.708860759493671,
665
- "grad_norm": 4.92384147644043,
666
  "learning_rate": 8.320398932703144e-07,
667
- "loss": 1.0243,
668
  "step": 94
669
  },
670
  {
671
  "epoch": 4.759493670886076,
672
- "grad_norm": 4.248635292053223,
673
  "learning_rate": 6.122008004890851e-07,
674
- "loss": 0.835,
675
  "step": 95
676
  },
677
  {
678
  "epoch": 4.810126582278481,
679
- "grad_norm": 4.580704212188721,
680
  "learning_rate": 4.256725079024554e-07,
681
- "loss": 0.6964,
682
  "step": 96
683
  },
684
  {
685
  "epoch": 4.860759493670886,
686
- "grad_norm": 4.322700023651123,
687
  "learning_rate": 2.7270979072135104e-07,
688
- "loss": 0.9184,
689
  "step": 97
690
  },
691
  {
692
  "epoch": 4.911392405063291,
693
- "grad_norm": 5.485646724700928,
694
  "learning_rate": 1.5352157761815977e-07,
695
- "loss": 0.9517,
696
  "step": 98
697
  },
698
  {
699
  "epoch": 4.962025316455696,
700
- "grad_norm": 3.603494167327881,
701
  "learning_rate": 6.827066535529946e-08,
702
- "loss": 0.7875,
703
  "step": 99
704
  },
705
  {
706
  "epoch": 5.0,
707
- "grad_norm": 5.922054290771484,
708
  "learning_rate": 1.7073496424427348e-08,
709
- "loss": 0.5772,
710
  "step": 100
711
  }
712
  ],
@@ -714,7 +714,7 @@
714
  "max_steps": 100,
715
  "num_input_tokens_seen": 0,
716
  "num_train_epochs": 5,
717
- "save_steps": 10,
718
  "stateful_callbacks": {
719
  "TrainerControl": {
720
  "args": {
@@ -727,7 +727,7 @@
727
  "attributes": {}
728
  }
729
  },
730
- "total_flos": 2.957146343615693e+16,
731
  "train_batch_size": 2,
732
  "trial_name": null,
733
  "trial_params": null
 
11
  "log_history": [
12
  {
13
  "epoch": 0.05063291139240506,
14
+ "grad_norm": 59.90552520751953,
15
  "learning_rate": 0.0,
16
  "loss": 3.0474,
17
  "step": 1
18
  },
19
  {
20
  "epoch": 0.10126582278481013,
21
+ "grad_norm": 97.50157928466797,
22
  "learning_rate": 3.3333333333333333e-06,
23
  "loss": 3.2925,
24
  "step": 2
25
  },
26
  {
27
  "epoch": 0.1518987341772152,
28
+ "grad_norm": 19.947616577148438,
29
  "learning_rate": 6.666666666666667e-06,
30
+ "loss": 3.0627,
31
  "step": 3
32
  },
33
  {
34
  "epoch": 0.20253164556962025,
35
+ "grad_norm": 12.422538757324219,
36
  "learning_rate": 1e-05,
37
+ "loss": 2.6282,
38
  "step": 4
39
  },
40
  {
41
  "epoch": 0.25316455696202533,
42
+ "grad_norm": 7.736602783203125,
43
  "learning_rate": 1.3333333333333333e-05,
44
+ "loss": 2.1838,
45
  "step": 5
46
  },
47
  {
48
  "epoch": 0.3037974683544304,
49
+ "grad_norm": 11.91862964630127,
50
  "learning_rate": 1.6666666666666667e-05,
51
+ "loss": 2.7703,
52
  "step": 6
53
  },
54
  {
55
  "epoch": 0.35443037974683544,
56
+ "grad_norm": 9.939104080200195,
57
  "learning_rate": 2e-05,
58
+ "loss": 2.5457,
59
  "step": 7
60
  },
61
  {
62
  "epoch": 0.4050632911392405,
63
+ "grad_norm": 8.861398696899414,
64
  "learning_rate": 2.3333333333333336e-05,
65
+ "loss": 2.3865,
66
  "step": 8
67
  },
68
  {
69
  "epoch": 0.45569620253164556,
70
+ "grad_norm": 9.401021957397461,
71
  "learning_rate": 2.6666666666666667e-05,
72
+ "loss": 2.7507,
73
  "step": 9
74
  },
75
  {
76
  "epoch": 0.5063291139240507,
77
+ "grad_norm": 9.068034172058105,
78
  "learning_rate": 3e-05,
79
+ "loss": 2.5836,
80
  "step": 10
81
  },
82
  {
83
  "epoch": 0.5569620253164557,
84
+ "grad_norm": 9.445144653320312,
85
  "learning_rate": 3.3333333333333335e-05,
86
+ "loss": 2.6978,
87
  "step": 11
88
  },
89
  {
90
  "epoch": 0.6075949367088608,
91
+ "grad_norm": 67.43431854248047,
92
  "learning_rate": 3.6666666666666666e-05,
93
+ "loss": 2.7532,
94
  "step": 12
95
  },
96
  {
97
  "epoch": 0.6582278481012658,
98
+ "grad_norm": 10.308363914489746,
99
  "learning_rate": 4e-05,
100
+ "loss": 2.5309,
101
  "step": 13
102
  },
103
  {
104
  "epoch": 0.7088607594936709,
105
+ "grad_norm": 9.538652420043945,
106
  "learning_rate": 4.3333333333333334e-05,
107
+ "loss": 2.1622,
108
  "step": 14
109
  },
110
  {
111
  "epoch": 0.759493670886076,
112
+ "grad_norm": 10.145242691040039,
113
  "learning_rate": 4.666666666666667e-05,
114
+ "loss": 2.3157,
115
  "step": 15
116
  },
117
  {
118
  "epoch": 0.810126582278481,
119
+ "grad_norm": 12.203636169433594,
120
  "learning_rate": 5e-05,
121
+ "loss": 2.7209,
122
  "step": 16
123
  },
124
  {
125
  "epoch": 0.8607594936708861,
126
+ "grad_norm": 8.771341323852539,
127
  "learning_rate": 4.998292650357558e-05,
128
+ "loss": 2.4761,
129
  "step": 17
130
  },
131
  {
132
  "epoch": 0.9113924050632911,
133
+ "grad_norm": 9.946197509765625,
134
  "learning_rate": 4.993172933464471e-05,
135
+ "loss": 2.9751,
136
  "step": 18
137
  },
138
  {
139
  "epoch": 0.9620253164556962,
140
+ "grad_norm": 7.850845813751221,
141
  "learning_rate": 4.984647842238185e-05,
142
+ "loss": 2.4653,
143
  "step": 19
144
  },
145
  {
146
  "epoch": 1.0,
147
+ "grad_norm": 10.167204856872559,
148
  "learning_rate": 4.972729020927865e-05,
149
+ "loss": 2.3039,
150
  "step": 20
151
  },
152
  {
153
  "epoch": 1.0506329113924051,
154
+ "grad_norm": 7.328862190246582,
155
  "learning_rate": 4.957432749209755e-05,
156
+ "loss": 1.9503,
157
  "step": 21
158
  },
159
  {
160
  "epoch": 1.1012658227848102,
161
+ "grad_norm": 7.199270725250244,
162
  "learning_rate": 4.938779919951092e-05,
163
+ "loss": 1.8009,
164
  "step": 22
165
  },
166
  {
167
  "epoch": 1.1518987341772151,
168
+ "grad_norm": 10.441041946411133,
169
  "learning_rate": 4.916796010672969e-05,
170
+ "loss": 1.9009,
171
  "step": 23
172
  },
173
  {
174
  "epoch": 1.2025316455696202,
175
+ "grad_norm": 9.012300491333008,
176
  "learning_rate": 4.891511048751102e-05,
177
+ "loss": 1.9546,
178
  "step": 24
179
  },
180
  {
181
  "epoch": 1.2531645569620253,
182
+ "grad_norm": 8.55013656616211,
183
  "learning_rate": 4.862959570402049e-05,
184
+ "loss": 1.5774,
185
  "step": 25
186
  },
187
  {
188
  "epoch": 1.3037974683544304,
189
+ "grad_norm": 10.145879745483398,
190
  "learning_rate": 4.8311805735108894e-05,
191
+ "loss": 2.2121,
192
  "step": 26
193
  },
194
  {
195
  "epoch": 1.3544303797468356,
196
+ "grad_norm": 13.984664916992188,
197
  "learning_rate": 4.796217464364808e-05,
198
+ "loss": 2.0364,
199
  "step": 27
200
  },
201
  {
202
  "epoch": 1.4050632911392404,
203
+ "grad_norm": 8.673234939575195,
204
  "learning_rate": 4.758117998365322e-05,
205
+ "loss": 1.8328,
206
  "step": 28
207
  },
208
  {
209
  "epoch": 1.4556962025316456,
210
+ "grad_norm": 7.145374298095703,
211
  "learning_rate": 4.716934214800155e-05,
212
+ "loss": 1.8605,
213
  "step": 29
214
  },
215
  {
216
  "epoch": 1.5063291139240507,
217
+ "grad_norm": 7.1981611251831055,
218
  "learning_rate": 4.672722365763821e-05,
219
+ "loss": 1.539,
220
  "step": 30
221
  },
222
  {
223
  "epoch": 1.5569620253164556,
224
+ "grad_norm": 7.796040058135986,
225
  "learning_rate": 4.625542839324036e-05,
226
+ "loss": 1.6676,
227
  "step": 31
228
  },
229
  {
230
  "epoch": 1.6075949367088609,
231
+ "grad_norm": 7.746204376220703,
232
  "learning_rate": 4.575460077038877e-05,
233
+ "loss": 1.8508,
234
  "step": 32
235
  },
236
  {
237
  "epoch": 1.6582278481012658,
238
+ "grad_norm": 7.917386531829834,
239
  "learning_rate": 4.522542485937369e-05,
240
+ "loss": 1.6083,
241
  "step": 33
242
  },
243
  {
244
  "epoch": 1.7088607594936709,
245
+ "grad_norm": 7.896660804748535,
246
  "learning_rate": 4.4668623450837085e-05,
247
+ "loss": 1.8305,
248
  "step": 34
249
  },
250
  {
251
  "epoch": 1.759493670886076,
252
+ "grad_norm": 10.955219268798828,
253
  "learning_rate": 4.408495706852758e-05,
254
+ "loss": 2.0006,
255
  "step": 35
256
  },
257
  {
258
  "epoch": 1.810126582278481,
259
+ "grad_norm": 12.703990936279297,
260
  "learning_rate": 4.347522293051648e-05,
261
+ "loss": 1.9769,
262
  "step": 36
263
  },
264
  {
265
  "epoch": 1.8607594936708862,
266
+ "grad_norm": 6.554587364196777,
267
  "learning_rate": 4.284025386029381e-05,
268
+ "loss": 1.5205,
269
  "step": 37
270
  },
271
  {
272
  "epoch": 1.9113924050632911,
273
+ "grad_norm": 9.378654479980469,
274
  "learning_rate": 4.218091714923157e-05,
275
+ "loss": 2.112,
276
  "step": 38
277
  },
278
  {
279
  "epoch": 1.9620253164556962,
280
+ "grad_norm": 8.477209091186523,
281
  "learning_rate": 4.149811337196807e-05,
282
+ "loss": 1.6478,
283
  "step": 39
284
  },
285
  {
286
  "epoch": 2.0,
287
+ "grad_norm": 10.616765975952148,
288
  "learning_rate": 4.079277515633127e-05,
289
+ "loss": 2.3503,
290
  "step": 40
291
  },
292
  {
293
  "epoch": 2.050632911392405,
294
+ "grad_norm": 5.762049198150635,
295
  "learning_rate": 4.0065865909481417e-05,
296
+ "loss": 1.2976,
297
  "step": 41
298
  },
299
  {
300
  "epoch": 2.1012658227848102,
301
+ "grad_norm": 7.1912736892700195,
302
  "learning_rate": 3.931837850201263e-05,
303
+ "loss": 1.3291,
304
  "step": 42
305
  },
306
  {
307
  "epoch": 2.151898734177215,
308
+ "grad_norm": 7.300074577331543,
309
  "learning_rate": 3.855133391181124e-05,
310
+ "loss": 1.0771,
311
  "step": 43
312
  },
313
  {
314
  "epoch": 2.2025316455696204,
315
+ "grad_norm": 8.115863800048828,
316
  "learning_rate": 3.7765779829522675e-05,
317
+ "loss": 1.1092,
318
  "step": 44
319
  },
320
  {
321
  "epoch": 2.2531645569620253,
322
+ "grad_norm": 14.167332649230957,
323
  "learning_rate": 3.696278922753216e-05,
324
+ "loss": 1.3089,
325
  "step": 45
326
  },
327
  {
328
  "epoch": 2.3037974683544302,
329
+ "grad_norm": 5.89024543762207,
330
  "learning_rate": 3.6143458894413465e-05,
331
+ "loss": 1.2359,
332
  "step": 46
333
  },
334
  {
335
  "epoch": 2.3544303797468356,
336
+ "grad_norm": 8.202404975891113,
337
  "learning_rate": 3.5308907936847594e-05,
338
+ "loss": 1.0419,
339
  "step": 47
340
  },
341
  {
342
  "epoch": 2.4050632911392404,
343
+ "grad_norm": 6.15331506729126,
344
  "learning_rate": 3.446027625105776e-05,
345
+ "loss": 1.2716,
346
  "step": 48
347
  },
348
  {
349
  "epoch": 2.4556962025316453,
350
+ "grad_norm": 7.2445173263549805,
351
  "learning_rate": 3.3598722965848204e-05,
352
+ "loss": 1.2061,
353
  "step": 49
354
  },
355
  {
356
  "epoch": 2.5063291139240507,
357
+ "grad_norm": 8.201730728149414,
358
  "learning_rate": 3.272542485937369e-05,
359
+ "loss": 0.9834,
360
  "step": 50
361
  },
362
  {
363
  "epoch": 2.5569620253164556,
364
+ "grad_norm": 6.665807723999023,
365
  "learning_rate": 3.1841574751802076e-05,
366
+ "loss": 1.0482,
367
  "step": 51
368
  },
369
  {
370
  "epoch": 2.607594936708861,
371
+ "grad_norm": 9.461847305297852,
372
  "learning_rate": 3.094837987606547e-05,
373
+ "loss": 0.9212,
374
  "step": 52
375
  },
376
  {
377
  "epoch": 2.6582278481012658,
378
+ "grad_norm": 6.989381790161133,
379
  "learning_rate": 3.0047060228925256e-05,
380
+ "loss": 1.158,
381
  "step": 53
382
  },
383
  {
384
  "epoch": 2.708860759493671,
385
+ "grad_norm": 10.334746360778809,
386
  "learning_rate": 2.913884690460325e-05,
387
+ "loss": 1.5235,
388
  "step": 54
389
  },
390
  {
391
  "epoch": 2.759493670886076,
392
+ "grad_norm": 6.639229774475098,
393
  "learning_rate": 2.8224980413255086e-05,
394
+ "loss": 1.0108,
395
  "step": 55
396
  },
397
  {
398
  "epoch": 2.810126582278481,
399
+ "grad_norm": 6.2803263664245605,
400
  "learning_rate": 2.7306708986582553e-05,
401
+ "loss": 1.0466,
402
  "step": 56
403
  },
404
  {
405
  "epoch": 2.8607594936708862,
406
+ "grad_norm": 8.546771049499512,
407
  "learning_rate": 2.638528687289925e-05,
408
+ "loss": 1.0012,
409
  "step": 57
410
  },
411
  {
412
  "epoch": 2.911392405063291,
413
+ "grad_norm": 7.191826820373535,
414
  "learning_rate": 2.5461972623978247e-05,
415
+ "loss": 1.0746,
416
  "step": 58
417
  },
418
  {
419
  "epoch": 2.962025316455696,
420
+ "grad_norm": 11.312370300292969,
421
  "learning_rate": 2.453802737602176e-05,
422
+ "loss": 1.3323,
423
  "step": 59
424
  },
425
  {
426
  "epoch": 3.0,
427
+ "grad_norm": 4.749184608459473,
428
  "learning_rate": 2.361471312710075e-05,
429
+ "loss": 0.5565,
430
  "step": 60
431
  },
432
  {
433
  "epoch": 3.050632911392405,
434
+ "grad_norm": 5.846968650817871,
435
  "learning_rate": 2.2693291013417453e-05,
436
+ "loss": 0.7122,
437
  "step": 61
438
  },
439
  {
440
  "epoch": 3.1012658227848102,
441
+ "grad_norm": 5.533771514892578,
442
  "learning_rate": 2.1775019586744923e-05,
443
+ "loss": 0.728,
444
  "step": 62
445
  },
446
  {
447
  "epoch": 3.151898734177215,
448
+ "grad_norm": 5.222585678100586,
449
  "learning_rate": 2.0861153095396748e-05,
450
+ "loss": 0.6109,
451
  "step": 63
452
  },
453
  {
454
  "epoch": 3.2025316455696204,
455
+ "grad_norm": 3.953166961669922,
456
  "learning_rate": 1.995293977107475e-05,
457
+ "loss": 0.4257,
458
  "step": 64
459
  },
460
  {
461
  "epoch": 3.2531645569620253,
462
+ "grad_norm": 5.500568866729736,
463
  "learning_rate": 1.9051620123934537e-05,
464
+ "loss": 0.5261,
465
  "step": 65
466
  },
467
  {
468
  "epoch": 3.3037974683544302,
469
+ "grad_norm": 5.6277174949646,
470
  "learning_rate": 1.815842524819793e-05,
471
+ "loss": 0.6001,
472
  "step": 66
473
  },
474
  {
475
  "epoch": 3.3544303797468356,
476
+ "grad_norm": 8.257991790771484,
477
  "learning_rate": 1.7274575140626318e-05,
478
+ "loss": 0.6239,
479
  "step": 67
480
  },
481
  {
482
  "epoch": 3.4050632911392404,
483
+ "grad_norm": 6.649881839752197,
484
  "learning_rate": 1.6401277034151798e-05,
485
+ "loss": 0.8054,
486
  "step": 68
487
  },
488
  {
489
  "epoch": 3.4556962025316453,
490
+ "grad_norm": 7.573472499847412,
491
  "learning_rate": 1.5539723748942245e-05,
492
+ "loss": 0.6508,
493
  "step": 69
494
  },
495
  {
496
  "epoch": 3.5063291139240507,
497
+ "grad_norm": 6.748483180999756,
498
  "learning_rate": 1.4691092063152417e-05,
499
+ "loss": 0.4997,
500
  "step": 70
501
  },
502
  {
503
  "epoch": 3.5569620253164556,
504
+ "grad_norm": 6.085212707519531,
505
  "learning_rate": 1.3856541105586545e-05,
506
+ "loss": 0.5069,
507
  "step": 71
508
  },
509
  {
510
  "epoch": 3.607594936708861,
511
+ "grad_norm": 8.33370304107666,
512
  "learning_rate": 1.303721077246784e-05,
513
+ "loss": 0.675,
514
  "step": 72
515
  },
516
  {
517
  "epoch": 3.6582278481012658,
518
+ "grad_norm": 9.128050804138184,
519
  "learning_rate": 1.223422017047733e-05,
520
+ "loss": 0.6149,
521
  "step": 73
522
  },
523
  {
524
  "epoch": 3.708860759493671,
525
+ "grad_norm": 6.868535041809082,
526
  "learning_rate": 1.1448666088188764e-05,
527
+ "loss": 0.5729,
528
  "step": 74
529
  },
530
  {
531
  "epoch": 3.759493670886076,
532
+ "grad_norm": 10.637543678283691,
533
  "learning_rate": 1.068162149798737e-05,
534
+ "loss": 0.6948,
535
  "step": 75
536
  },
537
  {
538
  "epoch": 3.810126582278481,
539
+ "grad_norm": 5.958944320678711,
540
  "learning_rate": 9.934134090518593e-06,
541
+ "loss": 0.4837,
542
  "step": 76
543
  },
544
  {
545
  "epoch": 3.8607594936708862,
546
+ "grad_norm": 8.521463394165039,
547
  "learning_rate": 9.207224843668732e-06,
548
+ "loss": 0.7003,
549
  "step": 77
550
  },
551
  {
552
  "epoch": 3.911392405063291,
553
+ "grad_norm": 7.978651523590088,
554
  "learning_rate": 8.50188662803194e-06,
555
+ "loss": 0.5588,
556
  "step": 78
557
  },
558
  {
559
  "epoch": 3.962025316455696,
560
+ "grad_norm": 6.872608184814453,
561
  "learning_rate": 7.819082850768434e-06,
562
+ "loss": 0.5483,
563
  "step": 79
564
  },
565
  {
566
  "epoch": 4.0,
567
+ "grad_norm": 6.058796405792236,
568
  "learning_rate": 7.159746139706194e-06,
569
+ "loss": 0.4585,
570
  "step": 80
571
  },
572
  {
573
  "epoch": 4.050632911392405,
574
+ "grad_norm": 5.086803913116455,
575
  "learning_rate": 6.524777069483526e-06,
576
+ "loss": 0.3461,
577
  "step": 81
578
  },
579
  {
580
  "epoch": 4.10126582278481,
581
+ "grad_norm": 4.965209484100342,
582
  "learning_rate": 5.915042931472425e-06,
583
+ "loss": 0.4123,
584
  "step": 82
585
  },
586
  {
587
  "epoch": 4.151898734177215,
588
+ "grad_norm": 4.652761459350586,
589
  "learning_rate": 5.33137654916292e-06,
590
+ "loss": 0.3055,
591
  "step": 83
592
  },
593
  {
594
  "epoch": 4.2025316455696204,
595
+ "grad_norm": 3.384598731994629,
596
  "learning_rate": 4.7745751406263165e-06,
597
+ "loss": 0.2181,
598
  "step": 84
599
  },
600
  {
601
  "epoch": 4.253164556962025,
602
+ "grad_norm": 4.2130818367004395,
603
  "learning_rate": 4.245399229611238e-06,
604
+ "loss": 0.3542,
605
  "step": 85
606
  },
607
  {
608
  "epoch": 4.30379746835443,
609
+ "grad_norm": 4.708261489868164,
610
  "learning_rate": 3.7445716067596503e-06,
611
+ "loss": 0.4113,
612
  "step": 86
613
  },
614
  {
615
  "epoch": 4.3544303797468356,
616
+ "grad_norm": 3.8586061000823975,
617
  "learning_rate": 3.2727763423617913e-06,
618
+ "loss": 0.1785,
619
  "step": 87
620
  },
621
  {
622
  "epoch": 4.405063291139241,
623
+ "grad_norm": 4.425046920776367,
624
  "learning_rate": 2.8306578519984527e-06,
625
+ "loss": 0.3638,
626
  "step": 88
627
  },
628
  {
629
  "epoch": 4.455696202531645,
630
+ "grad_norm": 4.765505790710449,
631
  "learning_rate": 2.418820016346779e-06,
632
+ "loss": 0.2206,
633
  "step": 89
634
  },
635
  {
636
  "epoch": 4.506329113924051,
637
+ "grad_norm": 4.627501964569092,
638
  "learning_rate": 2.0378253563519247e-06,
639
+ "loss": 0.3092,
640
  "step": 90
641
  },
642
  {
643
  "epoch": 4.556962025316456,
644
+ "grad_norm": 5.241892337799072,
645
  "learning_rate": 1.6881942648911076e-06,
646
+ "loss": 0.3072,
647
  "step": 91
648
  },
649
  {
650
  "epoch": 4.6075949367088604,
651
+ "grad_norm": 4.68987512588501,
652
  "learning_rate": 1.3704042959795132e-06,
653
+ "loss": 0.4245,
654
  "step": 92
655
  },
656
  {
657
  "epoch": 4.658227848101266,
658
+ "grad_norm": 3.880495071411133,
659
  "learning_rate": 1.0848895124889818e-06,
660
+ "loss": 0.3624,
661
  "step": 93
662
  },
663
  {
664
  "epoch": 4.708860759493671,
665
+ "grad_norm": 5.35604190826416,
666
  "learning_rate": 8.320398932703144e-07,
667
+ "loss": 0.3626,
668
  "step": 94
669
  },
670
  {
671
  "epoch": 4.759493670886076,
672
+ "grad_norm": 4.486222267150879,
673
  "learning_rate": 6.122008004890851e-07,
674
+ "loss": 0.2896,
675
  "step": 95
676
  },
677
  {
678
  "epoch": 4.810126582278481,
679
+ "grad_norm": 4.400012016296387,
680
  "learning_rate": 4.256725079024554e-07,
681
+ "loss": 0.2279,
682
  "step": 96
683
  },
684
  {
685
  "epoch": 4.860759493670886,
686
+ "grad_norm": 5.147950649261475,
687
  "learning_rate": 2.7270979072135104e-07,
688
+ "loss": 0.3981,
689
  "step": 97
690
  },
691
  {
692
  "epoch": 4.911392405063291,
693
+ "grad_norm": 5.160162448883057,
694
  "learning_rate": 1.5352157761815977e-07,
695
+ "loss": 0.2999,
696
  "step": 98
697
  },
698
  {
699
  "epoch": 4.962025316455696,
700
+ "grad_norm": 3.9566633701324463,
701
  "learning_rate": 6.827066535529946e-08,
702
+ "loss": 0.2816,
703
  "step": 99
704
  },
705
  {
706
  "epoch": 5.0,
707
+ "grad_norm": 6.496578693389893,
708
  "learning_rate": 1.7073496424427348e-08,
709
+ "loss": 0.1943,
710
  "step": 100
711
  }
712
  ],
 
714
  "max_steps": 100,
715
  "num_input_tokens_seen": 0,
716
  "num_train_epochs": 5,
717
+ "save_steps": 20,
718
  "stateful_callbacks": {
719
  "TrainerControl": {
720
  "args": {
 
727
  "attributes": {}
728
  }
729
  },
730
+ "total_flos": 3.0794717131554816e+16,
731
  "train_batch_size": 2,
732
  "trial_name": null,
733
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5d2bf5cbde8ca6f3d82294680caf16a11f0007e85940b9ae50a642637f810a67
3
  size 6033
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cec728dbde53c01fb89ffdc20ac760b3b4b343536639aa83df1e8f345df47df8
3
  size 6033