robertou2 commited on
Commit
6776136
·
verified ·
1 Parent(s): 176febd

Upload folder using huggingface_hub

Browse files
adapter_config.json CHANGED
@@ -12,21 +12,21 @@
12
  "layers_pattern": null,
13
  "layers_to_transform": null,
14
  "loftq_config": {},
15
- "lora_alpha": 32,
16
  "lora_bias": false,
17
  "lora_dropout": 0.05,
18
  "megatron_config": null,
19
  "megatron_core": "megatron.core",
20
  "modules_to_save": null,
21
  "peft_type": "LORA",
22
- "r": 16,
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
  "gate_up_proj",
27
- "qkv_proj",
28
  "o_proj",
29
- "down_proj"
 
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
12
  "layers_pattern": null,
13
  "layers_to_transform": null,
14
  "loftq_config": {},
15
+ "lora_alpha": 48,
16
  "lora_bias": false,
17
  "lora_dropout": 0.05,
18
  "megatron_config": null,
19
  "megatron_core": "megatron.core",
20
  "modules_to_save": null,
21
  "peft_type": "LORA",
22
+ "r": 96,
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
  "gate_up_proj",
 
27
  "o_proj",
28
+ "down_proj",
29
+ "qkv_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:91deaa0df19cb3a4603aed93b3ea53b071dfc7c0b2e4fdaaec06eb2240d2d81c
3
- size 92309112
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:354061e10299330b91e889824da13bec1303cd549f1396c485c21e63e79b6f6d
3
+ size 553683024
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:18d99d58786c67973fe2041dabca67551378777424f0b36426db8c429e7ae955
3
- size 184765003
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3369b904cd237f36a8d77bf651116368a1a19b5f3db96249b7e6a9f9bb45e3d8
3
+ size 1107512523
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3dbc0a5b32ad7d5de753e64fe048720f783b76e89c603c1c55e1c06734520c91
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0241384ebd3d15b1a8164991a445a993df2bd29e7024ac9c77da5909807a7c57
3
  size 14645
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:904ce4dc1d5cd57472f50779861a9053d20471c1dc9e146ec99c9316f40e0da7
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8da2a29d769a2c7c6ee654d830f0801577c6076f6346125ad041b3edd166bbc2
3
  size 1465
trainer_state.json CHANGED
@@ -1,17 +1,17 @@
1
  {
2
- "best_global_step": 99,
3
- "best_metric": 0.43201857805252075,
4
- "best_model_checkpoint": "/content/drive/MyDrive/lora_model/outputs/task15_microsoft/Phi-4-mini-instruct/checkpoint-90",
5
- "epoch": 5.2631578947368425,
6
  "eval_steps": 1,
7
- "global_step": 100,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.05263157894736842,
14
- "grad_norm": 1.1712253093719482,
15
  "learning_rate": 0.0,
16
  "loss": 3.2235,
17
  "step": 1
@@ -19,1502 +19,827 @@
19
  {
20
  "epoch": 0.05263157894736842,
21
  "eval_loss": 3.15524959564209,
22
- "eval_runtime": 3.3832,
23
- "eval_samples_per_second": 8.867,
24
- "eval_steps_per_second": 1.182,
25
  "step": 1
26
  },
27
  {
28
  "epoch": 0.10526315789473684,
29
- "grad_norm": 1.2426623106002808,
30
  "learning_rate": 3.3333333333333335e-05,
31
  "loss": 3.165,
32
  "step": 2
33
  },
34
  {
35
  "epoch": 0.10526315789473684,
36
- "eval_loss": 3.1208913326263428,
37
- "eval_runtime": 3.3506,
38
- "eval_samples_per_second": 8.954,
39
- "eval_steps_per_second": 1.194,
40
  "step": 2
41
  },
42
  {
43
  "epoch": 0.15789473684210525,
44
- "grad_norm": 1.0169581174850464,
45
  "learning_rate": 6.666666666666667e-05,
46
- "loss": 2.8762,
47
  "step": 3
48
  },
49
  {
50
  "epoch": 0.15789473684210525,
51
- "eval_loss": 3.033876895904541,
52
- "eval_runtime": 3.3775,
53
- "eval_samples_per_second": 8.882,
54
- "eval_steps_per_second": 1.184,
55
  "step": 3
56
  },
57
  {
58
  "epoch": 0.21052631578947367,
59
- "grad_norm": 1.1397525072097778,
60
  "learning_rate": 0.0001,
61
- "loss": 3.0285,
62
  "step": 4
63
  },
64
  {
65
  "epoch": 0.21052631578947367,
66
- "eval_loss": 2.882239818572998,
67
- "eval_runtime": 3.3981,
68
- "eval_samples_per_second": 8.829,
69
- "eval_steps_per_second": 1.177,
70
  "step": 4
71
  },
72
  {
73
  "epoch": 0.2631578947368421,
74
- "grad_norm": 1.1276919841766357,
75
  "learning_rate": 0.00013333333333333334,
76
- "loss": 2.8059,
77
  "step": 5
78
  },
79
  {
80
  "epoch": 0.2631578947368421,
81
- "eval_loss": 2.671700954437256,
82
- "eval_runtime": 3.4282,
83
- "eval_samples_per_second": 8.751,
84
- "eval_steps_per_second": 1.167,
85
  "step": 5
86
  },
87
  {
88
  "epoch": 0.3157894736842105,
89
- "grad_norm": 1.1082642078399658,
90
  "learning_rate": 0.00016666666666666666,
91
- "loss": 2.5492,
92
  "step": 6
93
  },
94
  {
95
  "epoch": 0.3157894736842105,
96
- "eval_loss": 2.4450764656066895,
97
- "eval_runtime": 3.4466,
98
- "eval_samples_per_second": 8.704,
99
- "eval_steps_per_second": 1.161,
100
  "step": 6
101
  },
102
  {
103
  "epoch": 0.3684210526315789,
104
- "grad_norm": 1.0461392402648926,
105
  "learning_rate": 0.0002,
106
- "loss": 2.4397,
107
  "step": 7
108
  },
109
  {
110
  "epoch": 0.3684210526315789,
111
- "eval_loss": 2.230668544769287,
112
- "eval_runtime": 3.4595,
113
- "eval_samples_per_second": 8.672,
114
- "eval_steps_per_second": 1.156,
115
  "step": 7
116
  },
117
  {
118
  "epoch": 0.42105263157894735,
119
- "grad_norm": 1.2636622190475464,
120
  "learning_rate": 0.00023333333333333333,
121
- "loss": 2.2026,
122
  "step": 8
123
  },
124
  {
125
  "epoch": 0.42105263157894735,
126
- "eval_loss": 2.028376340866089,
127
- "eval_runtime": 3.4413,
128
- "eval_samples_per_second": 8.718,
129
- "eval_steps_per_second": 1.162,
130
  "step": 8
131
  },
132
  {
133
  "epoch": 0.47368421052631576,
134
- "grad_norm": 1.4393274784088135,
135
  "learning_rate": 0.0002666666666666667,
136
- "loss": 2.0811,
137
  "step": 9
138
  },
139
  {
140
  "epoch": 0.47368421052631576,
141
- "eval_loss": 1.8259222507476807,
142
- "eval_runtime": 3.4225,
143
- "eval_samples_per_second": 8.765,
144
- "eval_steps_per_second": 1.169,
145
  "step": 9
146
  },
147
  {
148
  "epoch": 0.5263157894736842,
149
- "grad_norm": 1.4638570547103882,
150
  "learning_rate": 0.0003,
151
- "loss": 1.8383,
152
  "step": 10
153
  },
154
  {
155
  "epoch": 0.5263157894736842,
156
- "eval_loss": 1.6318742036819458,
157
- "eval_runtime": 3.4158,
158
- "eval_samples_per_second": 8.783,
159
- "eval_steps_per_second": 1.171,
160
  "step": 10
161
  },
162
  {
163
  "epoch": 0.5789473684210527,
164
- "grad_norm": 1.4234288930892944,
165
  "learning_rate": 0.0003333333333333333,
166
- "loss": 1.6943,
167
  "step": 11
168
  },
169
  {
170
  "epoch": 0.5789473684210527,
171
- "eval_loss": 1.4664249420166016,
172
- "eval_runtime": 3.385,
173
- "eval_samples_per_second": 8.863,
174
- "eval_steps_per_second": 1.182,
175
  "step": 11
176
  },
177
  {
178
  "epoch": 0.631578947368421,
179
- "grad_norm": 1.2770508527755737,
180
  "learning_rate": 0.00036666666666666667,
181
- "loss": 1.4634,
182
  "step": 12
183
  },
184
  {
185
  "epoch": 0.631578947368421,
186
- "eval_loss": 1.37418794631958,
187
- "eval_runtime": 3.3866,
188
- "eval_samples_per_second": 8.858,
189
- "eval_steps_per_second": 1.181,
190
  "step": 12
191
  },
192
  {
193
  "epoch": 0.6842105263157895,
194
- "grad_norm": 1.5616014003753662,
195
  "learning_rate": 0.0004,
196
- "loss": 1.4361,
197
  "step": 13
198
  },
199
  {
200
  "epoch": 0.6842105263157895,
201
- "eval_loss": 1.3023313283920288,
202
- "eval_runtime": 3.3859,
203
- "eval_samples_per_second": 8.86,
204
- "eval_steps_per_second": 1.181,
205
  "step": 13
206
  },
207
  {
208
  "epoch": 0.7368421052631579,
209
- "grad_norm": 1.475995421409607,
210
  "learning_rate": 0.00043333333333333337,
211
- "loss": 1.3218,
212
  "step": 14
213
  },
214
  {
215
  "epoch": 0.7368421052631579,
216
- "eval_loss": 1.237278699874878,
217
- "eval_runtime": 3.3787,
218
- "eval_samples_per_second": 8.879,
219
- "eval_steps_per_second": 1.184,
220
  "step": 14
221
  },
222
  {
223
  "epoch": 0.7894736842105263,
224
- "grad_norm": 1.1547696590423584,
225
  "learning_rate": 0.00046666666666666666,
226
- "loss": 1.3192,
227
  "step": 15
228
  },
229
  {
230
  "epoch": 0.7894736842105263,
231
- "eval_loss": 1.1772326231002808,
232
- "eval_runtime": 3.3856,
233
- "eval_samples_per_second": 8.861,
234
- "eval_steps_per_second": 1.181,
235
  "step": 15
236
  },
237
  {
238
  "epoch": 0.8421052631578947,
239
- "grad_norm": 0.8922737240791321,
240
  "learning_rate": 0.0005,
241
- "loss": 1.2386,
242
  "step": 16
243
  },
244
  {
245
  "epoch": 0.8421052631578947,
246
- "eval_loss": 1.1432918310165405,
247
- "eval_runtime": 3.3926,
248
- "eval_samples_per_second": 8.843,
249
- "eval_steps_per_second": 1.179,
250
  "step": 16
251
  },
252
  {
253
  "epoch": 0.8947368421052632,
254
- "grad_norm": 0.8703598380088806,
255
- "learning_rate": 0.0004999776608025946,
256
- "loss": 1.2852,
257
  "step": 17
258
  },
259
  {
260
  "epoch": 0.8947368421052632,
261
- "eval_loss": 1.1231766939163208,
262
- "eval_runtime": 3.401,
263
- "eval_samples_per_second": 8.821,
264
- "eval_steps_per_second": 1.176,
265
  "step": 17
266
  },
267
  {
268
  "epoch": 0.9473684210526315,
269
- "grad_norm": 0.8985245823860168,
270
- "learning_rate": 0.000499910647202696,
271
- "loss": 1.1268,
272
  "step": 18
273
  },
274
  {
275
  "epoch": 0.9473684210526315,
276
- "eval_loss": 1.0892575979232788,
277
- "eval_runtime": 3.4075,
278
- "eval_samples_per_second": 8.804,
279
- "eval_steps_per_second": 1.174,
280
  "step": 18
281
  },
282
  {
283
  "epoch": 1.0,
284
- "grad_norm": 0.705194890499115,
285
- "learning_rate": 0.0004997989711765446,
286
- "loss": 1.1925,
287
  "step": 19
288
  },
289
  {
290
  "epoch": 1.0,
291
- "eval_loss": 1.0620007514953613,
292
- "eval_runtime": 3.4023,
293
- "eval_samples_per_second": 8.817,
294
- "eval_steps_per_second": 1.176,
295
  "step": 19
296
  },
297
  {
298
  "epoch": 1.0526315789473684,
299
- "grad_norm": 0.6432715654373169,
300
- "learning_rate": 0.0004996426526821629,
301
- "loss": 1.0677,
302
  "step": 20
303
  },
304
  {
305
  "epoch": 1.0526315789473684,
306
- "eval_loss": 1.0364060401916504,
307
- "eval_runtime": 3.4021,
308
  "eval_samples_per_second": 8.818,
309
  "eval_steps_per_second": 1.176,
310
  "step": 20
311
  },
312
  {
313
  "epoch": 1.1052631578947367,
314
- "grad_norm": 0.5805476903915405,
315
- "learning_rate": 0.0004994417196557883,
316
- "loss": 1.0514,
317
  "step": 21
318
  },
319
  {
320
  "epoch": 1.1052631578947367,
321
- "eval_loss": 1.0189239978790283,
322
- "eval_runtime": 3.3998,
323
- "eval_samples_per_second": 8.824,
324
- "eval_steps_per_second": 1.177,
325
  "step": 21
326
  },
327
  {
328
  "epoch": 1.1578947368421053,
329
- "grad_norm": 0.5795720219612122,
330
- "learning_rate": 0.0004991962080068813,
331
- "loss": 1.0788,
332
  "step": 22
333
  },
334
  {
335
  "epoch": 1.1578947368421053,
336
- "eval_loss": 1.0024681091308594,
337
- "eval_runtime": 3.3973,
338
- "eval_samples_per_second": 8.831,
339
- "eval_steps_per_second": 1.177,
340
  "step": 22
341
  },
342
  {
343
  "epoch": 1.2105263157894737,
344
- "grad_norm": 0.7284250855445862,
345
- "learning_rate": 0.0004989061616117073,
346
- "loss": 0.9834,
347
  "step": 23
348
  },
349
  {
350
  "epoch": 1.2105263157894737,
351
- "eval_loss": 0.9821510910987854,
352
- "eval_runtime": 3.3979,
353
- "eval_samples_per_second": 8.829,
354
- "eval_steps_per_second": 1.177,
355
  "step": 23
356
  },
357
  {
358
  "epoch": 1.263157894736842,
359
- "grad_norm": 0.7955266833305359,
360
- "learning_rate": 0.0004985716323054959,
361
- "loss": 1.0999,
362
  "step": 24
363
  },
364
  {
365
  "epoch": 1.263157894736842,
366
- "eval_loss": 0.973588228225708,
367
- "eval_runtime": 3.3958,
368
- "eval_samples_per_second": 8.834,
369
- "eval_steps_per_second": 1.178,
370
  "step": 24
371
  },
372
  {
373
  "epoch": 1.3157894736842106,
374
- "grad_norm": 0.6546872854232788,
375
- "learning_rate": 0.0004981926798731766,
376
- "loss": 0.9389,
377
  "step": 25
378
  },
379
  {
380
  "epoch": 1.3157894736842106,
381
- "eval_loss": 0.9707676768302917,
382
- "eval_runtime": 3.4001,
383
- "eval_samples_per_second": 8.823,
384
- "eval_steps_per_second": 1.176,
385
  "step": 25
386
  },
387
  {
388
  "epoch": 1.368421052631579,
389
- "grad_norm": 0.6482366323471069,
390
- "learning_rate": 0.000497769372038695,
391
- "loss": 1.0285,
392
  "step": 26
393
  },
394
  {
395
  "epoch": 1.368421052631579,
396
- "eval_loss": 0.9686868190765381,
397
- "eval_runtime": 3.4003,
398
- "eval_samples_per_second": 8.823,
399
- "eval_steps_per_second": 1.176,
400
  "step": 26
401
  },
402
  {
403
  "epoch": 1.4210526315789473,
404
- "grad_norm": 0.5976347327232361,
405
- "learning_rate": 0.0004973017844529094,
406
- "loss": 0.9571,
407
  "step": 27
408
  },
409
  {
410
  "epoch": 1.4210526315789473,
411
- "eval_loss": 0.9679729342460632,
412
- "eval_runtime": 3.3978,
413
- "eval_samples_per_second": 8.829,
414
- "eval_steps_per_second": 1.177,
415
  "step": 27
416
  },
417
  {
418
  "epoch": 1.4736842105263157,
419
- "grad_norm": 0.6117852926254272,
420
- "learning_rate": 0.0004967900006800708,
421
- "loss": 0.8988,
422
  "step": 28
423
  },
424
  {
425
  "epoch": 1.4736842105263157,
426
- "eval_loss": 0.955328643321991,
427
- "eval_runtime": 3.3986,
428
- "eval_samples_per_second": 8.827,
429
- "eval_steps_per_second": 1.177,
430
  "step": 28
431
  },
432
  {
433
  "epoch": 1.526315789473684,
434
- "grad_norm": 0.8034415245056152,
435
- "learning_rate": 0.000496234112182889,
436
- "loss": 1.0419,
437
  "step": 29
438
  },
439
  {
440
  "epoch": 1.526315789473684,
441
- "eval_loss": 0.9429832696914673,
442
- "eval_runtime": 3.3993,
443
- "eval_samples_per_second": 8.825,
444
- "eval_steps_per_second": 1.177,
445
  "step": 29
446
  },
447
  {
448
  "epoch": 1.5789473684210527,
449
- "grad_norm": 0.6744455099105835,
450
- "learning_rate": 0.000495634218306187,
451
- "loss": 1.0113,
452
  "step": 30
453
  },
454
  {
455
  "epoch": 1.5789473684210527,
456
- "eval_loss": 0.9402546286582947,
457
- "eval_runtime": 3.4024,
458
- "eval_samples_per_second": 8.817,
459
- "eval_steps_per_second": 1.176,
460
  "step": 30
461
  },
462
  {
463
  "epoch": 1.631578947368421,
464
- "grad_norm": 0.8540083169937134,
465
- "learning_rate": 0.0004949904262591467,
466
- "loss": 0.9779,
467
  "step": 31
468
  },
469
  {
470
  "epoch": 1.631578947368421,
471
- "eval_loss": 0.9174972176551819,
472
- "eval_runtime": 3.3971,
473
- "eval_samples_per_second": 8.831,
474
- "eval_steps_per_second": 1.177,
475
  "step": 31
476
  },
477
  {
478
  "epoch": 1.6842105263157894,
479
- "grad_norm": 0.5661184787750244,
480
- "learning_rate": 0.0004943028510961491,
481
- "loss": 0.967,
482
  "step": 32
483
  },
484
  {
485
  "epoch": 1.6842105263157894,
486
- "eval_loss": 0.8996461629867554,
487
- "eval_runtime": 3.401,
488
- "eval_samples_per_second": 8.821,
489
- "eval_steps_per_second": 1.176,
490
  "step": 32
491
  },
492
  {
493
  "epoch": 1.736842105263158,
494
- "grad_norm": 0.6420716643333435,
495
- "learning_rate": 0.0004935716156962127,
496
- "loss": 1.0637,
497
  "step": 33
498
  },
499
  {
500
  "epoch": 1.736842105263158,
501
- "eval_loss": 0.8879114389419556,
502
- "eval_runtime": 3.3843,
503
- "eval_samples_per_second": 8.865,
504
- "eval_steps_per_second": 1.182,
505
  "step": 33
506
  },
507
  {
508
  "epoch": 1.7894736842105263,
509
- "grad_norm": 0.5820953249931335,
510
- "learning_rate": 0.000492796850741033,
511
- "loss": 0.9406,
512
  "step": 34
513
  },
514
  {
515
  "epoch": 1.7894736842105263,
516
- "eval_loss": 0.8790176510810852,
517
- "eval_runtime": 3.3978,
518
- "eval_samples_per_second": 8.829,
519
- "eval_steps_per_second": 1.177,
520
  "step": 34
521
  },
522
  {
523
  "epoch": 1.8421052631578947,
524
- "grad_norm": 0.5555437207221985,
525
- "learning_rate": 0.0004919786946916281,
526
- "loss": 0.9973,
527
  "step": 35
528
  },
529
  {
530
  "epoch": 1.8421052631578947,
531
- "eval_loss": 0.8706895112991333,
532
- "eval_runtime": 3.4025,
533
- "eval_samples_per_second": 8.817,
534
- "eval_steps_per_second": 1.176,
535
  "step": 35
536
  },
537
  {
538
  "epoch": 1.8947368421052633,
539
- "grad_norm": 0.6944723129272461,
540
- "learning_rate": 0.0004911172937635942,
541
- "loss": 0.9624,
542
  "step": 36
543
  },
544
  {
545
  "epoch": 1.8947368421052633,
546
- "eval_loss": 0.8582616448402405,
547
- "eval_runtime": 3.3987,
548
- "eval_samples_per_second": 8.827,
549
- "eval_steps_per_second": 1.177,
550
  "step": 36
551
  },
552
  {
553
  "epoch": 1.9473684210526314,
554
- "grad_norm": 0.516936182975769,
555
- "learning_rate": 0.0004902128019009741,
556
- "loss": 1.0242,
557
  "step": 37
558
  },
559
  {
560
  "epoch": 1.9473684210526314,
561
- "eval_loss": 0.8482629060745239,
562
- "eval_runtime": 3.3973,
563
- "eval_samples_per_second": 8.83,
564
- "eval_steps_per_second": 1.177,
565
  "step": 37
566
  },
567
  {
568
  "epoch": 2.0,
569
- "grad_norm": 0.6250211596488953,
570
- "learning_rate": 0.000489265380748746,
571
- "loss": 1.0646,
572
  "step": 38
573
  },
574
  {
575
  "epoch": 2.0,
576
- "eval_loss": 0.8396931290626526,
577
- "eval_runtime": 3.3968,
578
- "eval_samples_per_second": 8.832,
579
- "eval_steps_per_second": 1.178,
580
  "step": 38
581
  },
582
  {
583
  "epoch": 2.0526315789473686,
584
- "grad_norm": 0.6457982659339905,
585
- "learning_rate": 0.0004882751996239352,
586
- "loss": 0.9107,
587
  "step": 39
588
  },
589
  {
590
  "epoch": 2.0526315789473686,
591
- "eval_loss": 0.8291558027267456,
592
- "eval_runtime": 3.3988,
593
- "eval_samples_per_second": 8.827,
594
- "eval_steps_per_second": 1.177,
595
  "step": 39
596
  },
597
  {
598
  "epoch": 2.1052631578947367,
599
- "grad_norm": 0.49637654423713684,
600
- "learning_rate": 0.0004872424354853545,
601
- "loss": 0.8729,
602
  "step": 40
603
  },
604
  {
605
  "epoch": 2.1052631578947367,
606
- "eval_loss": 0.8166154026985168,
607
- "eval_runtime": 3.3997,
608
- "eval_samples_per_second": 8.824,
609
- "eval_steps_per_second": 1.177,
610
  "step": 40
611
  },
612
  {
613
  "epoch": 2.1578947368421053,
614
- "grad_norm": 0.6060866713523865,
615
- "learning_rate": 0.0004861672729019797,
616
- "loss": 0.8154,
617
  "step": 41
618
  },
619
  {
620
  "epoch": 2.1578947368421053,
621
- "eval_loss": 0.8058971762657166,
622
- "eval_runtime": 3.3964,
623
- "eval_samples_per_second": 8.833,
624
- "eval_steps_per_second": 1.178,
625
  "step": 41
626
  },
627
  {
628
  "epoch": 2.2105263157894735,
629
- "grad_norm": 0.5285487771034241,
630
- "learning_rate": 0.0004850499040199643,
631
- "loss": 0.7798,
632
  "step": 42
633
  },
634
  {
635
  "epoch": 2.2105263157894735,
636
- "eval_loss": 0.7971588969230652,
637
- "eval_runtime": 3.4012,
638
- "eval_samples_per_second": 8.82,
639
- "eval_steps_per_second": 1.176,
640
  "step": 42
641
  },
642
  {
643
  "epoch": 2.263157894736842,
644
- "grad_norm": 0.7103962898254395,
645
- "learning_rate": 0.0004838905285283005,
646
- "loss": 0.9025,
647
  "step": 43
648
  },
649
  {
650
  "epoch": 2.263157894736842,
651
- "eval_loss": 0.7828482985496521,
652
- "eval_runtime": 3.4,
653
- "eval_samples_per_second": 8.824,
654
- "eval_steps_per_second": 1.176,
655
  "step": 43
656
  },
657
  {
658
  "epoch": 2.3157894736842106,
659
- "grad_norm": 0.6385390758514404,
660
- "learning_rate": 0.00048268935362313215,
661
- "loss": 0.8484,
662
  "step": 44
663
  },
664
  {
665
  "epoch": 2.3157894736842106,
666
- "eval_loss": 0.7740622758865356,
667
- "eval_runtime": 3.4013,
668
- "eval_samples_per_second": 8.82,
669
  "eval_steps_per_second": 1.176,
670
  "step": 44
671
  },
672
  {
673
  "epoch": 2.3684210526315788,
674
- "grad_norm": 0.6478577852249146,
675
- "learning_rate": 0.00048144659397072586,
676
- "loss": 0.794,
677
  "step": 45
678
  },
679
  {
680
  "epoch": 2.3684210526315788,
681
- "eval_loss": 0.7711488604545593,
682
- "eval_runtime": 3.4029,
683
- "eval_samples_per_second": 8.816,
684
- "eval_steps_per_second": 1.175,
685
  "step": 45
686
  },
687
  {
688
  "epoch": 2.4210526315789473,
689
- "grad_norm": 0.6230824589729309,
690
- "learning_rate": 0.0004801624716691072,
691
- "loss": 0.8394,
692
  "step": 46
693
  },
694
  {
695
  "epoch": 2.4210526315789473,
696
- "eval_loss": 0.7640188932418823,
697
- "eval_runtime": 3.3993,
698
- "eval_samples_per_second": 8.825,
699
- "eval_steps_per_second": 1.177,
700
  "step": 46
701
  },
702
  {
703
  "epoch": 2.473684210526316,
704
- "grad_norm": 0.5779664516448975,
705
- "learning_rate": 0.00047883721620836894,
706
- "loss": 0.7857,
707
  "step": 47
708
  },
709
  {
710
  "epoch": 2.473684210526316,
711
- "eval_loss": 0.758138120174408,
712
- "eval_runtime": 3.3991,
713
- "eval_samples_per_second": 8.826,
714
- "eval_steps_per_second": 1.177,
715
  "step": 47
716
  },
717
  {
718
  "epoch": 2.526315789473684,
719
- "grad_norm": 0.5758649110794067,
720
- "learning_rate": 0.0004774710644296578,
721
- "loss": 0.7685,
722
  "step": 48
723
  },
724
  {
725
  "epoch": 2.526315789473684,
726
- "eval_loss": 0.7491741180419922,
727
- "eval_runtime": 3.4037,
728
- "eval_samples_per_second": 8.814,
729
- "eval_steps_per_second": 1.175,
730
  "step": 48
731
  },
732
  {
733
  "epoch": 2.5789473684210527,
734
- "grad_norm": 0.7427331805229187,
735
- "learning_rate": 0.00047606426048284813,
736
- "loss": 0.8529,
737
  "step": 49
738
  },
739
  {
740
  "epoch": 2.5789473684210527,
741
- "eval_loss": 0.7381884455680847,
742
- "eval_runtime": 3.3985,
743
- "eval_samples_per_second": 8.827,
744
  "eval_steps_per_second": 1.177,
745
  "step": 49
746
  },
747
  {
748
  "epoch": 2.6315789473684212,
749
- "grad_norm": 0.5156267285346985,
750
- "learning_rate": 0.00047461705578290833,
751
- "loss": 0.7453,
752
  "step": 50
753
  },
754
  {
755
  "epoch": 2.6315789473684212,
756
- "eval_loss": 0.735011637210846,
757
- "eval_runtime": 3.3975,
758
- "eval_samples_per_second": 8.83,
759
- "eval_steps_per_second": 1.177,
760
  "step": 50
761
  },
762
  {
763
  "epoch": 2.6842105263157894,
764
- "grad_norm": 0.5465694665908813,
765
- "learning_rate": 0.0004731297089649703,
766
- "loss": 0.7681,
767
  "step": 51
768
  },
769
  {
770
  "epoch": 2.6842105263157894,
771
- "eval_loss": 0.7380778193473816,
772
- "eval_runtime": 3.3945,
773
- "eval_samples_per_second": 8.838,
774
- "eval_steps_per_second": 1.178,
775
  "step": 51
776
  },
777
  {
778
  "epoch": 2.736842105263158,
779
- "grad_norm": 0.5591109991073608,
780
- "learning_rate": 0.0004716024858381075,
781
- "loss": 0.7583,
782
  "step": 52
783
  },
784
  {
785
  "epoch": 2.736842105263158,
786
- "eval_loss": 0.735223650932312,
787
- "eval_runtime": 3.3923,
788
- "eval_samples_per_second": 8.844,
789
- "eval_steps_per_second": 1.179,
790
  "step": 52
791
  },
792
  {
793
  "epoch": 2.7894736842105265,
794
- "grad_norm": 0.6300053596496582,
795
- "learning_rate": 0.00047003565933783123,
796
- "loss": 0.8622,
797
  "step": 53
798
  },
799
  {
800
  "epoch": 2.7894736842105265,
801
- "eval_loss": 0.7290965914726257,
802
- "eval_runtime": 3.4013,
803
- "eval_samples_per_second": 8.82,
804
- "eval_steps_per_second": 1.176,
805
  "step": 53
806
  },
807
  {
808
  "epoch": 2.8421052631578947,
809
- "grad_norm": 0.6577848792076111,
810
- "learning_rate": 0.0004684295094773134,
811
- "loss": 0.7678,
812
  "step": 54
813
  },
814
  {
815
  "epoch": 2.8421052631578947,
816
- "eval_loss": 0.7240878343582153,
817
- "eval_runtime": 3.3989,
818
- "eval_samples_per_second": 8.826,
819
- "eval_steps_per_second": 1.177,
820
  "step": 54
821
  },
822
  {
823
  "epoch": 2.8947368421052633,
824
- "grad_norm": 0.48959189653396606,
825
- "learning_rate": 0.00046678432329734434,
826
- "loss": 0.7592,
827
  "step": 55
828
  },
829
  {
830
  "epoch": 2.8947368421052633,
831
- "eval_loss": 0.7289024591445923,
832
- "eval_runtime": 3.4003,
833
- "eval_samples_per_second": 8.823,
834
- "eval_steps_per_second": 1.176,
835
  "step": 55
836
- },
837
- {
838
- "epoch": 2.9473684210526314,
839
- "grad_norm": 0.6378675699234009,
840
- "learning_rate": 0.00046510039481503486,
841
- "loss": 0.8768,
842
- "step": 56
843
- },
844
- {
845
- "epoch": 2.9473684210526314,
846
- "eval_loss": 0.7245283722877502,
847
- "eval_runtime": 3.4071,
848
- "eval_samples_per_second": 8.805,
849
- "eval_steps_per_second": 1.174,
850
- "step": 56
851
- },
852
- {
853
- "epoch": 3.0,
854
- "grad_norm": 0.533486545085907,
855
- "learning_rate": 0.00046337802497127117,
856
- "loss": 0.8078,
857
- "step": 57
858
- },
859
- {
860
- "epoch": 3.0,
861
- "eval_loss": 0.7103175520896912,
862
- "eval_runtime": 3.4012,
863
- "eval_samples_per_second": 8.821,
864
- "eval_steps_per_second": 1.176,
865
- "step": 57
866
- },
867
- {
868
- "epoch": 3.0526315789473686,
869
- "grad_norm": 0.5410111546516418,
870
- "learning_rate": 0.00046161752157693284,
871
- "loss": 0.7147,
872
- "step": 58
873
- },
874
- {
875
- "epoch": 3.0526315789473686,
876
- "eval_loss": 0.6982213258743286,
877
- "eval_runtime": 3.3871,
878
- "eval_samples_per_second": 8.857,
879
- "eval_steps_per_second": 1.181,
880
- "step": 58
881
- },
882
- {
883
- "epoch": 3.1052631578947367,
884
- "grad_norm": 0.5490122437477112,
885
- "learning_rate": 0.0004598191992578828,
886
- "loss": 0.7584,
887
- "step": 59
888
- },
889
- {
890
- "epoch": 3.1052631578947367,
891
- "eval_loss": 0.6866177320480347,
892
- "eval_runtime": 3.3873,
893
- "eval_samples_per_second": 8.857,
894
- "eval_steps_per_second": 1.181,
895
- "step": 59
896
- },
897
- {
898
- "epoch": 3.1578947368421053,
899
- "grad_norm": 0.49469825625419617,
900
- "learning_rate": 0.00045798337939873923,
901
- "loss": 0.7261,
902
- "step": 60
903
- },
904
- {
905
- "epoch": 3.1578947368421053,
906
- "eval_loss": 0.6730698943138123,
907
- "eval_runtime": 3.3973,
908
- "eval_samples_per_second": 8.83,
909
- "eval_steps_per_second": 1.177,
910
- "step": 60
911
- },
912
- {
913
- "epoch": 3.2105263157894735,
914
- "grad_norm": 0.8399549126625061,
915
- "learning_rate": 0.0004561103900854401,
916
- "loss": 0.6503,
917
- "step": 61
918
- },
919
- {
920
- "epoch": 3.2105263157894735,
921
- "eval_loss": 0.6618488430976868,
922
- "eval_runtime": 3.3947,
923
- "eval_samples_per_second": 8.837,
924
- "eval_steps_per_second": 1.178,
925
- "step": 61
926
- },
927
- {
928
- "epoch": 3.263157894736842,
929
- "grad_norm": 0.5458311438560486,
930
- "learning_rate": 0.0004542005660466094,
931
- "loss": 0.7217,
932
- "step": 62
933
- },
934
- {
935
- "epoch": 3.263157894736842,
936
- "eval_loss": 0.6508110761642456,
937
- "eval_runtime": 3.4003,
938
- "eval_samples_per_second": 8.823,
939
- "eval_steps_per_second": 1.176,
940
- "step": 62
941
- },
942
- {
943
- "epoch": 3.3157894736842106,
944
- "grad_norm": 0.9009385704994202,
945
- "learning_rate": 0.0004522542485937369,
946
- "loss": 0.6747,
947
- "step": 63
948
- },
949
- {
950
- "epoch": 3.3157894736842106,
951
- "eval_loss": 0.6464059948921204,
952
- "eval_runtime": 3.4046,
953
- "eval_samples_per_second": 8.812,
954
- "eval_steps_per_second": 1.175,
955
- "step": 63
956
- },
957
- {
958
- "epoch": 3.3684210526315788,
959
- "grad_norm": 0.5399370193481445,
960
- "learning_rate": 0.0004502717855601809,
961
- "loss": 0.6838,
962
- "step": 64
963
- },
964
- {
965
- "epoch": 3.3684210526315788,
966
- "eval_loss": 0.6449176669120789,
967
- "eval_runtime": 3.3903,
968
- "eval_samples_per_second": 8.849,
969
- "eval_steps_per_second": 1.18,
970
- "step": 64
971
- },
972
- {
973
- "epoch": 3.4210526315789473,
974
- "grad_norm": 0.664746880531311,
975
- "learning_rate": 0.0004482535312390058,
976
- "loss": 0.6601,
977
- "step": 65
978
- },
979
- {
980
- "epoch": 3.4210526315789473,
981
- "eval_loss": 0.6410928964614868,
982
- "eval_runtime": 3.3948,
983
- "eval_samples_per_second": 8.837,
984
- "eval_steps_per_second": 1.178,
985
- "step": 65
986
- },
987
- {
988
- "epoch": 3.473684210526316,
989
- "grad_norm": 0.7200000882148743,
990
- "learning_rate": 0.00044619984631966527,
991
- "loss": 0.5722,
992
- "step": 66
993
- },
994
- {
995
- "epoch": 3.473684210526316,
996
- "eval_loss": 0.6338309645652771,
997
- "eval_runtime": 3.3867,
998
- "eval_samples_per_second": 8.858,
999
- "eval_steps_per_second": 1.181,
1000
- "step": 66
1001
- },
1002
- {
1003
- "epoch": 3.526315789473684,
1004
- "grad_norm": 0.8224210739135742,
1005
- "learning_rate": 0.0004441110978235418,
1006
- "loss": 0.6984,
1007
- "step": 67
1008
- },
1009
- {
1010
- "epoch": 3.526315789473684,
1011
- "eval_loss": 0.6232346892356873,
1012
- "eval_runtime": 3.3872,
1013
- "eval_samples_per_second": 8.857,
1014
- "eval_steps_per_second": 1.181,
1015
- "step": 67
1016
- },
1017
- {
1018
- "epoch": 3.5789473684210527,
1019
- "grad_norm": 0.6948024034500122,
1020
- "learning_rate": 0.0004419876590383554,
1021
- "loss": 0.6921,
1022
- "step": 68
1023
- },
1024
- {
1025
- "epoch": 3.5789473684210527,
1026
- "eval_loss": 0.6190816164016724,
1027
- "eval_runtime": 3.4096,
1028
- "eval_samples_per_second": 8.799,
1029
- "eval_steps_per_second": 1.173,
1030
- "step": 68
1031
- },
1032
- {
1033
- "epoch": 3.6315789473684212,
1034
- "grad_norm": 0.5954806804656982,
1035
- "learning_rate": 0.00043982990945145146,
1036
- "loss": 0.6452,
1037
- "step": 69
1038
- },
1039
- {
1040
- "epoch": 3.6315789473684212,
1041
- "eval_loss": 0.6215729117393494,
1042
- "eval_runtime": 3.4023,
1043
- "eval_samples_per_second": 8.818,
1044
- "eval_steps_per_second": 1.176,
1045
- "step": 69
1046
- },
1047
- {
1048
- "epoch": 3.6842105263157894,
1049
- "grad_norm": 0.6146106719970703,
1050
- "learning_rate": 0.0004376382346819819,
1051
- "loss": 0.6753,
1052
- "step": 70
1053
- },
1054
- {
1055
- "epoch": 3.6842105263157894,
1056
- "eval_loss": 0.616372287273407,
1057
- "eval_runtime": 3.4004,
1058
- "eval_samples_per_second": 8.822,
1059
- "eval_steps_per_second": 1.176,
1060
- "step": 70
1061
- },
1062
- {
1063
- "epoch": 3.736842105263158,
1064
- "grad_norm": 0.6286161541938782,
1065
- "learning_rate": 0.00043541302641198946,
1066
- "loss": 0.7126,
1067
- "step": 71
1068
- },
1069
- {
1070
- "epoch": 3.736842105263158,
1071
- "eval_loss": 0.6052109599113464,
1072
- "eval_runtime": 3.3873,
1073
- "eval_samples_per_second": 8.857,
1074
- "eval_steps_per_second": 1.181,
1075
- "step": 71
1076
- },
1077
- {
1078
- "epoch": 3.7894736842105265,
1079
- "grad_norm": 0.5700982213020325,
1080
- "learning_rate": 0.00043315468231640834,
1081
- "loss": 0.6126,
1082
- "step": 72
1083
- },
1084
- {
1085
- "epoch": 3.7894736842105265,
1086
- "eval_loss": 0.6031004786491394,
1087
- "eval_runtime": 3.3922,
1088
- "eval_samples_per_second": 8.844,
1089
- "eval_steps_per_second": 1.179,
1090
- "step": 72
1091
- },
1092
- {
1093
- "epoch": 3.8421052631578947,
1094
- "grad_norm": 0.8683762550354004,
1095
- "learning_rate": 0.00043086360599199516,
1096
- "loss": 0.7278,
1097
- "step": 73
1098
- },
1099
- {
1100
- "epoch": 3.8421052631578947,
1101
- "eval_loss": 0.5932725667953491,
1102
- "eval_runtime": 3.3962,
1103
- "eval_samples_per_second": 8.833,
1104
- "eval_steps_per_second": 1.178,
1105
- "step": 73
1106
- },
1107
- {
1108
- "epoch": 3.8947368421052633,
1109
- "grad_norm": 0.8634172081947327,
1110
- "learning_rate": 0.0004285402068852002,
1111
- "loss": 0.6826,
1112
- "step": 74
1113
- },
1114
- {
1115
- "epoch": 3.8947368421052633,
1116
- "eval_loss": 0.5909937620162964,
1117
- "eval_runtime": 3.3983,
1118
- "eval_samples_per_second": 8.828,
1119
- "eval_steps_per_second": 1.177,
1120
- "step": 74
1121
- },
1122
- {
1123
- "epoch": 3.9473684210526314,
1124
- "grad_norm": 0.556474506855011,
1125
- "learning_rate": 0.00042618490021899383,
1126
- "loss": 0.65,
1127
- "step": 75
1128
- },
1129
- {
1130
- "epoch": 3.9473684210526314,
1131
- "eval_loss": 0.5868418216705322,
1132
- "eval_runtime": 3.399,
1133
- "eval_samples_per_second": 8.826,
1134
- "eval_steps_per_second": 1.177,
1135
- "step": 75
1136
- },
1137
- {
1138
- "epoch": 4.0,
1139
- "grad_norm": 0.5346130728721619,
1140
- "learning_rate": 0.00042379810691866064,
1141
- "loss": 0.6475,
1142
- "step": 76
1143
- },
1144
- {
1145
- "epoch": 4.0,
1146
- "eval_loss": 0.588336706161499,
1147
- "eval_runtime": 3.3932,
1148
- "eval_samples_per_second": 8.841,
1149
- "eval_steps_per_second": 1.179,
1150
- "step": 76
1151
- },
1152
- {
1153
- "epoch": 4.052631578947368,
1154
- "grad_norm": 0.4865156412124634,
1155
- "learning_rate": 0.00042138025353657407,
1156
- "loss": 0.5485,
1157
- "step": 77
1158
- },
1159
- {
1160
- "epoch": 4.052631578947368,
1161
- "eval_loss": 0.5785155892372131,
1162
- "eval_runtime": 3.3941,
1163
- "eval_samples_per_second": 8.839,
1164
- "eval_steps_per_second": 1.179,
1165
- "step": 77
1166
- },
1167
- {
1168
- "epoch": 4.105263157894737,
1169
- "grad_norm": 0.5607722997665405,
1170
- "learning_rate": 0.00041893177217596633,
1171
- "loss": 0.5699,
1172
- "step": 78
1173
- },
1174
- {
1175
- "epoch": 4.105263157894737,
1176
- "eval_loss": 0.5646374821662903,
1177
- "eval_runtime": 3.3965,
1178
- "eval_samples_per_second": 8.833,
1179
- "eval_steps_per_second": 1.178,
1180
- "step": 78
1181
- },
1182
- {
1183
- "epoch": 4.157894736842105,
1184
- "grad_norm": 0.5337282419204712,
1185
- "learning_rate": 0.0004164531004137049,
1186
- "loss": 0.5308,
1187
- "step": 79
1188
- },
1189
- {
1190
- "epoch": 4.157894736842105,
1191
- "eval_loss": 0.5542218685150146,
1192
- "eval_runtime": 3.3924,
1193
- "eval_samples_per_second": 8.843,
1194
- "eval_steps_per_second": 1.179,
1195
- "step": 79
1196
- },
1197
- {
1198
- "epoch": 4.2105263157894735,
1199
- "grad_norm": 1.7681509256362915,
1200
- "learning_rate": 0.0004139446812220924,
1201
- "loss": 0.5458,
1202
- "step": 80
1203
- },
1204
- {
1205
- "epoch": 4.2105263157894735,
1206
- "eval_loss": 0.5494810938835144,
1207
- "eval_runtime": 3.3951,
1208
- "eval_samples_per_second": 8.836,
1209
- "eval_steps_per_second": 1.178,
1210
- "step": 80
1211
- },
1212
- {
1213
- "epoch": 4.2631578947368425,
1214
- "grad_norm": 0.8153849244117737,
1215
- "learning_rate": 0.0004114069628897006,
1216
- "loss": 0.592,
1217
- "step": 81
1218
- },
1219
- {
1220
- "epoch": 4.2631578947368425,
1221
- "eval_loss": 0.5404940843582153,
1222
- "eval_runtime": 3.3937,
1223
- "eval_samples_per_second": 8.84,
1224
- "eval_steps_per_second": 1.179,
1225
- "step": 81
1226
- },
1227
- {
1228
- "epoch": 4.315789473684211,
1229
- "grad_norm": 0.7037251591682434,
1230
- "learning_rate": 0.0004088403989412559,
1231
- "loss": 0.579,
1232
- "step": 82
1233
- },
1234
- {
1235
- "epoch": 4.315789473684211,
1236
- "eval_loss": 0.530238926410675,
1237
- "eval_runtime": 3.3957,
1238
- "eval_samples_per_second": 8.835,
1239
- "eval_steps_per_second": 1.178,
1240
- "step": 82
1241
- },
1242
- {
1243
- "epoch": 4.368421052631579,
1244
- "grad_norm": 0.6703127026557922,
1245
- "learning_rate": 0.00040624544805658794,
1246
- "loss": 0.5513,
1247
- "step": 83
1248
- },
1249
- {
1250
- "epoch": 4.368421052631579,
1251
- "eval_loss": 0.5282605290412903,
1252
- "eval_runtime": 3.4012,
1253
- "eval_samples_per_second": 8.82,
1254
- "eval_steps_per_second": 1.176,
1255
- "step": 83
1256
- },
1257
- {
1258
- "epoch": 4.421052631578947,
1259
- "grad_norm": 0.7324157357215881,
1260
- "learning_rate": 0.00040362257398865713,
1261
- "loss": 0.6175,
1262
- "step": 84
1263
- },
1264
- {
1265
- "epoch": 4.421052631578947,
1266
- "eval_loss": 0.5271756052970886,
1267
- "eval_runtime": 3.3941,
1268
- "eval_samples_per_second": 8.839,
1269
- "eval_steps_per_second": 1.179,
1270
- "step": 84
1271
- },
1272
- {
1273
- "epoch": 4.473684210526316,
1274
- "grad_norm": 0.7354516386985779,
1275
- "learning_rate": 0.00040097224548067613,
1276
- "loss": 0.5497,
1277
- "step": 85
1278
- },
1279
- {
1280
- "epoch": 4.473684210526316,
1281
- "eval_loss": 0.5268288850784302,
1282
- "eval_runtime": 3.397,
1283
- "eval_samples_per_second": 8.831,
1284
- "eval_steps_per_second": 1.177,
1285
- "step": 85
1286
- },
1287
- {
1288
- "epoch": 4.526315789473684,
1289
- "grad_norm": 0.6430884599685669,
1290
- "learning_rate": 0.0003982949361823388,
1291
- "loss": 0.5323,
1292
- "step": 86
1293
- },
1294
- {
1295
- "epoch": 4.526315789473684,
1296
- "eval_loss": 0.5271150469779968,
1297
- "eval_runtime": 3.4081,
1298
- "eval_samples_per_second": 8.803,
1299
- "eval_steps_per_second": 1.174,
1300
- "step": 86
1301
- },
1302
- {
1303
- "epoch": 4.578947368421053,
1304
- "grad_norm": 0.6861183643341064,
1305
- "learning_rate": 0.0003955911245651726,
1306
- "loss": 0.555,
1307
- "step": 87
1308
- },
1309
- {
1310
- "epoch": 4.578947368421053,
1311
- "eval_loss": 0.5218092799186707,
1312
- "eval_runtime": 3.3947,
1313
- "eval_samples_per_second": 8.837,
1314
- "eval_steps_per_second": 1.178,
1315
- "step": 87
1316
- },
1317
- {
1318
- "epoch": 4.631578947368421,
1319
- "grad_norm": 0.6339515447616577,
1320
- "learning_rate": 0.0003928612938370292,
1321
- "loss": 0.5396,
1322
- "step": 88
1323
- },
1324
- {
1325
- "epoch": 4.631578947368421,
1326
- "eval_loss": 0.5187237858772278,
1327
- "eval_runtime": 3.3968,
1328
- "eval_samples_per_second": 8.832,
1329
- "eval_steps_per_second": 1.178,
1330
- "step": 88
1331
- },
1332
- {
1333
- "epoch": 4.684210526315789,
1334
- "grad_norm": 0.5840083360671997,
1335
- "learning_rate": 0.00039010593185572867,
1336
- "loss": 0.5043,
1337
- "step": 89
1338
- },
1339
- {
1340
- "epoch": 4.684210526315789,
1341
- "eval_loss": 0.5117171406745911,
1342
- "eval_runtime": 3.3945,
1343
- "eval_samples_per_second": 8.838,
1344
- "eval_steps_per_second": 1.178,
1345
- "step": 89
1346
- },
1347
- {
1348
- "epoch": 4.7368421052631575,
1349
- "grad_norm": 0.6243887543678284,
1350
- "learning_rate": 0.00038732553104187296,
1351
- "loss": 0.4985,
1352
- "step": 90
1353
- },
1354
- {
1355
- "epoch": 4.7368421052631575,
1356
- "eval_loss": 0.5013009905815125,
1357
- "eval_runtime": 3.3983,
1358
- "eval_samples_per_second": 8.828,
1359
- "eval_steps_per_second": 1.177,
1360
- "step": 90
1361
- },
1362
- {
1363
- "epoch": 4.7894736842105265,
1364
- "grad_norm": 0.7383096814155579,
1365
- "learning_rate": 0.0003845205882908432,
1366
- "loss": 0.5446,
1367
- "step": 91
1368
- },
1369
- {
1370
- "epoch": 4.7894736842105265,
1371
- "eval_loss": 0.48944994807243347,
1372
- "eval_runtime": 3.3912,
1373
- "eval_samples_per_second": 8.846,
1374
- "eval_steps_per_second": 1.18,
1375
- "step": 91
1376
- },
1377
- {
1378
- "epoch": 4.842105263157895,
1379
- "grad_norm": 0.7017186880111694,
1380
- "learning_rate": 0.0003816916048839979,
1381
- "loss": 0.4855,
1382
- "step": 92
1383
- },
1384
- {
1385
- "epoch": 4.842105263157895,
1386
- "eval_loss": 0.490288108587265,
1387
- "eval_runtime": 3.392,
1388
- "eval_samples_per_second": 8.844,
1389
- "eval_steps_per_second": 1.179,
1390
- "step": 92
1391
- },
1392
- {
1393
- "epoch": 4.894736842105263,
1394
- "grad_norm": 0.803577184677124,
1395
- "learning_rate": 0.0003788390863990875,
1396
- "loss": 0.599,
1397
- "step": 93
1398
- },
1399
- {
1400
- "epoch": 4.894736842105263,
1401
- "eval_loss": 0.48545849323272705,
1402
- "eval_runtime": 3.3984,
1403
- "eval_samples_per_second": 8.828,
1404
- "eval_steps_per_second": 1.177,
1405
- "step": 93
1406
- },
1407
- {
1408
- "epoch": 4.947368421052632,
1409
- "grad_norm": 0.719249963760376,
1410
- "learning_rate": 0.00037596354261990007,
1411
- "loss": 0.5539,
1412
- "step": 94
1413
- },
1414
- {
1415
- "epoch": 4.947368421052632,
1416
- "eval_loss": 0.4850545823574066,
1417
- "eval_runtime": 3.4015,
1418
- "eval_samples_per_second": 8.82,
1419
- "eval_steps_per_second": 1.176,
1420
- "step": 94
1421
- },
1422
- {
1423
- "epoch": 5.0,
1424
- "grad_norm": 0.7983654141426086,
1425
- "learning_rate": 0.0003730654874451569,
1426
- "loss": 0.5899,
1427
- "step": 95
1428
- },
1429
- {
1430
- "epoch": 5.0,
1431
- "eval_loss": 0.47937095165252686,
1432
- "eval_runtime": 3.4007,
1433
- "eval_samples_per_second": 8.822,
1434
- "eval_steps_per_second": 1.176,
1435
- "step": 95
1436
- },
1437
- {
1438
- "epoch": 5.052631578947368,
1439
- "grad_norm": 0.6120598316192627,
1440
- "learning_rate": 0.00037014543879667093,
1441
- "loss": 0.4219,
1442
- "step": 96
1443
- },
1444
- {
1445
- "epoch": 5.052631578947368,
1446
- "eval_loss": 0.46941977739334106,
1447
- "eval_runtime": 3.3985,
1448
- "eval_samples_per_second": 8.827,
1449
- "eval_steps_per_second": 1.177,
1450
- "step": 96
1451
- },
1452
- {
1453
- "epoch": 5.105263157894737,
1454
- "grad_norm": 0.7291161417961121,
1455
- "learning_rate": 0.0003672039185267878,
1456
- "loss": 0.5002,
1457
- "step": 97
1458
- },
1459
- {
1460
- "epoch": 5.105263157894737,
1461
- "eval_loss": 0.45138782262802124,
1462
- "eval_runtime": 3.4042,
1463
- "eval_samples_per_second": 8.813,
1464
- "eval_steps_per_second": 1.175,
1465
- "step": 97
1466
- },
1467
- {
1468
- "epoch": 5.157894736842105,
1469
- "grad_norm": 0.5574305057525635,
1470
- "learning_rate": 0.00036424145232512333,
1471
- "loss": 0.4445,
1472
- "step": 98
1473
- },
1474
- {
1475
- "epoch": 5.157894736842105,
1476
- "eval_loss": 0.43881431221961975,
1477
- "eval_runtime": 3.4021,
1478
- "eval_samples_per_second": 8.818,
1479
- "eval_steps_per_second": 1.176,
1480
- "step": 98
1481
- },
1482
- {
1483
- "epoch": 5.2105263157894735,
1484
- "grad_norm": 0.7164113521575928,
1485
- "learning_rate": 0.0003612585696246158,
1486
- "loss": 0.4292,
1487
- "step": 99
1488
- },
1489
- {
1490
- "epoch": 5.2105263157894735,
1491
- "eval_loss": 0.43201857805252075,
1492
- "eval_runtime": 3.404,
1493
- "eval_samples_per_second": 8.813,
1494
- "eval_steps_per_second": 1.175,
1495
- "step": 99
1496
- },
1497
- {
1498
- "epoch": 5.2631578947368425,
1499
- "grad_norm": 0.7618677020072937,
1500
- "learning_rate": 0.0003582558035069091,
1501
- "loss": 0.4598,
1502
- "step": 100
1503
- },
1504
- {
1505
- "epoch": 5.2631578947368425,
1506
- "eval_loss": 0.434807151556015,
1507
- "eval_runtime": 3.3997,
1508
- "eval_samples_per_second": 8.824,
1509
- "eval_steps_per_second": 1.177,
1510
- "step": 100
1511
  }
1512
  ],
1513
  "logging_steps": 1,
1514
- "max_steps": 250,
1515
  "num_input_tokens_seen": 0,
1516
- "num_train_epochs": 14,
1517
- "save_steps": 10,
1518
  "stateful_callbacks": {
1519
  "TrainerControl": {
1520
  "args": {
@@ -1527,7 +852,7 @@
1527
  "attributes": {}
1528
  }
1529
  },
1530
- "total_flos": 4061879153080320.0,
1531
  "train_batch_size": 1,
1532
  "trial_name": null,
1533
  "trial_params": null
 
1
  {
2
+ "best_global_step": 55,
3
+ "best_metric": 0.7241045236587524,
4
+ "best_model_checkpoint": "/content/drive/MyDrive/lora_model/outputs/task15_microsoft/Phi-4-mini-instruct/checkpoint-55",
5
+ "epoch": 2.8947368421052633,
6
  "eval_steps": 1,
7
+ "global_step": 55,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.05263157894736842,
14
+ "grad_norm": 0.7188231945037842,
15
  "learning_rate": 0.0,
16
  "loss": 3.2235,
17
  "step": 1
 
19
  {
20
  "epoch": 0.05263157894736842,
21
  "eval_loss": 3.15524959564209,
22
+ "eval_runtime": 3.3312,
23
+ "eval_samples_per_second": 9.006,
24
+ "eval_steps_per_second": 1.201,
25
  "step": 1
26
  },
27
  {
28
  "epoch": 0.10526315789473684,
29
+ "grad_norm": 0.766629159450531,
30
  "learning_rate": 3.3333333333333335e-05,
31
  "loss": 3.165,
32
  "step": 2
33
  },
34
  {
35
  "epoch": 0.10526315789473684,
36
+ "eval_loss": 3.1018595695495605,
37
+ "eval_runtime": 3.28,
38
+ "eval_samples_per_second": 9.146,
39
+ "eval_steps_per_second": 1.219,
40
  "step": 2
41
  },
42
  {
43
  "epoch": 0.15789473684210525,
44
+ "grad_norm": 0.6206756234169006,
45
  "learning_rate": 6.666666666666667e-05,
46
+ "loss": 2.8628,
47
  "step": 3
48
  },
49
  {
50
  "epoch": 0.15789473684210525,
51
+ "eval_loss": 2.97302508354187,
52
+ "eval_runtime": 3.2899,
53
+ "eval_samples_per_second": 9.119,
54
+ "eval_steps_per_second": 1.216,
55
  "step": 3
56
  },
57
  {
58
  "epoch": 0.21052631578947367,
59
+ "grad_norm": 0.6644885540008545,
60
  "learning_rate": 0.0001,
61
+ "loss": 2.9711,
62
  "step": 4
63
  },
64
  {
65
  "epoch": 0.21052631578947367,
66
+ "eval_loss": 2.762944221496582,
67
+ "eval_runtime": 3.2987,
68
+ "eval_samples_per_second": 9.095,
69
+ "eval_steps_per_second": 1.213,
70
  "step": 4
71
  },
72
  {
73
  "epoch": 0.2631578947368421,
74
+ "grad_norm": 0.6135285496711731,
75
  "learning_rate": 0.00013333333333333334,
76
+ "loss": 2.7061,
77
  "step": 5
78
  },
79
  {
80
  "epoch": 0.2631578947368421,
81
+ "eval_loss": 2.5087203979492188,
82
+ "eval_runtime": 3.3091,
83
+ "eval_samples_per_second": 9.066,
84
+ "eval_steps_per_second": 1.209,
85
  "step": 5
86
  },
87
  {
88
  "epoch": 0.3157894736842105,
89
+ "grad_norm": 0.5422775745391846,
90
  "learning_rate": 0.00016666666666666666,
91
+ "loss": 2.4032,
92
  "step": 6
93
  },
94
  {
95
  "epoch": 0.3157894736842105,
96
+ "eval_loss": 2.270092725753784,
97
+ "eval_runtime": 3.3142,
98
+ "eval_samples_per_second": 9.052,
99
+ "eval_steps_per_second": 1.207,
100
  "step": 6
101
  },
102
  {
103
  "epoch": 0.3684210526315789,
104
+ "grad_norm": 0.5579596161842346,
105
  "learning_rate": 0.0002,
106
+ "loss": 2.272,
107
  "step": 7
108
  },
109
  {
110
  "epoch": 0.3684210526315789,
111
+ "eval_loss": 2.0614399909973145,
112
+ "eval_runtime": 3.3233,
113
+ "eval_samples_per_second": 9.027,
114
+ "eval_steps_per_second": 1.204,
115
  "step": 7
116
  },
117
  {
118
  "epoch": 0.42105263157894735,
119
+ "grad_norm": 0.7365043759346008,
120
  "learning_rate": 0.00023333333333333333,
121
+ "loss": 2.0297,
122
  "step": 8
123
  },
124
  {
125
  "epoch": 0.42105263157894735,
126
+ "eval_loss": 1.8437634706497192,
127
+ "eval_runtime": 3.3264,
128
+ "eval_samples_per_second": 9.019,
129
+ "eval_steps_per_second": 1.202,
130
  "step": 8
131
  },
132
  {
133
  "epoch": 0.47368421052631576,
134
+ "grad_norm": 0.7677823901176453,
135
  "learning_rate": 0.0002666666666666667,
136
+ "loss": 1.8911,
137
  "step": 9
138
  },
139
  {
140
  "epoch": 0.47368421052631576,
141
+ "eval_loss": 1.615093469619751,
142
+ "eval_runtime": 3.3357,
143
+ "eval_samples_per_second": 8.994,
144
+ "eval_steps_per_second": 1.199,
145
  "step": 9
146
  },
147
  {
148
  "epoch": 0.5263157894736842,
149
+ "grad_norm": 0.7033586502075195,
150
  "learning_rate": 0.0003,
151
+ "loss": 1.654,
152
  "step": 10
153
  },
154
  {
155
  "epoch": 0.5263157894736842,
156
+ "eval_loss": 1.4461504220962524,
157
+ "eval_runtime": 3.3549,
158
+ "eval_samples_per_second": 8.942,
159
+ "eval_steps_per_second": 1.192,
160
  "step": 10
161
  },
162
  {
163
  "epoch": 0.5789473684210527,
164
+ "grad_norm": 0.721517026424408,
165
  "learning_rate": 0.0003333333333333333,
166
+ "loss": 1.5364,
167
  "step": 11
168
  },
169
  {
170
  "epoch": 0.5789473684210527,
171
+ "eval_loss": 1.3645799160003662,
172
+ "eval_runtime": 3.361,
173
+ "eval_samples_per_second": 8.926,
174
+ "eval_steps_per_second": 1.19,
175
  "step": 11
176
  },
177
  {
178
  "epoch": 0.631578947368421,
179
+ "grad_norm": 0.7304323315620422,
180
  "learning_rate": 0.00036666666666666667,
181
+ "loss": 1.3689,
182
  "step": 12
183
  },
184
  {
185
  "epoch": 0.631578947368421,
186
+ "eval_loss": 1.272360920906067,
187
+ "eval_runtime": 3.3759,
188
+ "eval_samples_per_second": 8.887,
189
+ "eval_steps_per_second": 1.185,
190
  "step": 12
191
  },
192
  {
193
  "epoch": 0.6842105263157895,
194
+ "grad_norm": 0.6370911002159119,
195
  "learning_rate": 0.0004,
196
+ "loss": 1.329,
197
  "step": 13
198
  },
199
  {
200
  "epoch": 0.6842105263157895,
201
+ "eval_loss": 1.19339120388031,
202
+ "eval_runtime": 3.3835,
203
+ "eval_samples_per_second": 8.867,
204
+ "eval_steps_per_second": 1.182,
205
  "step": 13
206
  },
207
  {
208
  "epoch": 0.7368421052631579,
209
+ "grad_norm": 0.5493318438529968,
210
  "learning_rate": 0.00043333333333333337,
211
+ "loss": 1.1991,
212
  "step": 14
213
  },
214
  {
215
  "epoch": 0.7368421052631579,
216
+ "eval_loss": 1.154818058013916,
217
+ "eval_runtime": 3.3971,
218
+ "eval_samples_per_second": 8.831,
219
+ "eval_steps_per_second": 1.177,
220
  "step": 14
221
  },
222
  {
223
  "epoch": 0.7894736842105263,
224
+ "grad_norm": 0.4599643051624298,
225
  "learning_rate": 0.00046666666666666666,
226
+ "loss": 1.2358,
227
  "step": 15
228
  },
229
  {
230
  "epoch": 0.7894736842105263,
231
+ "eval_loss": 1.1299824714660645,
232
+ "eval_runtime": 3.4098,
233
+ "eval_samples_per_second": 8.798,
234
+ "eval_steps_per_second": 1.173,
235
  "step": 15
236
  },
237
  {
238
  "epoch": 0.8421052631578947,
239
+ "grad_norm": 0.5700777173042297,
240
  "learning_rate": 0.0005,
241
+ "loss": 1.206,
242
  "step": 16
243
  },
244
  {
245
  "epoch": 0.8421052631578947,
246
+ "eval_loss": 1.1079914569854736,
247
+ "eval_runtime": 3.4063,
248
+ "eval_samples_per_second": 8.807,
249
+ "eval_steps_per_second": 1.174,
250
  "step": 16
251
  },
252
  {
253
  "epoch": 0.8947368421052632,
254
+ "grad_norm": 0.44451233744621277,
255
+ "learning_rate": 0.0004993910125649561,
256
+ "loss": 1.2374,
257
  "step": 17
258
  },
259
  {
260
  "epoch": 0.8947368421052632,
261
+ "eval_loss": 1.076997995376587,
262
+ "eval_runtime": 3.4099,
263
+ "eval_samples_per_second": 8.798,
264
+ "eval_steps_per_second": 1.173,
265
  "step": 17
266
  },
267
  {
268
  "epoch": 0.9473684210526315,
269
+ "grad_norm": 0.382600337266922,
270
+ "learning_rate": 0.0004975670171853926,
271
+ "loss": 1.0959,
272
  "step": 18
273
  },
274
  {
275
  "epoch": 0.9473684210526315,
276
+ "eval_loss": 1.0459389686584473,
277
+ "eval_runtime": 3.4174,
278
+ "eval_samples_per_second": 8.779,
279
+ "eval_steps_per_second": 1.17,
280
  "step": 18
281
  },
282
  {
283
  "epoch": 1.0,
284
+ "grad_norm": 0.3735465109348297,
285
+ "learning_rate": 0.0004945369001834514,
286
+ "loss": 1.1433,
287
  "step": 19
288
  },
289
  {
290
  "epoch": 1.0,
291
+ "eval_loss": 1.0354558229446411,
292
+ "eval_runtime": 3.41,
293
+ "eval_samples_per_second": 8.798,
294
+ "eval_steps_per_second": 1.173,
295
  "step": 19
296
  },
297
  {
298
  "epoch": 1.0526315789473684,
299
+ "grad_norm": 0.36878153681755066,
300
+ "learning_rate": 0.0004903154239845797,
301
+ "loss": 1.0467,
302
  "step": 20
303
  },
304
  {
305
  "epoch": 1.0526315789473684,
306
+ "eval_loss": 1.0118752717971802,
307
+ "eval_runtime": 3.4023,
308
  "eval_samples_per_second": 8.818,
309
  "eval_steps_per_second": 1.176,
310
  "step": 20
311
  },
312
  {
313
  "epoch": 1.1052631578947367,
314
+ "grad_norm": 0.3709339499473572,
315
+ "learning_rate": 0.0004849231551964771,
316
+ "loss": 1.0453,
317
  "step": 21
318
  },
319
  {
320
  "epoch": 1.1052631578947367,
321
+ "eval_loss": 0.9837953448295593,
322
+ "eval_runtime": 3.3826,
323
+ "eval_samples_per_second": 8.869,
324
+ "eval_steps_per_second": 1.183,
325
  "step": 21
326
  },
327
  {
328
  "epoch": 1.1578947368421053,
329
+ "grad_norm": 0.32317909598350525,
330
+ "learning_rate": 0.0004783863644106502,
331
+ "loss": 1.0573,
332
  "step": 22
333
  },
334
  {
335
  "epoch": 1.1578947368421053,
336
+ "eval_loss": 0.9650039076805115,
337
+ "eval_runtime": 3.3888,
338
+ "eval_samples_per_second": 8.853,
339
+ "eval_steps_per_second": 1.18,
340
  "step": 22
341
  },
342
  {
343
  "epoch": 1.2105263157894737,
344
+ "grad_norm": 0.3465510606765747,
345
+ "learning_rate": 0.00047073689821473173,
346
+ "loss": 0.9613,
347
  "step": 23
348
  },
349
  {
350
  "epoch": 1.2105263157894737,
351
+ "eval_loss": 0.9524248838424683,
352
+ "eval_runtime": 3.389,
353
+ "eval_samples_per_second": 8.852,
354
+ "eval_steps_per_second": 1.18,
355
  "step": 23
356
  },
357
  {
358
  "epoch": 1.263157894736842,
359
+ "grad_norm": 0.341265469789505,
360
+ "learning_rate": 0.00046201202403910646,
361
+ "loss": 1.0765,
362
  "step": 24
363
  },
364
  {
365
  "epoch": 1.263157894736842,
366
+ "eval_loss": 0.9478815197944641,
367
+ "eval_runtime": 3.3934,
368
+ "eval_samples_per_second": 8.841,
369
+ "eval_steps_per_second": 1.179,
370
  "step": 24
371
  },
372
  {
373
  "epoch": 1.3157894736842106,
374
+ "grad_norm": 0.32804617285728455,
375
+ "learning_rate": 0.0004522542485937369,
376
+ "loss": 0.9063,
377
  "step": 25
378
  },
379
  {
380
  "epoch": 1.3157894736842106,
381
+ "eval_loss": 0.9379161596298218,
382
+ "eval_runtime": 3.394,
383
+ "eval_samples_per_second": 8.839,
384
+ "eval_steps_per_second": 1.179,
385
  "step": 25
386
  },
387
  {
388
  "epoch": 1.368421052631579,
389
+ "grad_norm": 0.31782791018486023,
390
+ "learning_rate": 0.0004415111107797445,
391
+ "loss": 0.9969,
392
  "step": 26
393
  },
394
  {
395
  "epoch": 1.368421052631579,
396
+ "eval_loss": 0.9347817897796631,
397
+ "eval_runtime": 3.3909,
398
+ "eval_samples_per_second": 8.847,
399
+ "eval_steps_per_second": 1.18,
400
  "step": 26
401
  },
402
  {
403
  "epoch": 1.4210526315789473,
404
+ "grad_norm": 0.3140616714954376,
405
+ "learning_rate": 0.0004298349500846628,
406
+ "loss": 0.9423,
407
  "step": 27
408
  },
409
  {
410
  "epoch": 1.4210526315789473,
411
+ "eval_loss": 0.9298030138015747,
412
+ "eval_runtime": 3.4047,
413
+ "eval_samples_per_second": 8.811,
414
+ "eval_steps_per_second": 1.175,
415
  "step": 27
416
  },
417
  {
418
  "epoch": 1.4736842105263157,
419
+ "grad_norm": 0.3035232126712799,
420
+ "learning_rate": 0.0004172826515897146,
421
+ "loss": 0.8544,
422
  "step": 28
423
  },
424
  {
425
  "epoch": 1.4736842105263157,
426
+ "eval_loss": 0.920465350151062,
427
+ "eval_runtime": 3.4152,
428
+ "eval_samples_per_second": 8.784,
429
+ "eval_steps_per_second": 1.171,
430
  "step": 28
431
  },
432
  {
433
  "epoch": 1.526315789473684,
434
+ "grad_norm": 0.36378970742225647,
435
+ "learning_rate": 0.00040391536883141455,
436
+ "loss": 1.0175,
437
  "step": 29
438
  },
439
  {
440
  "epoch": 1.526315789473684,
441
+ "eval_loss": 0.9069837331771851,
442
+ "eval_runtime": 3.4214,
443
+ "eval_samples_per_second": 8.768,
444
+ "eval_steps_per_second": 1.169,
445
  "step": 29
446
  },
447
  {
448
  "epoch": 1.5789473684210527,
449
+ "grad_norm": 0.3729051947593689,
450
+ "learning_rate": 0.0003897982258676867,
451
+ "loss": 0.9851,
452
  "step": 30
453
  },
454
  {
455
  "epoch": 1.5789473684210527,
456
+ "eval_loss": 0.8988735675811768,
457
+ "eval_runtime": 3.4109,
458
+ "eval_samples_per_second": 8.795,
459
+ "eval_steps_per_second": 1.173,
460
  "step": 30
461
  },
462
  {
463
  "epoch": 1.631578947368421,
464
+ "grad_norm": 0.3581544756889343,
465
+ "learning_rate": 0.000375,
466
+ "loss": 0.9229,
467
  "step": 31
468
  },
469
  {
470
  "epoch": 1.631578947368421,
471
+ "eval_loss": 0.8822915554046631,
472
+ "eval_runtime": 3.3783,
473
+ "eval_samples_per_second": 8.88,
474
+ "eval_steps_per_second": 1.184,
475
  "step": 31
476
  },
477
  {
478
  "epoch": 1.6842105263157894,
479
+ "grad_norm": 0.28150516748428345,
480
+ "learning_rate": 0.00035959278669726934,
481
+ "loss": 0.94,
482
  "step": 32
483
  },
484
  {
485
  "epoch": 1.6842105263157894,
486
+ "eval_loss": 0.8713746666908264,
487
+ "eval_runtime": 3.4041,
488
+ "eval_samples_per_second": 8.813,
489
+ "eval_steps_per_second": 1.175,
490
  "step": 32
491
  },
492
  {
493
  "epoch": 1.736842105263158,
494
+ "grad_norm": 0.30831000208854675,
495
+ "learning_rate": 0.00034365164835397803,
496
+ "loss": 1.0407,
497
  "step": 33
498
  },
499
  {
500
  "epoch": 1.736842105263158,
501
+ "eval_loss": 0.8603693842887878,
502
+ "eval_runtime": 3.417,
503
+ "eval_samples_per_second": 8.78,
504
+ "eval_steps_per_second": 1.171,
505
  "step": 33
506
  },
507
  {
508
  "epoch": 1.7894736842105263,
509
+ "grad_norm": 0.31896907091140747,
510
+ "learning_rate": 0.00032725424859373687,
511
+ "loss": 0.9185,
512
  "step": 34
513
  },
514
  {
515
  "epoch": 1.7894736842105263,
516
+ "eval_loss": 0.849823534488678,
517
+ "eval_runtime": 3.4154,
518
+ "eval_samples_per_second": 8.784,
519
+ "eval_steps_per_second": 1.171,
520
  "step": 34
521
  },
522
  {
523
  "epoch": 1.8421052631578947,
524
+ "grad_norm": 0.29725414514541626,
525
+ "learning_rate": 0.0003104804738999169,
526
+ "loss": 0.978,
527
  "step": 35
528
  },
529
  {
530
  "epoch": 1.8421052631578947,
531
+ "eval_loss": 0.8390634655952454,
532
+ "eval_runtime": 3.4119,
533
+ "eval_samples_per_second": 8.793,
534
+ "eval_steps_per_second": 1.172,
535
  "step": 35
536
  },
537
  {
538
  "epoch": 1.8947368421052633,
539
+ "grad_norm": 0.3137111961841583,
540
+ "learning_rate": 0.00029341204441673266,
541
+ "loss": 0.9221,
542
  "step": 36
543
  },
544
  {
545
  "epoch": 1.8947368421052633,
546
+ "eval_loss": 0.8293085098266602,
547
+ "eval_runtime": 3.3951,
548
+ "eval_samples_per_second": 8.836,
549
+ "eval_steps_per_second": 1.178,
550
  "step": 36
551
  },
552
  {
553
  "epoch": 1.9473684210526314,
554
+ "grad_norm": 0.267716646194458,
555
+ "learning_rate": 0.0002761321158169134,
556
+ "loss": 1.0078,
557
  "step": 37
558
  },
559
  {
560
  "epoch": 1.9473684210526314,
561
+ "eval_loss": 0.8227899670600891,
562
+ "eval_runtime": 3.3926,
563
+ "eval_samples_per_second": 8.843,
564
+ "eval_steps_per_second": 1.179,
565
  "step": 37
566
  },
567
  {
568
  "epoch": 2.0,
569
+ "grad_norm": 0.3097141683101654,
570
+ "learning_rate": 0.0002587248741756253,
571
+ "loss": 1.0386,
572
  "step": 38
573
  },
574
  {
575
  "epoch": 2.0,
576
+ "eval_loss": 0.8196889758110046,
577
+ "eval_runtime": 3.3913,
578
+ "eval_samples_per_second": 8.846,
579
+ "eval_steps_per_second": 1.179,
580
  "step": 38
581
  },
582
  {
583
  "epoch": 2.0526315789473686,
584
+ "grad_norm": 0.29532116651535034,
585
+ "learning_rate": 0.00024127512582437484,
586
+ "loss": 0.9046,
587
  "step": 39
588
  },
589
  {
590
  "epoch": 2.0526315789473686,
591
+ "eval_loss": 0.8109915852546692,
592
+ "eval_runtime": 3.3856,
593
+ "eval_samples_per_second": 8.861,
594
+ "eval_steps_per_second": 1.181,
595
  "step": 39
596
  },
597
  {
598
  "epoch": 2.1052631578947367,
599
+ "grad_norm": 0.3160407245159149,
600
+ "learning_rate": 0.00022386788418308668,
601
+ "loss": 0.8684,
602
  "step": 40
603
  },
604
  {
605
  "epoch": 2.1052631578947367,
606
+ "eval_loss": 0.799045979976654,
607
+ "eval_runtime": 3.3859,
608
+ "eval_samples_per_second": 8.86,
609
+ "eval_steps_per_second": 1.181,
610
  "step": 40
611
  },
612
  {
613
  "epoch": 2.1578947368421053,
614
+ "grad_norm": 0.2594124674797058,
615
+ "learning_rate": 0.00020658795558326743,
616
+ "loss": 0.8051,
617
  "step": 41
618
  },
619
  {
620
  "epoch": 2.1578947368421053,
621
+ "eval_loss": 0.7873298525810242,
622
+ "eval_runtime": 3.3873,
623
+ "eval_samples_per_second": 8.857,
624
+ "eval_steps_per_second": 1.181,
625
  "step": 41
626
  },
627
  {
628
  "epoch": 2.2105263157894735,
629
+ "grad_norm": 0.2573184370994568,
630
+ "learning_rate": 0.0001895195261000831,
631
+ "loss": 0.7542,
632
  "step": 42
633
  },
634
  {
635
  "epoch": 2.2105263157894735,
636
+ "eval_loss": 0.7783879637718201,
637
+ "eval_runtime": 3.3897,
638
+ "eval_samples_per_second": 8.85,
639
+ "eval_steps_per_second": 1.18,
640
  "step": 42
641
  },
642
  {
643
  "epoch": 2.263157894736842,
644
+ "grad_norm": 0.3050247132778168,
645
+ "learning_rate": 0.00017274575140626317,
646
+ "loss": 0.8833,
647
  "step": 43
648
  },
649
  {
650
  "epoch": 2.263157894736842,
651
+ "eval_loss": 0.7714616060256958,
652
+ "eval_runtime": 3.4031,
653
+ "eval_samples_per_second": 8.815,
654
+ "eval_steps_per_second": 1.175,
655
  "step": 43
656
  },
657
  {
658
  "epoch": 2.3157894736842106,
659
+ "grad_norm": 0.27206432819366455,
660
+ "learning_rate": 0.00015634835164602198,
661
+ "loss": 0.8176,
662
  "step": 44
663
  },
664
  {
665
  "epoch": 2.3157894736842106,
666
+ "eval_loss": 0.7637041807174683,
667
+ "eval_runtime": 3.4006,
668
+ "eval_samples_per_second": 8.822,
669
  "eval_steps_per_second": 1.176,
670
  "step": 44
671
  },
672
  {
673
  "epoch": 2.3684210526315788,
674
+ "grad_norm": 0.24384012818336487,
675
+ "learning_rate": 0.00014040721330273062,
676
+ "loss": 0.7616,
677
  "step": 45
678
  },
679
  {
680
  "epoch": 2.3684210526315788,
681
+ "eval_loss": 0.7560217380523682,
682
+ "eval_runtime": 3.4005,
683
+ "eval_samples_per_second": 8.822,
684
+ "eval_steps_per_second": 1.176,
685
  "step": 45
686
  },
687
  {
688
  "epoch": 2.4210526315789473,
689
+ "grad_norm": 0.25645551085472107,
690
+ "learning_rate": 0.00012500000000000006,
691
+ "loss": 0.7888,
692
  "step": 46
693
  },
694
  {
695
  "epoch": 2.4210526315789473,
696
+ "eval_loss": 0.7505295872688293,
697
+ "eval_runtime": 3.3925,
698
+ "eval_samples_per_second": 8.843,
699
+ "eval_steps_per_second": 1.179,
700
  "step": 46
701
  },
702
  {
703
  "epoch": 2.473684210526316,
704
+ "grad_norm": 0.27820125222206116,
705
+ "learning_rate": 0.00011020177413231333,
706
+ "loss": 0.7584,
707
  "step": 47
708
  },
709
  {
710
  "epoch": 2.473684210526316,
711
+ "eval_loss": 0.7445800304412842,
712
+ "eval_runtime": 3.3928,
713
+ "eval_samples_per_second": 8.842,
714
+ "eval_steps_per_second": 1.179,
715
  "step": 47
716
  },
717
  {
718
  "epoch": 2.526315789473684,
719
+ "grad_norm": 0.23925091326236725,
720
+ "learning_rate": 9.608463116858542e-05,
721
+ "loss": 0.7504,
722
  "step": 48
723
  },
724
  {
725
  "epoch": 2.526315789473684,
726
+ "eval_loss": 0.7403488755226135,
727
+ "eval_runtime": 3.4026,
728
+ "eval_samples_per_second": 8.817,
729
+ "eval_steps_per_second": 1.176,
730
  "step": 48
731
  },
732
  {
733
  "epoch": 2.5789473684210527,
734
+ "grad_norm": 0.32143712043762207,
735
+ "learning_rate": 8.271734841028553e-05,
736
+ "loss": 0.8269,
737
  "step": 49
738
  },
739
  {
740
  "epoch": 2.5789473684210527,
741
+ "eval_loss": 0.7371814250946045,
742
+ "eval_runtime": 3.3997,
743
+ "eval_samples_per_second": 8.824,
744
  "eval_steps_per_second": 1.177,
745
  "step": 49
746
  },
747
  {
748
  "epoch": 2.6315789473684212,
749
+ "grad_norm": 0.2628876864910126,
750
+ "learning_rate": 7.016504991533726e-05,
751
+ "loss": 0.7076,
752
  "step": 50
753
  },
754
  {
755
  "epoch": 2.6315789473684212,
756
+ "eval_loss": 0.7335822582244873,
757
+ "eval_runtime": 3.4029,
758
+ "eval_samples_per_second": 8.816,
759
+ "eval_steps_per_second": 1.175,
760
  "step": 50
761
  },
762
  {
763
  "epoch": 2.6842105263157894,
764
+ "grad_norm": 0.30318617820739746,
765
+ "learning_rate": 5.848888922025553e-05,
766
+ "loss": 0.7792,
767
  "step": 51
768
  },
769
  {
770
  "epoch": 2.6842105263157894,
771
+ "eval_loss": 0.7297669053077698,
772
+ "eval_runtime": 3.3726,
773
+ "eval_samples_per_second": 8.895,
774
+ "eval_steps_per_second": 1.186,
775
  "step": 51
776
  },
777
  {
778
  "epoch": 2.736842105263158,
779
+ "grad_norm": 0.3162338435649872,
780
+ "learning_rate": 4.7745751406263163e-05,
781
+ "loss": 0.7217,
782
  "step": 52
783
  },
784
  {
785
  "epoch": 2.736842105263158,
786
+ "eval_loss": 0.728228747844696,
787
+ "eval_runtime": 3.3989,
788
+ "eval_samples_per_second": 8.827,
789
+ "eval_steps_per_second": 1.177,
790
  "step": 52
791
  },
792
  {
793
  "epoch": 2.7894736842105265,
794
+ "grad_norm": 0.2733875513076782,
795
+ "learning_rate": 3.798797596089351e-05,
796
+ "loss": 0.8098,
797
  "step": 53
798
  },
799
  {
800
  "epoch": 2.7894736842105265,
801
+ "eval_loss": 0.7270908355712891,
802
+ "eval_runtime": 3.4122,
803
+ "eval_samples_per_second": 8.792,
804
+ "eval_steps_per_second": 1.172,
805
  "step": 53
806
  },
807
  {
808
  "epoch": 2.8421052631578947,
809
+ "grad_norm": 0.26100900769233704,
810
+ "learning_rate": 2.9263101785268254e-05,
811
+ "loss": 0.7631,
812
  "step": 54
813
  },
814
  {
815
  "epoch": 2.8421052631578947,
816
+ "eval_loss": 0.7254647016525269,
817
+ "eval_runtime": 3.4244,
818
+ "eval_samples_per_second": 8.761,
819
+ "eval_steps_per_second": 1.168,
820
  "step": 54
821
  },
822
  {
823
  "epoch": 2.8947368421052633,
824
+ "grad_norm": 0.2827248275279999,
825
+ "learning_rate": 2.1613635589349755e-05,
826
+ "loss": 0.7716,
827
  "step": 55
828
  },
829
  {
830
  "epoch": 2.8947368421052633,
831
+ "eval_loss": 0.7241045236587524,
832
+ "eval_runtime": 3.4133,
833
+ "eval_samples_per_second": 8.789,
834
+ "eval_steps_per_second": 1.172,
835
  "step": 55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
836
  }
837
  ],
838
  "logging_steps": 1,
839
+ "max_steps": 60,
840
  "num_input_tokens_seen": 0,
841
+ "num_train_epochs": 4,
842
+ "save_steps": 5,
843
  "stateful_callbacks": {
844
  "TrainerControl": {
845
  "args": {
 
852
  "attributes": {}
853
  }
854
  },
855
+ "total_flos": 2315465393725440.0,
856
  "train_batch_size": 1,
857
  "trial_name": null,
858
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9a12232fc8be9bbc30f617bdff8aae0dd2eb32982822050660854f8120e8007a
3
  size 6033
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e7f2bf25b1718a220ed1b92d07f386fbdcd9effbf62c9fe1bb8da4cac6ff2c3
3
  size 6033