robertou2 commited on
Commit
edc703e
·
verified ·
1 Parent(s): 2c9551f

Upload folder using huggingface_hub

Browse files
adapter_config.json CHANGED
@@ -23,9 +23,9 @@
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
- "down_proj",
27
- "o_proj",
28
  "gate_up_proj",
 
 
29
  "qkv_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
 
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
 
 
26
  "gate_up_proj",
27
+ "o_proj",
28
+ "down_proj",
29
  "qkv_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bf7e8b8c722ab0df8a5db587e800c65b59585720dfdf7fe5b4209a5be232841a
3
  size 369133600
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d1c92e5aea6479fbcd97c9a2c7bcb7b704e5179617a38cd54b8b6d362b9a546
3
  size 369133600
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6dff1b67f2df08f6a4e5fb81c633d3da1b6bf25f3b53210855b4dd6a6f44a3ff
3
+ size 738413771
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c99e9eb1649a644c8be6e8e889139d6797bf8dd316223ef128559cf7c1e450b6
3
+ size 14645
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:817a83ea1c2988e6dd498bb2cb1922b79fe6cdd5fb3e9dadad8593e0f7a9abed
3
+ size 1465
trainer_state.json CHANGED
@@ -1,904 +1,109 @@
1
  {
2
- "best_global_step": 236,
3
- "best_metric": 8.514503861078992e-05,
4
- "best_model_checkpoint": "/content/drive/MyDrive/lora_model/outputs/task15_microsoft/Phi-4-mini-instruct/checkpoint-236",
5
- "epoch": 59.0,
6
  "eval_steps": 500,
7
- "global_step": 236,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 1.0,
14
- "grad_norm": 9.229236602783203,
15
- "learning_rate": 0.0001,
16
- "loss": 2.6715,
17
  "step": 4
18
  },
19
  {
20
  "epoch": 1.0,
21
- "eval_loss": 1.3550810813903809,
22
- "eval_runtime": 3.3411,
23
- "eval_samples_per_second": 8.979,
24
- "eval_steps_per_second": 1.197,
25
  "step": 4
26
  },
27
  {
28
  "epoch": 2.0,
29
- "grad_norm": 9.349386215209961,
30
- "learning_rate": 0.00023333333333333333,
31
- "loss": 1.3025,
32
  "step": 8
33
  },
34
  {
35
  "epoch": 2.0,
36
- "eval_loss": 0.9609652757644653,
37
- "eval_runtime": 3.3476,
38
- "eval_samples_per_second": 8.962,
39
- "eval_steps_per_second": 1.195,
40
  "step": 8
41
  },
42
  {
43
  "epoch": 3.0,
44
- "grad_norm": 4.202998161315918,
45
- "learning_rate": 0.00036666666666666667,
46
- "loss": 0.9833,
47
  "step": 12
48
  },
49
  {
50
  "epoch": 3.0,
51
- "eval_loss": 0.8532541990280151,
52
- "eval_runtime": 3.3649,
53
- "eval_samples_per_second": 8.916,
54
- "eval_steps_per_second": 1.189,
55
  "step": 12
56
  },
57
  {
58
  "epoch": 4.0,
59
- "grad_norm": 5.911304473876953,
60
- "learning_rate": 0.0005,
61
- "loss": 0.8496,
62
  "step": 16
63
  },
64
  {
65
  "epoch": 4.0,
66
- "eval_loss": 0.7015247941017151,
67
- "eval_runtime": 3.3686,
68
- "eval_samples_per_second": 8.906,
69
- "eval_steps_per_second": 1.187,
70
  "step": 16
71
  },
72
  {
73
  "epoch": 5.0,
74
- "grad_norm": 3.7761387825012207,
75
- "learning_rate": 0.0004996426526821629,
76
- "loss": 0.737,
77
  "step": 20
78
  },
79
  {
80
  "epoch": 5.0,
81
- "eval_loss": 0.5959243178367615,
82
- "eval_runtime": 3.3737,
83
- "eval_samples_per_second": 8.892,
84
- "eval_steps_per_second": 1.186,
85
  "step": 20
86
  },
87
  {
88
  "epoch": 6.0,
89
- "grad_norm": 5.99334716796875,
90
- "learning_rate": 0.0004985716323054959,
91
- "loss": 0.6024,
92
  "step": 24
93
  },
94
  {
95
  "epoch": 6.0,
96
- "eval_loss": 0.5120770931243896,
97
- "eval_runtime": 3.3367,
98
- "eval_samples_per_second": 8.991,
99
- "eval_steps_per_second": 1.199,
100
  "step": 24
101
- },
102
- {
103
- "epoch": 7.0,
104
- "grad_norm": 6.7658467292785645,
105
- "learning_rate": 0.0004967900006800708,
106
- "loss": 0.496,
107
- "step": 28
108
- },
109
- {
110
- "epoch": 7.0,
111
- "eval_loss": 0.2661070227622986,
112
- "eval_runtime": 3.3521,
113
- "eval_samples_per_second": 8.95,
114
- "eval_steps_per_second": 1.193,
115
- "step": 28
116
- },
117
- {
118
- "epoch": 8.0,
119
- "grad_norm": 6.91306209564209,
120
- "learning_rate": 0.0004943028510961491,
121
- "loss": 0.3035,
122
- "step": 32
123
- },
124
- {
125
- "epoch": 8.0,
126
- "eval_loss": 0.20593233406543732,
127
- "eval_runtime": 3.3825,
128
- "eval_samples_per_second": 8.869,
129
- "eval_steps_per_second": 1.183,
130
- "step": 32
131
- },
132
- {
133
- "epoch": 9.0,
134
- "grad_norm": 5.512369155883789,
135
- "learning_rate": 0.0004911172937635942,
136
- "loss": 0.2058,
137
- "step": 36
138
- },
139
- {
140
- "epoch": 9.0,
141
- "eval_loss": 0.13461297750473022,
142
- "eval_runtime": 3.3417,
143
- "eval_samples_per_second": 8.978,
144
- "eval_steps_per_second": 1.197,
145
- "step": 36
146
- },
147
- {
148
- "epoch": 10.0,
149
- "grad_norm": 14.840362548828125,
150
- "learning_rate": 0.0004872424354853545,
151
- "loss": 0.3198,
152
- "step": 40
153
- },
154
- {
155
- "epoch": 10.0,
156
- "eval_loss": 0.26689016819000244,
157
- "eval_runtime": 3.3554,
158
- "eval_samples_per_second": 8.941,
159
- "eval_steps_per_second": 1.192,
160
- "step": 40
161
- },
162
- {
163
- "epoch": 11.0,
164
- "grad_norm": 5.161523818969727,
165
- "learning_rate": 0.00048268935362313215,
166
- "loss": 0.225,
167
- "step": 44
168
- },
169
- {
170
- "epoch": 11.0,
171
- "eval_loss": 0.1698600798845291,
172
- "eval_runtime": 3.3826,
173
- "eval_samples_per_second": 8.869,
174
- "eval_steps_per_second": 1.183,
175
- "step": 44
176
- },
177
- {
178
- "epoch": 12.0,
179
- "grad_norm": 6.356969833374023,
180
- "learning_rate": 0.0004774710644296578,
181
- "loss": 0.1683,
182
- "step": 48
183
- },
184
- {
185
- "epoch": 12.0,
186
- "eval_loss": 0.14320150017738342,
187
- "eval_runtime": 3.3444,
188
- "eval_samples_per_second": 8.97,
189
- "eval_steps_per_second": 1.196,
190
- "step": 48
191
- },
192
- {
193
- "epoch": 13.0,
194
- "grad_norm": 7.662347316741943,
195
- "learning_rate": 0.0004716024858381075,
196
- "loss": 0.1401,
197
- "step": 52
198
- },
199
- {
200
- "epoch": 13.0,
201
- "eval_loss": 0.07819870859384537,
202
- "eval_runtime": 3.3566,
203
- "eval_samples_per_second": 8.938,
204
- "eval_steps_per_second": 1.192,
205
- "step": 52
206
- },
207
- {
208
- "epoch": 14.0,
209
- "grad_norm": 3.4681787490844727,
210
- "learning_rate": 0.00046510039481503486,
211
- "loss": 0.0957,
212
- "step": 56
213
- },
214
- {
215
- "epoch": 14.0,
216
- "eval_loss": 0.05342103913426399,
217
- "eval_runtime": 3.3897,
218
- "eval_samples_per_second": 8.85,
219
- "eval_steps_per_second": 1.18,
220
- "step": 56
221
- },
222
- {
223
- "epoch": 15.0,
224
- "grad_norm": 2.6177587509155273,
225
- "learning_rate": 0.00045798337939873923,
226
- "loss": 0.0853,
227
- "step": 60
228
- },
229
- {
230
- "epoch": 15.0,
231
- "eval_loss": 0.07320532202720642,
232
- "eval_runtime": 3.3471,
233
- "eval_samples_per_second": 8.963,
234
- "eval_steps_per_second": 1.195,
235
- "step": 60
236
- },
237
- {
238
- "epoch": 16.0,
239
- "grad_norm": 10.169129371643066,
240
- "learning_rate": 0.0004502717855601809,
241
- "loss": 0.2155,
242
- "step": 64
243
- },
244
- {
245
- "epoch": 16.0,
246
- "eval_loss": 0.14110827445983887,
247
- "eval_runtime": 3.3497,
248
- "eval_samples_per_second": 8.956,
249
- "eval_steps_per_second": 1.194,
250
- "step": 64
251
- },
252
- {
253
- "epoch": 17.0,
254
- "grad_norm": 4.305017471313477,
255
- "learning_rate": 0.0004419876590383554,
256
- "loss": 0.1458,
257
- "step": 68
258
- },
259
- {
260
- "epoch": 17.0,
261
- "eval_loss": 0.07745428383350372,
262
- "eval_runtime": 3.3838,
263
- "eval_samples_per_second": 8.866,
264
- "eval_steps_per_second": 1.182,
265
- "step": 68
266
- },
267
- {
268
- "epoch": 18.0,
269
- "grad_norm": 6.399552822113037,
270
- "learning_rate": 0.00043315468231640834,
271
- "loss": 0.1029,
272
- "step": 72
273
- },
274
- {
275
- "epoch": 18.0,
276
- "eval_loss": 0.11537892371416092,
277
- "eval_runtime": 3.3464,
278
- "eval_samples_per_second": 8.965,
279
- "eval_steps_per_second": 1.195,
280
- "step": 72
281
- },
282
- {
283
- "epoch": 19.0,
284
- "grad_norm": 5.510335922241211,
285
- "learning_rate": 0.00042379810691866064,
286
- "loss": 0.1198,
287
- "step": 76
288
- },
289
- {
290
- "epoch": 19.0,
291
- "eval_loss": 0.09017840772867203,
292
- "eval_runtime": 3.3477,
293
- "eval_samples_per_second": 8.961,
294
- "eval_steps_per_second": 1.195,
295
- "step": 76
296
- },
297
- {
298
- "epoch": 20.0,
299
- "grad_norm": 3.690556287765503,
300
- "learning_rate": 0.0004139446812220924,
301
- "loss": 0.1062,
302
- "step": 80
303
- },
304
- {
305
- "epoch": 20.0,
306
- "eval_loss": 0.0416039302945137,
307
- "eval_runtime": 3.3871,
308
- "eval_samples_per_second": 8.857,
309
- "eval_steps_per_second": 1.181,
310
- "step": 80
311
- },
312
- {
313
- "epoch": 21.0,
314
- "grad_norm": 3.543250799179077,
315
- "learning_rate": 0.00040362257398865713,
316
- "loss": 0.0896,
317
- "step": 84
318
- },
319
- {
320
- "epoch": 21.0,
321
- "eval_loss": 0.06732075661420822,
322
- "eval_runtime": 3.3474,
323
- "eval_samples_per_second": 8.962,
324
- "eval_steps_per_second": 1.195,
325
- "step": 84
326
- },
327
- {
328
- "epoch": 22.0,
329
- "grad_norm": 3.758932113647461,
330
- "learning_rate": 0.0003928612938370292,
331
- "loss": 0.1018,
332
- "step": 88
333
- },
334
- {
335
- "epoch": 22.0,
336
- "eval_loss": 0.05171294882893562,
337
- "eval_runtime": 3.3522,
338
- "eval_samples_per_second": 8.949,
339
- "eval_steps_per_second": 1.193,
340
- "step": 88
341
- },
342
- {
343
- "epoch": 23.0,
344
- "grad_norm": 51.296634674072266,
345
- "learning_rate": 0.0003816916048839979,
346
- "loss": 0.0869,
347
- "step": 92
348
- },
349
- {
350
- "epoch": 23.0,
351
- "eval_loss": 0.10484348982572556,
352
- "eval_runtime": 3.3734,
353
- "eval_samples_per_second": 8.893,
354
- "eval_steps_per_second": 1.186,
355
- "step": 92
356
- },
357
- {
358
- "epoch": 24.0,
359
- "grad_norm": 3.9963831901550293,
360
- "learning_rate": 0.00037014543879667093,
361
- "loss": 0.112,
362
- "step": 96
363
- },
364
- {
365
- "epoch": 24.0,
366
- "eval_loss": 0.06635650247335434,
367
- "eval_runtime": 3.3487,
368
- "eval_samples_per_second": 8.959,
369
- "eval_steps_per_second": 1.195,
370
- "step": 96
371
- },
372
- {
373
- "epoch": 25.0,
374
- "grad_norm": 4.267527103424072,
375
- "learning_rate": 0.0003582558035069091,
376
- "loss": 0.0833,
377
- "step": 100
378
- },
379
- {
380
- "epoch": 25.0,
381
- "eval_loss": 0.053719986230134964,
382
- "eval_runtime": 3.3464,
383
- "eval_samples_per_second": 8.965,
384
- "eval_steps_per_second": 1.195,
385
- "step": 100
386
- },
387
- {
388
- "epoch": 26.0,
389
- "grad_norm": 1.9985603094100952,
390
- "learning_rate": 0.0003460566888489593,
391
- "loss": 0.0577,
392
- "step": 104
393
- },
394
- {
395
- "epoch": 26.0,
396
- "eval_loss": 0.07173171639442444,
397
- "eval_runtime": 3.3782,
398
- "eval_samples_per_second": 8.88,
399
- "eval_steps_per_second": 1.184,
400
- "step": 104
401
- },
402
- {
403
- "epoch": 27.0,
404
- "grad_norm": 8.18993091583252,
405
- "learning_rate": 0.00033358296939004547,
406
- "loss": 0.1127,
407
- "step": 108
408
- },
409
- {
410
- "epoch": 27.0,
411
- "eval_loss": 0.046506691724061966,
412
- "eval_runtime": 3.3712,
413
- "eval_samples_per_second": 8.899,
414
- "eval_steps_per_second": 1.187,
415
- "step": 108
416
- },
417
- {
418
- "epoch": 28.0,
419
- "grad_norm": 1.7538604736328125,
420
- "learning_rate": 0.00032087030473170445,
421
- "loss": 0.0478,
422
- "step": 112
423
- },
424
- {
425
- "epoch": 28.0,
426
- "eval_loss": 0.03175203874707222,
427
- "eval_runtime": 3.3745,
428
- "eval_samples_per_second": 8.89,
429
- "eval_steps_per_second": 1.185,
430
- "step": 112
431
- },
432
- {
433
- "epoch": 29.0,
434
- "grad_norm": 13.397028923034668,
435
- "learning_rate": 0.0003079550375668821,
436
- "loss": 0.0585,
437
- "step": 116
438
- },
439
- {
440
- "epoch": 29.0,
441
- "eval_loss": 0.026407385244965553,
442
- "eval_runtime": 3.3744,
443
- "eval_samples_per_second": 8.89,
444
- "eval_steps_per_second": 1.185,
445
- "step": 116
446
- },
447
- {
448
- "epoch": 30.0,
449
- "grad_norm": 2.6996145248413086,
450
- "learning_rate": 0.0002948740897842223,
451
- "loss": 0.0468,
452
- "step": 120
453
- },
454
- {
455
- "epoch": 30.0,
456
- "eval_loss": 0.021538730710744858,
457
- "eval_runtime": 3.357,
458
- "eval_samples_per_second": 8.937,
459
- "eval_steps_per_second": 1.192,
460
- "step": 120
461
- },
462
- {
463
- "epoch": 31.0,
464
- "grad_norm": 3.0162580013275146,
465
- "learning_rate": 0.00028166485691656423,
466
- "loss": 0.0406,
467
- "step": 124
468
- },
469
- {
470
- "epoch": 31.0,
471
- "eval_loss": 0.04511945694684982,
472
- "eval_runtime": 3.3551,
473
- "eval_samples_per_second": 8.942,
474
- "eval_steps_per_second": 1.192,
475
- "step": 124
476
- },
477
- {
478
- "epoch": 32.0,
479
- "grad_norm": 1.962350845336914,
480
- "learning_rate": 0.0002683651012353955,
481
- "loss": 0.0505,
482
- "step": 128
483
- },
484
- {
485
- "epoch": 32.0,
486
- "eval_loss": 0.040635574609041214,
487
- "eval_runtime": 3.3893,
488
- "eval_samples_per_second": 8.851,
489
- "eval_steps_per_second": 1.18,
490
- "step": 128
491
- },
492
- {
493
- "epoch": 33.0,
494
- "grad_norm": 2.60496187210083,
495
- "learning_rate": 0.00025501284379688067,
496
- "loss": 0.0458,
497
- "step": 132
498
- },
499
- {
500
- "epoch": 33.0,
501
- "eval_loss": 0.01912449672818184,
502
- "eval_runtime": 3.3576,
503
- "eval_samples_per_second": 8.935,
504
- "eval_steps_per_second": 1.191,
505
- "step": 132
506
- },
507
- {
508
- "epoch": 34.0,
509
- "grad_norm": 165.37008666992188,
510
- "learning_rate": 0.00024164625574808144,
511
- "loss": 0.0322,
512
- "step": 136
513
- },
514
- {
515
- "epoch": 34.0,
516
- "eval_loss": 0.032830674201250076,
517
- "eval_runtime": 3.3488,
518
- "eval_samples_per_second": 8.958,
519
- "eval_steps_per_second": 1.194,
520
- "step": 136
521
- },
522
- {
523
- "epoch": 35.0,
524
- "grad_norm": 2.0985658168792725,
525
- "learning_rate": 0.00022830354920410064,
526
- "loss": 0.0392,
527
- "step": 140
528
- },
529
- {
530
- "epoch": 35.0,
531
- "eval_loss": 0.016142379492521286,
532
- "eval_runtime": 3.3792,
533
- "eval_samples_per_second": 8.878,
534
- "eval_steps_per_second": 1.184,
535
- "step": 140
536
- },
537
- {
538
- "epoch": 36.0,
539
- "grad_norm": 2.632981777191162,
540
- "learning_rate": 0.0002150228680081079,
541
- "loss": 0.0328,
542
- "step": 144
543
- },
544
- {
545
- "epoch": 36.0,
546
- "eval_loss": 0.018485000357031822,
547
- "eval_runtime": 3.3462,
548
- "eval_samples_per_second": 8.965,
549
- "eval_steps_per_second": 1.195,
550
- "step": 144
551
- },
552
- {
553
- "epoch": 37.0,
554
- "grad_norm": 2.000067949295044,
555
- "learning_rate": 0.00020184217868653867,
556
- "loss": 0.0281,
557
- "step": 148
558
- },
559
- {
560
- "epoch": 37.0,
561
- "eval_loss": 0.010733678936958313,
562
- "eval_runtime": 3.3514,
563
- "eval_samples_per_second": 8.952,
564
- "eval_steps_per_second": 1.194,
565
- "step": 148
566
- },
567
- {
568
- "epoch": 38.0,
569
- "grad_norm": 2.6984753608703613,
570
- "learning_rate": 0.00018879916191120349,
571
- "loss": 0.0306,
572
- "step": 152
573
- },
574
- {
575
- "epoch": 38.0,
576
- "eval_loss": 0.014147897250950336,
577
- "eval_runtime": 3.3807,
578
- "eval_samples_per_second": 8.874,
579
- "eval_steps_per_second": 1.183,
580
- "step": 152
581
- },
582
- {
583
- "epoch": 39.0,
584
- "grad_norm": 1.7256083488464355,
585
- "learning_rate": 0.00017593110477859153,
586
- "loss": 0.0229,
587
- "step": 156
588
- },
589
- {
590
- "epoch": 39.0,
591
- "eval_loss": 0.013782525435090065,
592
- "eval_runtime": 3.3475,
593
- "eval_samples_per_second": 8.962,
594
- "eval_steps_per_second": 1.195,
595
- "step": 156
596
- },
597
- {
598
- "epoch": 40.0,
599
- "grad_norm": 1.312639832496643,
600
- "learning_rate": 0.00016327479421431983,
601
- "loss": 0.0233,
602
- "step": 160
603
- },
604
- {
605
- "epoch": 40.0,
606
- "eval_loss": 0.009614935144782066,
607
- "eval_runtime": 3.3523,
608
- "eval_samples_per_second": 8.949,
609
- "eval_steps_per_second": 1.193,
610
- "step": 160
611
- },
612
- {
613
- "epoch": 41.0,
614
- "grad_norm": 1.2714215517044067,
615
- "learning_rate": 0.00015086641180745932,
616
- "loss": 0.014,
617
- "step": 164
618
- },
619
- {
620
- "epoch": 41.0,
621
- "eval_loss": 0.01818581484258175,
622
- "eval_runtime": 3.3843,
623
- "eval_samples_per_second": 8.864,
624
- "eval_steps_per_second": 1.182,
625
- "step": 164
626
- },
627
- {
628
- "epoch": 42.0,
629
- "grad_norm": 2.4470083713531494,
630
- "learning_rate": 0.00013874143037538418,
631
- "loss": 0.0268,
632
- "step": 168
633
- },
634
- {
635
- "epoch": 42.0,
636
- "eval_loss": 0.017209211364388466,
637
- "eval_runtime": 3.3513,
638
- "eval_samples_per_second": 8.952,
639
- "eval_steps_per_second": 1.194,
640
- "step": 168
641
- },
642
- {
643
- "epoch": 43.0,
644
- "grad_norm": 1.7730706930160522,
645
- "learning_rate": 0.00012693451255484312,
646
- "loss": 0.0449,
647
- "step": 172
648
- },
649
- {
650
- "epoch": 43.0,
651
- "eval_loss": 0.03041950613260269,
652
- "eval_runtime": 3.3534,
653
- "eval_samples_per_second": 8.946,
654
- "eval_steps_per_second": 1.193,
655
- "step": 172
656
- },
657
- {
658
- "epoch": 44.0,
659
- "grad_norm": 0.741055965423584,
660
- "learning_rate": 0.00011547941170915685,
661
- "loss": 0.0211,
662
- "step": 176
663
- },
664
- {
665
- "epoch": 44.0,
666
- "eval_loss": 0.00939116906374693,
667
- "eval_runtime": 3.3843,
668
- "eval_samples_per_second": 8.865,
669
- "eval_steps_per_second": 1.182,
670
- "step": 176
671
- },
672
- {
673
- "epoch": 45.0,
674
- "grad_norm": 0.5482388138771057,
675
- "learning_rate": 0.00010440887543482746,
676
- "loss": 0.0127,
677
- "step": 180
678
- },
679
- {
680
- "epoch": 45.0,
681
- "eval_loss": 0.002400527708232403,
682
- "eval_runtime": 3.3584,
683
- "eval_samples_per_second": 8.933,
684
- "eval_steps_per_second": 1.191,
685
- "step": 180
686
- },
687
- {
688
- "epoch": 46.0,
689
- "grad_norm": 0.9107034206390381,
690
- "learning_rate": 9.375455194341214e-05,
691
- "loss": 0.0028,
692
- "step": 184
693
- },
694
- {
695
- "epoch": 46.0,
696
- "eval_loss": 0.0015702341916039586,
697
- "eval_runtime": 3.3439,
698
- "eval_samples_per_second": 8.972,
699
- "eval_steps_per_second": 1.196,
700
- "step": 184
701
- },
702
- {
703
- "epoch": 47.0,
704
- "grad_norm": 0.6881429553031921,
705
- "learning_rate": 8.354689958629513e-05,
706
- "loss": 0.0026,
707
- "step": 188
708
- },
709
- {
710
- "epoch": 47.0,
711
- "eval_loss": 0.0003339569375384599,
712
- "eval_runtime": 3.3814,
713
- "eval_samples_per_second": 8.872,
714
- "eval_steps_per_second": 1.183,
715
- "step": 188
716
- },
717
- {
718
- "epoch": 48.0,
719
- "grad_norm": 0.018251951783895493,
720
- "learning_rate": 7.381509978100626e-05,
721
- "loss": 0.0004,
722
- "step": 192
723
- },
724
- {
725
- "epoch": 48.0,
726
- "eval_loss": 0.0006879170541651547,
727
- "eval_runtime": 3.3536,
728
- "eval_samples_per_second": 8.946,
729
- "eval_steps_per_second": 1.193,
730
- "step": 192
731
- },
732
- {
733
- "epoch": 49.0,
734
- "grad_norm": 0.14602628350257874,
735
- "learning_rate": 6.458697358801061e-05,
736
- "loss": 0.0013,
737
- "step": 196
738
- },
739
- {
740
- "epoch": 49.0,
741
- "eval_loss": 0.0002326490357518196,
742
- "eval_runtime": 3.3572,
743
- "eval_samples_per_second": 8.936,
744
- "eval_steps_per_second": 1.191,
745
- "step": 196
746
- },
747
- {
748
- "epoch": 50.0,
749
- "grad_norm": 0.011048965156078339,
750
- "learning_rate": 5.58889021764582e-05,
751
- "loss": 0.0002,
752
- "step": 200
753
- },
754
- {
755
- "epoch": 50.0,
756
- "eval_loss": 0.00015924364561215043,
757
- "eval_runtime": 3.3745,
758
- "eval_samples_per_second": 8.89,
759
- "eval_steps_per_second": 1.185,
760
- "step": 200
761
- },
762
- {
763
- "epoch": 51.0,
764
- "grad_norm": 0.008616381324827671,
765
- "learning_rate": 4.7745751406263163e-05,
766
- "loss": 0.0002,
767
- "step": 204
768
- },
769
- {
770
- "epoch": 51.0,
771
- "eval_loss": 0.00013590451271738857,
772
- "eval_runtime": 3.3562,
773
- "eval_samples_per_second": 8.939,
774
- "eval_steps_per_second": 1.192,
775
- "step": 204
776
- },
777
- {
778
- "epoch": 52.0,
779
- "grad_norm": 0.007540772669017315,
780
- "learning_rate": 4.0180800742117244e-05,
781
- "loss": 0.0001,
782
- "step": 208
783
- },
784
- {
785
- "epoch": 52.0,
786
- "eval_loss": 0.00011990289203822613,
787
- "eval_runtime": 3.3714,
788
- "eval_samples_per_second": 8.898,
789
- "eval_steps_per_second": 1.186,
790
- "step": 208
791
- },
792
- {
793
- "epoch": 53.0,
794
- "grad_norm": 0.004615222569555044,
795
- "learning_rate": 3.321567670265568e-05,
796
- "loss": 0.0001,
797
- "step": 212
798
- },
799
- {
800
- "epoch": 53.0,
801
- "eval_loss": 0.00010787827341118827,
802
- "eval_runtime": 3.3544,
803
- "eval_samples_per_second": 8.944,
804
- "eval_steps_per_second": 1.192,
805
- "step": 212
806
- },
807
- {
808
- "epoch": 54.0,
809
- "grad_norm": 0.00507324980571866,
810
- "learning_rate": 2.687029103502972e-05,
811
- "loss": 0.0001,
812
- "step": 216
813
- },
814
- {
815
- "epoch": 54.0,
816
- "eval_loss": 9.984564530896023e-05,
817
- "eval_runtime": 3.3469,
818
- "eval_samples_per_second": 8.964,
819
- "eval_steps_per_second": 1.195,
820
- "step": 216
821
- },
822
- {
823
- "epoch": 55.0,
824
- "grad_norm": 0.0049056364223361015,
825
- "learning_rate": 2.1162783791631057e-05,
826
- "loss": 0.0001,
827
- "step": 220
828
- },
829
- {
830
- "epoch": 55.0,
831
- "eval_loss": 9.402891737408936e-05,
832
- "eval_runtime": 3.3601,
833
- "eval_samples_per_second": 8.928,
834
- "eval_steps_per_second": 1.19,
835
- "step": 220
836
- },
837
- {
838
- "epoch": 56.0,
839
- "grad_norm": 0.004331118427217007,
840
- "learning_rate": 1.6109471471699556e-05,
841
- "loss": 0.0001,
842
- "step": 224
843
- },
844
- {
845
- "epoch": 56.0,
846
- "eval_loss": 9.088371007237583e-05,
847
- "eval_runtime": 3.3632,
848
- "eval_samples_per_second": 8.92,
849
- "eval_steps_per_second": 1.189,
850
- "step": 224
851
- },
852
- {
853
- "epoch": 57.0,
854
- "grad_norm": 0.004597917664796114,
855
- "learning_rate": 1.1724800376064798e-05,
856
- "loss": 0.0001,
857
- "step": 228
858
- },
859
- {
860
- "epoch": 57.0,
861
- "eval_loss": 8.777277253102511e-05,
862
- "eval_runtime": 3.3405,
863
- "eval_samples_per_second": 8.981,
864
- "eval_steps_per_second": 1.197,
865
- "step": 228
866
- },
867
- {
868
- "epoch": 58.0,
869
- "grad_norm": 0.0038540030363947153,
870
- "learning_rate": 8.02130530837189e-06,
871
- "loss": 0.0001,
872
- "step": 232
873
- },
874
- {
875
- "epoch": 58.0,
876
- "eval_loss": 8.658332808408886e-05,
877
- "eval_runtime": 3.3595,
878
- "eval_samples_per_second": 8.93,
879
- "eval_steps_per_second": 1.191,
880
- "step": 232
881
- },
882
- {
883
- "epoch": 59.0,
884
- "grad_norm": 0.0038286536000669003,
885
- "learning_rate": 5.009573740853312e-06,
886
- "loss": 0.0001,
887
- "step": 236
888
- },
889
- {
890
- "epoch": 59.0,
891
- "eval_loss": 8.514503861078992e-05,
892
- "eval_runtime": 3.3605,
893
- "eval_samples_per_second": 8.927,
894
- "eval_steps_per_second": 1.19,
895
- "step": 236
896
  }
897
  ],
898
  "logging_steps": 1,
899
- "max_steps": 250,
900
  "num_input_tokens_seen": 0,
901
- "num_train_epochs": 63,
902
  "save_steps": 500,
903
  "stateful_callbacks": {
904
  "TrainerControl": {
@@ -907,12 +112,12 @@
907
  "should_evaluate": false,
908
  "should_log": false,
909
  "should_save": true,
910
- "should_training_stop": false
911
  },
912
  "attributes": {}
913
  }
914
  },
915
- "total_flos": 8799046124943360.0,
916
  "train_batch_size": 1,
917
  "trial_name": null,
918
  "trial_params": null
 
1
  {
2
+ "best_global_step": 24,
3
+ "best_metric": 0.4786834418773651,
4
+ "best_model_checkpoint": "/content/drive/MyDrive/lora_model/outputs/task15_microsoft/Phi-4-mini-instruct/checkpoint-24",
5
+ "epoch": 6.0,
6
  "eval_steps": 500,
7
+ "global_step": 24,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 1.0,
14
+ "grad_norm": 10.959209442138672,
15
+ "learning_rate": 2e-05,
16
+ "loss": 2.9413,
17
  "step": 4
18
  },
19
  {
20
  "epoch": 1.0,
21
+ "eval_loss": 2.1191623210906982,
22
+ "eval_runtime": 3.3934,
23
+ "eval_samples_per_second": 8.841,
24
+ "eval_steps_per_second": 1.179,
25
  "step": 4
26
  },
27
  {
28
  "epoch": 2.0,
29
+ "grad_norm": 10.235432624816895,
30
+ "learning_rate": 4.666666666666667e-05,
31
+ "loss": 1.7232,
32
  "step": 8
33
  },
34
  {
35
  "epoch": 2.0,
36
+ "eval_loss": 1.1511512994766235,
37
+ "eval_runtime": 3.3715,
38
+ "eval_samples_per_second": 8.898,
39
+ "eval_steps_per_second": 1.186,
40
  "step": 8
41
  },
42
  {
43
  "epoch": 3.0,
44
+ "grad_norm": 5.968946933746338,
45
+ "learning_rate": 7.333333333333333e-05,
46
+ "loss": 1.087,
47
  "step": 12
48
  },
49
  {
50
  "epoch": 3.0,
51
+ "eval_loss": 0.8837258219718933,
52
+ "eval_runtime": 3.433,
53
+ "eval_samples_per_second": 8.739,
54
+ "eval_steps_per_second": 1.165,
55
  "step": 12
56
  },
57
  {
58
  "epoch": 4.0,
59
+ "grad_norm": 4.635072231292725,
60
+ "learning_rate": 0.0001,
61
+ "loss": 0.8605,
62
  "step": 16
63
  },
64
  {
65
  "epoch": 4.0,
66
+ "eval_loss": 0.6861255168914795,
67
+ "eval_runtime": 3.5269,
68
+ "eval_samples_per_second": 8.506,
69
+ "eval_steps_per_second": 1.134,
70
  "step": 16
71
  },
72
  {
73
  "epoch": 5.0,
74
+ "grad_norm": 7.834333896636963,
75
+ "learning_rate": 5.868240888334653e-05,
76
+ "loss": 0.6604,
77
  "step": 20
78
  },
79
  {
80
  "epoch": 5.0,
81
+ "eval_loss": 0.555094838142395,
82
+ "eval_runtime": 3.5535,
83
+ "eval_samples_per_second": 8.442,
84
+ "eval_steps_per_second": 1.126,
85
  "step": 20
86
  },
87
  {
88
  "epoch": 6.0,
89
+ "grad_norm": 5.8574137687683105,
90
+ "learning_rate": 3.0153689607045845e-06,
91
+ "loss": 0.5394,
92
  "step": 24
93
  },
94
  {
95
  "epoch": 6.0,
96
+ "eval_loss": 0.4786834418773651,
97
+ "eval_runtime": 3.5424,
98
+ "eval_samples_per_second": 8.469,
99
+ "eval_steps_per_second": 1.129,
100
  "step": 24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  }
102
  ],
103
  "logging_steps": 1,
104
+ "max_steps": 24,
105
  "num_input_tokens_seen": 0,
106
+ "num_train_epochs": 6,
107
  "save_steps": 500,
108
  "stateful_callbacks": {
109
  "TrainerControl": {
 
112
  "should_evaluate": false,
113
  "should_log": false,
114
  "should_save": true,
115
+ "should_training_stop": true
116
  },
117
  "attributes": {}
118
  }
119
  },
120
+ "total_flos": 894818249994240.0,
121
  "train_batch_size": 1,
122
  "trial_name": null,
123
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9dcffb64394458c7f3bf585c777bb22841e1c48790a8b32dfe35f3ef285c9393
3
  size 6033
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78ca5075a4829a841c8ee9add728a7824a036af9fc9b4b2d7dd77743d63820e8
3
  size 6033