aleegis commited on
Commit
8964f7f
·
verified ·
1 Parent(s): 6214ab7

Training in progress, step 600, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1bb2d34536f53765db925aec503ec77f3782aa3bec1228a0fb31ef711894567b
3
  size 101752088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11d6aca27259c04e48ff1e6833110c9ce7f7cb359ca11513b3eb5c3401694577
3
  size 101752088
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aba086c49bb1d64970d247f24d6d61f09233122825c51a37951b7ab52468080b
3
  size 203719079
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:110f815b2e61a6607ab28d0131cd25dcd3134a1d6d7e0a35eabcde2387a38bb9
3
  size 203719079
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:99ad50444b8487f944187571f0d34b8c0833c8b1ec0194bbf9de205c3834a3ba
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6bc83dc2d6c811943f930285a433310949280eb049ff76a77b592b75863af96c
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7022ab037152757f4e275ed980213f69e3a154b7ed94c343e397c8af670740a0
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:13170990a11005b433d8dae9c4d2d14d2d8b2818aeb5b8e3b1626f654dee20a1
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.04752098843655948,
6
  "eval_steps": 500,
7
- "global_step": 300,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -218,6 +218,216 @@
218
  "learning_rate": 9.966666666666667e-05,
219
  "loss": 1.3409,
220
  "step": 300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
  }
222
  ],
223
  "logging_steps": 10,
@@ -237,7 +447,7 @@
237
  "attributes": {}
238
  }
239
  },
240
- "total_flos": 9.8780760244224e+16,
241
  "train_batch_size": 16,
242
  "trial_name": null,
243
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.09504197687311897,
6
  "eval_steps": 500,
7
+ "global_step": 600,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
218
  "learning_rate": 9.966666666666667e-05,
219
  "loss": 1.3409,
220
  "step": 300
221
+ },
222
+ {
223
+ "epoch": 0.049105021384444794,
224
+ "grad_norm": 0.3628294765949249,
225
+ "learning_rate": 9.999725846827562e-05,
226
+ "loss": 1.3767,
227
+ "step": 310
228
+ },
229
+ {
230
+ "epoch": 0.05068905433233011,
231
+ "grad_norm": 0.4101487398147583,
232
+ "learning_rate": 9.998778195446311e-05,
233
+ "loss": 1.4544,
234
+ "step": 320
235
+ },
236
+ {
237
+ "epoch": 0.05227308728021543,
238
+ "grad_norm": 0.40901538729667664,
239
+ "learning_rate": 9.997153789515461e-05,
240
+ "loss": 1.406,
241
+ "step": 330
242
+ },
243
+ {
244
+ "epoch": 0.05385712022810074,
245
+ "grad_norm": 0.40885481238365173,
246
+ "learning_rate": 9.994852848953574e-05,
247
+ "loss": 1.4384,
248
+ "step": 340
249
+ },
250
+ {
251
+ "epoch": 0.05544115317598606,
252
+ "grad_norm": 0.433713436126709,
253
+ "learning_rate": 9.991875685271168e-05,
254
+ "loss": 1.4379,
255
+ "step": 350
256
+ },
257
+ {
258
+ "epoch": 0.05702518612387138,
259
+ "grad_norm": 0.41924989223480225,
260
+ "learning_rate": 9.988222701528547e-05,
261
+ "loss": 1.3935,
262
+ "step": 360
263
+ },
264
+ {
265
+ "epoch": 0.058609219071756695,
266
+ "grad_norm": 0.48709481954574585,
267
+ "learning_rate": 9.983894392281237e-05,
268
+ "loss": 1.3913,
269
+ "step": 370
270
+ },
271
+ {
272
+ "epoch": 0.060193252019642006,
273
+ "grad_norm": 0.44844549894332886,
274
+ "learning_rate": 9.978891343513023e-05,
275
+ "loss": 1.3975,
276
+ "step": 380
277
+ },
278
+ {
279
+ "epoch": 0.061777284967527324,
280
+ "grad_norm": 0.510023295879364,
281
+ "learning_rate": 9.973214232556622e-05,
282
+ "loss": 1.3778,
283
+ "step": 390
284
+ },
285
+ {
286
+ "epoch": 0.06336131791541263,
287
+ "grad_norm": 0.5296265482902527,
288
+ "learning_rate": 9.966863828001982e-05,
289
+ "loss": 1.4633,
290
+ "step": 400
291
+ },
292
+ {
293
+ "epoch": 0.06494535086329796,
294
+ "grad_norm": 0.5563903450965881,
295
+ "learning_rate": 9.959840989592226e-05,
296
+ "loss": 1.4098,
297
+ "step": 410
298
+ },
299
+ {
300
+ "epoch": 0.06652938381118327,
301
+ "grad_norm": 0.6422920227050781,
302
+ "learning_rate": 9.952146668107254e-05,
303
+ "loss": 1.3916,
304
+ "step": 420
305
+ },
306
+ {
307
+ "epoch": 0.06811341675906858,
308
+ "grad_norm": 0.5075757503509521,
309
+ "learning_rate": 9.94378190523503e-05,
310
+ "loss": 1.4111,
311
+ "step": 430
312
+ },
313
+ {
314
+ "epoch": 0.0696974497069539,
315
+ "grad_norm": 0.6157119274139404,
316
+ "learning_rate": 9.934747833430547e-05,
317
+ "loss": 1.4315,
318
+ "step": 440
319
+ },
320
+ {
321
+ "epoch": 0.07128148265483922,
322
+ "grad_norm": 0.5845485925674438,
323
+ "learning_rate": 9.925045675762514e-05,
324
+ "loss": 1.3969,
325
+ "step": 450
326
+ },
327
+ {
328
+ "epoch": 0.07286551560272454,
329
+ "grad_norm": 0.49031880497932434,
330
+ "learning_rate": 9.914676745747772e-05,
331
+ "loss": 1.3132,
332
+ "step": 460
333
+ },
334
+ {
335
+ "epoch": 0.07444954855060985,
336
+ "grad_norm": 0.643332302570343,
337
+ "learning_rate": 9.903642447173465e-05,
338
+ "loss": 1.4596,
339
+ "step": 470
340
+ },
341
+ {
342
+ "epoch": 0.07603358149849516,
343
+ "grad_norm": 0.604245662689209,
344
+ "learning_rate": 9.891944273906986e-05,
345
+ "loss": 1.4994,
346
+ "step": 480
347
+ },
348
+ {
349
+ "epoch": 0.07761761444638049,
350
+ "grad_norm": 0.4713222086429596,
351
+ "learning_rate": 9.879583809693738e-05,
352
+ "loss": 1.372,
353
+ "step": 490
354
+ },
355
+ {
356
+ "epoch": 0.0792016473942658,
357
+ "grad_norm": 0.47081106901168823,
358
+ "learning_rate": 9.866562727942714e-05,
359
+ "loss": 1.5145,
360
+ "step": 500
361
+ },
362
+ {
363
+ "epoch": 0.08078568034215111,
364
+ "grad_norm": 0.6371116638183594,
365
+ "learning_rate": 9.85288279149995e-05,
366
+ "loss": 1.4835,
367
+ "step": 510
368
+ },
369
+ {
370
+ "epoch": 0.08236971329003644,
371
+ "grad_norm": 0.6258746981620789,
372
+ "learning_rate": 9.838545852409857e-05,
373
+ "loss": 1.4214,
374
+ "step": 520
375
+ },
376
+ {
377
+ "epoch": 0.08395374623792175,
378
+ "grad_norm": 0.7464697360992432,
379
+ "learning_rate": 9.823553851664489e-05,
380
+ "loss": 1.4559,
381
+ "step": 530
382
+ },
383
+ {
384
+ "epoch": 0.08553777918580706,
385
+ "grad_norm": 0.5535822510719299,
386
+ "learning_rate": 9.807908818940761e-05,
387
+ "loss": 1.4096,
388
+ "step": 540
389
+ },
390
+ {
391
+ "epoch": 0.08712181213369238,
392
+ "grad_norm": 0.5659494400024414,
393
+ "learning_rate": 9.791612872325667e-05,
394
+ "loss": 1.4298,
395
+ "step": 550
396
+ },
397
+ {
398
+ "epoch": 0.0887058450815777,
399
+ "grad_norm": 0.5127139091491699,
400
+ "learning_rate": 9.77466821802952e-05,
401
+ "loss": 1.339,
402
+ "step": 560
403
+ },
404
+ {
405
+ "epoch": 0.09028987802946302,
406
+ "grad_norm": 0.5496402382850647,
407
+ "learning_rate": 9.75707715008727e-05,
408
+ "loss": 1.4232,
409
+ "step": 570
410
+ },
411
+ {
412
+ "epoch": 0.09187391097734833,
413
+ "grad_norm": 0.6117046475410461,
414
+ "learning_rate": 9.73884205004793e-05,
415
+ "loss": 1.4693,
416
+ "step": 580
417
+ },
418
+ {
419
+ "epoch": 0.09345794392523364,
420
+ "grad_norm": 0.5658081769943237,
421
+ "learning_rate": 9.719965386652141e-05,
422
+ "loss": 1.3002,
423
+ "step": 590
424
+ },
425
+ {
426
+ "epoch": 0.09504197687311897,
427
+ "grad_norm": 0.7319624423980713,
428
+ "learning_rate": 9.700449715497961e-05,
429
+ "loss": 1.5359,
430
+ "step": 600
431
  }
432
  ],
433
  "logging_steps": 10,
 
447
  "attributes": {}
448
  }
449
  },
450
+ "total_flos": 1.97561520488448e+17,
451
  "train_batch_size": 16,
452
  "trial_name": null,
453
  "trial_params": null