| { | |
| "best_metric": 0.9795737122557726, | |
| "best_model_checkpoint": "swin-tiny-patch4-window7-224-finetuned-ibird/checkpoint-2825", | |
| "epoch": 4.995579133510168, | |
| "eval_steps": 500, | |
| "global_step": 2825, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 10.971182823181152, | |
| "learning_rate": 1.76678445229682e-06, | |
| "loss": 3.1809, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 6.421601295471191, | |
| "learning_rate": 3.53356890459364e-06, | |
| "loss": 3.2061, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 9.90295124053955, | |
| "learning_rate": 5.30035335689046e-06, | |
| "loss": 3.1752, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 6.587502479553223, | |
| "learning_rate": 7.06713780918728e-06, | |
| "loss": 3.1217, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 5.991631984710693, | |
| "learning_rate": 8.8339222614841e-06, | |
| "loss": 3.0881, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 18.684083938598633, | |
| "learning_rate": 1.060070671378092e-05, | |
| "loss": 3.0575, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 9.924698829650879, | |
| "learning_rate": 1.236749116607774e-05, | |
| "loss": 2.9739, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 8.268143653869629, | |
| "learning_rate": 1.413427561837456e-05, | |
| "loss": 2.8529, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 8.838825225830078, | |
| "learning_rate": 1.5901060070671377e-05, | |
| "loss": 2.6898, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 11.603575706481934, | |
| "learning_rate": 1.76678445229682e-05, | |
| "loss": 2.4395, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 13.830281257629395, | |
| "learning_rate": 1.9434628975265016e-05, | |
| "loss": 1.9708, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 13.014172554016113, | |
| "learning_rate": 2.120141342756184e-05, | |
| "loss": 1.5808, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 17.082054138183594, | |
| "learning_rate": 2.296819787985866e-05, | |
| "loss": 1.2151, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 17.318891525268555, | |
| "learning_rate": 2.473498233215548e-05, | |
| "loss": 0.8489, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 14.354799270629883, | |
| "learning_rate": 2.6501766784452298e-05, | |
| "loss": 0.6891, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 9.554667472839355, | |
| "learning_rate": 2.826855123674912e-05, | |
| "loss": 0.6101, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 8.692755699157715, | |
| "learning_rate": 3.003533568904594e-05, | |
| "loss": 0.4587, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 18.25060272216797, | |
| "learning_rate": 3.1802120141342755e-05, | |
| "loss": 0.436, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 10.215097427368164, | |
| "learning_rate": 3.356890459363958e-05, | |
| "loss": 0.2916, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 10.572564125061035, | |
| "learning_rate": 3.53356890459364e-05, | |
| "loss": 0.2545, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 18.01451301574707, | |
| "learning_rate": 3.710247349823322e-05, | |
| "loss": 0.3369, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 9.001435279846191, | |
| "learning_rate": 3.886925795053003e-05, | |
| "loss": 0.2184, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 11.199219703674316, | |
| "learning_rate": 4.063604240282686e-05, | |
| "loss": 0.2333, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 9.572484016418457, | |
| "learning_rate": 4.240282685512368e-05, | |
| "loss": 0.2238, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 11.255512237548828, | |
| "learning_rate": 4.416961130742049e-05, | |
| "loss": 0.2418, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 7.414256572723389, | |
| "learning_rate": 4.593639575971732e-05, | |
| "loss": 0.2201, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 4.147739410400391, | |
| "learning_rate": 4.7703180212014135e-05, | |
| "loss": 0.2054, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 4.774346828460693, | |
| "learning_rate": 4.946996466431096e-05, | |
| "loss": 0.1863, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 7.592128753662109, | |
| "learning_rate": 4.9862313139260423e-05, | |
| "loss": 0.1531, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 14.047577857971191, | |
| "learning_rate": 4.9665617623918175e-05, | |
| "loss": 0.1713, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 13.25577449798584, | |
| "learning_rate": 4.9468922108575926e-05, | |
| "loss": 0.2064, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 15.570125579833984, | |
| "learning_rate": 4.927222659323368e-05, | |
| "loss": 0.1782, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 7.168793201446533, | |
| "learning_rate": 4.907553107789143e-05, | |
| "loss": 0.1792, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 19.736141204833984, | |
| "learning_rate": 4.887883556254917e-05, | |
| "loss": 0.1774, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 11.251014709472656, | |
| "learning_rate": 4.8682140047206924e-05, | |
| "loss": 0.1933, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 14.66421890258789, | |
| "learning_rate": 4.8485444531864675e-05, | |
| "loss": 0.1554, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 7.186115741729736, | |
| "learning_rate": 4.8288749016522426e-05, | |
| "loss": 0.1577, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 11.209123611450195, | |
| "learning_rate": 4.809205350118017e-05, | |
| "loss": 0.1598, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 7.6711602210998535, | |
| "learning_rate": 4.789535798583792e-05, | |
| "loss": 0.1964, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 12.84682846069336, | |
| "learning_rate": 4.769866247049567e-05, | |
| "loss": 0.1597, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 6.741486072540283, | |
| "learning_rate": 4.7501966955153424e-05, | |
| "loss": 0.1085, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 12.747113227844238, | |
| "learning_rate": 4.7305271439811175e-05, | |
| "loss": 0.1448, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 15.717222213745117, | |
| "learning_rate": 4.7108575924468926e-05, | |
| "loss": 0.2451, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 6.885907173156738, | |
| "learning_rate": 4.691188040912668e-05, | |
| "loss": 0.1473, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 9.838655471801758, | |
| "learning_rate": 4.671518489378442e-05, | |
| "loss": 0.1295, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 5.5332932472229, | |
| "learning_rate": 4.651848937844217e-05, | |
| "loss": 0.1199, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 11.243609428405762, | |
| "learning_rate": 4.6321793863099924e-05, | |
| "loss": 0.2097, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 1.227767825126648, | |
| "learning_rate": 4.6125098347757675e-05, | |
| "loss": 0.142, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 8.73798942565918, | |
| "learning_rate": 4.5928402832415426e-05, | |
| "loss": 0.1749, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 4.825047969818115, | |
| "learning_rate": 4.573170731707318e-05, | |
| "loss": 0.1062, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 14.581511497497559, | |
| "learning_rate": 4.553501180173092e-05, | |
| "loss": 0.1786, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 8.596332550048828, | |
| "learning_rate": 4.533831628638867e-05, | |
| "loss": 0.1797, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 21.009435653686523, | |
| "learning_rate": 4.5141620771046424e-05, | |
| "loss": 0.141, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 11.336109161376953, | |
| "learning_rate": 4.4944925255704175e-05, | |
| "loss": 0.1304, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 8.8182954788208, | |
| "learning_rate": 4.4748229740361926e-05, | |
| "loss": 0.1208, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 14.062355041503906, | |
| "learning_rate": 4.455153422501967e-05, | |
| "loss": 0.166, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_accuracy": 0.9675843694493783, | |
| "eval_loss": 0.11483483016490936, | |
| "eval_runtime": 72.8258, | |
| "eval_samples_per_second": 30.923, | |
| "eval_steps_per_second": 3.872, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 1.01, | |
| "grad_norm": 3.4058330059051514, | |
| "learning_rate": 4.435483870967742e-05, | |
| "loss": 0.0858, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.03, | |
| "grad_norm": 4.461623191833496, | |
| "learning_rate": 4.415814319433517e-05, | |
| "loss": 0.1057, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 11.684330940246582, | |
| "learning_rate": 4.3961447678992924e-05, | |
| "loss": 0.0425, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.06, | |
| "grad_norm": 11.538973808288574, | |
| "learning_rate": 4.376475216365067e-05, | |
| "loss": 0.0799, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 20.21871566772461, | |
| "learning_rate": 4.356805664830842e-05, | |
| "loss": 0.0488, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 6.491779327392578, | |
| "learning_rate": 4.337136113296617e-05, | |
| "loss": 0.0461, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.11, | |
| "grad_norm": 20.6411075592041, | |
| "learning_rate": 4.317466561762392e-05, | |
| "loss": 0.1365, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.13, | |
| "grad_norm": 6.05819034576416, | |
| "learning_rate": 4.297797010228167e-05, | |
| "loss": 0.0707, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "grad_norm": 5.697531223297119, | |
| "learning_rate": 4.278127458693942e-05, | |
| "loss": 0.0684, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.17, | |
| "grad_norm": 3.515834331512451, | |
| "learning_rate": 4.258457907159717e-05, | |
| "loss": 0.0809, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.18, | |
| "grad_norm": 7.478769779205322, | |
| "learning_rate": 4.238788355625492e-05, | |
| "loss": 0.0941, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 1.829246997833252, | |
| "learning_rate": 4.219118804091267e-05, | |
| "loss": 0.0903, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "grad_norm": 1.0911270380020142, | |
| "learning_rate": 4.1994492525570416e-05, | |
| "loss": 0.0291, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "grad_norm": 1.5587719678878784, | |
| "learning_rate": 4.179779701022817e-05, | |
| "loss": 0.0999, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.26, | |
| "grad_norm": 0.5585480332374573, | |
| "learning_rate": 4.160110149488592e-05, | |
| "loss": 0.0609, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.27, | |
| "grad_norm": 17.553932189941406, | |
| "learning_rate": 4.140440597954367e-05, | |
| "loss": 0.1024, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.29, | |
| "grad_norm": 5.286637306213379, | |
| "learning_rate": 4.1207710464201413e-05, | |
| "loss": 0.0903, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "grad_norm": 3.5825843811035156, | |
| "learning_rate": 4.1011014948859165e-05, | |
| "loss": 0.086, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.33, | |
| "grad_norm": 0.051394496113061905, | |
| "learning_rate": 4.0814319433516916e-05, | |
| "loss": 0.0483, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.34, | |
| "grad_norm": 4.485952377319336, | |
| "learning_rate": 4.061762391817467e-05, | |
| "loss": 0.0641, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.36, | |
| "grad_norm": 0.9799467921257019, | |
| "learning_rate": 4.042092840283242e-05, | |
| "loss": 0.0225, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "grad_norm": 0.5134735107421875, | |
| "learning_rate": 4.022423288749016e-05, | |
| "loss": 0.0875, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 1.2541639804840088, | |
| "learning_rate": 4.0027537372147914e-05, | |
| "loss": 0.031, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.41, | |
| "grad_norm": 0.1368063986301422, | |
| "learning_rate": 3.9830841856805665e-05, | |
| "loss": 0.1112, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.43, | |
| "grad_norm": 3.3988840579986572, | |
| "learning_rate": 3.9634146341463416e-05, | |
| "loss": 0.0806, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "grad_norm": 23.861074447631836, | |
| "learning_rate": 3.943745082612117e-05, | |
| "loss": 0.1014, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.47, | |
| "grad_norm": 7.84724235534668, | |
| "learning_rate": 3.924075531077892e-05, | |
| "loss": 0.0532, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.49, | |
| "grad_norm": 3.9560492038726807, | |
| "learning_rate": 3.904405979543666e-05, | |
| "loss": 0.0824, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 4.0379438400268555, | |
| "learning_rate": 3.8847364280094414e-05, | |
| "loss": 0.1027, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 1.1322356462478638, | |
| "learning_rate": 3.8650668764752165e-05, | |
| "loss": 0.0466, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "grad_norm": 14.02712631225586, | |
| "learning_rate": 3.8453973249409916e-05, | |
| "loss": 0.0871, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 21.739513397216797, | |
| "learning_rate": 3.825727773406767e-05, | |
| "loss": 0.0682, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.57, | |
| "grad_norm": 18.444072723388672, | |
| "learning_rate": 3.806058221872542e-05, | |
| "loss": 0.0942, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.59, | |
| "grad_norm": 9.762558937072754, | |
| "learning_rate": 3.786388670338317e-05, | |
| "loss": 0.071, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.61, | |
| "grad_norm": 10.741572380065918, | |
| "learning_rate": 3.7667191188040914e-05, | |
| "loss": 0.055, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.63, | |
| "grad_norm": 1.030358910560608, | |
| "learning_rate": 3.7470495672698665e-05, | |
| "loss": 0.0666, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.64, | |
| "grad_norm": 0.6305733919143677, | |
| "learning_rate": 3.7273800157356416e-05, | |
| "loss": 0.0523, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.66, | |
| "grad_norm": 2.391921043395996, | |
| "learning_rate": 3.707710464201417e-05, | |
| "loss": 0.0785, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.68, | |
| "grad_norm": 4.645728588104248, | |
| "learning_rate": 3.688040912667191e-05, | |
| "loss": 0.1018, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "grad_norm": 13.808793067932129, | |
| "learning_rate": 3.668371361132966e-05, | |
| "loss": 0.0728, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 0.23640145361423492, | |
| "learning_rate": 3.6487018095987414e-05, | |
| "loss": 0.0652, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.73, | |
| "grad_norm": 3.5320382118225098, | |
| "learning_rate": 3.6290322580645165e-05, | |
| "loss": 0.0621, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 1.206480860710144, | |
| "learning_rate": 3.6093627065302916e-05, | |
| "loss": 0.0676, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.77, | |
| "grad_norm": 18.625965118408203, | |
| "learning_rate": 3.589693154996066e-05, | |
| "loss": 0.0715, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "grad_norm": 3.9618349075317383, | |
| "learning_rate": 3.570023603461841e-05, | |
| "loss": 0.0814, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 2.8978772163391113, | |
| "learning_rate": 3.550354051927616e-05, | |
| "loss": 0.0383, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.82, | |
| "grad_norm": 11.628218650817871, | |
| "learning_rate": 3.5306845003933914e-05, | |
| "loss": 0.0768, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.84, | |
| "grad_norm": 1.9953187704086304, | |
| "learning_rate": 3.511014948859166e-05, | |
| "loss": 0.0649, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.86, | |
| "grad_norm": 15.428389549255371, | |
| "learning_rate": 3.491345397324941e-05, | |
| "loss": 0.0591, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "grad_norm": 2.0453615188598633, | |
| "learning_rate": 3.471675845790716e-05, | |
| "loss": 0.0979, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.89, | |
| "grad_norm": 7.687002658843994, | |
| "learning_rate": 3.452006294256491e-05, | |
| "loss": 0.0681, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.91, | |
| "grad_norm": 2.8325603008270264, | |
| "learning_rate": 3.432336742722266e-05, | |
| "loss": 0.0322, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.93, | |
| "grad_norm": 2.8982272148132324, | |
| "learning_rate": 3.412667191188041e-05, | |
| "loss": 0.0468, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 1.95, | |
| "grad_norm": 2.832631826400757, | |
| "learning_rate": 3.392997639653816e-05, | |
| "loss": 0.034, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": 6.040098190307617, | |
| "learning_rate": 3.373328088119591e-05, | |
| "loss": 0.0631, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.98, | |
| "grad_norm": 0.12215587496757507, | |
| "learning_rate": 3.353658536585366e-05, | |
| "loss": 0.0522, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 8.005010604858398, | |
| "learning_rate": 3.3339889850511406e-05, | |
| "loss": 0.0909, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_accuracy": 0.9755772646536413, | |
| "eval_loss": 0.0889873057603836, | |
| "eval_runtime": 72.7301, | |
| "eval_samples_per_second": 30.964, | |
| "eval_steps_per_second": 3.877, | |
| "step": 1131 | |
| }, | |
| { | |
| "epoch": 2.02, | |
| "grad_norm": 0.5045525431632996, | |
| "learning_rate": 3.314319433516916e-05, | |
| "loss": 0.054, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 2.03, | |
| "grad_norm": 3.169762134552002, | |
| "learning_rate": 3.294649881982691e-05, | |
| "loss": 0.0138, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 2.05, | |
| "grad_norm": 12.394304275512695, | |
| "learning_rate": 3.274980330448466e-05, | |
| "loss": 0.025, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 2.07, | |
| "grad_norm": 0.7768550515174866, | |
| "learning_rate": 3.255310778914241e-05, | |
| "loss": 0.0136, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 2.09, | |
| "grad_norm": 23.059011459350586, | |
| "learning_rate": 3.2356412273800155e-05, | |
| "loss": 0.0487, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "grad_norm": 1.4939569234848022, | |
| "learning_rate": 3.2159716758457906e-05, | |
| "loss": 0.0267, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 2.12, | |
| "grad_norm": 1.5693202018737793, | |
| "learning_rate": 3.196302124311566e-05, | |
| "loss": 0.0339, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.14, | |
| "grad_norm": 0.9168961644172668, | |
| "learning_rate": 3.176632572777341e-05, | |
| "loss": 0.0195, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "grad_norm": 3.3607680797576904, | |
| "learning_rate": 3.156963021243116e-05, | |
| "loss": 0.0257, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 2.18, | |
| "grad_norm": 10.094879150390625, | |
| "learning_rate": 3.137293469708891e-05, | |
| "loss": 0.0143, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 2.19, | |
| "grad_norm": 9.125, | |
| "learning_rate": 3.1176239181746655e-05, | |
| "loss": 0.0295, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 2.21, | |
| "grad_norm": 0.2778412699699402, | |
| "learning_rate": 3.0979543666404406e-05, | |
| "loss": 0.0182, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 2.23, | |
| "grad_norm": 1.0662269592285156, | |
| "learning_rate": 3.078284815106216e-05, | |
| "loss": 0.0295, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 2.25, | |
| "grad_norm": 0.0518941730260849, | |
| "learning_rate": 3.058615263571991e-05, | |
| "loss": 0.048, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 2.26, | |
| "grad_norm": 3.4041025638580322, | |
| "learning_rate": 3.0389457120377656e-05, | |
| "loss": 0.0601, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 2.28, | |
| "grad_norm": 0.9103949666023254, | |
| "learning_rate": 3.0192761605035407e-05, | |
| "loss": 0.0191, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 2.3, | |
| "grad_norm": 1.8118720054626465, | |
| "learning_rate": 2.999606608969316e-05, | |
| "loss": 0.023, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 0.6234725117683411, | |
| "learning_rate": 2.9799370574350903e-05, | |
| "loss": 0.0248, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 2.33, | |
| "grad_norm": 18.03314781188965, | |
| "learning_rate": 2.9602675059008654e-05, | |
| "loss": 0.0694, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 2.35, | |
| "grad_norm": 0.3662589490413666, | |
| "learning_rate": 2.9405979543666405e-05, | |
| "loss": 0.0206, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 2.37, | |
| "grad_norm": 0.030597494915127754, | |
| "learning_rate": 2.9209284028324156e-05, | |
| "loss": 0.0291, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 2.39, | |
| "grad_norm": 0.9393526911735535, | |
| "learning_rate": 2.9012588512981904e-05, | |
| "loss": 0.0243, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 3.631523847579956, | |
| "learning_rate": 2.8815892997639655e-05, | |
| "loss": 0.0292, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 2.42, | |
| "grad_norm": 0.010743443854153156, | |
| "learning_rate": 2.8619197482297406e-05, | |
| "loss": 0.0453, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "grad_norm": 16.26077651977539, | |
| "learning_rate": 2.8422501966955157e-05, | |
| "loss": 0.0221, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 2.46, | |
| "grad_norm": 3.7162559032440186, | |
| "learning_rate": 2.822580645161291e-05, | |
| "loss": 0.0278, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 1.568585991859436, | |
| "learning_rate": 2.8029110936270653e-05, | |
| "loss": 0.0234, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.49, | |
| "grad_norm": 0.43441635370254517, | |
| "learning_rate": 2.7832415420928404e-05, | |
| "loss": 0.0187, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 2.51, | |
| "grad_norm": 2.6314618587493896, | |
| "learning_rate": 2.7635719905586155e-05, | |
| "loss": 0.0235, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 2.53, | |
| "grad_norm": 0.020479127764701843, | |
| "learning_rate": 2.7439024390243906e-05, | |
| "loss": 0.0102, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 2.55, | |
| "grad_norm": 0.40051591396331787, | |
| "learning_rate": 2.724232887490165e-05, | |
| "loss": 0.0036, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 8.731142044067383, | |
| "learning_rate": 2.7045633359559402e-05, | |
| "loss": 0.0412, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 2.58, | |
| "grad_norm": 5.808520317077637, | |
| "learning_rate": 2.6848937844217153e-05, | |
| "loss": 0.0333, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "grad_norm": 0.15826575458049774, | |
| "learning_rate": 2.6652242328874904e-05, | |
| "loss": 0.0381, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 2.62, | |
| "grad_norm": 2.659451484680176, | |
| "learning_rate": 2.645554681353265e-05, | |
| "loss": 0.0188, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 2.63, | |
| "grad_norm": 0.06260908395051956, | |
| "learning_rate": 2.62588512981904e-05, | |
| "loss": 0.0141, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 2.65, | |
| "grad_norm": 0.15780910849571228, | |
| "learning_rate": 2.606215578284815e-05, | |
| "loss": 0.0146, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.67, | |
| "grad_norm": 2.506399631500244, | |
| "learning_rate": 2.5865460267505902e-05, | |
| "loss": 0.0138, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 2.69, | |
| "grad_norm": 0.11960842460393906, | |
| "learning_rate": 2.5668764752163653e-05, | |
| "loss": 0.0028, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 2.71, | |
| "grad_norm": 22.104814529418945, | |
| "learning_rate": 2.54720692368214e-05, | |
| "loss": 0.0279, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 2.72, | |
| "grad_norm": 0.6916127800941467, | |
| "learning_rate": 2.5275373721479152e-05, | |
| "loss": 0.0152, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 2.74, | |
| "grad_norm": 4.406822204589844, | |
| "learning_rate": 2.5078678206136903e-05, | |
| "loss": 0.0064, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 2.76, | |
| "grad_norm": 0.5762219429016113, | |
| "learning_rate": 2.488198269079465e-05, | |
| "loss": 0.0257, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 2.78, | |
| "grad_norm": 1.5382003784179688, | |
| "learning_rate": 2.4685287175452402e-05, | |
| "loss": 0.0049, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 2.79, | |
| "grad_norm": 0.4372711181640625, | |
| "learning_rate": 2.448859166011015e-05, | |
| "loss": 0.0442, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 2.81, | |
| "grad_norm": 6.409371852874756, | |
| "learning_rate": 2.42918961447679e-05, | |
| "loss": 0.055, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 2.83, | |
| "grad_norm": 0.15078137814998627, | |
| "learning_rate": 2.4095200629425652e-05, | |
| "loss": 0.0228, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.85, | |
| "grad_norm": 4.177145481109619, | |
| "learning_rate": 2.38985051140834e-05, | |
| "loss": 0.0193, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 2.86, | |
| "grad_norm": 0.13254141807556152, | |
| "learning_rate": 2.370180959874115e-05, | |
| "loss": 0.0138, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 0.5547938942909241, | |
| "learning_rate": 2.35051140833989e-05, | |
| "loss": 0.003, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 2.9, | |
| "grad_norm": 1.550114393234253, | |
| "learning_rate": 2.330841856805665e-05, | |
| "loss": 0.0156, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 2.92, | |
| "grad_norm": 2.4502694606781006, | |
| "learning_rate": 2.3111723052714398e-05, | |
| "loss": 0.0284, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 2.94, | |
| "grad_norm": 0.34310609102249146, | |
| "learning_rate": 2.291502753737215e-05, | |
| "loss": 0.0318, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 2.95, | |
| "grad_norm": 0.06068241968750954, | |
| "learning_rate": 2.2718332022029897e-05, | |
| "loss": 0.0171, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 2.97, | |
| "grad_norm": 1.1032580137252808, | |
| "learning_rate": 2.2521636506687648e-05, | |
| "loss": 0.0062, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 2.99, | |
| "grad_norm": 16.99115562438965, | |
| "learning_rate": 2.2324940991345396e-05, | |
| "loss": 0.018, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_accuracy": 0.9755772646536413, | |
| "eval_loss": 0.0940345823764801, | |
| "eval_runtime": 73.2079, | |
| "eval_samples_per_second": 30.762, | |
| "eval_steps_per_second": 3.852, | |
| "step": 1696 | |
| }, | |
| { | |
| "epoch": 3.01, | |
| "grad_norm": 18.221080780029297, | |
| "learning_rate": 2.2128245476003147e-05, | |
| "loss": 0.018, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 3.02, | |
| "grad_norm": 1.597777009010315, | |
| "learning_rate": 2.1931549960660898e-05, | |
| "loss": 0.0095, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 3.04, | |
| "grad_norm": 0.460238516330719, | |
| "learning_rate": 2.1734854445318646e-05, | |
| "loss": 0.0135, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 3.06, | |
| "grad_norm": 0.1700868457555771, | |
| "learning_rate": 2.1538158929976397e-05, | |
| "loss": 0.0149, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 3.08, | |
| "grad_norm": 4.091975212097168, | |
| "learning_rate": 2.134146341463415e-05, | |
| "loss": 0.0163, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 3.09, | |
| "grad_norm": 0.004883296322077513, | |
| "learning_rate": 2.11447678992919e-05, | |
| "loss": 0.0022, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 3.11, | |
| "grad_norm": 0.9401123523712158, | |
| "learning_rate": 2.0948072383949647e-05, | |
| "loss": 0.0012, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 3.13, | |
| "grad_norm": 4.135828018188477, | |
| "learning_rate": 2.07513768686074e-05, | |
| "loss": 0.0201, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 3.15, | |
| "grad_norm": 0.030084745958447456, | |
| "learning_rate": 2.0554681353265146e-05, | |
| "loss": 0.0013, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 3.17, | |
| "grad_norm": 15.494827270507812, | |
| "learning_rate": 2.0357985837922897e-05, | |
| "loss": 0.0067, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 3.18, | |
| "grad_norm": 1.0570249557495117, | |
| "learning_rate": 2.0161290322580645e-05, | |
| "loss": 0.015, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "grad_norm": 0.5005541443824768, | |
| "learning_rate": 1.9964594807238396e-05, | |
| "loss": 0.0025, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 3.22, | |
| "grad_norm": 3.019387722015381, | |
| "learning_rate": 1.9767899291896147e-05, | |
| "loss": 0.0112, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 3.24, | |
| "grad_norm": 0.19258318841457367, | |
| "learning_rate": 1.9571203776553895e-05, | |
| "loss": 0.0021, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 3.25, | |
| "grad_norm": 2.9886810779571533, | |
| "learning_rate": 1.9374508261211646e-05, | |
| "loss": 0.002, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 3.27, | |
| "grad_norm": 0.012368579395115376, | |
| "learning_rate": 1.9177812745869394e-05, | |
| "loss": 0.0012, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 3.29, | |
| "grad_norm": 0.023176291957497597, | |
| "learning_rate": 1.8981117230527145e-05, | |
| "loss": 0.0456, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 3.31, | |
| "grad_norm": 0.06562870740890503, | |
| "learning_rate": 1.8784421715184893e-05, | |
| "loss": 0.0069, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 3.32, | |
| "grad_norm": 17.193941116333008, | |
| "learning_rate": 1.8587726199842644e-05, | |
| "loss": 0.0133, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 3.34, | |
| "grad_norm": 0.08079444617033005, | |
| "learning_rate": 1.8391030684500392e-05, | |
| "loss": 0.0162, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 3.36, | |
| "grad_norm": 1.4867687225341797, | |
| "learning_rate": 1.8194335169158143e-05, | |
| "loss": 0.0028, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 3.38, | |
| "grad_norm": 0.10314597189426422, | |
| "learning_rate": 1.799763965381589e-05, | |
| "loss": 0.0036, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 3.4, | |
| "grad_norm": 0.03035634197294712, | |
| "learning_rate": 1.7800944138473642e-05, | |
| "loss": 0.0018, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 3.41, | |
| "grad_norm": 0.03287611901760101, | |
| "learning_rate": 1.7604248623131393e-05, | |
| "loss": 0.011, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 3.43, | |
| "grad_norm": 0.009028772823512554, | |
| "learning_rate": 1.7407553107789144e-05, | |
| "loss": 0.0055, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 3.45, | |
| "grad_norm": 0.11215109378099442, | |
| "learning_rate": 1.7210857592446896e-05, | |
| "loss": 0.0033, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 3.47, | |
| "grad_norm": 0.18691633641719818, | |
| "learning_rate": 1.7014162077104643e-05, | |
| "loss": 0.0157, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 3.48, | |
| "grad_norm": 10.645195007324219, | |
| "learning_rate": 1.6817466561762395e-05, | |
| "loss": 0.0056, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 3.5, | |
| "grad_norm": 0.3934668004512787, | |
| "learning_rate": 1.6620771046420142e-05, | |
| "loss": 0.0023, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 3.52, | |
| "grad_norm": 0.01680024527013302, | |
| "learning_rate": 1.6424075531077893e-05, | |
| "loss": 0.0058, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 3.54, | |
| "grad_norm": 0.6062192916870117, | |
| "learning_rate": 1.622738001573564e-05, | |
| "loss": 0.004, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 3.55, | |
| "grad_norm": 0.7066389918327332, | |
| "learning_rate": 1.6030684500393392e-05, | |
| "loss": 0.0094, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 3.57, | |
| "grad_norm": 0.20641224086284637, | |
| "learning_rate": 1.583398898505114e-05, | |
| "loss": 0.005, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 3.59, | |
| "grad_norm": 0.005134557373821735, | |
| "learning_rate": 1.563729346970889e-05, | |
| "loss": 0.0196, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 3.61, | |
| "grad_norm": 0.09725712984800339, | |
| "learning_rate": 1.5440597954366642e-05, | |
| "loss": 0.0093, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 3.63, | |
| "grad_norm": 0.01703699305653572, | |
| "learning_rate": 1.524390243902439e-05, | |
| "loss": 0.0173, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 3.64, | |
| "grad_norm": 8.595245361328125, | |
| "learning_rate": 1.5047206923682141e-05, | |
| "loss": 0.0067, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 3.66, | |
| "grad_norm": 1.1955232620239258, | |
| "learning_rate": 1.485051140833989e-05, | |
| "loss": 0.0057, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 3.68, | |
| "grad_norm": 0.026586467400193214, | |
| "learning_rate": 1.465381589299764e-05, | |
| "loss": 0.0033, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 3.7, | |
| "grad_norm": 0.07073145359754562, | |
| "learning_rate": 1.445712037765539e-05, | |
| "loss": 0.0098, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 3.71, | |
| "grad_norm": 13.662938117980957, | |
| "learning_rate": 1.4260424862313141e-05, | |
| "loss": 0.0099, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 3.73, | |
| "grad_norm": 7.773074626922607, | |
| "learning_rate": 1.4063729346970889e-05, | |
| "loss": 0.0147, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 3.75, | |
| "grad_norm": 0.08416301012039185, | |
| "learning_rate": 1.386703383162864e-05, | |
| "loss": 0.0015, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 3.77, | |
| "grad_norm": 0.09213215857744217, | |
| "learning_rate": 1.3670338316286388e-05, | |
| "loss": 0.0011, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 3.78, | |
| "grad_norm": 0.08238010108470917, | |
| "learning_rate": 1.3473642800944139e-05, | |
| "loss": 0.0067, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 3.8, | |
| "grad_norm": 0.2930988073348999, | |
| "learning_rate": 1.327694728560189e-05, | |
| "loss": 0.0027, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 3.82, | |
| "grad_norm": 0.6446830630302429, | |
| "learning_rate": 1.3080251770259638e-05, | |
| "loss": 0.0021, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 3.84, | |
| "grad_norm": 0.04108636826276779, | |
| "learning_rate": 1.2883556254917389e-05, | |
| "loss": 0.0116, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 3.85, | |
| "grad_norm": 0.3955759108066559, | |
| "learning_rate": 1.2686860739575138e-05, | |
| "loss": 0.0132, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 3.87, | |
| "grad_norm": 0.10534235090017319, | |
| "learning_rate": 1.249016522423289e-05, | |
| "loss": 0.0014, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 3.89, | |
| "grad_norm": 0.9984220266342163, | |
| "learning_rate": 1.2293469708890639e-05, | |
| "loss": 0.016, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 3.91, | |
| "grad_norm": 0.05933081731200218, | |
| "learning_rate": 1.2096774193548388e-05, | |
| "loss": 0.0024, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 3.93, | |
| "grad_norm": 0.3307284414768219, | |
| "learning_rate": 1.1900078678206138e-05, | |
| "loss": 0.0118, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 3.94, | |
| "grad_norm": 0.01056114211678505, | |
| "learning_rate": 1.1703383162863887e-05, | |
| "loss": 0.0054, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 3.96, | |
| "grad_norm": 0.0951002761721611, | |
| "learning_rate": 1.1506687647521637e-05, | |
| "loss": 0.0059, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 3.98, | |
| "grad_norm": 0.008732125163078308, | |
| "learning_rate": 1.1309992132179386e-05, | |
| "loss": 0.002, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 2.568053722381592, | |
| "learning_rate": 1.1113296616837136e-05, | |
| "loss": 0.0246, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_accuracy": 0.9791296625222025, | |
| "eval_loss": 0.08105943351984024, | |
| "eval_runtime": 71.9238, | |
| "eval_samples_per_second": 31.311, | |
| "eval_steps_per_second": 3.921, | |
| "step": 2262 | |
| }, | |
| { | |
| "epoch": 4.01, | |
| "grad_norm": 0.3494608700275421, | |
| "learning_rate": 1.0916601101494885e-05, | |
| "loss": 0.0029, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 4.03, | |
| "grad_norm": 0.4306102991104126, | |
| "learning_rate": 1.0719905586152636e-05, | |
| "loss": 0.0037, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 4.05, | |
| "grad_norm": 2.145603656768799, | |
| "learning_rate": 1.0523210070810386e-05, | |
| "loss": 0.0063, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 4.07, | |
| "grad_norm": 0.0036911088973283768, | |
| "learning_rate": 1.0326514555468137e-05, | |
| "loss": 0.0022, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 4.08, | |
| "grad_norm": 0.032683487981557846, | |
| "learning_rate": 1.0129819040125886e-05, | |
| "loss": 0.0062, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 4.1, | |
| "grad_norm": 0.007357526570558548, | |
| "learning_rate": 9.933123524783636e-06, | |
| "loss": 0.0059, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 4.12, | |
| "grad_norm": 0.052719421684741974, | |
| "learning_rate": 9.736428009441385e-06, | |
| "loss": 0.0007, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 4.14, | |
| "grad_norm": 0.0456971600651741, | |
| "learning_rate": 9.539732494099135e-06, | |
| "loss": 0.0083, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 4.16, | |
| "grad_norm": 0.029169630259275436, | |
| "learning_rate": 9.343036978756884e-06, | |
| "loss": 0.0075, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 4.17, | |
| "grad_norm": 0.05413562431931496, | |
| "learning_rate": 9.146341463414634e-06, | |
| "loss": 0.0025, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 4.19, | |
| "grad_norm": 8.87646198272705, | |
| "learning_rate": 8.949645948072383e-06, | |
| "loss": 0.0104, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 4.21, | |
| "grad_norm": 1.6246185302734375, | |
| "learning_rate": 8.752950432730134e-06, | |
| "loss": 0.001, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 4.23, | |
| "grad_norm": 0.21518048644065857, | |
| "learning_rate": 8.556254917387884e-06, | |
| "loss": 0.0073, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 4.24, | |
| "grad_norm": 0.6136273741722107, | |
| "learning_rate": 8.359559402045635e-06, | |
| "loss": 0.008, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 4.26, | |
| "grad_norm": 0.013673730194568634, | |
| "learning_rate": 8.162863886703385e-06, | |
| "loss": 0.0003, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 4.28, | |
| "grad_norm": 0.013138936832547188, | |
| "learning_rate": 7.966168371361134e-06, | |
| "loss": 0.0016, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 4.3, | |
| "grad_norm": 0.5832827687263489, | |
| "learning_rate": 7.769472856018883e-06, | |
| "loss": 0.0016, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 4.31, | |
| "grad_norm": 0.013815794140100479, | |
| "learning_rate": 7.572777340676633e-06, | |
| "loss": 0.0093, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 4.33, | |
| "grad_norm": 2.6205365657806396, | |
| "learning_rate": 7.376081825334382e-06, | |
| "loss": 0.0247, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 4.35, | |
| "grad_norm": 0.008757353760302067, | |
| "learning_rate": 7.179386309992133e-06, | |
| "loss": 0.0004, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 4.37, | |
| "grad_norm": 0.5439887642860413, | |
| "learning_rate": 6.982690794649882e-06, | |
| "loss": 0.0011, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 4.39, | |
| "grad_norm": 3.330894708633423, | |
| "learning_rate": 6.785995279307632e-06, | |
| "loss": 0.0027, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 4.4, | |
| "grad_norm": 0.24116189777851105, | |
| "learning_rate": 6.589299763965381e-06, | |
| "loss": 0.001, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 4.42, | |
| "grad_norm": 0.19894400238990784, | |
| "learning_rate": 6.392604248623131e-06, | |
| "loss": 0.0096, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 4.44, | |
| "grad_norm": 0.08030141144990921, | |
| "learning_rate": 6.195908733280882e-06, | |
| "loss": 0.0003, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 4.46, | |
| "grad_norm": 0.06319218873977661, | |
| "learning_rate": 5.999213217938631e-06, | |
| "loss": 0.0075, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 4.47, | |
| "grad_norm": 0.0329049713909626, | |
| "learning_rate": 5.802517702596381e-06, | |
| "loss": 0.0123, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 4.49, | |
| "grad_norm": 0.07301346212625504, | |
| "learning_rate": 5.605822187254131e-06, | |
| "loss": 0.0037, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 4.51, | |
| "grad_norm": 2.7243146896362305, | |
| "learning_rate": 5.40912667191188e-06, | |
| "loss": 0.009, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 4.53, | |
| "grad_norm": 0.002613728167489171, | |
| "learning_rate": 5.212431156569631e-06, | |
| "loss": 0.0211, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 4.54, | |
| "grad_norm": 0.08672753721475601, | |
| "learning_rate": 5.01573564122738e-06, | |
| "loss": 0.0003, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 4.56, | |
| "grad_norm": 0.7189329862594604, | |
| "learning_rate": 4.81904012588513e-06, | |
| "loss": 0.0017, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 4.58, | |
| "grad_norm": 0.021875105798244476, | |
| "learning_rate": 4.62234461054288e-06, | |
| "loss": 0.0036, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 4.6, | |
| "grad_norm": 0.06674761325120926, | |
| "learning_rate": 4.425649095200629e-06, | |
| "loss": 0.0016, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 4.62, | |
| "grad_norm": 1.5341488122940063, | |
| "learning_rate": 4.22895357985838e-06, | |
| "loss": 0.0123, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 4.63, | |
| "grad_norm": 0.6225674152374268, | |
| "learning_rate": 4.032258064516129e-06, | |
| "loss": 0.0071, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 4.65, | |
| "grad_norm": 0.04396051913499832, | |
| "learning_rate": 3.835562549173879e-06, | |
| "loss": 0.0061, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 4.67, | |
| "grad_norm": 0.024606185033917427, | |
| "learning_rate": 3.638867033831629e-06, | |
| "loss": 0.0034, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 4.69, | |
| "grad_norm": 0.7400041818618774, | |
| "learning_rate": 3.442171518489379e-06, | |
| "loss": 0.0022, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 4.7, | |
| "grad_norm": 0.105620838701725, | |
| "learning_rate": 3.2454760031471283e-06, | |
| "loss": 0.0203, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 4.72, | |
| "grad_norm": 0.011251443065702915, | |
| "learning_rate": 3.0487804878048782e-06, | |
| "loss": 0.0047, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 4.74, | |
| "grad_norm": 0.03630177304148674, | |
| "learning_rate": 2.852084972462628e-06, | |
| "loss": 0.004, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 4.76, | |
| "grad_norm": 0.03339725360274315, | |
| "learning_rate": 2.655389457120378e-06, | |
| "loss": 0.0011, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 4.77, | |
| "grad_norm": 4.932060718536377, | |
| "learning_rate": 2.4586939417781275e-06, | |
| "loss": 0.0032, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 4.79, | |
| "grad_norm": 0.20462249219417572, | |
| "learning_rate": 2.2619984264358773e-06, | |
| "loss": 0.0008, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 4.81, | |
| "grad_norm": 0.01447032019495964, | |
| "learning_rate": 2.0653029110936272e-06, | |
| "loss": 0.003, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 4.83, | |
| "grad_norm": 0.012796329334378242, | |
| "learning_rate": 1.868607395751377e-06, | |
| "loss": 0.0015, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 4.85, | |
| "grad_norm": 2.1272308826446533, | |
| "learning_rate": 1.6719118804091268e-06, | |
| "loss": 0.002, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 4.86, | |
| "grad_norm": 0.049883559346199036, | |
| "learning_rate": 1.4752163650668765e-06, | |
| "loss": 0.0008, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 4.88, | |
| "grad_norm": 1.860205054283142, | |
| "learning_rate": 1.2785208497246264e-06, | |
| "loss": 0.0039, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 4.9, | |
| "grad_norm": 0.012104855850338936, | |
| "learning_rate": 1.0818253343823763e-06, | |
| "loss": 0.0083, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 4.92, | |
| "grad_norm": 0.03279277682304382, | |
| "learning_rate": 8.85129819040126e-07, | |
| "loss": 0.0006, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 4.93, | |
| "grad_norm": 0.6932834386825562, | |
| "learning_rate": 6.884343036978757e-07, | |
| "loss": 0.0035, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 4.95, | |
| "grad_norm": 0.037294335663318634, | |
| "learning_rate": 4.917387883556255e-07, | |
| "loss": 0.0097, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 4.97, | |
| "grad_norm": 0.10355295240879059, | |
| "learning_rate": 2.9504327301337533e-07, | |
| "loss": 0.0006, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 4.99, | |
| "grad_norm": 3.9190878868103027, | |
| "learning_rate": 9.834775767112511e-08, | |
| "loss": 0.0175, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_accuracy": 0.9795737122557726, | |
| "eval_loss": 0.08262032270431519, | |
| "eval_runtime": 72.9002, | |
| "eval_samples_per_second": 30.892, | |
| "eval_steps_per_second": 3.868, | |
| "step": 2825 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "step": 2825, | |
| "total_flos": 2.2478285568521503e+18, | |
| "train_loss": 0.1798970705407581, | |
| "train_runtime": 3694.2951, | |
| "train_samples_per_second": 24.484, | |
| "train_steps_per_second": 0.765 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 2825, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "total_flos": 2.2478285568521503e+18, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |