rm_merged_v2 / trainer_state.json
bingqin111's picture
Upload folder using huggingface_hub
33a0649 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9338205440519691,
"eval_steps": 100,
"global_step": 6900,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0013533631073216945,
"grad_norm": 33.467918395996094,
"learning_rate": 4.0595399188092017e-07,
"loss": 0.7404,
"step": 10
},
{
"epoch": 0.002706726214643389,
"grad_norm": 22.66303253173828,
"learning_rate": 8.570139828597205e-07,
"loss": 0.72,
"step": 20
},
{
"epoch": 0.004060089321965083,
"grad_norm": 28.549816131591797,
"learning_rate": 1.3080739738385204e-06,
"loss": 1.0342,
"step": 30
},
{
"epoch": 0.005413452429286778,
"grad_norm": 26.86774253845215,
"learning_rate": 1.7591339648173207e-06,
"loss": 0.851,
"step": 40
},
{
"epoch": 0.006766815536608472,
"grad_norm": 24.830371856689453,
"learning_rate": 2.210193955796121e-06,
"loss": 0.8493,
"step": 50
},
{
"epoch": 0.008120178643930167,
"grad_norm": 31.990707397460938,
"learning_rate": 2.6612539467749215e-06,
"loss": 0.8196,
"step": 60
},
{
"epoch": 0.009473541751251861,
"grad_norm": 19.266714096069336,
"learning_rate": 3.112313937753721e-06,
"loss": 0.8137,
"step": 70
},
{
"epoch": 0.010826904858573556,
"grad_norm": 27.596582412719727,
"learning_rate": 3.563373928732522e-06,
"loss": 0.9199,
"step": 80
},
{
"epoch": 0.012180267965895249,
"grad_norm": 20.1337890625,
"learning_rate": 4.014433919711322e-06,
"loss": 0.6817,
"step": 90
},
{
"epoch": 0.013533631073216944,
"grad_norm": 30.579744338989258,
"learning_rate": 4.465493910690122e-06,
"loss": 0.7792,
"step": 100
},
{
"epoch": 0.013533631073216944,
"eval_accuracy": 0.5047184170471841,
"eval_loss": 0.7254102230072021,
"eval_runtime": 586.4826,
"eval_samples_per_second": 5.601,
"eval_steps_per_second": 5.601,
"step": 100
},
{
"epoch": 0.014886994180538638,
"grad_norm": 32.78731918334961,
"learning_rate": 4.916553901668922e-06,
"loss": 0.8336,
"step": 110
},
{
"epoch": 0.016240357287860333,
"grad_norm": 29.351137161254883,
"learning_rate": 5.367613892647722e-06,
"loss": 0.8204,
"step": 120
},
{
"epoch": 0.017593720395182026,
"grad_norm": 24.733667373657227,
"learning_rate": 5.8186738836265224e-06,
"loss": 0.9134,
"step": 130
},
{
"epoch": 0.018947083502503723,
"grad_norm": 27.078041076660156,
"learning_rate": 6.269733874605323e-06,
"loss": 0.7954,
"step": 140
},
{
"epoch": 0.020300446609825416,
"grad_norm": 32.13090515136719,
"learning_rate": 6.7207938655841226e-06,
"loss": 0.9003,
"step": 150
},
{
"epoch": 0.021653809717147112,
"grad_norm": 24.68378448486328,
"learning_rate": 7.171853856562922e-06,
"loss": 0.7541,
"step": 160
},
{
"epoch": 0.023007172824468805,
"grad_norm": 24.132369995117188,
"learning_rate": 7.6229138475417236e-06,
"loss": 0.8898,
"step": 170
},
{
"epoch": 0.024360535931790498,
"grad_norm": 30.671886444091797,
"learning_rate": 8.073973838520524e-06,
"loss": 0.8849,
"step": 180
},
{
"epoch": 0.025713899039112195,
"grad_norm": 30.794673919677734,
"learning_rate": 8.525033829499324e-06,
"loss": 0.7421,
"step": 190
},
{
"epoch": 0.027067262146433888,
"grad_norm": 35.09535598754883,
"learning_rate": 8.976093820478123e-06,
"loss": 0.7815,
"step": 200
},
{
"epoch": 0.027067262146433888,
"eval_accuracy": 0.6313546423135464,
"eval_loss": 0.638473391532898,
"eval_runtime": 586.0225,
"eval_samples_per_second": 5.606,
"eval_steps_per_second": 5.606,
"step": 200
},
{
"epoch": 0.028420625253755584,
"grad_norm": 13.783787727355957,
"learning_rate": 9.427153811456923e-06,
"loss": 0.7288,
"step": 210
},
{
"epoch": 0.029773988361077277,
"grad_norm": 29.833702087402344,
"learning_rate": 9.878213802435724e-06,
"loss": 0.5495,
"step": 220
},
{
"epoch": 0.03112735146839897,
"grad_norm": 14.958450317382812,
"learning_rate": 1.0329273793414524e-05,
"loss": 0.69,
"step": 230
},
{
"epoch": 0.032480714575720666,
"grad_norm": 28.827964782714844,
"learning_rate": 1.0780333784393325e-05,
"loss": 0.6468,
"step": 240
},
{
"epoch": 0.03383407768304236,
"grad_norm": 29.778186798095703,
"learning_rate": 1.1231393775372125e-05,
"loss": 0.7499,
"step": 250
},
{
"epoch": 0.03518744079036405,
"grad_norm": 10.802916526794434,
"learning_rate": 1.1682453766350926e-05,
"loss": 0.6447,
"step": 260
},
{
"epoch": 0.03654080389768575,
"grad_norm": 24.211503982543945,
"learning_rate": 1.2133513757329726e-05,
"loss": 0.6241,
"step": 270
},
{
"epoch": 0.037894167005007445,
"grad_norm": 26.76028060913086,
"learning_rate": 1.2584573748308526e-05,
"loss": 0.4461,
"step": 280
},
{
"epoch": 0.039247530112329135,
"grad_norm": 20.0728759765625,
"learning_rate": 1.3035633739287325e-05,
"loss": 0.6034,
"step": 290
},
{
"epoch": 0.04060089321965083,
"grad_norm": 15.319375038146973,
"learning_rate": 1.3486693730266125e-05,
"loss": 0.5155,
"step": 300
},
{
"epoch": 0.04060089321965083,
"eval_accuracy": 0.7418569254185693,
"eval_loss": 0.5153380632400513,
"eval_runtime": 586.0685,
"eval_samples_per_second": 5.605,
"eval_steps_per_second": 5.605,
"step": 300
},
{
"epoch": 0.04195425632697253,
"grad_norm": 22.047800064086914,
"learning_rate": 1.3937753721244926e-05,
"loss": 0.7796,
"step": 310
},
{
"epoch": 0.043307619434294224,
"grad_norm": 23.10489845275879,
"learning_rate": 1.4388813712223726e-05,
"loss": 0.5833,
"step": 320
},
{
"epoch": 0.044660982541615914,
"grad_norm": 34.646427154541016,
"learning_rate": 1.4839873703202526e-05,
"loss": 0.4678,
"step": 330
},
{
"epoch": 0.04601434564893761,
"grad_norm": 36.290771484375,
"learning_rate": 1.529093369418133e-05,
"loss": 0.5535,
"step": 340
},
{
"epoch": 0.04736770875625931,
"grad_norm": 4.73361873626709,
"learning_rate": 1.5741993685160127e-05,
"loss": 0.3168,
"step": 350
},
{
"epoch": 0.048721071863580996,
"grad_norm": 14.30739974975586,
"learning_rate": 1.6193053676138928e-05,
"loss": 0.3948,
"step": 360
},
{
"epoch": 0.05007443497090269,
"grad_norm": 15.850822448730469,
"learning_rate": 1.664411366711773e-05,
"loss": 0.6431,
"step": 370
},
{
"epoch": 0.05142779807822439,
"grad_norm": 40.54030990600586,
"learning_rate": 1.7095173658096527e-05,
"loss": 0.6361,
"step": 380
},
{
"epoch": 0.05278116118554608,
"grad_norm": 3.195178270339966,
"learning_rate": 1.754623364907533e-05,
"loss": 0.4635,
"step": 390
},
{
"epoch": 0.054134524292867775,
"grad_norm": 20.665943145751953,
"learning_rate": 1.7997293640054127e-05,
"loss": 0.5693,
"step": 400
},
{
"epoch": 0.054134524292867775,
"eval_accuracy": 0.7948249619482496,
"eval_loss": 0.4957842528820038,
"eval_runtime": 585.7637,
"eval_samples_per_second": 5.608,
"eval_steps_per_second": 5.608,
"step": 400
},
{
"epoch": 0.05548788740018947,
"grad_norm": 24.18285369873047,
"learning_rate": 1.8448353631032928e-05,
"loss": 0.6247,
"step": 410
},
{
"epoch": 0.05684125050751117,
"grad_norm": 37.275447845458984,
"learning_rate": 1.8899413622011726e-05,
"loss": 0.5164,
"step": 420
},
{
"epoch": 0.05819461361483286,
"grad_norm": 22.858854293823242,
"learning_rate": 1.9350473612990527e-05,
"loss": 0.4309,
"step": 430
},
{
"epoch": 0.059547976722154554,
"grad_norm": 11.075826644897461,
"learning_rate": 1.980153360396933e-05,
"loss": 0.3716,
"step": 440
},
{
"epoch": 0.06090133982947625,
"grad_norm": 20.966968536376953,
"learning_rate": 2.025259359494813e-05,
"loss": 0.5862,
"step": 450
},
{
"epoch": 0.06225470293679794,
"grad_norm": 16.335777282714844,
"learning_rate": 2.070365358592693e-05,
"loss": 0.293,
"step": 460
},
{
"epoch": 0.06360806604411964,
"grad_norm": 42.5803108215332,
"learning_rate": 2.115471357690573e-05,
"loss": 0.336,
"step": 470
},
{
"epoch": 0.06496142915144133,
"grad_norm": 1.7485947608947754,
"learning_rate": 2.160577356788453e-05,
"loss": 0.2536,
"step": 480
},
{
"epoch": 0.06631479225876302,
"grad_norm": 28.608144760131836,
"learning_rate": 2.205683355886333e-05,
"loss": 0.4017,
"step": 490
},
{
"epoch": 0.06766815536608473,
"grad_norm": 35.06800079345703,
"learning_rate": 2.250789354984213e-05,
"loss": 0.3398,
"step": 500
},
{
"epoch": 0.06766815536608473,
"eval_accuracy": 0.8213089802130898,
"eval_loss": 0.6229318380355835,
"eval_runtime": 586.2164,
"eval_samples_per_second": 5.604,
"eval_steps_per_second": 5.604,
"step": 500
},
{
"epoch": 0.06902151847340642,
"grad_norm": 41.761592864990234,
"learning_rate": 2.2958953540820928e-05,
"loss": 0.6924,
"step": 510
},
{
"epoch": 0.0703748815807281,
"grad_norm": 26.0762882232666,
"learning_rate": 2.341001353179973e-05,
"loss": 0.4984,
"step": 520
},
{
"epoch": 0.07172824468804981,
"grad_norm": 51.21356201171875,
"learning_rate": 2.386107352277853e-05,
"loss": 1.0133,
"step": 530
},
{
"epoch": 0.0730816077953715,
"grad_norm": 9.679603576660156,
"learning_rate": 2.4312133513757332e-05,
"loss": 0.7537,
"step": 540
},
{
"epoch": 0.07443497090269319,
"grad_norm": 8.944125175476074,
"learning_rate": 2.4763193504736133e-05,
"loss": 0.1747,
"step": 550
},
{
"epoch": 0.07578833401001489,
"grad_norm": 0.24319039285182953,
"learning_rate": 2.5214253495714928e-05,
"loss": 0.2228,
"step": 560
},
{
"epoch": 0.07714169711733658,
"grad_norm": 34.457130432128906,
"learning_rate": 2.5665313486693732e-05,
"loss": 0.6227,
"step": 570
},
{
"epoch": 0.07849506022465827,
"grad_norm": 25.974857330322266,
"learning_rate": 2.6116373477672534e-05,
"loss": 0.2999,
"step": 580
},
{
"epoch": 0.07984842333197997,
"grad_norm": 29.637929916381836,
"learning_rate": 2.6567433468651332e-05,
"loss": 0.4552,
"step": 590
},
{
"epoch": 0.08120178643930166,
"grad_norm": 22.073013305664062,
"learning_rate": 2.7018493459630133e-05,
"loss": 0.4963,
"step": 600
},
{
"epoch": 0.08120178643930166,
"eval_accuracy": 0.8307458143074582,
"eval_loss": 0.4110361933708191,
"eval_runtime": 586.3068,
"eval_samples_per_second": 5.603,
"eval_steps_per_second": 5.603,
"step": 600
},
{
"epoch": 0.08255514954662337,
"grad_norm": 0.12677842378616333,
"learning_rate": 2.746955345060893e-05,
"loss": 0.3773,
"step": 610
},
{
"epoch": 0.08390851265394506,
"grad_norm": 42.65321350097656,
"learning_rate": 2.7920613441587736e-05,
"loss": 0.6737,
"step": 620
},
{
"epoch": 0.08526187576126674,
"grad_norm": 26.458065032958984,
"learning_rate": 2.837167343256653e-05,
"loss": 0.6558,
"step": 630
},
{
"epoch": 0.08661523886858845,
"grad_norm": 5.073329925537109,
"learning_rate": 2.8822733423545335e-05,
"loss": 0.3947,
"step": 640
},
{
"epoch": 0.08796860197591014,
"grad_norm": 2.559771776199341,
"learning_rate": 2.927379341452413e-05,
"loss": 0.2376,
"step": 650
},
{
"epoch": 0.08932196508323183,
"grad_norm": 46.948219299316406,
"learning_rate": 2.9724853405502934e-05,
"loss": 0.6246,
"step": 660
},
{
"epoch": 0.09067532819055353,
"grad_norm": 20.228105545043945,
"learning_rate": 3.0175913396481732e-05,
"loss": 0.4244,
"step": 670
},
{
"epoch": 0.09202869129787522,
"grad_norm": 5.761535167694092,
"learning_rate": 3.0626973387460534e-05,
"loss": 0.356,
"step": 680
},
{
"epoch": 0.09338205440519691,
"grad_norm": 0.10911667346954346,
"learning_rate": 3.107803337843934e-05,
"loss": 0.4661,
"step": 690
},
{
"epoch": 0.09473541751251861,
"grad_norm": 12.342616081237793,
"learning_rate": 3.1529093369418136e-05,
"loss": 0.4579,
"step": 700
},
{
"epoch": 0.09473541751251861,
"eval_accuracy": 0.8584474885844748,
"eval_loss": 0.4246827960014343,
"eval_runtime": 586.4424,
"eval_samples_per_second": 5.602,
"eval_steps_per_second": 5.602,
"step": 700
},
{
"epoch": 0.0960887806198403,
"grad_norm": 42.24137496948242,
"learning_rate": 3.1980153360396934e-05,
"loss": 0.52,
"step": 710
},
{
"epoch": 0.09744214372716199,
"grad_norm": 0.19052202999591827,
"learning_rate": 3.243121335137573e-05,
"loss": 0.3293,
"step": 720
},
{
"epoch": 0.0987955068344837,
"grad_norm": 29.600139617919922,
"learning_rate": 3.288227334235454e-05,
"loss": 0.5003,
"step": 730
},
{
"epoch": 0.10014886994180539,
"grad_norm": 30.022396087646484,
"learning_rate": 3.3333333333333335e-05,
"loss": 0.5022,
"step": 740
},
{
"epoch": 0.10150223304912707,
"grad_norm": 1.689180850982666,
"learning_rate": 3.378439332431213e-05,
"loss": 0.4278,
"step": 750
},
{
"epoch": 0.10285559615644878,
"grad_norm": 0.31317076086997986,
"learning_rate": 3.423545331529093e-05,
"loss": 0.4199,
"step": 760
},
{
"epoch": 0.10420895926377047,
"grad_norm": 20.078535079956055,
"learning_rate": 3.4686513306269736e-05,
"loss": 0.3041,
"step": 770
},
{
"epoch": 0.10556232237109216,
"grad_norm": 20.944692611694336,
"learning_rate": 3.513757329724854e-05,
"loss": 0.8085,
"step": 780
},
{
"epoch": 0.10691568547841386,
"grad_norm": 9.586214065551758,
"learning_rate": 3.558863328822734e-05,
"loss": 0.5523,
"step": 790
},
{
"epoch": 0.10826904858573555,
"grad_norm": 31.01270866394043,
"learning_rate": 3.6039693279206136e-05,
"loss": 0.472,
"step": 800
},
{
"epoch": 0.10826904858573555,
"eval_accuracy": 0.8681887366818873,
"eval_loss": 0.32172152400016785,
"eval_runtime": 586.2592,
"eval_samples_per_second": 5.603,
"eval_steps_per_second": 5.603,
"step": 800
},
{
"epoch": 0.10962241169305725,
"grad_norm": 0.7769675850868225,
"learning_rate": 3.6490753270184934e-05,
"loss": 0.2738,
"step": 810
},
{
"epoch": 0.11097577480037894,
"grad_norm": 37.9739875793457,
"learning_rate": 3.694181326116374e-05,
"loss": 0.4221,
"step": 820
},
{
"epoch": 0.11232913790770063,
"grad_norm": 4.82348108291626,
"learning_rate": 3.739287325214254e-05,
"loss": 0.2673,
"step": 830
},
{
"epoch": 0.11368250101502234,
"grad_norm": 0.9005016088485718,
"learning_rate": 3.7843933243121335e-05,
"loss": 0.2568,
"step": 840
},
{
"epoch": 0.11503586412234403,
"grad_norm": 23.65435028076172,
"learning_rate": 3.829499323410013e-05,
"loss": 0.4254,
"step": 850
},
{
"epoch": 0.11638922722966571,
"grad_norm": 0.8585798144340515,
"learning_rate": 3.874605322507894e-05,
"loss": 0.8521,
"step": 860
},
{
"epoch": 0.11774259033698742,
"grad_norm": 33.647430419921875,
"learning_rate": 3.9197113216057735e-05,
"loss": 1.389,
"step": 870
},
{
"epoch": 0.11909595344430911,
"grad_norm": 14.677156448364258,
"learning_rate": 3.964817320703654e-05,
"loss": 0.1391,
"step": 880
},
{
"epoch": 0.1204493165516308,
"grad_norm": 1.6325709819793701,
"learning_rate": 4.009923319801534e-05,
"loss": 0.7501,
"step": 890
},
{
"epoch": 0.1218026796589525,
"grad_norm": 0.28640714287757874,
"learning_rate": 4.0550293188994136e-05,
"loss": 0.4304,
"step": 900
},
{
"epoch": 0.1218026796589525,
"eval_accuracy": 0.8639269406392694,
"eval_loss": 0.3807564675807953,
"eval_runtime": 586.1022,
"eval_samples_per_second": 5.605,
"eval_steps_per_second": 5.605,
"step": 900
},
{
"epoch": 0.12315604276627419,
"grad_norm": 25.761606216430664,
"learning_rate": 4.100135317997294e-05,
"loss": 0.2124,
"step": 910
},
{
"epoch": 0.12450940587359588,
"grad_norm": 4.299468994140625,
"learning_rate": 4.145241317095174e-05,
"loss": 0.443,
"step": 920
},
{
"epoch": 0.12586276898091758,
"grad_norm": 2.907810688018799,
"learning_rate": 4.190347316193054e-05,
"loss": 0.3628,
"step": 930
},
{
"epoch": 0.1272161320882393,
"grad_norm": 0.42966228723526,
"learning_rate": 4.2354533152909335e-05,
"loss": 0.5665,
"step": 940
},
{
"epoch": 0.12856949519556096,
"grad_norm": 0.043091725558042526,
"learning_rate": 4.280559314388814e-05,
"loss": 0.6238,
"step": 950
},
{
"epoch": 0.12992285830288267,
"grad_norm": 4.317753791809082,
"learning_rate": 4.325665313486694e-05,
"loss": 0.4853,
"step": 960
},
{
"epoch": 0.13127622141020437,
"grad_norm": 6.607635974884033,
"learning_rate": 4.370771312584574e-05,
"loss": 0.5516,
"step": 970
},
{
"epoch": 0.13262958451752604,
"grad_norm": 5.1537861824035645,
"learning_rate": 4.415877311682454e-05,
"loss": 0.2681,
"step": 980
},
{
"epoch": 0.13398294762484775,
"grad_norm": 18.95270538330078,
"learning_rate": 4.460983310780334e-05,
"loss": 0.1933,
"step": 990
},
{
"epoch": 0.13533631073216945,
"grad_norm": 37.21832275390625,
"learning_rate": 4.506089309878214e-05,
"loss": 0.5274,
"step": 1000
},
{
"epoch": 0.13533631073216945,
"eval_accuracy": 0.8748858447488584,
"eval_loss": 0.46359485387802124,
"eval_runtime": 586.3431,
"eval_samples_per_second": 5.603,
"eval_steps_per_second": 5.603,
"step": 1000
},
{
"epoch": 0.13668967383949113,
"grad_norm": 31.545852661132812,
"learning_rate": 4.551195308976094e-05,
"loss": 0.4664,
"step": 1010
},
{
"epoch": 0.13804303694681283,
"grad_norm": 10.666961669921875,
"learning_rate": 4.596301308073974e-05,
"loss": 0.3493,
"step": 1020
},
{
"epoch": 0.13939640005413453,
"grad_norm": 18.62867546081543,
"learning_rate": 4.641407307171854e-05,
"loss": 0.3411,
"step": 1030
},
{
"epoch": 0.1407497631614562,
"grad_norm": 0.038888320326805115,
"learning_rate": 4.686513306269734e-05,
"loss": 0.2676,
"step": 1040
},
{
"epoch": 0.1421031262687779,
"grad_norm": 0.8312036991119385,
"learning_rate": 4.731619305367614e-05,
"loss": 0.577,
"step": 1050
},
{
"epoch": 0.14345648937609962,
"grad_norm": 0.6841209530830383,
"learning_rate": 4.7767253044654944e-05,
"loss": 0.2822,
"step": 1060
},
{
"epoch": 0.1448098524834213,
"grad_norm": 2.010380983352661,
"learning_rate": 4.821831303563374e-05,
"loss": 0.214,
"step": 1070
},
{
"epoch": 0.146163215590743,
"grad_norm": 0.2284691035747528,
"learning_rate": 4.866937302661254e-05,
"loss": 0.6455,
"step": 1080
},
{
"epoch": 0.1475165786980647,
"grad_norm": 15.199652671813965,
"learning_rate": 4.9120433017591345e-05,
"loss": 0.1528,
"step": 1090
},
{
"epoch": 0.14886994180538637,
"grad_norm": 0.560634195804596,
"learning_rate": 4.957149300857014e-05,
"loss": 0.5175,
"step": 1100
},
{
"epoch": 0.14886994180538637,
"eval_accuracy": 0.8745814307458143,
"eval_loss": 0.37603330612182617,
"eval_runtime": 586.3343,
"eval_samples_per_second": 5.603,
"eval_steps_per_second": 5.603,
"step": 1100
},
{
"epoch": 0.15022330491270808,
"grad_norm": 0.30973055958747864,
"learning_rate": 5.002255299954894e-05,
"loss": 0.1818,
"step": 1110
},
{
"epoch": 0.15157666802002978,
"grad_norm": 0.012624499388039112,
"learning_rate": 5.047361299052774e-05,
"loss": 0.6186,
"step": 1120
},
{
"epoch": 0.15293003112735146,
"grad_norm": 32.54011917114258,
"learning_rate": 5.0924672981506537e-05,
"loss": 0.5022,
"step": 1130
},
{
"epoch": 0.15428339423467316,
"grad_norm": 3.9204163551330566,
"learning_rate": 5.137573297248535e-05,
"loss": 0.3571,
"step": 1140
},
{
"epoch": 0.15563675734199486,
"grad_norm": 0.25882434844970703,
"learning_rate": 5.1826792963464146e-05,
"loss": 0.4617,
"step": 1150
},
{
"epoch": 0.15699012044931654,
"grad_norm": 0.029281217604875565,
"learning_rate": 5.2277852954442944e-05,
"loss": 0.2459,
"step": 1160
},
{
"epoch": 0.15834348355663824,
"grad_norm": 15.895069122314453,
"learning_rate": 5.272891294542175e-05,
"loss": 0.3078,
"step": 1170
},
{
"epoch": 0.15969684666395995,
"grad_norm": 15.33349323272705,
"learning_rate": 5.317997293640055e-05,
"loss": 0.5131,
"step": 1180
},
{
"epoch": 0.16105020977128162,
"grad_norm": 6.226064205169678,
"learning_rate": 5.3631032927379345e-05,
"loss": 0.1718,
"step": 1190
},
{
"epoch": 0.16240357287860333,
"grad_norm": 20.2957820892334,
"learning_rate": 5.408209291835814e-05,
"loss": 0.671,
"step": 1200
},
{
"epoch": 0.16240357287860333,
"eval_accuracy": 0.871841704718417,
"eval_loss": 0.36159640550613403,
"eval_runtime": 586.2928,
"eval_samples_per_second": 5.603,
"eval_steps_per_second": 5.603,
"step": 1200
},
{
"epoch": 0.16375693598592503,
"grad_norm": 0.6848016977310181,
"learning_rate": 5.453315290933695e-05,
"loss": 0.7948,
"step": 1210
},
{
"epoch": 0.16511029909324673,
"grad_norm": 2.2201507091522217,
"learning_rate": 5.4984212900315745e-05,
"loss": 0.3166,
"step": 1220
},
{
"epoch": 0.1664636622005684,
"grad_norm": 17.319942474365234,
"learning_rate": 5.543527289129454e-05,
"loss": 0.6259,
"step": 1230
},
{
"epoch": 0.1678170253078901,
"grad_norm": 11.914515495300293,
"learning_rate": 5.588633288227334e-05,
"loss": 0.4776,
"step": 1240
},
{
"epoch": 0.16917038841521181,
"grad_norm": 19.129995346069336,
"learning_rate": 5.6337392873252146e-05,
"loss": 0.3623,
"step": 1250
},
{
"epoch": 0.1705237515225335,
"grad_norm": 6.791560649871826,
"learning_rate": 5.6788452864230944e-05,
"loss": 0.1206,
"step": 1260
},
{
"epoch": 0.1718771146298552,
"grad_norm": 24.70437240600586,
"learning_rate": 5.723951285520974e-05,
"loss": 0.6828,
"step": 1270
},
{
"epoch": 0.1732304777371769,
"grad_norm": 1.6283477544784546,
"learning_rate": 5.769057284618855e-05,
"loss": 0.2961,
"step": 1280
},
{
"epoch": 0.17458384084449857,
"grad_norm": 0.2976061701774597,
"learning_rate": 5.814163283716735e-05,
"loss": 0.2001,
"step": 1290
},
{
"epoch": 0.17593720395182028,
"grad_norm": 5.002806663513184,
"learning_rate": 5.859269282814614e-05,
"loss": 0.7133,
"step": 1300
},
{
"epoch": 0.17593720395182028,
"eval_accuracy": 0.8916286149162862,
"eval_loss": 0.37970247864723206,
"eval_runtime": 586.3256,
"eval_samples_per_second": 5.603,
"eval_steps_per_second": 5.603,
"step": 1300
},
{
"epoch": 0.17729056705914198,
"grad_norm": 11.172118186950684,
"learning_rate": 5.904375281912494e-05,
"loss": 0.2119,
"step": 1310
},
{
"epoch": 0.17864393016646365,
"grad_norm": 16.503934860229492,
"learning_rate": 5.949481281010375e-05,
"loss": 0.5483,
"step": 1320
},
{
"epoch": 0.17999729327378536,
"grad_norm": 0.00956371147185564,
"learning_rate": 5.994587280108255e-05,
"loss": 0.1664,
"step": 1330
},
{
"epoch": 0.18135065638110706,
"grad_norm": 1.2320352792739868,
"learning_rate": 6.039693279206135e-05,
"loss": 0.5364,
"step": 1340
},
{
"epoch": 0.18270401948842874,
"grad_norm": 2.635582208633423,
"learning_rate": 6.084799278304014e-05,
"loss": 0.2823,
"step": 1350
},
{
"epoch": 0.18405738259575044,
"grad_norm": 0.07258131355047226,
"learning_rate": 6.129905277401894e-05,
"loss": 0.2589,
"step": 1360
},
{
"epoch": 0.18541074570307214,
"grad_norm": 0.6865749359130859,
"learning_rate": 6.175011276499775e-05,
"loss": 0.5216,
"step": 1370
},
{
"epoch": 0.18676410881039382,
"grad_norm": 0.460299015045166,
"learning_rate": 6.220117275597654e-05,
"loss": 0.2386,
"step": 1380
},
{
"epoch": 0.18811747191771552,
"grad_norm": 0.32845044136047363,
"learning_rate": 6.265223274695536e-05,
"loss": 0.6403,
"step": 1390
},
{
"epoch": 0.18947083502503723,
"grad_norm": 19.33603286743164,
"learning_rate": 6.310329273793415e-05,
"loss": 0.3134,
"step": 1400
},
{
"epoch": 0.18947083502503723,
"eval_accuracy": 0.8964992389649924,
"eval_loss": 0.2971131503582001,
"eval_runtime": 586.3223,
"eval_samples_per_second": 5.603,
"eval_steps_per_second": 5.603,
"step": 1400
},
{
"epoch": 0.1908241981323589,
"grad_norm": 18.192237854003906,
"learning_rate": 6.355435272891294e-05,
"loss": 0.308,
"step": 1410
},
{
"epoch": 0.1921775612396806,
"grad_norm": 0.7239986658096313,
"learning_rate": 6.400541271989174e-05,
"loss": 0.1362,
"step": 1420
},
{
"epoch": 0.1935309243470023,
"grad_norm": 0.13601917028427124,
"learning_rate": 6.445647271087055e-05,
"loss": 0.3244,
"step": 1430
},
{
"epoch": 0.19488428745432398,
"grad_norm": 0.047646623104810715,
"learning_rate": 6.490753270184935e-05,
"loss": 0.4921,
"step": 1440
},
{
"epoch": 0.1962376505616457,
"grad_norm": 0.7094109058380127,
"learning_rate": 6.535859269282815e-05,
"loss": 0.1284,
"step": 1450
},
{
"epoch": 0.1975910136689674,
"grad_norm": 0.036302514374256134,
"learning_rate": 6.580965268380695e-05,
"loss": 0.1962,
"step": 1460
},
{
"epoch": 0.19894437677628907,
"grad_norm": 0.2627946138381958,
"learning_rate": 6.626071267478576e-05,
"loss": 0.3998,
"step": 1470
},
{
"epoch": 0.20029773988361077,
"grad_norm": 11.520143508911133,
"learning_rate": 6.671177266576455e-05,
"loss": 0.311,
"step": 1480
},
{
"epoch": 0.20165110299093247,
"grad_norm": 0.5052374005317688,
"learning_rate": 6.716283265674335e-05,
"loss": 0.4324,
"step": 1490
},
{
"epoch": 0.20300446609825415,
"grad_norm": 0.007840686477720737,
"learning_rate": 6.761389264772216e-05,
"loss": 0.0874,
"step": 1500
},
{
"epoch": 0.20300446609825415,
"eval_accuracy": 0.8898021308980213,
"eval_loss": 0.5496937036514282,
"eval_runtime": 586.3431,
"eval_samples_per_second": 5.603,
"eval_steps_per_second": 5.603,
"step": 1500
},
{
"epoch": 0.20435782920557585,
"grad_norm": 3.039323329925537,
"learning_rate": 6.806495263870095e-05,
"loss": 0.4797,
"step": 1510
},
{
"epoch": 0.20571119231289756,
"grad_norm": 0.00013633279013447464,
"learning_rate": 6.851601262967975e-05,
"loss": 0.4127,
"step": 1520
},
{
"epoch": 0.20706455542021923,
"grad_norm": 0.127132847905159,
"learning_rate": 6.896707262065854e-05,
"loss": 0.5609,
"step": 1530
},
{
"epoch": 0.20841791852754094,
"grad_norm": 22.69544792175293,
"learning_rate": 6.941813261163735e-05,
"loss": 0.3238,
"step": 1540
},
{
"epoch": 0.20977128163486264,
"grad_norm": 0.02062298357486725,
"learning_rate": 6.986919260261615e-05,
"loss": 0.4243,
"step": 1550
},
{
"epoch": 0.21112464474218431,
"grad_norm": 23.498069763183594,
"learning_rate": 7.032025259359494e-05,
"loss": 0.9495,
"step": 1560
},
{
"epoch": 0.21247800784950602,
"grad_norm": 0.8411264419555664,
"learning_rate": 7.077131258457376e-05,
"loss": 0.1026,
"step": 1570
},
{
"epoch": 0.21383137095682772,
"grad_norm": 0.030804749578237534,
"learning_rate": 7.122237257555255e-05,
"loss": 0.4295,
"step": 1580
},
{
"epoch": 0.21518473406414942,
"grad_norm": 2.670105218887329,
"learning_rate": 7.167343256653136e-05,
"loss": 0.7808,
"step": 1590
},
{
"epoch": 0.2165380971714711,
"grad_norm": 1.184561014175415,
"learning_rate": 7.212449255751015e-05,
"loss": 0.4684,
"step": 1600
},
{
"epoch": 0.2165380971714711,
"eval_accuracy": 0.8971080669710807,
"eval_loss": 0.4002440571784973,
"eval_runtime": 586.6131,
"eval_samples_per_second": 5.6,
"eval_steps_per_second": 5.6,
"step": 1600
},
{
"epoch": 0.2178914602787928,
"grad_norm": 0.03285335749387741,
"learning_rate": 7.257555254848895e-05,
"loss": 0.0504,
"step": 1610
},
{
"epoch": 0.2192448233861145,
"grad_norm": 0.07612759619951248,
"learning_rate": 7.302661253946776e-05,
"loss": 0.4363,
"step": 1620
},
{
"epoch": 0.22059818649343618,
"grad_norm": 17.422269821166992,
"learning_rate": 7.347767253044655e-05,
"loss": 0.0434,
"step": 1630
},
{
"epoch": 0.22195154960075789,
"grad_norm": 0.026716621592640877,
"learning_rate": 7.392873252142535e-05,
"loss": 0.4682,
"step": 1640
},
{
"epoch": 0.2233049127080796,
"grad_norm": 0.9974814057350159,
"learning_rate": 7.437979251240416e-05,
"loss": 0.3381,
"step": 1650
},
{
"epoch": 0.22465827581540126,
"grad_norm": 0.2876833975315094,
"learning_rate": 7.483085250338295e-05,
"loss": 0.5237,
"step": 1660
},
{
"epoch": 0.22601163892272297,
"grad_norm": 0.009840277023613453,
"learning_rate": 7.528191249436175e-05,
"loss": 0.1314,
"step": 1670
},
{
"epoch": 0.22736500203004467,
"grad_norm": 0.004841567948460579,
"learning_rate": 7.573297248534056e-05,
"loss": 0.6736,
"step": 1680
},
{
"epoch": 0.22871836513736635,
"grad_norm": 5.4699602127075195,
"learning_rate": 7.618403247631935e-05,
"loss": 0.4901,
"step": 1690
},
{
"epoch": 0.23007172824468805,
"grad_norm": 15.60845947265625,
"learning_rate": 7.663509246729816e-05,
"loss": 0.1759,
"step": 1700
},
{
"epoch": 0.23007172824468805,
"eval_accuracy": 0.8943683409436834,
"eval_loss": 0.33465254306793213,
"eval_runtime": 586.0743,
"eval_samples_per_second": 5.605,
"eval_steps_per_second": 5.605,
"step": 1700
},
{
"epoch": 0.23142509135200975,
"grad_norm": 25.261932373046875,
"learning_rate": 7.708615245827695e-05,
"loss": 0.347,
"step": 1710
},
{
"epoch": 0.23277845445933143,
"grad_norm": 0.9137635231018066,
"learning_rate": 7.753721244925576e-05,
"loss": 0.5517,
"step": 1720
},
{
"epoch": 0.23413181756665313,
"grad_norm": 1.7071709632873535,
"learning_rate": 7.798827244023456e-05,
"loss": 0.0289,
"step": 1730
},
{
"epoch": 0.23548518067397484,
"grad_norm": 1.1686097383499146,
"learning_rate": 7.843933243121335e-05,
"loss": 0.5705,
"step": 1740
},
{
"epoch": 0.2368385437812965,
"grad_norm": 6.95435094833374,
"learning_rate": 7.889039242219217e-05,
"loss": 0.4207,
"step": 1750
},
{
"epoch": 0.23819190688861822,
"grad_norm": 1.6849822998046875,
"learning_rate": 7.934145241317096e-05,
"loss": 0.1384,
"step": 1760
},
{
"epoch": 0.23954526999593992,
"grad_norm": 19.90876007080078,
"learning_rate": 7.979251240414976e-05,
"loss": 0.4826,
"step": 1770
},
{
"epoch": 0.2408986331032616,
"grad_norm": 19.53013801574707,
"learning_rate": 8.024357239512855e-05,
"loss": 0.4383,
"step": 1780
},
{
"epoch": 0.2422519962105833,
"grad_norm": 0.00940781645476818,
"learning_rate": 8.069463238610736e-05,
"loss": 0.1539,
"step": 1790
},
{
"epoch": 0.243605359317905,
"grad_norm": 0.9285580515861511,
"learning_rate": 8.114569237708616e-05,
"loss": 0.0173,
"step": 1800
},
{
"epoch": 0.243605359317905,
"eval_accuracy": 0.8964992389649924,
"eval_loss": 0.7079048156738281,
"eval_runtime": 586.1865,
"eval_samples_per_second": 5.604,
"eval_steps_per_second": 5.604,
"step": 1800
},
{
"epoch": 0.24495872242522668,
"grad_norm": 1.0155157070812493e-07,
"learning_rate": 8.159675236806495e-05,
"loss": 0.6124,
"step": 1810
},
{
"epoch": 0.24631208553254838,
"grad_norm": 26.104318618774414,
"learning_rate": 8.204781235904376e-05,
"loss": 1.1316,
"step": 1820
},
{
"epoch": 0.24766544863987008,
"grad_norm": 36.21076202392578,
"learning_rate": 8.249887235002256e-05,
"loss": 1.1308,
"step": 1830
},
{
"epoch": 0.24901881174719176,
"grad_norm": 6.247893333435059,
"learning_rate": 8.294993234100135e-05,
"loss": 0.3823,
"step": 1840
},
{
"epoch": 0.25037217485451346,
"grad_norm": 2.5336642265319824,
"learning_rate": 8.340099233198016e-05,
"loss": 0.3386,
"step": 1850
},
{
"epoch": 0.25172553796183517,
"grad_norm": 0.1235450878739357,
"learning_rate": 8.385205232295896e-05,
"loss": 0.282,
"step": 1860
},
{
"epoch": 0.25307890106915687,
"grad_norm": 0.0033467705361545086,
"learning_rate": 8.430311231393775e-05,
"loss": 0.0699,
"step": 1870
},
{
"epoch": 0.2544322641764786,
"grad_norm": 46.03356170654297,
"learning_rate": 8.475417230491656e-05,
"loss": 0.6011,
"step": 1880
},
{
"epoch": 0.2557856272838002,
"grad_norm": 32.812896728515625,
"learning_rate": 8.520523229589535e-05,
"loss": 0.4655,
"step": 1890
},
{
"epoch": 0.2571389903911219,
"grad_norm": 0.27037665247917175,
"learning_rate": 8.565629228687417e-05,
"loss": 0.2309,
"step": 1900
},
{
"epoch": 0.2571389903911219,
"eval_accuracy": 0.9007610350076104,
"eval_loss": 0.35448792576789856,
"eval_runtime": 585.6872,
"eval_samples_per_second": 5.609,
"eval_steps_per_second": 5.609,
"step": 1900
},
{
"epoch": 0.2584923534984436,
"grad_norm": 0.004123059567064047,
"learning_rate": 8.610735227785296e-05,
"loss": 0.4046,
"step": 1910
},
{
"epoch": 0.25984571660576533,
"grad_norm": 11.693790435791016,
"learning_rate": 8.655841226883175e-05,
"loss": 0.1442,
"step": 1920
},
{
"epoch": 0.26119907971308703,
"grad_norm": 0.026265673339366913,
"learning_rate": 8.700947225981056e-05,
"loss": 0.1297,
"step": 1930
},
{
"epoch": 0.26255244282040874,
"grad_norm": 31.22381019592285,
"learning_rate": 8.746053225078936e-05,
"loss": 0.1566,
"step": 1940
},
{
"epoch": 0.2639058059277304,
"grad_norm": 0.0626770630478859,
"learning_rate": 8.791159224176817e-05,
"loss": 1.2843,
"step": 1950
},
{
"epoch": 0.2652591690350521,
"grad_norm": 6.931087017059326,
"learning_rate": 8.836265223274696e-05,
"loss": 0.8391,
"step": 1960
},
{
"epoch": 0.2666125321423738,
"grad_norm": 1.1776670217514038,
"learning_rate": 8.881371222372576e-05,
"loss": 0.3079,
"step": 1970
},
{
"epoch": 0.2679658952496955,
"grad_norm": 0.06983193755149841,
"learning_rate": 8.926477221470457e-05,
"loss": 0.1029,
"step": 1980
},
{
"epoch": 0.2693192583570172,
"grad_norm": 3.716341972351074,
"learning_rate": 8.971583220568336e-05,
"loss": 0.3396,
"step": 1990
},
{
"epoch": 0.2706726214643389,
"grad_norm": 35.3888053894043,
"learning_rate": 9.016689219666216e-05,
"loss": 0.3427,
"step": 2000
},
{
"epoch": 0.2706726214643389,
"eval_accuracy": 0.8840182648401826,
"eval_loss": 0.6963387727737427,
"eval_runtime": 586.0714,
"eval_samples_per_second": 5.605,
"eval_steps_per_second": 5.605,
"step": 2000
},
{
"epoch": 0.27202598457166055,
"grad_norm": 0.007088725455105305,
"learning_rate": 9.061795218764097e-05,
"loss": 0.2362,
"step": 2010
},
{
"epoch": 0.27337934767898225,
"grad_norm": 3.1651086807250977,
"learning_rate": 9.106901217861976e-05,
"loss": 0.2904,
"step": 2020
},
{
"epoch": 0.27473271078630396,
"grad_norm": 0.019235916435718536,
"learning_rate": 9.152007216959856e-05,
"loss": 0.5162,
"step": 2030
},
{
"epoch": 0.27608607389362566,
"grad_norm": 10.593847274780273,
"learning_rate": 9.197113216057735e-05,
"loss": 0.5283,
"step": 2040
},
{
"epoch": 0.27743943700094736,
"grad_norm": 2.6608517169952393,
"learning_rate": 9.242219215155616e-05,
"loss": 0.0297,
"step": 2050
},
{
"epoch": 0.27879280010826907,
"grad_norm": 11.656271934509277,
"learning_rate": 9.287325214253496e-05,
"loss": 0.6071,
"step": 2060
},
{
"epoch": 0.2801461632155907,
"grad_norm": 3.8772504329681396,
"learning_rate": 9.332431213351375e-05,
"loss": 0.3125,
"step": 2070
},
{
"epoch": 0.2814995263229124,
"grad_norm": 2.7373952865600586,
"learning_rate": 9.377537212449257e-05,
"loss": 0.8945,
"step": 2080
},
{
"epoch": 0.2828528894302341,
"grad_norm": 0.14976456761360168,
"learning_rate": 9.422643211547136e-05,
"loss": 0.2199,
"step": 2090
},
{
"epoch": 0.2842062525375558,
"grad_norm": 1.8281391859054565,
"learning_rate": 9.467749210645016e-05,
"loss": 0.5043,
"step": 2100
},
{
"epoch": 0.2842062525375558,
"eval_accuracy": 0.893455098934551,
"eval_loss": 0.4096009433269501,
"eval_runtime": 586.9389,
"eval_samples_per_second": 5.597,
"eval_steps_per_second": 5.597,
"step": 2100
},
{
"epoch": 0.28555961564487753,
"grad_norm": 32.19676208496094,
"learning_rate": 9.512855209742896e-05,
"loss": 0.4847,
"step": 2110
},
{
"epoch": 0.28691297875219923,
"grad_norm": 1.159845232963562,
"learning_rate": 9.557961208840776e-05,
"loss": 0.2801,
"step": 2120
},
{
"epoch": 0.28826634185952094,
"grad_norm": 0.039890531450510025,
"learning_rate": 9.603067207938657e-05,
"loss": 0.3207,
"step": 2130
},
{
"epoch": 0.2896197049668426,
"grad_norm": 0.00016595161287114024,
"learning_rate": 9.648173207036536e-05,
"loss": 0.2653,
"step": 2140
},
{
"epoch": 0.2909730680741643,
"grad_norm": 0.2522432506084442,
"learning_rate": 9.693279206134417e-05,
"loss": 0.4832,
"step": 2150
},
{
"epoch": 0.292326431181486,
"grad_norm": 10.649747848510742,
"learning_rate": 9.738385205232297e-05,
"loss": 0.1198,
"step": 2160
},
{
"epoch": 0.2936797942888077,
"grad_norm": 22.584964752197266,
"learning_rate": 9.783491204330176e-05,
"loss": 0.5245,
"step": 2170
},
{
"epoch": 0.2950331573961294,
"grad_norm": 1.6885179281234741,
"learning_rate": 9.828597203428057e-05,
"loss": 0.3199,
"step": 2180
},
{
"epoch": 0.2963865205034511,
"grad_norm": 4.436751365661621,
"learning_rate": 9.873703202525937e-05,
"loss": 0.5085,
"step": 2190
},
{
"epoch": 0.29773988361077275,
"grad_norm": 0.6824151873588562,
"learning_rate": 9.918809201623816e-05,
"loss": 0.6263,
"step": 2200
},
{
"epoch": 0.29773988361077275,
"eval_accuracy": 0.869406392694064,
"eval_loss": 0.4238719642162323,
"eval_runtime": 586.4265,
"eval_samples_per_second": 5.602,
"eval_steps_per_second": 5.602,
"step": 2200
},
{
"epoch": 0.29909324671809445,
"grad_norm": 0.0010402319021522999,
"learning_rate": 9.963915200721697e-05,
"loss": 0.391,
"step": 2210
},
{
"epoch": 0.30044660982541616,
"grad_norm": 1.8170876502990723,
"learning_rate": 9.999999752021549e-05,
"loss": 0.262,
"step": 2220
},
{
"epoch": 0.30179997293273786,
"grad_norm": 1.5593578815460205,
"learning_rate": 9.999991072778369e-05,
"loss": 0.217,
"step": 2230
},
{
"epoch": 0.30315333604005956,
"grad_norm": 0.04098629578948021,
"learning_rate": 9.999969994637269e-05,
"loss": 0.3085,
"step": 2240
},
{
"epoch": 0.30450669914738127,
"grad_norm": 3.2124032974243164,
"learning_rate": 9.999936517650514e-05,
"loss": 0.1197,
"step": 2250
},
{
"epoch": 0.3058600622547029,
"grad_norm": 16.91176414489746,
"learning_rate": 9.999890641901125e-05,
"loss": 0.2527,
"step": 2260
},
{
"epoch": 0.3072134253620246,
"grad_norm": 39.94670486450195,
"learning_rate": 9.999832367502859e-05,
"loss": 0.5771,
"step": 2270
},
{
"epoch": 0.3085667884693463,
"grad_norm": 5.255732536315918,
"learning_rate": 9.999761694600227e-05,
"loss": 0.7849,
"step": 2280
},
{
"epoch": 0.309920151576668,
"grad_norm": 5.741457462310791,
"learning_rate": 9.999678623368483e-05,
"loss": 0.6328,
"step": 2290
},
{
"epoch": 0.3112735146839897,
"grad_norm": 2.6297736167907715,
"learning_rate": 9.999583154013623e-05,
"loss": 0.2523,
"step": 2300
},
{
"epoch": 0.3112735146839897,
"eval_accuracy": 0.8843226788432268,
"eval_loss": 0.28884410858154297,
"eval_runtime": 586.1248,
"eval_samples_per_second": 5.605,
"eval_steps_per_second": 5.605,
"step": 2300
},
{
"epoch": 0.31262687779131143,
"grad_norm": 0.5248340368270874,
"learning_rate": 9.999475286772394e-05,
"loss": 0.2686,
"step": 2310
},
{
"epoch": 0.3139802408986331,
"grad_norm": 0.18005584180355072,
"learning_rate": 9.99935502191228e-05,
"loss": 0.3142,
"step": 2320
},
{
"epoch": 0.3153336040059548,
"grad_norm": 0.26872676610946655,
"learning_rate": 9.999222359731514e-05,
"loss": 0.1423,
"step": 2330
},
{
"epoch": 0.3166869671132765,
"grad_norm": 39.6584587097168,
"learning_rate": 9.99907730055907e-05,
"loss": 0.5064,
"step": 2340
},
{
"epoch": 0.3180403302205982,
"grad_norm": 27.253623962402344,
"learning_rate": 9.998919844754661e-05,
"loss": 0.6555,
"step": 2350
},
{
"epoch": 0.3193936933279199,
"grad_norm": 14.004796981811523,
"learning_rate": 9.998749992708744e-05,
"loss": 0.3537,
"step": 2360
},
{
"epoch": 0.3207470564352416,
"grad_norm": 10.884716033935547,
"learning_rate": 9.998567744842517e-05,
"loss": 0.1047,
"step": 2370
},
{
"epoch": 0.32210041954256324,
"grad_norm": 13.677248001098633,
"learning_rate": 9.998373101607915e-05,
"loss": 0.4014,
"step": 2380
},
{
"epoch": 0.32345378264988495,
"grad_norm": 0.055363189429044724,
"learning_rate": 9.998166063487611e-05,
"loss": 0.3669,
"step": 2390
},
{
"epoch": 0.32480714575720665,
"grad_norm": 2.3912179470062256,
"learning_rate": 9.997946630995013e-05,
"loss": 0.2504,
"step": 2400
},
{
"epoch": 0.32480714575720665,
"eval_accuracy": 0.8952815829528158,
"eval_loss": 0.3796720802783966,
"eval_runtime": 585.6459,
"eval_samples_per_second": 5.609,
"eval_steps_per_second": 5.609,
"step": 2400
},
{
"epoch": 0.32616050886452835,
"grad_norm": 25.042545318603516,
"learning_rate": 9.997714804674268e-05,
"loss": 0.5136,
"step": 2410
},
{
"epoch": 0.32751387197185006,
"grad_norm": 22.99011993408203,
"learning_rate": 9.997470585100255e-05,
"loss": 0.225,
"step": 2420
},
{
"epoch": 0.32886723507917176,
"grad_norm": 18.535400390625,
"learning_rate": 9.997213972878586e-05,
"loss": 0.1548,
"step": 2430
},
{
"epoch": 0.33022059818649346,
"grad_norm": 30.402786254882812,
"learning_rate": 9.996944968645603e-05,
"loss": 0.6825,
"step": 2440
},
{
"epoch": 0.3315739612938151,
"grad_norm": 9.916972160339355,
"learning_rate": 9.99666357306838e-05,
"loss": 0.6176,
"step": 2450
},
{
"epoch": 0.3329273244011368,
"grad_norm": 29.573097229003906,
"learning_rate": 9.996369786844714e-05,
"loss": 0.438,
"step": 2460
},
{
"epoch": 0.3342806875084585,
"grad_norm": 2.503023386001587,
"learning_rate": 9.996063610703137e-05,
"loss": 0.464,
"step": 2470
},
{
"epoch": 0.3356340506157802,
"grad_norm": 4.922478675842285,
"learning_rate": 9.995745045402893e-05,
"loss": 0.3029,
"step": 2480
},
{
"epoch": 0.3369874137231019,
"grad_norm": 17.966007232666016,
"learning_rate": 9.99541409173396e-05,
"loss": 0.3314,
"step": 2490
},
{
"epoch": 0.33834077683042363,
"grad_norm": 0.032199665904045105,
"learning_rate": 9.99507075051703e-05,
"loss": 0.3536,
"step": 2500
},
{
"epoch": 0.33834077683042363,
"eval_accuracy": 0.9038051750380518,
"eval_loss": 0.259170264005661,
"eval_runtime": 585.8326,
"eval_samples_per_second": 5.607,
"eval_steps_per_second": 5.607,
"step": 2500
},
{
"epoch": 0.3396941399377453,
"grad_norm": 17.7895565032959,
"learning_rate": 9.994715022603514e-05,
"loss": 0.2788,
"step": 2510
},
{
"epoch": 0.341047503045067,
"grad_norm": 0.004291262943297625,
"learning_rate": 9.994346908875543e-05,
"loss": 0.2134,
"step": 2520
},
{
"epoch": 0.3424008661523887,
"grad_norm": 12.855873107910156,
"learning_rate": 9.993966410245957e-05,
"loss": 1.1679,
"step": 2530
},
{
"epoch": 0.3437542292597104,
"grad_norm": 4.638855457305908,
"learning_rate": 9.99357352765831e-05,
"loss": 0.3796,
"step": 2540
},
{
"epoch": 0.3451075923670321,
"grad_norm": 0.11056680232286453,
"learning_rate": 9.99316826208687e-05,
"loss": 0.1339,
"step": 2550
},
{
"epoch": 0.3464609554743538,
"grad_norm": 0.006429149303585291,
"learning_rate": 9.992750614536605e-05,
"loss": 0.1356,
"step": 2560
},
{
"epoch": 0.34781431858167544,
"grad_norm": 0.028468480333685875,
"learning_rate": 9.992320586043192e-05,
"loss": 0.1875,
"step": 2570
},
{
"epoch": 0.34916768168899714,
"grad_norm": 0.16883939504623413,
"learning_rate": 9.991878177673006e-05,
"loss": 0.3505,
"step": 2580
},
{
"epoch": 0.35052104479631885,
"grad_norm": 0.0001893436856335029,
"learning_rate": 9.991423390523126e-05,
"loss": 0.2705,
"step": 2590
},
{
"epoch": 0.35187440790364055,
"grad_norm": 0.9219558835029602,
"learning_rate": 9.990956225721328e-05,
"loss": 0.3312,
"step": 2600
},
{
"epoch": 0.35187440790364055,
"eval_accuracy": 0.898021308980213,
"eval_loss": 0.3198848366737366,
"eval_runtime": 586.1648,
"eval_samples_per_second": 5.604,
"eval_steps_per_second": 5.604,
"step": 2600
},
{
"epoch": 0.35322777101096225,
"grad_norm": 0.16585978865623474,
"learning_rate": 9.990476684426075e-05,
"loss": 0.2798,
"step": 2610
},
{
"epoch": 0.35458113411828396,
"grad_norm": 27.86931800842285,
"learning_rate": 9.989984767826532e-05,
"loss": 0.3643,
"step": 2620
},
{
"epoch": 0.3559344972256056,
"grad_norm": 10.681389808654785,
"learning_rate": 9.98948047714254e-05,
"loss": 0.7329,
"step": 2630
},
{
"epoch": 0.3572878603329273,
"grad_norm": 0.3092847168445587,
"learning_rate": 9.988963813624635e-05,
"loss": 0.2119,
"step": 2640
},
{
"epoch": 0.358641223440249,
"grad_norm": 19.003339767456055,
"learning_rate": 9.98843477855403e-05,
"loss": 0.287,
"step": 2650
},
{
"epoch": 0.3599945865475707,
"grad_norm": 13.935131072998047,
"learning_rate": 9.987893373242616e-05,
"loss": 0.2836,
"step": 2660
},
{
"epoch": 0.3613479496548924,
"grad_norm": 0.20323041081428528,
"learning_rate": 9.987339599032964e-05,
"loss": 0.0202,
"step": 2670
},
{
"epoch": 0.3627013127622141,
"grad_norm": 32.1824951171875,
"learning_rate": 9.986773457298311e-05,
"loss": 0.5027,
"step": 2680
},
{
"epoch": 0.36405467586953577,
"grad_norm": 0.03664253652095795,
"learning_rate": 9.986194949442568e-05,
"loss": 0.4831,
"step": 2690
},
{
"epoch": 0.3654080389768575,
"grad_norm": 0.41898396611213684,
"learning_rate": 9.985604076900312e-05,
"loss": 0.2259,
"step": 2700
},
{
"epoch": 0.3654080389768575,
"eval_accuracy": 0.9056316590563166,
"eval_loss": 0.3357527554035187,
"eval_runtime": 585.747,
"eval_samples_per_second": 5.608,
"eval_steps_per_second": 5.608,
"step": 2700
},
{
"epoch": 0.3667614020841792,
"grad_norm": 0.08296563476324081,
"learning_rate": 9.985000841136775e-05,
"loss": 0.213,
"step": 2710
},
{
"epoch": 0.3681147651915009,
"grad_norm": 3.9525413513183594,
"learning_rate": 9.984385243647855e-05,
"loss": 0.1392,
"step": 2720
},
{
"epoch": 0.3694681282988226,
"grad_norm": 0.15052391588687897,
"learning_rate": 9.983757285960098e-05,
"loss": 0.7221,
"step": 2730
},
{
"epoch": 0.3708214914061443,
"grad_norm": 7.1570258140563965,
"learning_rate": 9.983116969630706e-05,
"loss": 0.3955,
"step": 2740
},
{
"epoch": 0.37217485451346594,
"grad_norm": 0.4611501395702362,
"learning_rate": 9.982464296247522e-05,
"loss": 0.0577,
"step": 2750
},
{
"epoch": 0.37352821762078764,
"grad_norm": 0.04361600801348686,
"learning_rate": 9.981799267429037e-05,
"loss": 0.3531,
"step": 2760
},
{
"epoch": 0.37488158072810934,
"grad_norm": 5.342017650604248,
"learning_rate": 9.98112188482438e-05,
"loss": 0.174,
"step": 2770
},
{
"epoch": 0.37623494383543105,
"grad_norm": 1.9316107034683228,
"learning_rate": 9.980432150113311e-05,
"loss": 0.5168,
"step": 2780
},
{
"epoch": 0.37758830694275275,
"grad_norm": 0.29214081168174744,
"learning_rate": 9.979730065006225e-05,
"loss": 0.1109,
"step": 2790
},
{
"epoch": 0.37894167005007445,
"grad_norm": 24.477149963378906,
"learning_rate": 9.97901563124414e-05,
"loss": 0.2328,
"step": 2800
},
{
"epoch": 0.37894167005007445,
"eval_accuracy": 0.8964992389649924,
"eval_loss": 0.5654892325401306,
"eval_runtime": 586.152,
"eval_samples_per_second": 5.604,
"eval_steps_per_second": 5.604,
"step": 2800
},
{
"epoch": 0.38029503315739616,
"grad_norm": 0.38575196266174316,
"learning_rate": 9.978288850598698e-05,
"loss": 0.6395,
"step": 2810
},
{
"epoch": 0.3816483962647178,
"grad_norm": 2.8028533458709717,
"learning_rate": 9.97754972487216e-05,
"loss": 0.1854,
"step": 2820
},
{
"epoch": 0.3830017593720395,
"grad_norm": 0.018388153985142708,
"learning_rate": 9.976798255897394e-05,
"loss": 0.3204,
"step": 2830
},
{
"epoch": 0.3843551224793612,
"grad_norm": 0.017660358920693398,
"learning_rate": 9.976034445537885e-05,
"loss": 0.3979,
"step": 2840
},
{
"epoch": 0.3857084855866829,
"grad_norm": 0.19985133409500122,
"learning_rate": 9.975258295687715e-05,
"loss": 0.1326,
"step": 2850
},
{
"epoch": 0.3870618486940046,
"grad_norm": 25.679462432861328,
"learning_rate": 9.97446980827157e-05,
"loss": 0.6361,
"step": 2860
},
{
"epoch": 0.3884152118013263,
"grad_norm": 25.165712356567383,
"learning_rate": 9.973668985244724e-05,
"loss": 0.5958,
"step": 2870
},
{
"epoch": 0.38976857490864797,
"grad_norm": 14.897231101989746,
"learning_rate": 9.972855828593051e-05,
"loss": 0.462,
"step": 2880
},
{
"epoch": 0.3911219380159697,
"grad_norm": 7.232959747314453,
"learning_rate": 9.972030340333001e-05,
"loss": 0.3582,
"step": 2890
},
{
"epoch": 0.3924753011232914,
"grad_norm": 19.949562072753906,
"learning_rate": 9.971192522511608e-05,
"loss": 0.2967,
"step": 2900
},
{
"epoch": 0.3924753011232914,
"eval_accuracy": 0.9068493150684932,
"eval_loss": 0.2848670482635498,
"eval_runtime": 585.5038,
"eval_samples_per_second": 5.611,
"eval_steps_per_second": 5.611,
"step": 2900
},
{
"epoch": 0.3938286642306131,
"grad_norm": 6.787762641906738,
"learning_rate": 9.970342377206477e-05,
"loss": 0.3068,
"step": 2910
},
{
"epoch": 0.3951820273379348,
"grad_norm": 0.035039156675338745,
"learning_rate": 9.969479906525785e-05,
"loss": 0.1438,
"step": 2920
},
{
"epoch": 0.3965353904452565,
"grad_norm": 0.027156801894307137,
"learning_rate": 9.968605112608273e-05,
"loss": 0.4841,
"step": 2930
},
{
"epoch": 0.39788875355257813,
"grad_norm": 16.14340591430664,
"learning_rate": 9.967717997623245e-05,
"loss": 0.1882,
"step": 2940
},
{
"epoch": 0.39924211665989984,
"grad_norm": 3.19144606590271,
"learning_rate": 9.966818563770548e-05,
"loss": 0.3626,
"step": 2950
},
{
"epoch": 0.40059547976722154,
"grad_norm": 1.6469515562057495,
"learning_rate": 9.96590681328059e-05,
"loss": 0.1981,
"step": 2960
},
{
"epoch": 0.40194884287454324,
"grad_norm": 5.4307092796079814e-06,
"learning_rate": 9.96498274841431e-05,
"loss": 0.3645,
"step": 2970
},
{
"epoch": 0.40330220598186495,
"grad_norm": 0.009263657964766026,
"learning_rate": 9.964046371463193e-05,
"loss": 0.1692,
"step": 2980
},
{
"epoch": 0.40465556908918665,
"grad_norm": 12.660831451416016,
"learning_rate": 9.963097684749251e-05,
"loss": 0.2142,
"step": 2990
},
{
"epoch": 0.4060089321965083,
"grad_norm": 3.7643866539001465,
"learning_rate": 9.962136690625019e-05,
"loss": 0.2978,
"step": 3000
},
{
"epoch": 0.4060089321965083,
"eval_accuracy": 0.8995433789954338,
"eval_loss": 0.41810643672943115,
"eval_runtime": 585.6633,
"eval_samples_per_second": 5.609,
"eval_steps_per_second": 5.609,
"step": 3000
},
{
"epoch": 0.40736229530383,
"grad_norm": 25.727134704589844,
"learning_rate": 9.96116339147356e-05,
"loss": 0.2822,
"step": 3010
},
{
"epoch": 0.4087156584111517,
"grad_norm": 0.004231706261634827,
"learning_rate": 9.96017778970844e-05,
"loss": 0.1751,
"step": 3020
},
{
"epoch": 0.4100690215184734,
"grad_norm": 0.3602442741394043,
"learning_rate": 9.959179887773744e-05,
"loss": 0.1602,
"step": 3030
},
{
"epoch": 0.4114223846257951,
"grad_norm": 46.329593658447266,
"learning_rate": 9.95816968814405e-05,
"loss": 1.1669,
"step": 3040
},
{
"epoch": 0.4127757477331168,
"grad_norm": 19.935781478881836,
"learning_rate": 9.957147193324434e-05,
"loss": 0.7264,
"step": 3050
},
{
"epoch": 0.41412911084043846,
"grad_norm": 3.980362892150879,
"learning_rate": 9.956112405850466e-05,
"loss": 0.2222,
"step": 3060
},
{
"epoch": 0.41548247394776017,
"grad_norm": 24.50153160095215,
"learning_rate": 9.955065328288193e-05,
"loss": 0.3385,
"step": 3070
},
{
"epoch": 0.41683583705508187,
"grad_norm": 28.30177879333496,
"learning_rate": 9.954005963234141e-05,
"loss": 0.5514,
"step": 3080
},
{
"epoch": 0.4181892001624036,
"grad_norm": 1.61983323097229,
"learning_rate": 9.952934313315306e-05,
"loss": 0.0859,
"step": 3090
},
{
"epoch": 0.4195425632697253,
"grad_norm": 6.002326011657715,
"learning_rate": 9.95185038118915e-05,
"loss": 0.4642,
"step": 3100
},
{
"epoch": 0.4195425632697253,
"eval_accuracy": 0.9007610350076104,
"eval_loss": 0.6233646869659424,
"eval_runtime": 585.9277,
"eval_samples_per_second": 5.606,
"eval_steps_per_second": 5.606,
"step": 3100
},
{
"epoch": 0.420895926377047,
"grad_norm": 1.9925620555877686,
"learning_rate": 9.95075416954359e-05,
"loss": 0.4925,
"step": 3110
},
{
"epoch": 0.42224928948436863,
"grad_norm": 18.721527099609375,
"learning_rate": 9.949645681096995e-05,
"loss": 0.3548,
"step": 3120
},
{
"epoch": 0.42360265259169033,
"grad_norm": 0.3973216712474823,
"learning_rate": 9.948524918598175e-05,
"loss": 0.085,
"step": 3130
},
{
"epoch": 0.42495601569901204,
"grad_norm": 17.404512405395508,
"learning_rate": 9.947391884826381e-05,
"loss": 0.5475,
"step": 3140
},
{
"epoch": 0.42630937880633374,
"grad_norm": 0.08265689015388489,
"learning_rate": 9.94624658259129e-05,
"loss": 0.6296,
"step": 3150
},
{
"epoch": 0.42766274191365544,
"grad_norm": 1.8543760776519775,
"learning_rate": 9.945089014733005e-05,
"loss": 0.6776,
"step": 3160
},
{
"epoch": 0.42901610502097715,
"grad_norm": 4.1883063316345215,
"learning_rate": 9.943919184122043e-05,
"loss": 0.2573,
"step": 3170
},
{
"epoch": 0.43036946812829885,
"grad_norm": 0.24804548919200897,
"learning_rate": 9.942737093659335e-05,
"loss": 0.2905,
"step": 3180
},
{
"epoch": 0.4317228312356205,
"grad_norm": 0.36752820014953613,
"learning_rate": 9.941542746276207e-05,
"loss": 0.5516,
"step": 3190
},
{
"epoch": 0.4330761943429422,
"grad_norm": 0.18298649787902832,
"learning_rate": 9.940336144934383e-05,
"loss": 0.0876,
"step": 3200
},
{
"epoch": 0.4330761943429422,
"eval_accuracy": 0.9068493150684932,
"eval_loss": 0.3489275276660919,
"eval_runtime": 586.2418,
"eval_samples_per_second": 5.603,
"eval_steps_per_second": 5.603,
"step": 3200
},
{
"epoch": 0.4344295574502639,
"grad_norm": 6.957592972867133e-07,
"learning_rate": 9.939117292625972e-05,
"loss": 0.4436,
"step": 3210
},
{
"epoch": 0.4357829205575856,
"grad_norm": 26.52113914489746,
"learning_rate": 9.937886192373469e-05,
"loss": 0.6185,
"step": 3220
},
{
"epoch": 0.4371362836649073,
"grad_norm": 8.334674835205078,
"learning_rate": 9.936642847229734e-05,
"loss": 0.1342,
"step": 3230
},
{
"epoch": 0.438489646772229,
"grad_norm": 0.1016705185174942,
"learning_rate": 9.935387260277993e-05,
"loss": 0.6313,
"step": 3240
},
{
"epoch": 0.43984300987955066,
"grad_norm": 7.029394149780273,
"learning_rate": 9.934119434631832e-05,
"loss": 0.5876,
"step": 3250
},
{
"epoch": 0.44119637298687236,
"grad_norm": 0.10425078123807907,
"learning_rate": 9.932839373435185e-05,
"loss": 0.1288,
"step": 3260
},
{
"epoch": 0.44254973609419407,
"grad_norm": 10.366453170776367,
"learning_rate": 9.931547079862329e-05,
"loss": 0.4045,
"step": 3270
},
{
"epoch": 0.44390309920151577,
"grad_norm": 0.6578564643859863,
"learning_rate": 9.930242557117869e-05,
"loss": 0.5181,
"step": 3280
},
{
"epoch": 0.4452564623088375,
"grad_norm": 6.488555908203125,
"learning_rate": 9.928925808436743e-05,
"loss": 0.1018,
"step": 3290
},
{
"epoch": 0.4466098254161592,
"grad_norm": 1.798299789428711,
"learning_rate": 9.927596837084198e-05,
"loss": 0.1241,
"step": 3300
},
{
"epoch": 0.4466098254161592,
"eval_accuracy": 0.9156773211567732,
"eval_loss": 0.298028826713562,
"eval_runtime": 585.8439,
"eval_samples_per_second": 5.607,
"eval_steps_per_second": 5.607,
"step": 3300
},
{
"epoch": 0.4479631885234808,
"grad_norm": 8.091084480285645,
"learning_rate": 9.926255646355804e-05,
"loss": 0.7205,
"step": 3310
},
{
"epoch": 0.44931655163080253,
"grad_norm": 1.7898259162902832,
"learning_rate": 9.924902239577418e-05,
"loss": 0.3658,
"step": 3320
},
{
"epoch": 0.45066991473812423,
"grad_norm": 7.857369422912598,
"learning_rate": 9.923536620105201e-05,
"loss": 0.1974,
"step": 3330
},
{
"epoch": 0.45202327784544594,
"grad_norm": 1.4473187923431396,
"learning_rate": 9.922158791325588e-05,
"loss": 0.5274,
"step": 3340
},
{
"epoch": 0.45337664095276764,
"grad_norm": 20.294761657714844,
"learning_rate": 9.920768756655304e-05,
"loss": 0.4609,
"step": 3350
},
{
"epoch": 0.45473000406008934,
"grad_norm": 0.015980269759893417,
"learning_rate": 9.919366519541332e-05,
"loss": 0.1363,
"step": 3360
},
{
"epoch": 0.456083367167411,
"grad_norm": 7.722431659698486,
"learning_rate": 9.917952083460916e-05,
"loss": 0.3419,
"step": 3370
},
{
"epoch": 0.4574367302747327,
"grad_norm": 0.9992192387580872,
"learning_rate": 9.91652545192155e-05,
"loss": 0.6367,
"step": 3380
},
{
"epoch": 0.4587900933820544,
"grad_norm": 30.987764358520508,
"learning_rate": 9.915086628460977e-05,
"loss": 0.813,
"step": 3390
},
{
"epoch": 0.4601434564893761,
"grad_norm": 16.1995906829834,
"learning_rate": 9.913635616647166e-05,
"loss": 0.6461,
"step": 3400
},
{
"epoch": 0.4601434564893761,
"eval_accuracy": 0.908675799086758,
"eval_loss": 0.25029391050338745,
"eval_runtime": 607.2706,
"eval_samples_per_second": 5.409,
"eval_steps_per_second": 5.409,
"step": 3400
},
{
"epoch": 0.4614968195966978,
"grad_norm": 3.102372407913208,
"learning_rate": 9.912172420078312e-05,
"loss": 0.2572,
"step": 3410
},
{
"epoch": 0.4628501827040195,
"grad_norm": 11.124046325683594,
"learning_rate": 9.910697042382829e-05,
"loss": 0.2574,
"step": 3420
},
{
"epoch": 0.46420354581134116,
"grad_norm": 8.01544189453125,
"learning_rate": 9.909209487219333e-05,
"loss": 0.5683,
"step": 3430
},
{
"epoch": 0.46555690891866286,
"grad_norm": 9.192936897277832,
"learning_rate": 9.90770975827664e-05,
"loss": 0.2297,
"step": 3440
},
{
"epoch": 0.46691027202598456,
"grad_norm": 7.2200117111206055,
"learning_rate": 9.906197859273753e-05,
"loss": 0.169,
"step": 3450
},
{
"epoch": 0.46826363513330627,
"grad_norm": 19.230791091918945,
"learning_rate": 9.904673793959857e-05,
"loss": 0.5687,
"step": 3460
},
{
"epoch": 0.46961699824062797,
"grad_norm": 5.5020012855529785,
"learning_rate": 9.903137566114304e-05,
"loss": 0.5738,
"step": 3470
},
{
"epoch": 0.4709703613479497,
"grad_norm": 8.84924030303955,
"learning_rate": 9.901589179546606e-05,
"loss": 0.2117,
"step": 3480
},
{
"epoch": 0.4723237244552714,
"grad_norm": 17.913070678710938,
"learning_rate": 9.900028638096428e-05,
"loss": 0.3447,
"step": 3490
},
{
"epoch": 0.473677087562593,
"grad_norm": 12.876395225524902,
"learning_rate": 9.898455945633576e-05,
"loss": 0.1405,
"step": 3500
},
{
"epoch": 0.473677087562593,
"eval_accuracy": 0.9077625570776255,
"eval_loss": 0.2597663700580597,
"eval_runtime": 586.0918,
"eval_samples_per_second": 5.605,
"eval_steps_per_second": 5.605,
"step": 3500
},
{
"epoch": 0.47503045066991473,
"grad_norm": 13.94190502166748,
"learning_rate": 9.896871106057989e-05,
"loss": 0.1196,
"step": 3510
},
{
"epoch": 0.47638381377723643,
"grad_norm": 0.00028016060241498053,
"learning_rate": 9.895274123299723e-05,
"loss": 0.3356,
"step": 3520
},
{
"epoch": 0.47773717688455813,
"grad_norm": 21.0711669921875,
"learning_rate": 9.893665001318954e-05,
"loss": 0.3105,
"step": 3530
},
{
"epoch": 0.47909053999187984,
"grad_norm": 2.196962594985962,
"learning_rate": 9.892043744105957e-05,
"loss": 0.3484,
"step": 3540
},
{
"epoch": 0.48044390309920154,
"grad_norm": 9.535185813903809,
"learning_rate": 9.890410355681097e-05,
"loss": 0.5164,
"step": 3550
},
{
"epoch": 0.4817972662065232,
"grad_norm": 10.87157917022705,
"learning_rate": 9.888764840094825e-05,
"loss": 0.2638,
"step": 3560
},
{
"epoch": 0.4831506293138449,
"grad_norm": 3.433655023574829,
"learning_rate": 9.887107201427666e-05,
"loss": 0.0772,
"step": 3570
},
{
"epoch": 0.4845039924211666,
"grad_norm": 18.490392684936523,
"learning_rate": 9.885437443790204e-05,
"loss": 0.224,
"step": 3580
},
{
"epoch": 0.4858573555284883,
"grad_norm": 36.07349395751953,
"learning_rate": 9.88375557132308e-05,
"loss": 0.1838,
"step": 3590
},
{
"epoch": 0.48721071863581,
"grad_norm": 0.00971564557403326,
"learning_rate": 9.882061588196971e-05,
"loss": 0.0814,
"step": 3600
},
{
"epoch": 0.48721071863581,
"eval_accuracy": 0.9053272450532724,
"eval_loss": 0.39746227860450745,
"eval_runtime": 585.9258,
"eval_samples_per_second": 5.607,
"eval_steps_per_second": 5.607,
"step": 3600
},
{
"epoch": 0.4885640817431317,
"grad_norm": 8.707074165344238,
"learning_rate": 9.880355498612593e-05,
"loss": 0.1846,
"step": 3610
},
{
"epoch": 0.48991744485045335,
"grad_norm": 0.2467634230852127,
"learning_rate": 9.878637306800676e-05,
"loss": 0.5841,
"step": 3620
},
{
"epoch": 0.49127080795777506,
"grad_norm": 11.655877113342285,
"learning_rate": 9.876907017021967e-05,
"loss": 0.6614,
"step": 3630
},
{
"epoch": 0.49262417106509676,
"grad_norm": 2.5910391807556152,
"learning_rate": 9.87516463356721e-05,
"loss": 0.3747,
"step": 3640
},
{
"epoch": 0.49397753417241846,
"grad_norm": 0.08143145591020584,
"learning_rate": 9.873410160757139e-05,
"loss": 0.2551,
"step": 3650
},
{
"epoch": 0.49533089727974017,
"grad_norm": 0.006202420219779015,
"learning_rate": 9.871643602942469e-05,
"loss": 0.1721,
"step": 3660
},
{
"epoch": 0.49668426038706187,
"grad_norm": 0.4420149028301239,
"learning_rate": 9.869864964503881e-05,
"loss": 0.2613,
"step": 3670
},
{
"epoch": 0.4980376234943835,
"grad_norm": 4.616814613342285,
"learning_rate": 9.868074249852016e-05,
"loss": 0.3163,
"step": 3680
},
{
"epoch": 0.4993909866017052,
"grad_norm": 1.4008029699325562,
"learning_rate": 9.866271463427457e-05,
"loss": 0.392,
"step": 3690
},
{
"epoch": 0.5007443497090269,
"grad_norm": 10.468450546264648,
"learning_rate": 9.864456609700726e-05,
"loss": 0.2675,
"step": 3700
},
{
"epoch": 0.5007443497090269,
"eval_accuracy": 0.9092846270928463,
"eval_loss": 0.31681403517723083,
"eval_runtime": 585.9869,
"eval_samples_per_second": 5.606,
"eval_steps_per_second": 5.606,
"step": 3700
},
{
"epoch": 0.5020977128163486,
"grad_norm": 3.0164525508880615,
"learning_rate": 9.862629693172267e-05,
"loss": 0.3723,
"step": 3710
},
{
"epoch": 0.5034510759236703,
"grad_norm": 0.2863682210445404,
"learning_rate": 9.860790718372441e-05,
"loss": 0.3251,
"step": 3720
},
{
"epoch": 0.504804439030992,
"grad_norm": 0.02388608641922474,
"learning_rate": 9.858939689861506e-05,
"loss": 0.2996,
"step": 3730
},
{
"epoch": 0.5061578021383137,
"grad_norm": 0.42931804060935974,
"learning_rate": 9.857076612229614e-05,
"loss": 0.4254,
"step": 3740
},
{
"epoch": 0.5075111652456354,
"grad_norm": 0.24269169569015503,
"learning_rate": 9.855201490096795e-05,
"loss": 0.2688,
"step": 3750
},
{
"epoch": 0.5088645283529571,
"grad_norm": 3.338435649871826,
"learning_rate": 9.853314328112947e-05,
"loss": 0.0494,
"step": 3760
},
{
"epoch": 0.5102178914602787,
"grad_norm": 0.04143134132027626,
"learning_rate": 9.851415130957824e-05,
"loss": 0.2798,
"step": 3770
},
{
"epoch": 0.5115712545676004,
"grad_norm": 0.18607792258262634,
"learning_rate": 9.849503903341024e-05,
"loss": 0.2892,
"step": 3780
},
{
"epoch": 0.5129246176749221,
"grad_norm": 11.773528099060059,
"learning_rate": 9.84758065000198e-05,
"loss": 0.4357,
"step": 3790
},
{
"epoch": 0.5142779807822438,
"grad_norm": 0.7222903370857239,
"learning_rate": 9.845645375709945e-05,
"loss": 0.3765,
"step": 3800
},
{
"epoch": 0.5142779807822438,
"eval_accuracy": 0.919634703196347,
"eval_loss": 0.2469821274280548,
"eval_runtime": 585.5969,
"eval_samples_per_second": 5.61,
"eval_steps_per_second": 5.61,
"step": 3800
},
{
"epoch": 0.5156313438895656,
"grad_norm": 0.587492823600769,
"learning_rate": 9.84369808526398e-05,
"loss": 0.2611,
"step": 3810
},
{
"epoch": 0.5169847069968873,
"grad_norm": 1.6213396787643433,
"learning_rate": 9.841738783492944e-05,
"loss": 0.4305,
"step": 3820
},
{
"epoch": 0.518338070104209,
"grad_norm": 25.090078353881836,
"learning_rate": 9.839767475255484e-05,
"loss": 0.3079,
"step": 3830
},
{
"epoch": 0.5196914332115307,
"grad_norm": 1.8611633777618408,
"learning_rate": 9.837784165440018e-05,
"loss": 0.312,
"step": 3840
},
{
"epoch": 0.5210447963188524,
"grad_norm": 16.361417770385742,
"learning_rate": 9.835788858964726e-05,
"loss": 0.1514,
"step": 3850
},
{
"epoch": 0.5223981594261741,
"grad_norm": 0.027297548949718475,
"learning_rate": 9.833781560777537e-05,
"loss": 0.8527,
"step": 3860
},
{
"epoch": 0.5237515225334958,
"grad_norm": 9.58652400970459,
"learning_rate": 9.831762275856118e-05,
"loss": 0.3958,
"step": 3870
},
{
"epoch": 0.5251048856408175,
"grad_norm": 0.11664781719446182,
"learning_rate": 9.829731009207859e-05,
"loss": 0.105,
"step": 3880
},
{
"epoch": 0.5264582487481392,
"grad_norm": 17.347599029541016,
"learning_rate": 9.82768776586986e-05,
"loss": 0.5082,
"step": 3890
},
{
"epoch": 0.5278116118554608,
"grad_norm": 10.610013961791992,
"learning_rate": 9.825632550908926e-05,
"loss": 0.3189,
"step": 3900
},
{
"epoch": 0.5278116118554608,
"eval_accuracy": 0.9147640791476408,
"eval_loss": 0.21581518650054932,
"eval_runtime": 585.6064,
"eval_samples_per_second": 5.61,
"eval_steps_per_second": 5.61,
"step": 3900
},
{
"epoch": 0.5291649749627825,
"grad_norm": 5.391573905944824,
"learning_rate": 9.823565369421545e-05,
"loss": 0.2908,
"step": 3910
},
{
"epoch": 0.5305183380701042,
"grad_norm": 18.12822723388672,
"learning_rate": 9.821486226533882e-05,
"loss": 0.3401,
"step": 3920
},
{
"epoch": 0.5318717011774259,
"grad_norm": 0.004009630065411329,
"learning_rate": 9.819395127401762e-05,
"loss": 0.1484,
"step": 3930
},
{
"epoch": 0.5332250642847476,
"grad_norm": 0.21845737099647522,
"learning_rate": 9.817292077210659e-05,
"loss": 0.2242,
"step": 3940
},
{
"epoch": 0.5345784273920693,
"grad_norm": 0.21392692625522614,
"learning_rate": 9.81517708117568e-05,
"loss": 0.3041,
"step": 3950
},
{
"epoch": 0.535931790499391,
"grad_norm": 0.029498528689146042,
"learning_rate": 9.813050144541562e-05,
"loss": 0.0472,
"step": 3960
},
{
"epoch": 0.5372851536067127,
"grad_norm": 0.0002741254575084895,
"learning_rate": 9.81091127258265e-05,
"loss": 0.0588,
"step": 3970
},
{
"epoch": 0.5386385167140344,
"grad_norm": 0.002227282151579857,
"learning_rate": 9.808760470602879e-05,
"loss": 0.183,
"step": 3980
},
{
"epoch": 0.5399918798213561,
"grad_norm": 11.413102149963379,
"learning_rate": 9.806597743935778e-05,
"loss": 0.0947,
"step": 3990
},
{
"epoch": 0.5413452429286778,
"grad_norm": 2.3081635447397275e-07,
"learning_rate": 9.804423097944439e-05,
"loss": 0.1807,
"step": 4000
},
{
"epoch": 0.5413452429286778,
"eval_accuracy": 0.9141552511415525,
"eval_loss": 0.40402328968048096,
"eval_runtime": 586.0899,
"eval_samples_per_second": 5.605,
"eval_steps_per_second": 5.605,
"step": 4000
},
{
"epoch": 0.5426986060359995,
"grad_norm": 24.99509620666504,
"learning_rate": 9.802236538021518e-05,
"loss": 0.2574,
"step": 4010
},
{
"epoch": 0.5440519691433211,
"grad_norm": 0.0018494591349735856,
"learning_rate": 9.800038069589208e-05,
"loss": 0.2827,
"step": 4020
},
{
"epoch": 0.5454053322506428,
"grad_norm": 0.6751343011856079,
"learning_rate": 9.797827698099238e-05,
"loss": 0.2763,
"step": 4030
},
{
"epoch": 0.5467586953579645,
"grad_norm": 3.1831467151641846,
"learning_rate": 9.795605429032851e-05,
"loss": 0.1686,
"step": 4040
},
{
"epoch": 0.5481120584652862,
"grad_norm": 3.9633137930650264e-05,
"learning_rate": 9.793371267900793e-05,
"loss": 0.1481,
"step": 4050
},
{
"epoch": 0.5494654215726079,
"grad_norm": 0.24243293702602386,
"learning_rate": 9.791125220243303e-05,
"loss": 0.1069,
"step": 4060
},
{
"epoch": 0.5508187846799296,
"grad_norm": 28.994619369506836,
"learning_rate": 9.788867291630091e-05,
"loss": 0.7202,
"step": 4070
},
{
"epoch": 0.5521721477872513,
"grad_norm": 23.049583435058594,
"learning_rate": 9.786597487660337e-05,
"loss": 0.1929,
"step": 4080
},
{
"epoch": 0.553525510894573,
"grad_norm": 0.003061707131564617,
"learning_rate": 9.784315813962662e-05,
"loss": 0.0455,
"step": 4090
},
{
"epoch": 0.5548788740018947,
"grad_norm": 0.8015981912612915,
"learning_rate": 9.782022276195124e-05,
"loss": 0.1293,
"step": 4100
},
{
"epoch": 0.5548788740018947,
"eval_accuracy": 0.9080669710806697,
"eval_loss": 0.5554362535476685,
"eval_runtime": 585.9472,
"eval_samples_per_second": 5.606,
"eval_steps_per_second": 5.606,
"step": 4100
},
{
"epoch": 0.5562322371092164,
"grad_norm": 10.175394058227539,
"learning_rate": 9.7797168800452e-05,
"loss": 0.7186,
"step": 4110
},
{
"epoch": 0.5575856002165381,
"grad_norm": 2.411634341115132e-07,
"learning_rate": 9.777399631229777e-05,
"loss": 0.4884,
"step": 4120
},
{
"epoch": 0.5589389633238598,
"grad_norm": 0.3643926978111267,
"learning_rate": 9.775070535495132e-05,
"loss": 0.453,
"step": 4130
},
{
"epoch": 0.5602923264311814,
"grad_norm": 3.4061479254887672e-06,
"learning_rate": 9.772729598616916e-05,
"loss": 0.2994,
"step": 4140
},
{
"epoch": 0.5616456895385031,
"grad_norm": 0.7406808137893677,
"learning_rate": 9.77037682640015e-05,
"loss": 0.428,
"step": 4150
},
{
"epoch": 0.5629990526458248,
"grad_norm": 0.026358220726251602,
"learning_rate": 9.768012224679198e-05,
"loss": 0.1555,
"step": 4160
},
{
"epoch": 0.5643524157531465,
"grad_norm": 0.08676059544086456,
"learning_rate": 9.765635799317765e-05,
"loss": 0.2118,
"step": 4170
},
{
"epoch": 0.5657057788604682,
"grad_norm": 0.06001422554254532,
"learning_rate": 9.76324755620887e-05,
"loss": 0.1356,
"step": 4180
},
{
"epoch": 0.56705914196779,
"grad_norm": 4.249134063720703,
"learning_rate": 9.76084750127484e-05,
"loss": 0.3016,
"step": 4190
},
{
"epoch": 0.5684125050751117,
"grad_norm": 8.967272758483887,
"learning_rate": 9.758435640467295e-05,
"loss": 0.2838,
"step": 4200
},
{
"epoch": 0.5684125050751117,
"eval_accuracy": 0.9105022831050228,
"eval_loss": 0.32533395290374756,
"eval_runtime": 585.8922,
"eval_samples_per_second": 5.607,
"eval_steps_per_second": 5.607,
"step": 4200
},
{
"epoch": 0.5697658681824334,
"grad_norm": 7.07581090927124,
"learning_rate": 9.756011979767128e-05,
"loss": 0.4945,
"step": 4210
},
{
"epoch": 0.5711192312897551,
"grad_norm": 2.3176674842834473,
"learning_rate": 9.753576525184492e-05,
"loss": 0.3317,
"step": 4220
},
{
"epoch": 0.5724725943970768,
"grad_norm": 0.14022812247276306,
"learning_rate": 9.751129282758791e-05,
"loss": 0.2113,
"step": 4230
},
{
"epoch": 0.5738259575043985,
"grad_norm": 5.834623516420834e-05,
"learning_rate": 9.748670258558656e-05,
"loss": 0.644,
"step": 4240
},
{
"epoch": 0.5751793206117202,
"grad_norm": 7.6325764656066895,
"learning_rate": 9.746199458681938e-05,
"loss": 0.0884,
"step": 4250
},
{
"epoch": 0.5765326837190419,
"grad_norm": 1.895483374595642,
"learning_rate": 9.743716889255684e-05,
"loss": 0.258,
"step": 4260
},
{
"epoch": 0.5778860468263635,
"grad_norm": 1.6728992462158203,
"learning_rate": 9.741222556436132e-05,
"loss": 0.0853,
"step": 4270
},
{
"epoch": 0.5792394099336852,
"grad_norm": 0.009905705228447914,
"learning_rate": 9.738716466408688e-05,
"loss": 0.4743,
"step": 4280
},
{
"epoch": 0.5805927730410069,
"grad_norm": 0.7430686354637146,
"learning_rate": 9.736198625387916e-05,
"loss": 0.7162,
"step": 4290
},
{
"epoch": 0.5819461361483286,
"grad_norm": 7.7744269371032715,
"learning_rate": 9.733669039617513e-05,
"loss": 0.2627,
"step": 4300
},
{
"epoch": 0.5819461361483286,
"eval_accuracy": 0.915068493150685,
"eval_loss": 0.23285283148288727,
"eval_runtime": 585.6291,
"eval_samples_per_second": 5.609,
"eval_steps_per_second": 5.609,
"step": 4300
},
{
"epoch": 0.5832994992556503,
"grad_norm": 18.253538131713867,
"learning_rate": 9.731127715370311e-05,
"loss": 0.5606,
"step": 4310
},
{
"epoch": 0.584652862362972,
"grad_norm": 12.1471529006958,
"learning_rate": 9.728574658948242e-05,
"loss": 0.5073,
"step": 4320
},
{
"epoch": 0.5860062254702937,
"grad_norm": 13.163832664489746,
"learning_rate": 9.726009876682333e-05,
"loss": 0.2834,
"step": 4330
},
{
"epoch": 0.5873595885776154,
"grad_norm": 7.22966194152832,
"learning_rate": 9.723433374932696e-05,
"loss": 0.1887,
"step": 4340
},
{
"epoch": 0.5887129516849371,
"grad_norm": 3.5801568031311035,
"learning_rate": 9.720845160088492e-05,
"loss": 0.1997,
"step": 4350
},
{
"epoch": 0.5900663147922588,
"grad_norm": 14.825773239135742,
"learning_rate": 9.718245238567939e-05,
"loss": 0.3341,
"step": 4360
},
{
"epoch": 0.5914196778995805,
"grad_norm": 0.009115755558013916,
"learning_rate": 9.71563361681828e-05,
"loss": 0.0791,
"step": 4370
},
{
"epoch": 0.5927730410069022,
"grad_norm": 28.929086685180664,
"learning_rate": 9.713010301315772e-05,
"loss": 0.4627,
"step": 4380
},
{
"epoch": 0.5941264041142238,
"grad_norm": 0.04894016310572624,
"learning_rate": 9.710375298565672e-05,
"loss": 0.2501,
"step": 4390
},
{
"epoch": 0.5954797672215455,
"grad_norm": 0.1197153702378273,
"learning_rate": 9.707728615102217e-05,
"loss": 0.1651,
"step": 4400
},
{
"epoch": 0.5954797672215455,
"eval_accuracy": 0.9098934550989346,
"eval_loss": 0.3739698827266693,
"eval_runtime": 585.3122,
"eval_samples_per_second": 5.612,
"eval_steps_per_second": 5.612,
"step": 4400
},
{
"epoch": 0.5968331303288672,
"grad_norm": 18.560256958007812,
"learning_rate": 9.705070257488609e-05,
"loss": 0.2983,
"step": 4410
},
{
"epoch": 0.5981864934361889,
"grad_norm": 1.4426162242889404,
"learning_rate": 9.702400232317003e-05,
"loss": 0.3937,
"step": 4420
},
{
"epoch": 0.5995398565435106,
"grad_norm": 35.894256591796875,
"learning_rate": 9.699718546208484e-05,
"loss": 0.4369,
"step": 4430
},
{
"epoch": 0.6008932196508323,
"grad_norm": 24.106157302856445,
"learning_rate": 9.697025205813054e-05,
"loss": 0.3225,
"step": 4440
},
{
"epoch": 0.602246582758154,
"grad_norm": 20.590375900268555,
"learning_rate": 9.694320217809616e-05,
"loss": 0.342,
"step": 4450
},
{
"epoch": 0.6035999458654757,
"grad_norm": 27.700132369995117,
"learning_rate": 9.691603588905955e-05,
"loss": 0.2763,
"step": 4460
},
{
"epoch": 0.6049533089727974,
"grad_norm": 1.0037168264389038,
"learning_rate": 9.688875325838725e-05,
"loss": 0.1983,
"step": 4470
},
{
"epoch": 0.6063066720801191,
"grad_norm": 2.838923478520883e-07,
"learning_rate": 9.686135435373428e-05,
"loss": 0.3122,
"step": 4480
},
{
"epoch": 0.6076600351874408,
"grad_norm": 1.0174156427383423,
"learning_rate": 9.683383924304401e-05,
"loss": 0.0064,
"step": 4490
},
{
"epoch": 0.6090133982947625,
"grad_norm": 0.011479837819933891,
"learning_rate": 9.6806207994548e-05,
"loss": 0.4422,
"step": 4500
},
{
"epoch": 0.6090133982947625,
"eval_accuracy": 0.9126331811263318,
"eval_loss": 0.4408724009990692,
"eval_runtime": 585.495,
"eval_samples_per_second": 5.611,
"eval_steps_per_second": 5.611,
"step": 4500
},
{
"epoch": 0.6103667614020842,
"grad_norm": 25.792768478393555,
"learning_rate": 9.677846067676572e-05,
"loss": 0.4849,
"step": 4510
},
{
"epoch": 0.6117201245094058,
"grad_norm": 0.0019514035666361451,
"learning_rate": 9.675059735850457e-05,
"loss": 0.4136,
"step": 4520
},
{
"epoch": 0.6130734876167275,
"grad_norm": 14.797130584716797,
"learning_rate": 9.672261810885955e-05,
"loss": 0.7369,
"step": 4530
},
{
"epoch": 0.6144268507240492,
"grad_norm": 9.552214622497559,
"learning_rate": 9.669452299721316e-05,
"loss": 0.2675,
"step": 4540
},
{
"epoch": 0.6157802138313709,
"grad_norm": 15.778959274291992,
"learning_rate": 9.66663120932352e-05,
"loss": 0.1837,
"step": 4550
},
{
"epoch": 0.6171335769386926,
"grad_norm": 0.35701024532318115,
"learning_rate": 9.663798546688262e-05,
"loss": 0.0761,
"step": 4560
},
{
"epoch": 0.6184869400460143,
"grad_norm": 31.343671798706055,
"learning_rate": 9.660954318839933e-05,
"loss": 0.2741,
"step": 4570
},
{
"epoch": 0.619840303153336,
"grad_norm": 4.513859676080756e-05,
"learning_rate": 9.658098532831605e-05,
"loss": 0.2051,
"step": 4580
},
{
"epoch": 0.6211936662606578,
"grad_norm": 0.13937516510486603,
"learning_rate": 9.65523119574501e-05,
"loss": 1.4443,
"step": 4590
},
{
"epoch": 0.6225470293679795,
"grad_norm": 0.54905104637146,
"learning_rate": 9.652352314690524e-05,
"loss": 0.125,
"step": 4600
},
{
"epoch": 0.6225470293679795,
"eval_accuracy": 0.9025875190258752,
"eval_loss": 0.39096349477767944,
"eval_runtime": 585.735,
"eval_samples_per_second": 5.608,
"eval_steps_per_second": 5.608,
"step": 4600
},
{
"epoch": 0.6239003924753012,
"grad_norm": 0.001251073321327567,
"learning_rate": 9.649461896807153e-05,
"loss": 0.4539,
"step": 4610
},
{
"epoch": 0.6252537555826229,
"grad_norm": 0.060062967240810394,
"learning_rate": 9.646559949262504e-05,
"loss": 0.1364,
"step": 4620
},
{
"epoch": 0.6266071186899446,
"grad_norm": 20.359573364257812,
"learning_rate": 9.643646479252784e-05,
"loss": 0.4804,
"step": 4630
},
{
"epoch": 0.6279604817972662,
"grad_norm": 22.817575454711914,
"learning_rate": 9.640721494002769e-05,
"loss": 0.4074,
"step": 4640
},
{
"epoch": 0.6293138449045879,
"grad_norm": 1.5756419897079468,
"learning_rate": 9.637785000765789e-05,
"loss": 0.1832,
"step": 4650
},
{
"epoch": 0.6306672080119096,
"grad_norm": 0.32080402970314026,
"learning_rate": 9.634837006823714e-05,
"loss": 0.266,
"step": 4660
},
{
"epoch": 0.6320205711192313,
"grad_norm": 0.09690892696380615,
"learning_rate": 9.631877519486934e-05,
"loss": 0.0025,
"step": 4670
},
{
"epoch": 0.633373934226553,
"grad_norm": 26.12261199951172,
"learning_rate": 9.628906546094333e-05,
"loss": 0.1316,
"step": 4680
},
{
"epoch": 0.6347272973338747,
"grad_norm": 1.7195595502853394,
"learning_rate": 9.62592409401329e-05,
"loss": 0.531,
"step": 4690
},
{
"epoch": 0.6360806604411964,
"grad_norm": 20.064390182495117,
"learning_rate": 9.622930170639637e-05,
"loss": 0.4875,
"step": 4700
},
{
"epoch": 0.6360806604411964,
"eval_accuracy": 0.9141552511415525,
"eval_loss": 0.31653645634651184,
"eval_runtime": 585.8969,
"eval_samples_per_second": 5.607,
"eval_steps_per_second": 5.607,
"step": 4700
},
{
"epoch": 0.6374340235485181,
"grad_norm": 18.887651443481445,
"learning_rate": 9.619924783397661e-05,
"loss": 0.1611,
"step": 4710
},
{
"epoch": 0.6387873866558398,
"grad_norm": 0.010769748128950596,
"learning_rate": 9.61690793974007e-05,
"loss": 0.3398,
"step": 4720
},
{
"epoch": 0.6401407497631615,
"grad_norm": 7.151393413543701,
"learning_rate": 9.613879647147988e-05,
"loss": 0.0745,
"step": 4730
},
{
"epoch": 0.6414941128704832,
"grad_norm": 0.03005339950323105,
"learning_rate": 9.610839913130923e-05,
"loss": 0.2734,
"step": 4740
},
{
"epoch": 0.6428474759778049,
"grad_norm": 3.4520528316497803,
"learning_rate": 9.60778874522676e-05,
"loss": 0.7906,
"step": 4750
},
{
"epoch": 0.6442008390851265,
"grad_norm": 7.305643558502197,
"learning_rate": 9.604726151001737e-05,
"loss": 0.4846,
"step": 4760
},
{
"epoch": 0.6455542021924482,
"grad_norm": 19.83353042602539,
"learning_rate": 9.601652138050428e-05,
"loss": 0.1922,
"step": 4770
},
{
"epoch": 0.6469075652997699,
"grad_norm": 17.064523696899414,
"learning_rate": 9.598566713995718e-05,
"loss": 0.4546,
"step": 4780
},
{
"epoch": 0.6482609284070916,
"grad_norm": 14.504972457885742,
"learning_rate": 9.595469886488793e-05,
"loss": 0.3455,
"step": 4790
},
{
"epoch": 0.6496142915144133,
"grad_norm": 5.4947285652160645,
"learning_rate": 9.592361663209117e-05,
"loss": 0.1002,
"step": 4800
},
{
"epoch": 0.6496142915144133,
"eval_accuracy": 0.9178082191780822,
"eval_loss": 0.29287853837013245,
"eval_runtime": 585.872,
"eval_samples_per_second": 5.607,
"eval_steps_per_second": 5.607,
"step": 4800
},
{
"epoch": 0.650967654621735,
"grad_norm": 0.10837817192077637,
"learning_rate": 9.589242051864413e-05,
"loss": 0.12,
"step": 4810
},
{
"epoch": 0.6523210177290567,
"grad_norm": 16.458295822143555,
"learning_rate": 9.586111060190641e-05,
"loss": 0.115,
"step": 4820
},
{
"epoch": 0.6536743808363784,
"grad_norm": 3.44444160873536e-05,
"learning_rate": 9.582968695951986e-05,
"loss": 0.4776,
"step": 4830
},
{
"epoch": 0.6550277439437001,
"grad_norm": 27.302371978759766,
"learning_rate": 9.579814966940833e-05,
"loss": 0.9683,
"step": 4840
},
{
"epoch": 0.6563811070510218,
"grad_norm": 0.0014636669075116515,
"learning_rate": 9.576649880977748e-05,
"loss": 0.7764,
"step": 4850
},
{
"epoch": 0.6577344701583435,
"grad_norm": 2.055596113204956,
"learning_rate": 9.573473445911461e-05,
"loss": 0.177,
"step": 4860
},
{
"epoch": 0.6590878332656652,
"grad_norm": 17.790176391601562,
"learning_rate": 9.570285669618842e-05,
"loss": 0.2155,
"step": 4870
},
{
"epoch": 0.6604411963729869,
"grad_norm": 1.0239602327346802,
"learning_rate": 9.567086560004892e-05,
"loss": 0.1939,
"step": 4880
},
{
"epoch": 0.6617945594803085,
"grad_norm": 18.533052444458008,
"learning_rate": 9.563876125002711e-05,
"loss": 0.6398,
"step": 4890
},
{
"epoch": 0.6631479225876302,
"grad_norm": 0.025914179161190987,
"learning_rate": 9.560654372573481e-05,
"loss": 0.1241,
"step": 4900
},
{
"epoch": 0.6631479225876302,
"eval_accuracy": 0.9101978691019786,
"eval_loss": 0.2989567220211029,
"eval_runtime": 585.4692,
"eval_samples_per_second": 5.611,
"eval_steps_per_second": 5.611,
"step": 4900
},
{
"epoch": 0.6645012856949519,
"grad_norm": 25.82878875732422,
"learning_rate": 9.557421310706456e-05,
"loss": 0.4629,
"step": 4910
},
{
"epoch": 0.6658546488022736,
"grad_norm": 2.8251757621765137,
"learning_rate": 9.554176947418931e-05,
"loss": 0.5236,
"step": 4920
},
{
"epoch": 0.6672080119095953,
"grad_norm": 1.0843572616577148,
"learning_rate": 9.550921290756222e-05,
"loss": 0.0906,
"step": 4930
},
{
"epoch": 0.668561375016917,
"grad_norm": 2.299241542816162,
"learning_rate": 9.54765434879166e-05,
"loss": 0.1686,
"step": 4940
},
{
"epoch": 0.6699147381242387,
"grad_norm": 0.24002690613269806,
"learning_rate": 9.54437612962655e-05,
"loss": 0.3661,
"step": 4950
},
{
"epoch": 0.6712681012315604,
"grad_norm": 13.394685745239258,
"learning_rate": 9.54108664139017e-05,
"loss": 0.2536,
"step": 4960
},
{
"epoch": 0.6726214643388821,
"grad_norm": 20.657196044921875,
"learning_rate": 9.537785892239743e-05,
"loss": 0.3507,
"step": 4970
},
{
"epoch": 0.6739748274462039,
"grad_norm": 4.67914342880249,
"learning_rate": 9.534473890360412e-05,
"loss": 0.5186,
"step": 4980
},
{
"epoch": 0.6753281905535256,
"grad_norm": 7.856649398803711,
"learning_rate": 9.531150643965223e-05,
"loss": 0.7312,
"step": 4990
},
{
"epoch": 0.6766815536608473,
"grad_norm": 6.752259731292725,
"learning_rate": 9.527816161295114e-05,
"loss": 0.1414,
"step": 5000
},
{
"epoch": 0.6766815536608473,
"eval_accuracy": 0.9117199391171994,
"eval_loss": 0.2604728937149048,
"eval_runtime": 585.4721,
"eval_samples_per_second": 5.611,
"eval_steps_per_second": 5.611,
"step": 5000
},
{
"epoch": 0.6780349167681688,
"grad_norm": 1.1008288860321045,
"learning_rate": 9.524470450618879e-05,
"loss": 0.2198,
"step": 5010
},
{
"epoch": 0.6793882798754906,
"grad_norm": 7.540661811828613,
"learning_rate": 9.521113520233158e-05,
"loss": 0.6508,
"step": 5020
},
{
"epoch": 0.6807416429828123,
"grad_norm": 0.06511321663856506,
"learning_rate": 9.517745378462416e-05,
"loss": 0.3953,
"step": 5030
},
{
"epoch": 0.682095006090134,
"grad_norm": 0.5184662342071533,
"learning_rate": 9.514366033658914e-05,
"loss": 0.3206,
"step": 5040
},
{
"epoch": 0.6834483691974557,
"grad_norm": 4.385899543762207,
"learning_rate": 9.510975494202698e-05,
"loss": 0.3306,
"step": 5050
},
{
"epoch": 0.6848017323047774,
"grad_norm": 0.368313729763031,
"learning_rate": 9.507573768501574e-05,
"loss": 0.0516,
"step": 5060
},
{
"epoch": 0.6861550954120991,
"grad_norm": 27.66187286376953,
"learning_rate": 9.504160864991087e-05,
"loss": 0.4828,
"step": 5070
},
{
"epoch": 0.6875084585194208,
"grad_norm": 20.792448043823242,
"learning_rate": 9.500736792134501e-05,
"loss": 0.751,
"step": 5080
},
{
"epoch": 0.6888618216267425,
"grad_norm": 6.490540504455566,
"learning_rate": 9.497301558422776e-05,
"loss": 0.3981,
"step": 5090
},
{
"epoch": 0.6902151847340642,
"grad_norm": 6.3285136222839355,
"learning_rate": 9.493855172374551e-05,
"loss": 0.2298,
"step": 5100
},
{
"epoch": 0.6902151847340642,
"eval_accuracy": 0.9159817351598174,
"eval_loss": 0.22350817918777466,
"eval_runtime": 585.6563,
"eval_samples_per_second": 5.609,
"eval_steps_per_second": 5.609,
"step": 5100
},
{
"epoch": 0.6915685478413859,
"grad_norm": 0.9619494676589966,
"learning_rate": 9.490397642536117e-05,
"loss": 0.1571,
"step": 5110
},
{
"epoch": 0.6929219109487076,
"grad_norm": 0.28258052468299866,
"learning_rate": 9.486928977481402e-05,
"loss": 0.1245,
"step": 5120
},
{
"epoch": 0.6942752740560292,
"grad_norm": 0.1793600469827652,
"learning_rate": 9.483449185811948e-05,
"loss": 0.5415,
"step": 5130
},
{
"epoch": 0.6956286371633509,
"grad_norm": 23.72464942932129,
"learning_rate": 9.479958276156884e-05,
"loss": 0.7074,
"step": 5140
},
{
"epoch": 0.6969820002706726,
"grad_norm": 3.1746864318847656,
"learning_rate": 9.476456257172915e-05,
"loss": 0.5878,
"step": 5150
},
{
"epoch": 0.6983353633779943,
"grad_norm": 0.05973691865801811,
"learning_rate": 9.47294313754429e-05,
"loss": 0.0698,
"step": 5160
},
{
"epoch": 0.699688726485316,
"grad_norm": 17.902040481567383,
"learning_rate": 9.469418925982785e-05,
"loss": 0.2651,
"step": 5170
},
{
"epoch": 0.7010420895926377,
"grad_norm": 0.9632867574691772,
"learning_rate": 9.465883631227686e-05,
"loss": 0.3769,
"step": 5180
},
{
"epoch": 0.7023954526999594,
"grad_norm": 1.0752183198928833,
"learning_rate": 9.46233726204576e-05,
"loss": 0.3865,
"step": 5190
},
{
"epoch": 0.7037488158072811,
"grad_norm": 0.0032021338120102882,
"learning_rate": 9.458779827231237e-05,
"loss": 0.0677,
"step": 5200
},
{
"epoch": 0.7037488158072811,
"eval_accuracy": 0.9147640791476408,
"eval_loss": 0.33627045154571533,
"eval_runtime": 585.7425,
"eval_samples_per_second": 5.608,
"eval_steps_per_second": 5.608,
"step": 5200
},
{
"epoch": 0.7051021789146028,
"grad_norm": 0.574644148349762,
"learning_rate": 9.455211335605784e-05,
"loss": 0.0694,
"step": 5210
},
{
"epoch": 0.7064555420219245,
"grad_norm": 13.346553802490234,
"learning_rate": 9.451631796018494e-05,
"loss": 0.1663,
"step": 5220
},
{
"epoch": 0.7078089051292462,
"grad_norm": 22.342451095581055,
"learning_rate": 9.448041217345851e-05,
"loss": 0.8053,
"step": 5230
},
{
"epoch": 0.7091622682365679,
"grad_norm": 13.331931114196777,
"learning_rate": 9.444439608491711e-05,
"loss": 0.5684,
"step": 5240
},
{
"epoch": 0.7105156313438896,
"grad_norm": 2.19920015335083,
"learning_rate": 9.440826978387289e-05,
"loss": 0.4047,
"step": 5250
},
{
"epoch": 0.7118689944512112,
"grad_norm": 0.0734083279967308,
"learning_rate": 9.437203335991127e-05,
"loss": 0.4998,
"step": 5260
},
{
"epoch": 0.7132223575585329,
"grad_norm": 20.231857299804688,
"learning_rate": 9.433568690289075e-05,
"loss": 0.1117,
"step": 5270
},
{
"epoch": 0.7145757206658546,
"grad_norm": 1.9291208445793018e-05,
"learning_rate": 9.42992305029427e-05,
"loss": 0.2927,
"step": 5280
},
{
"epoch": 0.7159290837731763,
"grad_norm": 0.6859157681465149,
"learning_rate": 9.426266425047113e-05,
"loss": 0.1162,
"step": 5290
},
{
"epoch": 0.717282446880498,
"grad_norm": 2.1293721199035645,
"learning_rate": 9.42259882361524e-05,
"loss": 0.2384,
"step": 5300
},
{
"epoch": 0.717282446880498,
"eval_accuracy": 0.9205479452054794,
"eval_loss": 0.3333543539047241,
"eval_runtime": 585.504,
"eval_samples_per_second": 5.611,
"eval_steps_per_second": 5.611,
"step": 5300
},
{
"epoch": 0.7186358099878197,
"grad_norm": 9.881598472595215,
"learning_rate": 9.418920255093515e-05,
"loss": 0.1674,
"step": 5310
},
{
"epoch": 0.7199891730951414,
"grad_norm": 4.3182172775268555,
"learning_rate": 9.415230728603992e-05,
"loss": 0.477,
"step": 5320
},
{
"epoch": 0.7213425362024631,
"grad_norm": 19.00124168395996,
"learning_rate": 9.4115302532959e-05,
"loss": 0.317,
"step": 5330
},
{
"epoch": 0.7226958993097848,
"grad_norm": 4.142504621995613e-05,
"learning_rate": 9.407818838345619e-05,
"loss": 0.5577,
"step": 5340
},
{
"epoch": 0.7240492624171065,
"grad_norm": 0.0461486354470253,
"learning_rate": 9.404096492956656e-05,
"loss": 0.3604,
"step": 5350
},
{
"epoch": 0.7254026255244282,
"grad_norm": 0.5307131409645081,
"learning_rate": 9.40036322635962e-05,
"loss": 0.1165,
"step": 5360
},
{
"epoch": 0.72675598863175,
"grad_norm": 0.0007177101797424257,
"learning_rate": 9.396619047812213e-05,
"loss": 0.7133,
"step": 5370
},
{
"epoch": 0.7281093517390715,
"grad_norm": 0.09178412705659866,
"learning_rate": 9.392863966599183e-05,
"loss": 0.1665,
"step": 5380
},
{
"epoch": 0.7294627148463932,
"grad_norm": 5.707544551114552e-05,
"learning_rate": 9.389097992032325e-05,
"loss": 0.3494,
"step": 5390
},
{
"epoch": 0.730816077953715,
"grad_norm": 31.593660354614258,
"learning_rate": 9.385321133450438e-05,
"loss": 0.6661,
"step": 5400
},
{
"epoch": 0.730816077953715,
"eval_accuracy": 0.9156773211567732,
"eval_loss": 0.2831815481185913,
"eval_runtime": 585.0863,
"eval_samples_per_second": 5.615,
"eval_steps_per_second": 5.615,
"step": 5400
},
{
"epoch": 0.7321694410610367,
"grad_norm": 5.065242767333984,
"learning_rate": 9.381533400219318e-05,
"loss": 0.1203,
"step": 5410
},
{
"epoch": 0.7335228041683584,
"grad_norm": 23.13774299621582,
"learning_rate": 9.377734801731726e-05,
"loss": 0.9495,
"step": 5420
},
{
"epoch": 0.7348761672756801,
"grad_norm": 1.0966769456863403,
"learning_rate": 9.373925347407364e-05,
"loss": 0.2632,
"step": 5430
},
{
"epoch": 0.7362295303830018,
"grad_norm": 0.010265067219734192,
"learning_rate": 9.370105046692856e-05,
"loss": 0.4109,
"step": 5440
},
{
"epoch": 0.7375828934903235,
"grad_norm": 14.803862571716309,
"learning_rate": 9.366273909061725e-05,
"loss": 0.1273,
"step": 5450
},
{
"epoch": 0.7389362565976452,
"grad_norm": 21.68570899963379,
"learning_rate": 9.362431944014363e-05,
"loss": 0.1676,
"step": 5460
},
{
"epoch": 0.7402896197049669,
"grad_norm": 0.006488638464361429,
"learning_rate": 9.358579161078013e-05,
"loss": 0.512,
"step": 5470
},
{
"epoch": 0.7416429828122886,
"grad_norm": 0.21234115958213806,
"learning_rate": 9.354715569806744e-05,
"loss": 0.5069,
"step": 5480
},
{
"epoch": 0.7429963459196103,
"grad_norm": 13.795744895935059,
"learning_rate": 9.350841179781431e-05,
"loss": 0.48,
"step": 5490
},
{
"epoch": 0.7443497090269319,
"grad_norm": 13.517033576965332,
"learning_rate": 9.346956000609721e-05,
"loss": 0.3189,
"step": 5500
},
{
"epoch": 0.7443497090269319,
"eval_accuracy": 0.9181126331811263,
"eval_loss": 0.21662920713424683,
"eval_runtime": 585.767,
"eval_samples_per_second": 5.608,
"eval_steps_per_second": 5.608,
"step": 5500
},
{
"epoch": 0.7457030721342536,
"grad_norm": 0.05359930172562599,
"learning_rate": 9.343060041926021e-05,
"loss": 0.3663,
"step": 5510
},
{
"epoch": 0.7470564352415753,
"grad_norm": 9.406262397766113,
"learning_rate": 9.339153313391469e-05,
"loss": 0.3323,
"step": 5520
},
{
"epoch": 0.748409798348897,
"grad_norm": 1.5440926551818848,
"learning_rate": 9.335235824693904e-05,
"loss": 0.223,
"step": 5530
},
{
"epoch": 0.7497631614562187,
"grad_norm": 2.935401439666748,
"learning_rate": 9.331307585547853e-05,
"loss": 0.2858,
"step": 5540
},
{
"epoch": 0.7511165245635404,
"grad_norm": 0.1924748718738556,
"learning_rate": 9.327368605694502e-05,
"loss": 0.5013,
"step": 5550
},
{
"epoch": 0.7524698876708621,
"grad_norm": 16.0751895904541,
"learning_rate": 9.323418894901669e-05,
"loss": 0.2946,
"step": 5560
},
{
"epoch": 0.7538232507781838,
"grad_norm": 5.284033298492432,
"learning_rate": 9.319458462963785e-05,
"loss": 0.4561,
"step": 5570
},
{
"epoch": 0.7551766138855055,
"grad_norm": 0.00020838312047999352,
"learning_rate": 9.315487319701865e-05,
"loss": 0.1584,
"step": 5580
},
{
"epoch": 0.7565299769928272,
"grad_norm": 6.6507532210380305e-06,
"learning_rate": 9.311505474963484e-05,
"loss": 0.1945,
"step": 5590
},
{
"epoch": 0.7578833401001489,
"grad_norm": 0.00047044234815984964,
"learning_rate": 9.307512938622762e-05,
"loss": 0.5115,
"step": 5600
},
{
"epoch": 0.7578833401001489,
"eval_accuracy": 0.9193302891933028,
"eval_loss": 0.2699982523918152,
"eval_runtime": 585.7552,
"eval_samples_per_second": 5.608,
"eval_steps_per_second": 5.608,
"step": 5600
},
{
"epoch": 0.7592367032074706,
"grad_norm": 0.015827298164367676,
"learning_rate": 9.303509720580324e-05,
"loss": 0.131,
"step": 5610
},
{
"epoch": 0.7605900663147923,
"grad_norm": 16.13619613647461,
"learning_rate": 9.299495830763286e-05,
"loss": 0.3044,
"step": 5620
},
{
"epoch": 0.7619434294221139,
"grad_norm": 0.672073245048523,
"learning_rate": 9.295471279125227e-05,
"loss": 0.1588,
"step": 5630
},
{
"epoch": 0.7632967925294356,
"grad_norm": 0.0014772227732464671,
"learning_rate": 9.291436075646168e-05,
"loss": 0.1987,
"step": 5640
},
{
"epoch": 0.7646501556367573,
"grad_norm": 5.766489508118866e-08,
"learning_rate": 9.28739023033254e-05,
"loss": 0.2238,
"step": 5650
},
{
"epoch": 0.766003518744079,
"grad_norm": 0.010482143610715866,
"learning_rate": 9.283333753217167e-05,
"loss": 0.3199,
"step": 5660
},
{
"epoch": 0.7673568818514007,
"grad_norm": 1.591705322265625,
"learning_rate": 9.279266654359235e-05,
"loss": 0.2747,
"step": 5670
},
{
"epoch": 0.7687102449587224,
"grad_norm": 0.0668347105383873,
"learning_rate": 9.275188943844273e-05,
"loss": 0.5941,
"step": 5680
},
{
"epoch": 0.7700636080660441,
"grad_norm": 9.043622016906738,
"learning_rate": 9.27110063178412e-05,
"loss": 0.5263,
"step": 5690
},
{
"epoch": 0.7714169711733658,
"grad_norm": 6.835690498352051,
"learning_rate": 9.267001728316907e-05,
"loss": 0.1861,
"step": 5700
},
{
"epoch": 0.7714169711733658,
"eval_accuracy": 0.9108066971080669,
"eval_loss": 0.26752638816833496,
"eval_runtime": 585.647,
"eval_samples_per_second": 5.609,
"eval_steps_per_second": 5.609,
"step": 5700
},
{
"epoch": 0.7727703342806875,
"grad_norm": 20.598922729492188,
"learning_rate": 9.262892243607031e-05,
"loss": 0.5422,
"step": 5710
},
{
"epoch": 0.7741236973880092,
"grad_norm": 0.8921458721160889,
"learning_rate": 9.258772187845123e-05,
"loss": 0.103,
"step": 5720
},
{
"epoch": 0.7754770604953309,
"grad_norm": 0.09386087208986282,
"learning_rate": 9.254641571248035e-05,
"loss": 0.3001,
"step": 5730
},
{
"epoch": 0.7768304236026526,
"grad_norm": 0.05627532675862312,
"learning_rate": 9.250500404058804e-05,
"loss": 0.2537,
"step": 5740
},
{
"epoch": 0.7781837867099742,
"grad_norm": 23.66926383972168,
"learning_rate": 9.246348696546627e-05,
"loss": 0.5345,
"step": 5750
},
{
"epoch": 0.7795371498172959,
"grad_norm": 0.019794423133134842,
"learning_rate": 9.242186459006845e-05,
"loss": 0.227,
"step": 5760
},
{
"epoch": 0.7808905129246176,
"grad_norm": 19.66176986694336,
"learning_rate": 9.238013701760906e-05,
"loss": 0.3494,
"step": 5770
},
{
"epoch": 0.7822438760319393,
"grad_norm": 2.0934901237487793,
"learning_rate": 9.233830435156348e-05,
"loss": 0.0302,
"step": 5780
},
{
"epoch": 0.783597239139261,
"grad_norm": 19.443466186523438,
"learning_rate": 9.229636669566769e-05,
"loss": 0.7859,
"step": 5790
},
{
"epoch": 0.7849506022465828,
"grad_norm": 6.290703374878248e-11,
"learning_rate": 9.2254324153918e-05,
"loss": 0.3958,
"step": 5800
},
{
"epoch": 0.7849506022465828,
"eval_accuracy": 0.9120243531202435,
"eval_loss": 0.33059442043304443,
"eval_runtime": 585.9682,
"eval_samples_per_second": 5.606,
"eval_steps_per_second": 5.606,
"step": 5800
},
{
"epoch": 0.7863039653539045,
"grad_norm": 2.0353949069976807,
"learning_rate": 9.221217683057083e-05,
"loss": 0.2058,
"step": 5810
},
{
"epoch": 0.7876573284612262,
"grad_norm": 30.70398712158203,
"learning_rate": 9.216992483014247e-05,
"loss": 0.4889,
"step": 5820
},
{
"epoch": 0.7890106915685479,
"grad_norm": 5.967011451721191,
"learning_rate": 9.212756825740873e-05,
"loss": 0.1419,
"step": 5830
},
{
"epoch": 0.7903640546758696,
"grad_norm": 0.024264423176646233,
"learning_rate": 9.208510721740479e-05,
"loss": 0.0395,
"step": 5840
},
{
"epoch": 0.7917174177831913,
"grad_norm": 0.05600500851869583,
"learning_rate": 9.204254181542483e-05,
"loss": 0.482,
"step": 5850
},
{
"epoch": 0.793070780890513,
"grad_norm": 2.5246126651763916,
"learning_rate": 9.199987215702184e-05,
"loss": 0.2441,
"step": 5860
},
{
"epoch": 0.7944241439978346,
"grad_norm": 29.7937068939209,
"learning_rate": 9.195709834800739e-05,
"loss": 0.2949,
"step": 5870
},
{
"epoch": 0.7957775071051563,
"grad_norm": 0.23492039740085602,
"learning_rate": 9.191422049445128e-05,
"loss": 0.2055,
"step": 5880
},
{
"epoch": 0.797130870212478,
"grad_norm": 0.0021444044541567564,
"learning_rate": 9.187123870268133e-05,
"loss": 0.1847,
"step": 5890
},
{
"epoch": 0.7984842333197997,
"grad_norm": 1.0205862963630352e-06,
"learning_rate": 9.182815307928307e-05,
"loss": 0.3423,
"step": 5900
},
{
"epoch": 0.7984842333197997,
"eval_accuracy": 0.9126331811263318,
"eval_loss": 0.34658023715019226,
"eval_runtime": 585.7249,
"eval_samples_per_second": 5.608,
"eval_steps_per_second": 5.608,
"step": 5900
},
{
"epoch": 0.7998375964271214,
"grad_norm": 0.04354476556181908,
"learning_rate": 9.178496373109958e-05,
"loss": 0.4167,
"step": 5910
},
{
"epoch": 0.8011909595344431,
"grad_norm": 14.758959770202637,
"learning_rate": 9.174167076523108e-05,
"loss": 0.1091,
"step": 5920
},
{
"epoch": 0.8025443226417648,
"grad_norm": 2.380498170852661,
"learning_rate": 9.169827428903479e-05,
"loss": 0.8495,
"step": 5930
},
{
"epoch": 0.8038976857490865,
"grad_norm": 10.680726051330566,
"learning_rate": 9.16547744101246e-05,
"loss": 0.1548,
"step": 5940
},
{
"epoch": 0.8052510488564082,
"grad_norm": 2.218698263168335,
"learning_rate": 9.161117123637081e-05,
"loss": 0.4776,
"step": 5950
},
{
"epoch": 0.8066044119637299,
"grad_norm": 1.6176288681890583e-06,
"learning_rate": 9.15674648758999e-05,
"loss": 0.2087,
"step": 5960
},
{
"epoch": 0.8079577750710516,
"grad_norm": 0.03002401813864708,
"learning_rate": 9.152365543709416e-05,
"loss": 0.4413,
"step": 5970
},
{
"epoch": 0.8093111381783733,
"grad_norm": 29.92314910888672,
"learning_rate": 9.147974302859157e-05,
"loss": 0.58,
"step": 5980
},
{
"epoch": 0.810664501285695,
"grad_norm": 0.7911025881767273,
"learning_rate": 9.143572775928539e-05,
"loss": 0.4153,
"step": 5990
},
{
"epoch": 0.8120178643930166,
"grad_norm": 0.0276151355355978,
"learning_rate": 9.139160973832399e-05,
"loss": 0.2627,
"step": 6000
},
{
"epoch": 0.8120178643930166,
"eval_accuracy": 0.9199391171993911,
"eval_loss": 0.2898274064064026,
"eval_runtime": 585.4903,
"eval_samples_per_second": 5.611,
"eval_steps_per_second": 5.611,
"step": 6000
},
{
"epoch": 0.8133712275003383,
"grad_norm": 1.8124691247940063,
"learning_rate": 9.134738907511055e-05,
"loss": 0.1485,
"step": 6010
},
{
"epoch": 0.81472459060766,
"grad_norm": 24.3751220703125,
"learning_rate": 9.130306587930275e-05,
"loss": 0.1473,
"step": 6020
},
{
"epoch": 0.8160779537149817,
"grad_norm": 26.1238956451416,
"learning_rate": 9.125864026081255e-05,
"loss": 0.279,
"step": 6030
},
{
"epoch": 0.8174313168223034,
"grad_norm": 0.0028976276516914368,
"learning_rate": 9.121411232980588e-05,
"loss": 0.1216,
"step": 6040
},
{
"epoch": 0.8187846799296251,
"grad_norm": 0.4264237582683563,
"learning_rate": 9.116948219670239e-05,
"loss": 0.2344,
"step": 6050
},
{
"epoch": 0.8201380430369468,
"grad_norm": 0.1696048080921173,
"learning_rate": 9.112474997217517e-05,
"loss": 1.0503,
"step": 6060
},
{
"epoch": 0.8214914061442685,
"grad_norm": 0.2018238753080368,
"learning_rate": 9.107991576715049e-05,
"loss": 0.4023,
"step": 6070
},
{
"epoch": 0.8228447692515902,
"grad_norm": 4.706511974334717,
"learning_rate": 9.103497969280748e-05,
"loss": 0.4097,
"step": 6080
},
{
"epoch": 0.8241981323589119,
"grad_norm": 5.938671588897705,
"learning_rate": 9.09899418605779e-05,
"loss": 0.1898,
"step": 6090
},
{
"epoch": 0.8255514954662336,
"grad_norm": 17.77438735961914,
"learning_rate": 9.094480238214586e-05,
"loss": 0.2257,
"step": 6100
},
{
"epoch": 0.8255514954662336,
"eval_accuracy": 0.9187214611872146,
"eval_loss": 0.3077405095100403,
"eval_runtime": 586.1068,
"eval_samples_per_second": 5.605,
"eval_steps_per_second": 5.605,
"step": 6100
},
{
"epoch": 0.8269048585735553,
"grad_norm": 1.0022088289260864,
"learning_rate": 9.089956136944751e-05,
"loss": 0.0093,
"step": 6110
},
{
"epoch": 0.8282582216808769,
"grad_norm": 0.00016064182273112237,
"learning_rate": 9.085421893467076e-05,
"loss": 0.0219,
"step": 6120
},
{
"epoch": 0.8296115847881986,
"grad_norm": 0.17109623551368713,
"learning_rate": 9.08087751902551e-05,
"loss": 0.3479,
"step": 6130
},
{
"epoch": 0.8309649478955203,
"grad_norm": 20.761587142944336,
"learning_rate": 9.076323024889119e-05,
"loss": 0.6529,
"step": 6140
},
{
"epoch": 0.832318311002842,
"grad_norm": 0.00015795385115779936,
"learning_rate": 9.071758422352063e-05,
"loss": 0.0424,
"step": 6150
},
{
"epoch": 0.8336716741101637,
"grad_norm": 14.546133041381836,
"learning_rate": 9.06718372273357e-05,
"loss": 0.298,
"step": 6160
},
{
"epoch": 0.8350250372174854,
"grad_norm": 0.030780814588069916,
"learning_rate": 9.06259893737791e-05,
"loss": 0.1547,
"step": 6170
},
{
"epoch": 0.8363784003248071,
"grad_norm": 10.233747482299805,
"learning_rate": 9.058004077654359e-05,
"loss": 0.5607,
"step": 6180
},
{
"epoch": 0.8377317634321289,
"grad_norm": 0.13436900079250336,
"learning_rate": 9.053399154957176e-05,
"loss": 0.5464,
"step": 6190
},
{
"epoch": 0.8390851265394506,
"grad_norm": 0.34480223059654236,
"learning_rate": 9.048784180705573e-05,
"loss": 0.1525,
"step": 6200
},
{
"epoch": 0.8390851265394506,
"eval_accuracy": 0.9190258751902588,
"eval_loss": 0.24334192276000977,
"eval_runtime": 585.6983,
"eval_samples_per_second": 5.609,
"eval_steps_per_second": 5.609,
"step": 6200
},
{
"epoch": 0.8404384896467723,
"grad_norm": 0.0004358985461294651,
"learning_rate": 9.044159166343695e-05,
"loss": 0.2265,
"step": 6210
},
{
"epoch": 0.841791852754094,
"grad_norm": 5.295452117919922,
"learning_rate": 9.039524123340572e-05,
"loss": 0.3064,
"step": 6220
},
{
"epoch": 0.8431452158614157,
"grad_norm": 3.436157203395851e-05,
"learning_rate": 9.034879063190116e-05,
"loss": 0.212,
"step": 6230
},
{
"epoch": 0.8444985789687373,
"grad_norm": 14.672189712524414,
"learning_rate": 9.030223997411067e-05,
"loss": 0.444,
"step": 6240
},
{
"epoch": 0.845851942076059,
"grad_norm": 15.262385368347168,
"learning_rate": 9.025558937546988e-05,
"loss": 0.4573,
"step": 6250
},
{
"epoch": 0.8472053051833807,
"grad_norm": 13.051666259765625,
"learning_rate": 9.020883895166214e-05,
"loss": 0.4719,
"step": 6260
},
{
"epoch": 0.8485586682907024,
"grad_norm": 4.082529544830322,
"learning_rate": 9.016198881861845e-05,
"loss": 0.0876,
"step": 6270
},
{
"epoch": 0.8499120313980241,
"grad_norm": 0.01626618765294552,
"learning_rate": 9.0115039092517e-05,
"loss": 0.2108,
"step": 6280
},
{
"epoch": 0.8512653945053458,
"grad_norm": 2.203218698501587,
"learning_rate": 9.006798988978298e-05,
"loss": 0.0862,
"step": 6290
},
{
"epoch": 0.8526187576126675,
"grad_norm": 7.0966997146606445,
"learning_rate": 9.002084132708823e-05,
"loss": 0.5941,
"step": 6300
},
{
"epoch": 0.8526187576126675,
"eval_accuracy": 0.9123287671232877,
"eval_loss": 0.4010595679283142,
"eval_runtime": 585.622,
"eval_samples_per_second": 5.609,
"eval_steps_per_second": 5.609,
"step": 6300
},
{
"epoch": 0.8539721207199892,
"grad_norm": 0.00025126567925326526,
"learning_rate": 8.997359352135105e-05,
"loss": 0.3468,
"step": 6310
},
{
"epoch": 0.8553254838273109,
"grad_norm": 0.005354021675884724,
"learning_rate": 8.992624658973574e-05,
"loss": 0.2055,
"step": 6320
},
{
"epoch": 0.8566788469346326,
"grad_norm": 20.462688446044922,
"learning_rate": 8.987880064965249e-05,
"loss": 0.316,
"step": 6330
},
{
"epoch": 0.8580322100419543,
"grad_norm": 6.252931118011475,
"learning_rate": 8.983125581875698e-05,
"loss": 0.348,
"step": 6340
},
{
"epoch": 0.859385573149276,
"grad_norm": 12.182239532470703,
"learning_rate": 8.978361221495013e-05,
"loss": 0.1299,
"step": 6350
},
{
"epoch": 0.8607389362565977,
"grad_norm": 16.952531814575195,
"learning_rate": 8.973586995637778e-05,
"loss": 0.1622,
"step": 6360
},
{
"epoch": 0.8620922993639193,
"grad_norm": 1.5264373359968886e-05,
"learning_rate": 8.968802916143039e-05,
"loss": 0.2455,
"step": 6370
},
{
"epoch": 0.863445662471241,
"grad_norm": 0.003773150034248829,
"learning_rate": 8.964008994874285e-05,
"loss": 0.3139,
"step": 6380
},
{
"epoch": 0.8647990255785627,
"grad_norm": 5.219977378845215,
"learning_rate": 8.959205243719402e-05,
"loss": 0.5018,
"step": 6390
},
{
"epoch": 0.8661523886858844,
"grad_norm": 0.11269356310367584,
"learning_rate": 8.954391674590656e-05,
"loss": 0.0723,
"step": 6400
},
{
"epoch": 0.8661523886858844,
"eval_accuracy": 0.9208523592085236,
"eval_loss": 0.26359477639198303,
"eval_runtime": 585.3471,
"eval_samples_per_second": 5.612,
"eval_steps_per_second": 5.612,
"step": 6400
},
{
"epoch": 0.8675057517932061,
"grad_norm": 0.05086831748485565,
"learning_rate": 8.949568299424659e-05,
"loss": 0.1954,
"step": 6410
},
{
"epoch": 0.8688591149005278,
"grad_norm": 2.221594149887096e-05,
"learning_rate": 8.94473513018234e-05,
"loss": 0.352,
"step": 6420
},
{
"epoch": 0.8702124780078495,
"grad_norm": 0.014133933931589127,
"learning_rate": 8.939892178848914e-05,
"loss": 0.054,
"step": 6430
},
{
"epoch": 0.8715658411151712,
"grad_norm": 3.5625367164611816,
"learning_rate": 8.935039457433857e-05,
"loss": 0.217,
"step": 6440
},
{
"epoch": 0.8729192042224929,
"grad_norm": 0.15607240796089172,
"learning_rate": 8.930176977970864e-05,
"loss": 0.2095,
"step": 6450
},
{
"epoch": 0.8742725673298146,
"grad_norm": 47.42460250854492,
"learning_rate": 8.92530475251784e-05,
"loss": 0.7065,
"step": 6460
},
{
"epoch": 0.8756259304371363,
"grad_norm": 21.992969512939453,
"learning_rate": 8.920422793156847e-05,
"loss": 0.5967,
"step": 6470
},
{
"epoch": 0.876979293544458,
"grad_norm": 12.470071792602539,
"learning_rate": 8.915531111994094e-05,
"loss": 0.2819,
"step": 6480
},
{
"epoch": 0.8783326566517796,
"grad_norm": 12.152610778808594,
"learning_rate": 8.910629721159892e-05,
"loss": 0.2951,
"step": 6490
},
{
"epoch": 0.8796860197591013,
"grad_norm": 18.442096710205078,
"learning_rate": 8.90571863280863e-05,
"loss": 0.2415,
"step": 6500
},
{
"epoch": 0.8796860197591013,
"eval_accuracy": 0.9178082191780822,
"eval_loss": 0.24678540229797363,
"eval_runtime": 585.4998,
"eval_samples_per_second": 5.611,
"eval_steps_per_second": 5.611,
"step": 6500
},
{
"epoch": 0.881039382866423,
"grad_norm": 14.306652069091797,
"learning_rate": 8.900797859118748e-05,
"loss": 0.4111,
"step": 6510
},
{
"epoch": 0.8823927459737447,
"grad_norm": 15.398430824279785,
"learning_rate": 8.895867412292702e-05,
"loss": 0.1817,
"step": 6520
},
{
"epoch": 0.8837461090810664,
"grad_norm": 0.8829298615455627,
"learning_rate": 8.890927304556935e-05,
"loss": 0.0695,
"step": 6530
},
{
"epoch": 0.8850994721883881,
"grad_norm": 19.324243545532227,
"learning_rate": 8.885977548161848e-05,
"loss": 0.2991,
"step": 6540
},
{
"epoch": 0.8864528352957098,
"grad_norm": 4.953662872314453,
"learning_rate": 8.881018155381766e-05,
"loss": 0.6572,
"step": 6550
},
{
"epoch": 0.8878061984030315,
"grad_norm": 5.32732629776001,
"learning_rate": 8.876049138514912e-05,
"loss": 0.3841,
"step": 6560
},
{
"epoch": 0.8891595615103532,
"grad_norm": 9.986154556274414,
"learning_rate": 8.871070509883377e-05,
"loss": 0.1994,
"step": 6570
},
{
"epoch": 0.890512924617675,
"grad_norm": 16.808300018310547,
"learning_rate": 8.866082281833081e-05,
"loss": 0.2473,
"step": 6580
},
{
"epoch": 0.8918662877249967,
"grad_norm": 12.585075378417969,
"learning_rate": 8.861084466733757e-05,
"loss": 0.7061,
"step": 6590
},
{
"epoch": 0.8932196508323184,
"grad_norm": 1.7754485607147217,
"learning_rate": 8.856077076978902e-05,
"loss": 0.214,
"step": 6600
},
{
"epoch": 0.8932196508323184,
"eval_accuracy": 0.9114155251141552,
"eval_loss": 0.24676522612571716,
"eval_runtime": 585.6426,
"eval_samples_per_second": 5.609,
"eval_steps_per_second": 5.609,
"step": 6600
},
{
"epoch": 0.8945730139396401,
"grad_norm": 21.269773483276367,
"learning_rate": 8.851060124985765e-05,
"loss": 0.3384,
"step": 6610
},
{
"epoch": 0.8959263770469617,
"grad_norm": 12.510165214538574,
"learning_rate": 8.8460336231953e-05,
"loss": 0.4515,
"step": 6620
},
{
"epoch": 0.8972797401542834,
"grad_norm": 0.671404242515564,
"learning_rate": 8.840997584072149e-05,
"loss": 0.2762,
"step": 6630
},
{
"epoch": 0.8986331032616051,
"grad_norm": 0.46358880400657654,
"learning_rate": 8.835952020104598e-05,
"loss": 0.0521,
"step": 6640
},
{
"epoch": 0.8999864663689268,
"grad_norm": 18.224842071533203,
"learning_rate": 8.830896943804557e-05,
"loss": 0.4169,
"step": 6650
},
{
"epoch": 0.9013398294762485,
"grad_norm": 0.3978288471698761,
"learning_rate": 8.825832367707525e-05,
"loss": 0.2784,
"step": 6660
},
{
"epoch": 0.9026931925835702,
"grad_norm": 20.021671295166016,
"learning_rate": 8.820758304372557e-05,
"loss": 0.2109,
"step": 6670
},
{
"epoch": 0.9040465556908919,
"grad_norm": 1.5909063816070557,
"learning_rate": 8.815674766382232e-05,
"loss": 0.2103,
"step": 6680
},
{
"epoch": 0.9053999187982136,
"grad_norm": 1.8711028099060059,
"learning_rate": 8.810581766342626e-05,
"loss": 0.4238,
"step": 6690
},
{
"epoch": 0.9067532819055353,
"grad_norm": 31.497356414794922,
"learning_rate": 8.80547931688328e-05,
"loss": 0.2431,
"step": 6700
},
{
"epoch": 0.9067532819055353,
"eval_accuracy": 0.919634703196347,
"eval_loss": 0.2998688519001007,
"eval_runtime": 585.4623,
"eval_samples_per_second": 5.611,
"eval_steps_per_second": 5.611,
"step": 6700
},
{
"epoch": 0.908106645012857,
"grad_norm": 12.628376960754395,
"learning_rate": 8.800367430657168e-05,
"loss": 0.1661,
"step": 6710
},
{
"epoch": 0.9094600081201787,
"grad_norm": 0.07297774404287338,
"learning_rate": 8.795246120340663e-05,
"loss": 0.0459,
"step": 6720
},
{
"epoch": 0.9108133712275004,
"grad_norm": 28.95281982421875,
"learning_rate": 8.790115398633507e-05,
"loss": 0.567,
"step": 6730
},
{
"epoch": 0.912166734334822,
"grad_norm": 12.253477096557617,
"learning_rate": 8.784975278258783e-05,
"loss": 0.226,
"step": 6740
},
{
"epoch": 0.9135200974421437,
"grad_norm": 19.2222900390625,
"learning_rate": 8.779825771962879e-05,
"loss": 0.1918,
"step": 6750
},
{
"epoch": 0.9148734605494654,
"grad_norm": 0.03731995075941086,
"learning_rate": 8.774666892515458e-05,
"loss": 0.1789,
"step": 6760
},
{
"epoch": 0.9162268236567871,
"grad_norm": 1.9367414712905884,
"learning_rate": 8.769498652709427e-05,
"loss": 0.1,
"step": 6770
},
{
"epoch": 0.9175801867641088,
"grad_norm": 4.96419534101733e-06,
"learning_rate": 8.764321065360905e-05,
"loss": 0.9138,
"step": 6780
},
{
"epoch": 0.9189335498714305,
"grad_norm": 9.806981086730957,
"learning_rate": 8.759134143309189e-05,
"loss": 0.2375,
"step": 6790
},
{
"epoch": 0.9202869129787522,
"grad_norm": 9.017184493131936e-05,
"learning_rate": 8.753937899416727e-05,
"loss": 0.5315,
"step": 6800
},
{
"epoch": 0.9202869129787522,
"eval_accuracy": 0.9229832572298325,
"eval_loss": 0.3090192675590515,
"eval_runtime": 585.6276,
"eval_samples_per_second": 5.609,
"eval_steps_per_second": 5.609,
"step": 6800
},
{
"epoch": 0.9216402760860739,
"grad_norm": 21.234371185302734,
"learning_rate": 8.74873234656908e-05,
"loss": 0.3668,
"step": 6810
},
{
"epoch": 0.9229936391933956,
"grad_norm": 4.857106685638428,
"learning_rate": 8.743517497674896e-05,
"loss": 0.3368,
"step": 6820
},
{
"epoch": 0.9243470023007173,
"grad_norm": 0.0022790771909058094,
"learning_rate": 8.738293365665872e-05,
"loss": 0.0965,
"step": 6830
},
{
"epoch": 0.925700365408039,
"grad_norm": 25.66297149658203,
"learning_rate": 8.73305996349673e-05,
"loss": 0.5797,
"step": 6840
},
{
"epoch": 0.9270537285153607,
"grad_norm": 0.9596951603889465,
"learning_rate": 8.727817304145176e-05,
"loss": 0.2861,
"step": 6850
},
{
"epoch": 0.9284070916226823,
"grad_norm": 0.6504172086715698,
"learning_rate": 8.722565400611872e-05,
"loss": 0.1645,
"step": 6860
},
{
"epoch": 0.929760454730004,
"grad_norm": 0.5617231130599976,
"learning_rate": 8.717304265920402e-05,
"loss": 0.5579,
"step": 6870
},
{
"epoch": 0.9311138178373257,
"grad_norm": 8.67705249786377,
"learning_rate": 8.71203391311725e-05,
"loss": 0.6897,
"step": 6880
},
{
"epoch": 0.9324671809446474,
"grad_norm": 8.441317558288574,
"learning_rate": 8.70675435527175e-05,
"loss": 0.1798,
"step": 6890
},
{
"epoch": 0.9338205440519691,
"grad_norm": 11.730077743530273,
"learning_rate": 8.70146560547606e-05,
"loss": 0.1287,
"step": 6900
},
{
"epoch": 0.9338205440519691,
"eval_accuracy": 0.9187214611872146,
"eval_loss": 0.22067341208457947,
"eval_runtime": 585.0326,
"eval_samples_per_second": 5.615,
"eval_steps_per_second": 5.615,
"step": 6900
}
],
"logging_steps": 10,
"max_steps": 22167,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}