prxy5605's picture
Training in progress, step 200, checkpoint
240dcbc verified
raw
history blame
36.6 kB
{
"best_metric": 0.06736895442008972,
"best_model_checkpoint": "miner_id_24/checkpoint-200",
"epoch": 1.2364760432766615,
"eval_steps": 50,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0061823802163833074,
"grad_norm": 0.7046732306480408,
"learning_rate": 1e-05,
"loss": 0.9014,
"step": 1
},
{
"epoch": 0.0061823802163833074,
"eval_loss": 1.4344593286514282,
"eval_runtime": 5.499,
"eval_samples_per_second": 49.646,
"eval_steps_per_second": 12.548,
"step": 1
},
{
"epoch": 0.012364760432766615,
"grad_norm": 0.7162360548973083,
"learning_rate": 2e-05,
"loss": 0.9038,
"step": 2
},
{
"epoch": 0.01854714064914992,
"grad_norm": 0.7014511823654175,
"learning_rate": 3e-05,
"loss": 0.9134,
"step": 3
},
{
"epoch": 0.02472952086553323,
"grad_norm": 0.704330325126648,
"learning_rate": 4e-05,
"loss": 0.9097,
"step": 4
},
{
"epoch": 0.030911901081916538,
"grad_norm": 0.7009629607200623,
"learning_rate": 5e-05,
"loss": 0.8809,
"step": 5
},
{
"epoch": 0.03709428129829984,
"grad_norm": 0.6580602526664734,
"learning_rate": 6e-05,
"loss": 0.8479,
"step": 6
},
{
"epoch": 0.04327666151468315,
"grad_norm": 0.5885988473892212,
"learning_rate": 7e-05,
"loss": 0.7923,
"step": 7
},
{
"epoch": 0.04945904173106646,
"grad_norm": 0.5091739296913147,
"learning_rate": 8e-05,
"loss": 0.7074,
"step": 8
},
{
"epoch": 0.05564142194744977,
"grad_norm": 0.4579506814479828,
"learning_rate": 9e-05,
"loss": 0.6569,
"step": 9
},
{
"epoch": 0.061823802163833076,
"grad_norm": 0.4280666410923004,
"learning_rate": 0.0001,
"loss": 0.6187,
"step": 10
},
{
"epoch": 0.06800618238021638,
"grad_norm": 0.45351147651672363,
"learning_rate": 9.999316524962345e-05,
"loss": 0.4839,
"step": 11
},
{
"epoch": 0.07418856259659969,
"grad_norm": 0.4613342881202698,
"learning_rate": 9.997266286704631e-05,
"loss": 0.4408,
"step": 12
},
{
"epoch": 0.080370942812983,
"grad_norm": 0.7251604795455933,
"learning_rate": 9.993849845741524e-05,
"loss": 0.3717,
"step": 13
},
{
"epoch": 0.0865533230293663,
"grad_norm": 0.6855277419090271,
"learning_rate": 9.989068136093873e-05,
"loss": 0.6104,
"step": 14
},
{
"epoch": 0.09273570324574962,
"grad_norm": 1.5776333808898926,
"learning_rate": 9.98292246503335e-05,
"loss": 1.1992,
"step": 15
},
{
"epoch": 0.09891808346213292,
"grad_norm": 1.2704423666000366,
"learning_rate": 9.975414512725057e-05,
"loss": 1.1392,
"step": 16
},
{
"epoch": 0.10510046367851623,
"grad_norm": 1.2776545286178589,
"learning_rate": 9.966546331768191e-05,
"loss": 1.0187,
"step": 17
},
{
"epoch": 0.11128284389489954,
"grad_norm": 1.3643944263458252,
"learning_rate": 9.956320346634876e-05,
"loss": 0.9102,
"step": 18
},
{
"epoch": 0.11746522411128284,
"grad_norm": 1.7785018682479858,
"learning_rate": 9.944739353007344e-05,
"loss": 0.6558,
"step": 19
},
{
"epoch": 0.12364760432766615,
"grad_norm": 1.257607102394104,
"learning_rate": 9.931806517013612e-05,
"loss": 0.5436,
"step": 20
},
{
"epoch": 0.12982998454404945,
"grad_norm": 1.1651414632797241,
"learning_rate": 9.917525374361912e-05,
"loss": 0.3782,
"step": 21
},
{
"epoch": 0.13601236476043277,
"grad_norm": 0.8770898580551147,
"learning_rate": 9.901899829374047e-05,
"loss": 0.2996,
"step": 22
},
{
"epoch": 0.14219474497681608,
"grad_norm": 0.7041931748390198,
"learning_rate": 9.884934153917997e-05,
"loss": 0.2509,
"step": 23
},
{
"epoch": 0.14837712519319937,
"grad_norm": 0.41992565989494324,
"learning_rate": 9.86663298624003e-05,
"loss": 0.1883,
"step": 24
},
{
"epoch": 0.1545595054095827,
"grad_norm": 0.543204665184021,
"learning_rate": 9.847001329696653e-05,
"loss": 0.2283,
"step": 25
},
{
"epoch": 0.160741885625966,
"grad_norm": 0.3455043137073517,
"learning_rate": 9.826044551386744e-05,
"loss": 0.1566,
"step": 26
},
{
"epoch": 0.16692426584234932,
"grad_norm": 0.38509395718574524,
"learning_rate": 9.803768380684242e-05,
"loss": 0.1338,
"step": 27
},
{
"epoch": 0.1731066460587326,
"grad_norm": 0.3171514868736267,
"learning_rate": 9.780178907671789e-05,
"loss": 0.1165,
"step": 28
},
{
"epoch": 0.17928902627511592,
"grad_norm": 0.3930993974208832,
"learning_rate": 9.755282581475769e-05,
"loss": 0.1189,
"step": 29
},
{
"epoch": 0.18547140649149924,
"grad_norm": 0.42565444111824036,
"learning_rate": 9.729086208503174e-05,
"loss": 0.1265,
"step": 30
},
{
"epoch": 0.19165378670788252,
"grad_norm": 0.4470174312591553,
"learning_rate": 9.701596950580806e-05,
"loss": 0.1534,
"step": 31
},
{
"epoch": 0.19783616692426584,
"grad_norm": 0.3107547163963318,
"learning_rate": 9.672822322997305e-05,
"loss": 0.1768,
"step": 32
},
{
"epoch": 0.20401854714064915,
"grad_norm": 0.3041976988315582,
"learning_rate": 9.642770192448536e-05,
"loss": 0.1344,
"step": 33
},
{
"epoch": 0.21020092735703247,
"grad_norm": 0.3155347406864166,
"learning_rate": 9.611448774886924e-05,
"loss": 0.1546,
"step": 34
},
{
"epoch": 0.21638330757341576,
"grad_norm": 0.2196689397096634,
"learning_rate": 9.578866633275288e-05,
"loss": 0.1661,
"step": 35
},
{
"epoch": 0.22256568778979907,
"grad_norm": 0.2337467223405838,
"learning_rate": 9.545032675245813e-05,
"loss": 0.1684,
"step": 36
},
{
"epoch": 0.2287480680061824,
"grad_norm": 0.2257539927959442,
"learning_rate": 9.509956150664796e-05,
"loss": 0.1481,
"step": 37
},
{
"epoch": 0.23493044822256567,
"grad_norm": 0.1821281909942627,
"learning_rate": 9.473646649103818e-05,
"loss": 0.1748,
"step": 38
},
{
"epoch": 0.241112828438949,
"grad_norm": 0.16585831344127655,
"learning_rate": 9.43611409721806e-05,
"loss": 0.1309,
"step": 39
},
{
"epoch": 0.2472952086553323,
"grad_norm": 0.12429387122392654,
"learning_rate": 9.397368756032445e-05,
"loss": 0.1453,
"step": 40
},
{
"epoch": 0.2534775888717156,
"grad_norm": 7.558984756469727,
"learning_rate": 9.357421218136386e-05,
"loss": 0.911,
"step": 41
},
{
"epoch": 0.2596599690880989,
"grad_norm": 5.058866024017334,
"learning_rate": 9.316282404787871e-05,
"loss": 0.516,
"step": 42
},
{
"epoch": 0.26584234930448225,
"grad_norm": 2.027780532836914,
"learning_rate": 9.273963562927695e-05,
"loss": 0.2534,
"step": 43
},
{
"epoch": 0.27202472952086554,
"grad_norm": 0.9123561978340149,
"learning_rate": 9.230476262104677e-05,
"loss": 0.1345,
"step": 44
},
{
"epoch": 0.2782071097372488,
"grad_norm": 0.3587602376937866,
"learning_rate": 9.185832391312644e-05,
"loss": 0.0774,
"step": 45
},
{
"epoch": 0.28438948995363217,
"grad_norm": 0.21639949083328247,
"learning_rate": 9.140044155740101e-05,
"loss": 0.0637,
"step": 46
},
{
"epoch": 0.29057187017001546,
"grad_norm": 0.18078401684761047,
"learning_rate": 9.093124073433463e-05,
"loss": 0.0398,
"step": 47
},
{
"epoch": 0.29675425038639874,
"grad_norm": 0.15149831771850586,
"learning_rate": 9.045084971874738e-05,
"loss": 0.0507,
"step": 48
},
{
"epoch": 0.3029366306027821,
"grad_norm": 0.13558559119701385,
"learning_rate": 8.995939984474624e-05,
"loss": 0.0564,
"step": 49
},
{
"epoch": 0.3091190108191654,
"grad_norm": 0.13863137364387512,
"learning_rate": 8.945702546981969e-05,
"loss": 0.045,
"step": 50
},
{
"epoch": 0.3091190108191654,
"eval_loss": 0.08772445470094681,
"eval_runtime": 5.5012,
"eval_samples_per_second": 49.626,
"eval_steps_per_second": 12.543,
"step": 50
},
{
"epoch": 0.31530139103554866,
"grad_norm": 0.09701208025217056,
"learning_rate": 8.894386393810563e-05,
"loss": 0.0436,
"step": 51
},
{
"epoch": 0.321483771251932,
"grad_norm": 0.09931997954845428,
"learning_rate": 8.842005554284296e-05,
"loss": 0.036,
"step": 52
},
{
"epoch": 0.3276661514683153,
"grad_norm": 0.1121208667755127,
"learning_rate": 8.788574348801675e-05,
"loss": 0.039,
"step": 53
},
{
"epoch": 0.33384853168469864,
"grad_norm": 0.2304030805826187,
"learning_rate": 8.73410738492077e-05,
"loss": 0.0666,
"step": 54
},
{
"epoch": 0.3400309119010819,
"grad_norm": 0.45196354389190674,
"learning_rate": 8.678619553365659e-05,
"loss": 0.0818,
"step": 55
},
{
"epoch": 0.3462132921174652,
"grad_norm": 0.20915654301643372,
"learning_rate": 8.622126023955446e-05,
"loss": 0.0785,
"step": 56
},
{
"epoch": 0.35239567233384855,
"grad_norm": 0.10331236571073532,
"learning_rate": 8.564642241456986e-05,
"loss": 0.0947,
"step": 57
},
{
"epoch": 0.35857805255023184,
"grad_norm": 0.09945619106292725,
"learning_rate": 8.506183921362443e-05,
"loss": 0.0841,
"step": 58
},
{
"epoch": 0.36476043276661513,
"grad_norm": 0.08608353137969971,
"learning_rate": 8.44676704559283e-05,
"loss": 0.099,
"step": 59
},
{
"epoch": 0.37094281298299847,
"grad_norm": 0.06971795856952667,
"learning_rate": 8.386407858128706e-05,
"loss": 0.0909,
"step": 60
},
{
"epoch": 0.37712519319938176,
"grad_norm": 0.08237221837043762,
"learning_rate": 8.32512286056924e-05,
"loss": 0.0613,
"step": 61
},
{
"epoch": 0.38330757341576505,
"grad_norm": 0.08594591915607452,
"learning_rate": 8.262928807620843e-05,
"loss": 0.0792,
"step": 62
},
{
"epoch": 0.3894899536321484,
"grad_norm": 0.06339319050312042,
"learning_rate": 8.199842702516583e-05,
"loss": 0.061,
"step": 63
},
{
"epoch": 0.3956723338485317,
"grad_norm": 0.0899580791592598,
"learning_rate": 8.135881792367686e-05,
"loss": 0.0738,
"step": 64
},
{
"epoch": 0.401854714064915,
"grad_norm": 0.06145675107836723,
"learning_rate": 8.07106356344834e-05,
"loss": 0.0988,
"step": 65
},
{
"epoch": 0.4080370942812983,
"grad_norm": 0.05957213416695595,
"learning_rate": 8.005405736415126e-05,
"loss": 0.0752,
"step": 66
},
{
"epoch": 0.4142194744976816,
"grad_norm": 0.08227390795946121,
"learning_rate": 7.938926261462366e-05,
"loss": 0.0973,
"step": 67
},
{
"epoch": 0.42040185471406494,
"grad_norm": 0.05563255399465561,
"learning_rate": 7.871643313414718e-05,
"loss": 0.0708,
"step": 68
},
{
"epoch": 0.4265842349304482,
"grad_norm": 0.09192028641700745,
"learning_rate": 7.803575286758364e-05,
"loss": 0.0843,
"step": 69
},
{
"epoch": 0.4327666151468315,
"grad_norm": 0.05860935151576996,
"learning_rate": 7.734740790612136e-05,
"loss": 0.0711,
"step": 70
},
{
"epoch": 0.43894899536321486,
"grad_norm": 0.09385299682617188,
"learning_rate": 7.66515864363997e-05,
"loss": 0.0991,
"step": 71
},
{
"epoch": 0.44513137557959814,
"grad_norm": 0.12418725341558456,
"learning_rate": 7.594847868906076e-05,
"loss": 0.1394,
"step": 72
},
{
"epoch": 0.45131375579598143,
"grad_norm": 0.11011894792318344,
"learning_rate": 7.52382768867422e-05,
"loss": 0.1307,
"step": 73
},
{
"epoch": 0.4574961360123648,
"grad_norm": 0.14699475467205048,
"learning_rate": 7.452117519152542e-05,
"loss": 0.1735,
"step": 74
},
{
"epoch": 0.46367851622874806,
"grad_norm": 0.1061205342411995,
"learning_rate": 7.379736965185368e-05,
"loss": 0.1722,
"step": 75
},
{
"epoch": 0.46986089644513135,
"grad_norm": 0.13778509199619293,
"learning_rate": 7.30670581489344e-05,
"loss": 0.1887,
"step": 76
},
{
"epoch": 0.4760432766615147,
"grad_norm": 0.14930473268032074,
"learning_rate": 7.233044034264034e-05,
"loss": 0.1915,
"step": 77
},
{
"epoch": 0.482225656877898,
"grad_norm": 0.12359173595905304,
"learning_rate": 7.158771761692464e-05,
"loss": 0.1521,
"step": 78
},
{
"epoch": 0.4884080370942813,
"grad_norm": 0.16283033788204193,
"learning_rate": 7.083909302476453e-05,
"loss": 0.1192,
"step": 79
},
{
"epoch": 0.4945904173106646,
"grad_norm": 0.17497539520263672,
"learning_rate": 7.008477123264848e-05,
"loss": 0.1121,
"step": 80
},
{
"epoch": 0.500772797527048,
"grad_norm": 0.5904685258865356,
"learning_rate": 6.932495846462261e-05,
"loss": 0.0451,
"step": 81
},
{
"epoch": 0.5069551777434312,
"grad_norm": 0.060611702501773834,
"learning_rate": 6.855986244591104e-05,
"loss": 0.0322,
"step": 82
},
{
"epoch": 0.5131375579598145,
"grad_norm": 0.03139381855726242,
"learning_rate": 6.778969234612584e-05,
"loss": 0.0281,
"step": 83
},
{
"epoch": 0.5193199381761978,
"grad_norm": 0.03067929297685623,
"learning_rate": 6.701465872208216e-05,
"loss": 0.0328,
"step": 84
},
{
"epoch": 0.5255023183925811,
"grad_norm": 0.026165900751948357,
"learning_rate": 6.623497346023418e-05,
"loss": 0.0367,
"step": 85
},
{
"epoch": 0.5316846986089645,
"grad_norm": 0.030481263995170593,
"learning_rate": 6.545084971874738e-05,
"loss": 0.0337,
"step": 86
},
{
"epoch": 0.5378670788253478,
"grad_norm": 0.028614336624741554,
"learning_rate": 6.466250186922325e-05,
"loss": 0.0266,
"step": 87
},
{
"epoch": 0.5440494590417311,
"grad_norm": 0.029182005673646927,
"learning_rate": 6.387014543809223e-05,
"loss": 0.0336,
"step": 88
},
{
"epoch": 0.5502318392581144,
"grad_norm": 0.03444831073284149,
"learning_rate": 6.307399704769099e-05,
"loss": 0.0318,
"step": 89
},
{
"epoch": 0.5564142194744977,
"grad_norm": 0.024809561669826508,
"learning_rate": 6.227427435703997e-05,
"loss": 0.0351,
"step": 90
},
{
"epoch": 0.5625965996908809,
"grad_norm": 0.036650341004133224,
"learning_rate": 6.147119600233758e-05,
"loss": 0.0423,
"step": 91
},
{
"epoch": 0.5687789799072643,
"grad_norm": 0.02630997821688652,
"learning_rate": 6.066498153718735e-05,
"loss": 0.0235,
"step": 92
},
{
"epoch": 0.5749613601236476,
"grad_norm": 0.023355038836598396,
"learning_rate": 5.985585137257401e-05,
"loss": 0.0256,
"step": 93
},
{
"epoch": 0.5811437403400309,
"grad_norm": 0.03157835826277733,
"learning_rate": 5.90440267166055e-05,
"loss": 0.0377,
"step": 94
},
{
"epoch": 0.5873261205564142,
"grad_norm": 0.031935982406139374,
"learning_rate": 5.8229729514036705e-05,
"loss": 0.037,
"step": 95
},
{
"epoch": 0.5935085007727975,
"grad_norm": 0.07638601213693619,
"learning_rate": 5.74131823855921e-05,
"loss": 0.0734,
"step": 96
},
{
"epoch": 0.5996908809891809,
"grad_norm": 0.07356469333171844,
"learning_rate": 5.6594608567103456e-05,
"loss": 0.0647,
"step": 97
},
{
"epoch": 0.6058732612055642,
"grad_norm": 0.08144511282444,
"learning_rate": 5.577423184847932e-05,
"loss": 0.0822,
"step": 98
},
{
"epoch": 0.6120556414219475,
"grad_norm": 0.08834308385848999,
"learning_rate": 5.495227651252315e-05,
"loss": 0.0773,
"step": 99
},
{
"epoch": 0.6182380216383307,
"grad_norm": 0.06142331287264824,
"learning_rate": 5.4128967273616625e-05,
"loss": 0.0692,
"step": 100
},
{
"epoch": 0.6182380216383307,
"eval_loss": 0.07065095752477646,
"eval_runtime": 5.5131,
"eval_samples_per_second": 49.518,
"eval_steps_per_second": 12.516,
"step": 100
},
{
"epoch": 0.624420401854714,
"grad_norm": 0.0861298069357872,
"learning_rate": 5.330452921628497e-05,
"loss": 0.0919,
"step": 101
},
{
"epoch": 0.6306027820710973,
"grad_norm": 0.04875423386693001,
"learning_rate": 5.247918773366112e-05,
"loss": 0.0512,
"step": 102
},
{
"epoch": 0.6367851622874807,
"grad_norm": 0.08116843551397324,
"learning_rate": 5.165316846586541e-05,
"loss": 0.0914,
"step": 103
},
{
"epoch": 0.642967542503864,
"grad_norm": 0.07430719584226608,
"learning_rate": 5.0826697238317935e-05,
"loss": 0.0905,
"step": 104
},
{
"epoch": 0.6491499227202473,
"grad_norm": 0.0565401054918766,
"learning_rate": 5e-05,
"loss": 0.08,
"step": 105
},
{
"epoch": 0.6553323029366306,
"grad_norm": 0.07614312320947647,
"learning_rate": 4.917330276168208e-05,
"loss": 0.0687,
"step": 106
},
{
"epoch": 0.6615146831530139,
"grad_norm": 0.05415325611829758,
"learning_rate": 4.834683153413459e-05,
"loss": 0.1018,
"step": 107
},
{
"epoch": 0.6676970633693973,
"grad_norm": 0.05803875997662544,
"learning_rate": 4.7520812266338885e-05,
"loss": 0.0738,
"step": 108
},
{
"epoch": 0.6738794435857806,
"grad_norm": 0.07506942749023438,
"learning_rate": 4.669547078371504e-05,
"loss": 0.0585,
"step": 109
},
{
"epoch": 0.6800618238021638,
"grad_norm": 0.08233385533094406,
"learning_rate": 4.5871032726383386e-05,
"loss": 0.1079,
"step": 110
},
{
"epoch": 0.6862442040185471,
"grad_norm": 0.1070331260561943,
"learning_rate": 4.504772348747687e-05,
"loss": 0.1187,
"step": 111
},
{
"epoch": 0.6924265842349304,
"grad_norm": 0.07188671827316284,
"learning_rate": 4.4225768151520694e-05,
"loss": 0.1203,
"step": 112
},
{
"epoch": 0.6986089644513137,
"grad_norm": 0.10996810346841812,
"learning_rate": 4.3405391432896555e-05,
"loss": 0.1151,
"step": 113
},
{
"epoch": 0.7047913446676971,
"grad_norm": 0.10571317374706268,
"learning_rate": 4.2586817614407895e-05,
"loss": 0.1444,
"step": 114
},
{
"epoch": 0.7109737248840804,
"grad_norm": 0.12080641835927963,
"learning_rate": 4.17702704859633e-05,
"loss": 0.1631,
"step": 115
},
{
"epoch": 0.7171561051004637,
"grad_norm": 0.12236452102661133,
"learning_rate": 4.095597328339452e-05,
"loss": 0.1392,
"step": 116
},
{
"epoch": 0.723338485316847,
"grad_norm": 0.10123718529939651,
"learning_rate": 4.0144148627425993e-05,
"loss": 0.1507,
"step": 117
},
{
"epoch": 0.7295208655332303,
"grad_norm": 0.10937009751796722,
"learning_rate": 3.933501846281267e-05,
"loss": 0.1077,
"step": 118
},
{
"epoch": 0.7357032457496137,
"grad_norm": 0.10601434111595154,
"learning_rate": 3.852880399766243e-05,
"loss": 0.0933,
"step": 119
},
{
"epoch": 0.7418856259659969,
"grad_norm": 0.12326169013977051,
"learning_rate": 3.772572564296005e-05,
"loss": 0.1243,
"step": 120
},
{
"epoch": 0.7480680061823802,
"grad_norm": 0.03345995396375656,
"learning_rate": 3.6926002952309016e-05,
"loss": 0.0396,
"step": 121
},
{
"epoch": 0.7542503863987635,
"grad_norm": 0.02183113619685173,
"learning_rate": 3.612985456190778e-05,
"loss": 0.0241,
"step": 122
},
{
"epoch": 0.7604327666151468,
"grad_norm": 0.034045446664094925,
"learning_rate": 3.533749813077677e-05,
"loss": 0.0311,
"step": 123
},
{
"epoch": 0.7666151468315301,
"grad_norm": 0.02918507158756256,
"learning_rate": 3.4549150281252636e-05,
"loss": 0.0324,
"step": 124
},
{
"epoch": 0.7727975270479135,
"grad_norm": 0.03378276899456978,
"learning_rate": 3.3765026539765834e-05,
"loss": 0.0376,
"step": 125
},
{
"epoch": 0.7789799072642968,
"grad_norm": 0.034735891968011856,
"learning_rate": 3.298534127791785e-05,
"loss": 0.0361,
"step": 126
},
{
"epoch": 0.7851622874806801,
"grad_norm": 0.02379162609577179,
"learning_rate": 3.221030765387417e-05,
"loss": 0.03,
"step": 127
},
{
"epoch": 0.7913446676970634,
"grad_norm": 0.03231632709503174,
"learning_rate": 3.144013755408895e-05,
"loss": 0.0378,
"step": 128
},
{
"epoch": 0.7975270479134466,
"grad_norm": 0.026890065521001816,
"learning_rate": 3.0675041535377405e-05,
"loss": 0.0321,
"step": 129
},
{
"epoch": 0.80370942812983,
"grad_norm": 0.02761516161262989,
"learning_rate": 2.991522876735154e-05,
"loss": 0.0322,
"step": 130
},
{
"epoch": 0.8098918083462133,
"grad_norm": 0.03337293490767479,
"learning_rate": 2.916090697523549e-05,
"loss": 0.0264,
"step": 131
},
{
"epoch": 0.8160741885625966,
"grad_norm": 0.02159067615866661,
"learning_rate": 2.8412282383075363e-05,
"loss": 0.0279,
"step": 132
},
{
"epoch": 0.8222565687789799,
"grad_norm": 0.024149876087903976,
"learning_rate": 2.766955965735968e-05,
"loss": 0.0258,
"step": 133
},
{
"epoch": 0.8284389489953632,
"grad_norm": 0.04995239898562431,
"learning_rate": 2.693294185106562e-05,
"loss": 0.0492,
"step": 134
},
{
"epoch": 0.8346213292117465,
"grad_norm": 0.055764585733413696,
"learning_rate": 2.6202630348146324e-05,
"loss": 0.0626,
"step": 135
},
{
"epoch": 0.8408037094281299,
"grad_norm": 0.057029642164707184,
"learning_rate": 2.547882480847461e-05,
"loss": 0.055,
"step": 136
},
{
"epoch": 0.8469860896445132,
"grad_norm": 0.08656567335128784,
"learning_rate": 2.476172311325783e-05,
"loss": 0.0723,
"step": 137
},
{
"epoch": 0.8531684698608965,
"grad_norm": 0.0557289682328701,
"learning_rate": 2.405152131093926e-05,
"loss": 0.0699,
"step": 138
},
{
"epoch": 0.8593508500772797,
"grad_norm": 0.06906598061323166,
"learning_rate": 2.3348413563600325e-05,
"loss": 0.0703,
"step": 139
},
{
"epoch": 0.865533230293663,
"grad_norm": 0.06671453267335892,
"learning_rate": 2.2652592093878666e-05,
"loss": 0.0716,
"step": 140
},
{
"epoch": 0.8717156105100463,
"grad_norm": 0.0541725680232048,
"learning_rate": 2.196424713241637e-05,
"loss": 0.0554,
"step": 141
},
{
"epoch": 0.8778979907264297,
"grad_norm": 0.05449613183736801,
"learning_rate": 2.128356686585282e-05,
"loss": 0.0554,
"step": 142
},
{
"epoch": 0.884080370942813,
"grad_norm": 0.06350822746753693,
"learning_rate": 2.061073738537635e-05,
"loss": 0.0831,
"step": 143
},
{
"epoch": 0.8902627511591963,
"grad_norm": 0.058875709772109985,
"learning_rate": 1.9945942635848748e-05,
"loss": 0.0787,
"step": 144
},
{
"epoch": 0.8964451313755796,
"grad_norm": 0.0638042464852333,
"learning_rate": 1.928936436551661e-05,
"loss": 0.078,
"step": 145
},
{
"epoch": 0.9026275115919629,
"grad_norm": 0.0712515190243721,
"learning_rate": 1.8641182076323148e-05,
"loss": 0.0957,
"step": 146
},
{
"epoch": 0.9088098918083463,
"grad_norm": 0.07816080749034882,
"learning_rate": 1.800157297483417e-05,
"loss": 0.1148,
"step": 147
},
{
"epoch": 0.9149922720247295,
"grad_norm": 0.059558551758527756,
"learning_rate": 1.7370711923791567e-05,
"loss": 0.0572,
"step": 148
},
{
"epoch": 0.9211746522411128,
"grad_norm": 0.05272268131375313,
"learning_rate": 1.6748771394307585e-05,
"loss": 0.0562,
"step": 149
},
{
"epoch": 0.9273570324574961,
"grad_norm": 0.11955559998750687,
"learning_rate": 1.6135921418712956e-05,
"loss": 0.135,
"step": 150
},
{
"epoch": 0.9273570324574961,
"eval_loss": 0.06857249140739441,
"eval_runtime": 5.514,
"eval_samples_per_second": 49.511,
"eval_steps_per_second": 12.514,
"step": 150
},
{
"epoch": 0.9335394126738794,
"grad_norm": 0.07927730679512024,
"learning_rate": 1.553232954407171e-05,
"loss": 0.1002,
"step": 151
},
{
"epoch": 0.9397217928902627,
"grad_norm": 0.11686775088310242,
"learning_rate": 1.4938160786375572e-05,
"loss": 0.1481,
"step": 152
},
{
"epoch": 0.9459041731066461,
"grad_norm": 0.09868727624416351,
"learning_rate": 1.435357758543015e-05,
"loss": 0.0949,
"step": 153
},
{
"epoch": 0.9520865533230294,
"grad_norm": 0.08376035839319229,
"learning_rate": 1.3778739760445552e-05,
"loss": 0.1368,
"step": 154
},
{
"epoch": 0.9582689335394127,
"grad_norm": 0.10080867260694504,
"learning_rate": 1.3213804466343421e-05,
"loss": 0.0974,
"step": 155
},
{
"epoch": 0.964451313755796,
"grad_norm": 0.09424585849046707,
"learning_rate": 1.2658926150792322e-05,
"loss": 0.1178,
"step": 156
},
{
"epoch": 0.9706336939721792,
"grad_norm": 0.12882882356643677,
"learning_rate": 1.2114256511983274e-05,
"loss": 0.19,
"step": 157
},
{
"epoch": 0.9768160741885626,
"grad_norm": 0.10375382006168365,
"learning_rate": 1.157994445715706e-05,
"loss": 0.0978,
"step": 158
},
{
"epoch": 0.9829984544049459,
"grad_norm": 0.10177513211965561,
"learning_rate": 1.1056136061894384e-05,
"loss": 0.1095,
"step": 159
},
{
"epoch": 0.9891808346213292,
"grad_norm": 0.1246931403875351,
"learning_rate": 1.0542974530180327e-05,
"loss": 0.1929,
"step": 160
},
{
"epoch": 0.9953632148377125,
"grad_norm": 0.03230239450931549,
"learning_rate": 1.0040600155253765e-05,
"loss": 0.0366,
"step": 161
},
{
"epoch": 1.001545595054096,
"grad_norm": 0.07059313356876373,
"learning_rate": 9.549150281252633e-06,
"loss": 0.0696,
"step": 162
},
{
"epoch": 1.007727975270479,
"grad_norm": 0.03946077451109886,
"learning_rate": 9.068759265665384e-06,
"loss": 0.0278,
"step": 163
},
{
"epoch": 1.0139103554868625,
"grad_norm": 0.026573503389954567,
"learning_rate": 8.599558442598998e-06,
"loss": 0.0282,
"step": 164
},
{
"epoch": 1.0200927357032457,
"grad_norm": 0.028409384191036224,
"learning_rate": 8.141676086873572e-06,
"loss": 0.03,
"step": 165
},
{
"epoch": 1.026275115919629,
"grad_norm": 0.02546733431518078,
"learning_rate": 7.695237378953223e-06,
"loss": 0.0313,
"step": 166
},
{
"epoch": 1.0324574961360125,
"grad_norm": 0.025494728237390518,
"learning_rate": 7.260364370723044e-06,
"loss": 0.0332,
"step": 167
},
{
"epoch": 1.0386398763523956,
"grad_norm": 0.027573363855481148,
"learning_rate": 6.837175952121306e-06,
"loss": 0.0325,
"step": 168
},
{
"epoch": 1.044822256568779,
"grad_norm": 0.03296926990151405,
"learning_rate": 6.425787818636131e-06,
"loss": 0.0256,
"step": 169
},
{
"epoch": 1.0510046367851622,
"grad_norm": 0.03755498304963112,
"learning_rate": 6.026312439675552e-06,
"loss": 0.0475,
"step": 170
},
{
"epoch": 1.0571870170015456,
"grad_norm": 0.03160750865936279,
"learning_rate": 5.6388590278194096e-06,
"loss": 0.0294,
"step": 171
},
{
"epoch": 1.063369397217929,
"grad_norm": 0.030579503625631332,
"learning_rate": 5.263533508961827e-06,
"loss": 0.0346,
"step": 172
},
{
"epoch": 1.0695517774343122,
"grad_norm": 0.025405822321772575,
"learning_rate": 4.900438493352055e-06,
"loss": 0.0371,
"step": 173
},
{
"epoch": 1.0757341576506956,
"grad_norm": 0.025918610394001007,
"learning_rate": 4.549673247541875e-06,
"loss": 0.0268,
"step": 174
},
{
"epoch": 1.0819165378670788,
"grad_norm": 0.032863494008779526,
"learning_rate": 4.2113336672471245e-06,
"loss": 0.0236,
"step": 175
},
{
"epoch": 1.0880989180834622,
"grad_norm": 0.034572046250104904,
"learning_rate": 3.885512251130763e-06,
"loss": 0.0365,
"step": 176
},
{
"epoch": 1.0942812982998453,
"grad_norm": 0.046874385327100754,
"learning_rate": 3.5722980755146517e-06,
"loss": 0.0557,
"step": 177
},
{
"epoch": 1.1004636785162287,
"grad_norm": 0.04901492968201637,
"learning_rate": 3.271776770026963e-06,
"loss": 0.0593,
"step": 178
},
{
"epoch": 1.1066460587326121,
"grad_norm": 0.05979045107960701,
"learning_rate": 2.9840304941919415e-06,
"loss": 0.0651,
"step": 179
},
{
"epoch": 1.1128284389489953,
"grad_norm": 0.05601680278778076,
"learning_rate": 2.7091379149682685e-06,
"loss": 0.0819,
"step": 180
},
{
"epoch": 1.1190108191653787,
"grad_norm": 0.0761902779340744,
"learning_rate": 2.4471741852423237e-06,
"loss": 0.0713,
"step": 181
},
{
"epoch": 1.125193199381762,
"grad_norm": 0.05143177509307861,
"learning_rate": 2.1982109232821178e-06,
"loss": 0.0791,
"step": 182
},
{
"epoch": 1.1313755795981453,
"grad_norm": 0.06408868730068207,
"learning_rate": 1.962316193157593e-06,
"loss": 0.0717,
"step": 183
},
{
"epoch": 1.1375579598145287,
"grad_norm": 0.05212021991610527,
"learning_rate": 1.7395544861325718e-06,
"loss": 0.0557,
"step": 184
},
{
"epoch": 1.1437403400309119,
"grad_norm": 0.05530843511223793,
"learning_rate": 1.5299867030334814e-06,
"loss": 0.074,
"step": 185
},
{
"epoch": 1.1499227202472952,
"grad_norm": 0.07621589303016663,
"learning_rate": 1.333670137599713e-06,
"loss": 0.1034,
"step": 186
},
{
"epoch": 1.1561051004636784,
"grad_norm": 0.052267253398895264,
"learning_rate": 1.1506584608200367e-06,
"loss": 0.0692,
"step": 187
},
{
"epoch": 1.1622874806800618,
"grad_norm": 0.05556660518050194,
"learning_rate": 9.810017062595322e-07,
"loss": 0.0701,
"step": 188
},
{
"epoch": 1.1684698608964452,
"grad_norm": 0.07082415372133255,
"learning_rate": 8.247462563808817e-07,
"loss": 0.0924,
"step": 189
},
{
"epoch": 1.1746522411128284,
"grad_norm": 0.05150453373789787,
"learning_rate": 6.819348298638839e-07,
"loss": 0.0879,
"step": 190
},
{
"epoch": 1.1808346213292118,
"grad_norm": 0.05410479009151459,
"learning_rate": 5.526064699265753e-07,
"loss": 0.0681,
"step": 191
},
{
"epoch": 1.187017001545595,
"grad_norm": 0.07340589910745621,
"learning_rate": 4.367965336512403e-07,
"loss": 0.0703,
"step": 192
},
{
"epoch": 1.1931993817619784,
"grad_norm": 0.06983581185340881,
"learning_rate": 3.3453668231809286e-07,
"loss": 0.0868,
"step": 193
},
{
"epoch": 1.1993817619783615,
"grad_norm": 0.09015076607465744,
"learning_rate": 2.458548727494292e-07,
"loss": 0.1295,
"step": 194
},
{
"epoch": 1.205564142194745,
"grad_norm": 0.08907996863126755,
"learning_rate": 1.7077534966650766e-07,
"loss": 0.1115,
"step": 195
},
{
"epoch": 1.2117465224111283,
"grad_norm": 0.0957234650850296,
"learning_rate": 1.0931863906127327e-07,
"loss": 0.1177,
"step": 196
},
{
"epoch": 1.2179289026275115,
"grad_norm": 0.13698334991931915,
"learning_rate": 6.150154258476315e-08,
"loss": 0.2054,
"step": 197
},
{
"epoch": 1.224111282843895,
"grad_norm": 0.10444821417331696,
"learning_rate": 2.7337132953697554e-08,
"loss": 0.1576,
"step": 198
},
{
"epoch": 1.2302936630602783,
"grad_norm": 0.10824877768754959,
"learning_rate": 6.834750376549792e-09,
"loss": 0.1621,
"step": 199
},
{
"epoch": 1.2364760432766615,
"grad_norm": 0.08922790735960007,
"learning_rate": 0.0,
"loss": 0.1465,
"step": 200
},
{
"epoch": 1.2364760432766615,
"eval_loss": 0.06736895442008972,
"eval_runtime": 5.5074,
"eval_samples_per_second": 49.57,
"eval_steps_per_second": 12.529,
"step": 200
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.26629570494464e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}