FatCat87's picture
Upload folder using huggingface_hub
4239283 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 318,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0031446540880503146,
"grad_norm": 0.06697794049978256,
"learning_rate": 1.3333333333333333e-05,
"loss": 1.3911,
"step": 1
},
{
"epoch": 0.006289308176100629,
"grad_norm": 0.06842195242643356,
"learning_rate": 2.6666666666666667e-05,
"loss": 1.3973,
"step": 2
},
{
"epoch": 0.009433962264150943,
"grad_norm": 0.06460106372833252,
"learning_rate": 4e-05,
"loss": 1.4096,
"step": 3
},
{
"epoch": 0.012578616352201259,
"grad_norm": 0.0661928579211235,
"learning_rate": 5.333333333333333e-05,
"loss": 1.4273,
"step": 4
},
{
"epoch": 0.015723270440251572,
"grad_norm": 0.06443281471729279,
"learning_rate": 6.666666666666667e-05,
"loss": 1.4056,
"step": 5
},
{
"epoch": 0.018867924528301886,
"grad_norm": 0.06869488209486008,
"learning_rate": 8e-05,
"loss": 1.3854,
"step": 6
},
{
"epoch": 0.0220125786163522,
"grad_norm": 0.06870010495185852,
"learning_rate": 9.333333333333334e-05,
"loss": 1.4054,
"step": 7
},
{
"epoch": 0.025157232704402517,
"grad_norm": 0.072841115295887,
"learning_rate": 0.00010666666666666667,
"loss": 1.3904,
"step": 8
},
{
"epoch": 0.02830188679245283,
"grad_norm": 0.06708419322967529,
"learning_rate": 0.00012,
"loss": 1.4035,
"step": 9
},
{
"epoch": 0.031446540880503145,
"grad_norm": 0.06746525317430496,
"learning_rate": 0.00013333333333333334,
"loss": 1.3638,
"step": 10
},
{
"epoch": 0.03459119496855346,
"grad_norm": 0.06512407958507538,
"learning_rate": 0.00014666666666666666,
"loss": 1.3506,
"step": 11
},
{
"epoch": 0.03773584905660377,
"grad_norm": 0.05993957445025444,
"learning_rate": 0.00016,
"loss": 1.3772,
"step": 12
},
{
"epoch": 0.040880503144654086,
"grad_norm": 0.059435583651065826,
"learning_rate": 0.00017333333333333334,
"loss": 1.3517,
"step": 13
},
{
"epoch": 0.0440251572327044,
"grad_norm": 0.05579576641321182,
"learning_rate": 0.0001866666666666667,
"loss": 1.4105,
"step": 14
},
{
"epoch": 0.04716981132075472,
"grad_norm": 0.04974433407187462,
"learning_rate": 0.0002,
"loss": 1.3162,
"step": 15
},
{
"epoch": 0.050314465408805034,
"grad_norm": 0.04716808721423149,
"learning_rate": 0.00019999462497359466,
"loss": 1.3283,
"step": 16
},
{
"epoch": 0.05345911949685535,
"grad_norm": 0.04468343406915665,
"learning_rate": 0.0001999785004721968,
"loss": 1.359,
"step": 17
},
{
"epoch": 0.05660377358490566,
"grad_norm": 0.05088884010910988,
"learning_rate": 0.00019995162822919883,
"loss": 1.3484,
"step": 18
},
{
"epoch": 0.059748427672955975,
"grad_norm": 0.05735902860760689,
"learning_rate": 0.00019991401113338104,
"loss": 1.3326,
"step": 19
},
{
"epoch": 0.06289308176100629,
"grad_norm": 0.06404463201761246,
"learning_rate": 0.00019986565322860115,
"loss": 1.3403,
"step": 20
},
{
"epoch": 0.0660377358490566,
"grad_norm": 0.0681706890463829,
"learning_rate": 0.00019980655971335945,
"loss": 1.3228,
"step": 21
},
{
"epoch": 0.06918238993710692,
"grad_norm": 0.07040446251630783,
"learning_rate": 0.00019973673694024,
"loss": 1.3087,
"step": 22
},
{
"epoch": 0.07232704402515723,
"grad_norm": 0.0640912875533104,
"learning_rate": 0.0001996561924152278,
"loss": 1.2798,
"step": 23
},
{
"epoch": 0.07547169811320754,
"grad_norm": 0.05613941699266434,
"learning_rate": 0.0001995649347969019,
"loss": 1.2888,
"step": 24
},
{
"epoch": 0.07861635220125786,
"grad_norm": 0.051170893013477325,
"learning_rate": 0.00019946297389550433,
"loss": 1.2697,
"step": 25
},
{
"epoch": 0.08176100628930817,
"grad_norm": 0.044640567153692245,
"learning_rate": 0.0001993503206718859,
"loss": 1.2898,
"step": 26
},
{
"epoch": 0.08490566037735849,
"grad_norm": 0.040906600654125214,
"learning_rate": 0.00019922698723632767,
"loss": 1.247,
"step": 27
},
{
"epoch": 0.0880503144654088,
"grad_norm": 0.03780093416571617,
"learning_rate": 0.00019909298684723904,
"loss": 1.2751,
"step": 28
},
{
"epoch": 0.09119496855345911,
"grad_norm": 0.03710748627781868,
"learning_rate": 0.00019894833390973266,
"loss": 1.287,
"step": 29
},
{
"epoch": 0.09433962264150944,
"grad_norm": 0.03594716638326645,
"learning_rate": 0.0001987930439740757,
"loss": 1.2385,
"step": 30
},
{
"epoch": 0.09748427672955975,
"grad_norm": 0.03679339960217476,
"learning_rate": 0.0001986271337340182,
"loss": 1.2415,
"step": 31
},
{
"epoch": 0.10062893081761007,
"grad_norm": 0.03725181892514229,
"learning_rate": 0.0001984506210249986,
"loss": 1.2268,
"step": 32
},
{
"epoch": 0.10377358490566038,
"grad_norm": 0.037984397262334824,
"learning_rate": 0.00019826352482222638,
"loss": 1.2402,
"step": 33
},
{
"epoch": 0.1069182389937107,
"grad_norm": 0.037509895861148834,
"learning_rate": 0.0001980658652386421,
"loss": 1.2221,
"step": 34
},
{
"epoch": 0.11006289308176101,
"grad_norm": 0.03687283396720886,
"learning_rate": 0.00019785766352275542,
"loss": 1.2386,
"step": 35
},
{
"epoch": 0.11320754716981132,
"grad_norm": 0.03444783389568329,
"learning_rate": 0.00019763894205636072,
"loss": 1.2427,
"step": 36
},
{
"epoch": 0.11635220125786164,
"grad_norm": 0.032733093947172165,
"learning_rate": 0.00019740972435213115,
"loss": 1.2309,
"step": 37
},
{
"epoch": 0.11949685534591195,
"grad_norm": 0.029699521139264107,
"learning_rate": 0.00019717003505109095,
"loss": 1.2479,
"step": 38
},
{
"epoch": 0.12264150943396226,
"grad_norm": 0.02603563852608204,
"learning_rate": 0.00019691989991996663,
"loss": 1.2196,
"step": 39
},
{
"epoch": 0.12578616352201258,
"grad_norm": 0.02650611288845539,
"learning_rate": 0.00019665934584841682,
"loss": 1.2269,
"step": 40
},
{
"epoch": 0.1289308176100629,
"grad_norm": 0.027458857744932175,
"learning_rate": 0.00019638840084614182,
"loss": 1.2625,
"step": 41
},
{
"epoch": 0.1320754716981132,
"grad_norm": 0.027038419619202614,
"learning_rate": 0.00019610709403987246,
"loss": 1.231,
"step": 42
},
{
"epoch": 0.13522012578616352,
"grad_norm": 0.02573474682867527,
"learning_rate": 0.000195815455670239,
"loss": 1.1967,
"step": 43
},
{
"epoch": 0.13836477987421383,
"grad_norm": 0.026413045823574066,
"learning_rate": 0.0001955135170885202,
"loss": 1.1999,
"step": 44
},
{
"epoch": 0.14150943396226415,
"grad_norm": 0.02456706203520298,
"learning_rate": 0.00019520131075327298,
"loss": 1.1724,
"step": 45
},
{
"epoch": 0.14465408805031446,
"grad_norm": 0.02324003167450428,
"learning_rate": 0.00019487887022684336,
"loss": 1.1732,
"step": 46
},
{
"epoch": 0.14779874213836477,
"grad_norm": 0.02492634579539299,
"learning_rate": 0.00019454623017175812,
"loss": 1.1922,
"step": 47
},
{
"epoch": 0.1509433962264151,
"grad_norm": 0.026481660082936287,
"learning_rate": 0.0001942034263469989,
"loss": 1.1889,
"step": 48
},
{
"epoch": 0.1540880503144654,
"grad_norm": 0.023594651371240616,
"learning_rate": 0.00019385049560415794,
"loss": 1.1819,
"step": 49
},
{
"epoch": 0.15723270440251572,
"grad_norm": 0.024192512035369873,
"learning_rate": 0.00019348747588347637,
"loss": 1.1691,
"step": 50
},
{
"epoch": 0.16037735849056603,
"grad_norm": 0.023232240229845047,
"learning_rate": 0.00019311440620976597,
"loss": 1.1819,
"step": 51
},
{
"epoch": 0.16352201257861634,
"grad_norm": 0.02279943972826004,
"learning_rate": 0.00019273132668821364,
"loss": 1.2022,
"step": 52
},
{
"epoch": 0.16666666666666666,
"grad_norm": 0.06585303694009781,
"learning_rate": 0.00019233827850007027,
"loss": 1.135,
"step": 53
},
{
"epoch": 0.16981132075471697,
"grad_norm": 0.024625560268759727,
"learning_rate": 0.00019193530389822363,
"loss": 1.1774,
"step": 54
},
{
"epoch": 0.17295597484276728,
"grad_norm": 0.024765564128756523,
"learning_rate": 0.0001915224462026563,
"loss": 1.1848,
"step": 55
},
{
"epoch": 0.1761006289308176,
"grad_norm": 0.0233647171407938,
"learning_rate": 0.0001910997497957885,
"loss": 1.1821,
"step": 56
},
{
"epoch": 0.1792452830188679,
"grad_norm": 0.02151089534163475,
"learning_rate": 0.00019066726011770726,
"loss": 1.1458,
"step": 57
},
{
"epoch": 0.18238993710691823,
"grad_norm": 0.022214526310563087,
"learning_rate": 0.00019022502366128135,
"loss": 1.1492,
"step": 58
},
{
"epoch": 0.18553459119496854,
"grad_norm": 0.0223999060690403,
"learning_rate": 0.0001897730879671634,
"loss": 1.1703,
"step": 59
},
{
"epoch": 0.18867924528301888,
"grad_norm": 0.023374218493700027,
"learning_rate": 0.00018931150161867916,
"loss": 1.1797,
"step": 60
},
{
"epoch": 0.1918238993710692,
"grad_norm": 0.02511228248476982,
"learning_rate": 0.0001888403142366049,
"loss": 1.2078,
"step": 61
},
{
"epoch": 0.1949685534591195,
"grad_norm": 0.02414465881884098,
"learning_rate": 0.00018835957647383303,
"loss": 1.1902,
"step": 62
},
{
"epoch": 0.19811320754716982,
"grad_norm": 0.02244570665061474,
"learning_rate": 0.00018786934000992688,
"loss": 1.1541,
"step": 63
},
{
"epoch": 0.20125786163522014,
"grad_norm": 0.023515688255429268,
"learning_rate": 0.00018736965754556528,
"loss": 1.1401,
"step": 64
},
{
"epoch": 0.20440251572327045,
"grad_norm": 0.02403687871992588,
"learning_rate": 0.00018686058279687698,
"loss": 1.1526,
"step": 65
},
{
"epoch": 0.20754716981132076,
"grad_norm": 0.022151008248329163,
"learning_rate": 0.00018634217048966637,
"loss": 1.164,
"step": 66
},
{
"epoch": 0.21069182389937108,
"grad_norm": 0.022764768451452255,
"learning_rate": 0.0001858144763535302,
"loss": 1.1572,
"step": 67
},
{
"epoch": 0.2138364779874214,
"grad_norm": 0.024172818288207054,
"learning_rate": 0.00018527755711586678,
"loss": 1.1561,
"step": 68
},
{
"epoch": 0.2169811320754717,
"grad_norm": 0.023120006546378136,
"learning_rate": 0.00018473147049577774,
"loss": 1.1264,
"step": 69
},
{
"epoch": 0.22012578616352202,
"grad_norm": 0.022340824827551842,
"learning_rate": 0.00018417627519786315,
"loss": 1.1471,
"step": 70
},
{
"epoch": 0.22327044025157233,
"grad_norm": 0.02570510096848011,
"learning_rate": 0.00018361203090591071,
"loss": 1.1302,
"step": 71
},
{
"epoch": 0.22641509433962265,
"grad_norm": 0.02528996579349041,
"learning_rate": 0.00018303879827647975,
"loss": 1.1347,
"step": 72
},
{
"epoch": 0.22955974842767296,
"grad_norm": 0.02298339456319809,
"learning_rate": 0.00018245663893238075,
"loss": 1.1202,
"step": 73
},
{
"epoch": 0.23270440251572327,
"grad_norm": 0.023198647424578667,
"learning_rate": 0.00018186561545605054,
"loss": 1.1285,
"step": 74
},
{
"epoch": 0.2358490566037736,
"grad_norm": 0.02332969196140766,
"learning_rate": 0.00018126579138282503,
"loss": 1.1382,
"step": 75
},
{
"epoch": 0.2389937106918239,
"grad_norm": 0.023921016603708267,
"learning_rate": 0.00018065723119410884,
"loss": 1.1508,
"step": 76
},
{
"epoch": 0.24213836477987422,
"grad_norm": 0.027694478631019592,
"learning_rate": 0.0001800400003104436,
"loss": 1.1209,
"step": 77
},
{
"epoch": 0.24528301886792453,
"grad_norm": 0.02638174593448639,
"learning_rate": 0.00017941416508447536,
"loss": 1.1551,
"step": 78
},
{
"epoch": 0.24842767295597484,
"grad_norm": 0.024342985823750496,
"learning_rate": 0.00017877979279382135,
"loss": 1.1033,
"step": 79
},
{
"epoch": 0.25157232704402516,
"grad_norm": 0.0234859399497509,
"learning_rate": 0.0001781369516338378,
"loss": 1.0988,
"step": 80
},
{
"epoch": 0.25471698113207547,
"grad_norm": 0.025128323584794998,
"learning_rate": 0.000177485710710289,
"loss": 1.1248,
"step": 81
},
{
"epoch": 0.2578616352201258,
"grad_norm": 0.025595176964998245,
"learning_rate": 0.00017682614003191807,
"loss": 1.1429,
"step": 82
},
{
"epoch": 0.2610062893081761,
"grad_norm": 0.02447207272052765,
"learning_rate": 0.0001761583105029213,
"loss": 1.0941,
"step": 83
},
{
"epoch": 0.2641509433962264,
"grad_norm": 0.026511628180742264,
"learning_rate": 0.00017548229391532572,
"loss": 1.1529,
"step": 84
},
{
"epoch": 0.2672955974842767,
"grad_norm": 0.026698730885982513,
"learning_rate": 0.00017479816294127152,
"loss": 1.0938,
"step": 85
},
{
"epoch": 0.27044025157232704,
"grad_norm": 0.028718404471874237,
"learning_rate": 0.0001741059911251997,
"loss": 1.1071,
"step": 86
},
{
"epoch": 0.27358490566037735,
"grad_norm": 0.02812567539513111,
"learning_rate": 0.00017340585287594604,
"loss": 1.1382,
"step": 87
},
{
"epoch": 0.27672955974842767,
"grad_norm": 0.025351839140057564,
"learning_rate": 0.00017269782345874203,
"loss": 1.1061,
"step": 88
},
{
"epoch": 0.279874213836478,
"grad_norm": 0.02537315897643566,
"learning_rate": 0.00017198197898712404,
"loss": 1.0935,
"step": 89
},
{
"epoch": 0.2830188679245283,
"grad_norm": 0.027423014864325523,
"learning_rate": 0.00017125839641475072,
"loss": 1.0954,
"step": 90
},
{
"epoch": 0.2861635220125786,
"grad_norm": 0.027652902528643608,
"learning_rate": 0.00017052715352713075,
"loss": 1.0975,
"step": 91
},
{
"epoch": 0.2893081761006289,
"grad_norm": 0.029060475528240204,
"learning_rate": 0.00016978832893326074,
"loss": 1.1008,
"step": 92
},
{
"epoch": 0.29245283018867924,
"grad_norm": 0.02606775052845478,
"learning_rate": 0.0001690420020571747,
"loss": 1.1125,
"step": 93
},
{
"epoch": 0.29559748427672955,
"grad_norm": 0.025361906737089157,
"learning_rate": 0.00016828825312940592,
"loss": 1.1225,
"step": 94
},
{
"epoch": 0.29874213836477986,
"grad_norm": 0.029504677280783653,
"learning_rate": 0.00016752716317836229,
"loss": 1.1281,
"step": 95
},
{
"epoch": 0.3018867924528302,
"grad_norm": 0.027163010090589523,
"learning_rate": 0.00016675881402161536,
"loss": 1.0891,
"step": 96
},
{
"epoch": 0.3050314465408805,
"grad_norm": 0.028238749131560326,
"learning_rate": 0.00016598328825710533,
"loss": 1.0732,
"step": 97
},
{
"epoch": 0.3081761006289308,
"grad_norm": 0.02860194444656372,
"learning_rate": 0.00016520066925426144,
"loss": 1.1109,
"step": 98
},
{
"epoch": 0.3113207547169811,
"grad_norm": 0.027443770319223404,
"learning_rate": 0.0001644110411450398,
"loss": 1.1037,
"step": 99
},
{
"epoch": 0.31446540880503143,
"grad_norm": 0.02937367372214794,
"learning_rate": 0.00016361448881487914,
"loss": 1.1614,
"step": 100
},
{
"epoch": 0.31761006289308175,
"grad_norm": 0.028245460242033005,
"learning_rate": 0.0001628110978935756,
"loss": 1.1193,
"step": 101
},
{
"epoch": 0.32075471698113206,
"grad_norm": 0.03091912530362606,
"learning_rate": 0.00016200095474607753,
"loss": 1.0811,
"step": 102
},
{
"epoch": 0.3238993710691824,
"grad_norm": 0.029428910464048386,
"learning_rate": 0.0001611841464632011,
"loss": 1.0946,
"step": 103
},
{
"epoch": 0.3270440251572327,
"grad_norm": 0.02842988260090351,
"learning_rate": 0.00016036076085226814,
"loss": 1.0921,
"step": 104
},
{
"epoch": 0.330188679245283,
"grad_norm": 0.028155898675322533,
"learning_rate": 0.0001595308864276666,
"loss": 1.0929,
"step": 105
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.03080359846353531,
"learning_rate": 0.0001586946124013354,
"loss": 1.1039,
"step": 106
},
{
"epoch": 0.33647798742138363,
"grad_norm": 0.03177150338888168,
"learning_rate": 0.00015785202867317407,
"loss": 1.0986,
"step": 107
},
{
"epoch": 0.33962264150943394,
"grad_norm": 0.026763366535305977,
"learning_rate": 0.00015700322582137827,
"loss": 1.0686,
"step": 108
},
{
"epoch": 0.34276729559748426,
"grad_norm": 0.027751443907618523,
"learning_rate": 0.0001561482950927029,
"loss": 1.1177,
"step": 109
},
{
"epoch": 0.34591194968553457,
"grad_norm": 0.031205786392092705,
"learning_rate": 0.00015528732839265272,
"loss": 1.1045,
"step": 110
},
{
"epoch": 0.3490566037735849,
"grad_norm": 0.029671067371964455,
"learning_rate": 0.00015442041827560274,
"loss": 1.0815,
"step": 111
},
{
"epoch": 0.3522012578616352,
"grad_norm": 0.03158772736787796,
"learning_rate": 0.00015354765793484834,
"loss": 1.0811,
"step": 112
},
{
"epoch": 0.3553459119496855,
"grad_norm": 0.03245990723371506,
"learning_rate": 0.000152669141192587,
"loss": 1.0923,
"step": 113
},
{
"epoch": 0.3584905660377358,
"grad_norm": 0.030182786285877228,
"learning_rate": 0.00015178496248983254,
"loss": 1.0831,
"step": 114
},
{
"epoch": 0.36163522012578614,
"grad_norm": 0.03249813988804817,
"learning_rate": 0.00015089521687626243,
"loss": 1.0955,
"step": 115
},
{
"epoch": 0.36477987421383645,
"grad_norm": 0.029901932924985886,
"learning_rate": 0.00015000000000000001,
"loss": 1.0567,
"step": 116
},
{
"epoch": 0.36792452830188677,
"grad_norm": 0.0314863882958889,
"learning_rate": 0.00014909940809733222,
"loss": 1.0759,
"step": 117
},
{
"epoch": 0.3710691823899371,
"grad_norm": 0.03128151595592499,
"learning_rate": 0.00014819353798236427,
"loss": 1.0878,
"step": 118
},
{
"epoch": 0.3742138364779874,
"grad_norm": 0.03267417103052139,
"learning_rate": 0.00014728248703661182,
"loss": 1.094,
"step": 119
},
{
"epoch": 0.37735849056603776,
"grad_norm": 0.031713493168354034,
"learning_rate": 0.00014636635319853275,
"loss": 1.0845,
"step": 120
},
{
"epoch": 0.3805031446540881,
"grad_norm": 0.029322847723960876,
"learning_rate": 0.00014544523495299842,
"loss": 1.0683,
"step": 121
},
{
"epoch": 0.3836477987421384,
"grad_norm": 0.03310471028089523,
"learning_rate": 0.0001445192313207067,
"loss": 1.0798,
"step": 122
},
{
"epoch": 0.3867924528301887,
"grad_norm": 0.03195233270525932,
"learning_rate": 0.00014358844184753712,
"loss": 1.0697,
"step": 123
},
{
"epoch": 0.389937106918239,
"grad_norm": 0.034240156412124634,
"learning_rate": 0.00014265296659384956,
"loss": 1.0885,
"step": 124
},
{
"epoch": 0.39308176100628933,
"grad_norm": 0.03241978958249092,
"learning_rate": 0.0001417129061237278,
"loss": 1.0647,
"step": 125
},
{
"epoch": 0.39622641509433965,
"grad_norm": 0.03068430908024311,
"learning_rate": 0.00014076836149416887,
"loss": 1.0647,
"step": 126
},
{
"epoch": 0.39937106918238996,
"grad_norm": 0.03438032045960426,
"learning_rate": 0.00013981943424421932,
"loss": 1.0939,
"step": 127
},
{
"epoch": 0.4025157232704403,
"grad_norm": 0.031215351074934006,
"learning_rate": 0.00013886622638405952,
"loss": 1.0694,
"step": 128
},
{
"epoch": 0.4056603773584906,
"grad_norm": 0.035429947078228,
"learning_rate": 0.00013790884038403795,
"loss": 1.1149,
"step": 129
},
{
"epoch": 0.4088050314465409,
"grad_norm": 0.03237266466021538,
"learning_rate": 0.00013694737916365517,
"loss": 1.0778,
"step": 130
},
{
"epoch": 0.4119496855345912,
"grad_norm": 0.034300774335861206,
"learning_rate": 0.0001359819460805001,
"loss": 1.0872,
"step": 131
},
{
"epoch": 0.41509433962264153,
"grad_norm": 0.03338664770126343,
"learning_rate": 0.00013501264491913906,
"loss": 1.0809,
"step": 132
},
{
"epoch": 0.41823899371069184,
"grad_norm": 0.03169442340731621,
"learning_rate": 0.00013403957987995882,
"loss": 1.0436,
"step": 133
},
{
"epoch": 0.42138364779874216,
"grad_norm": 0.03536612167954445,
"learning_rate": 0.00013306285556796495,
"loss": 1.0643,
"step": 134
},
{
"epoch": 0.42452830188679247,
"grad_norm": 0.03312570974230766,
"learning_rate": 0.00013208257698153677,
"loss": 1.0879,
"step": 135
},
{
"epoch": 0.4276729559748428,
"grad_norm": 0.035323478281497955,
"learning_rate": 0.00013109884950114007,
"loss": 1.0946,
"step": 136
},
{
"epoch": 0.4308176100628931,
"grad_norm": 0.03307751566171646,
"learning_rate": 0.00013011177887799845,
"loss": 1.0574,
"step": 137
},
{
"epoch": 0.4339622641509434,
"grad_norm": 0.03208519518375397,
"learning_rate": 0.00012912147122272523,
"loss": 1.0563,
"step": 138
},
{
"epoch": 0.4371069182389937,
"grad_norm": 0.03215700760483742,
"learning_rate": 0.00012812803299391628,
"loss": 1.0615,
"step": 139
},
{
"epoch": 0.44025157232704404,
"grad_norm": 0.03540361300110817,
"learning_rate": 0.0001271315709867059,
"loss": 1.0903,
"step": 140
},
{
"epoch": 0.44339622641509435,
"grad_norm": 0.03418035805225372,
"learning_rate": 0.00012613219232128608,
"loss": 1.0589,
"step": 141
},
{
"epoch": 0.44654088050314467,
"grad_norm": 0.032720983028411865,
"learning_rate": 0.00012513000443139112,
"loss": 1.0394,
"step": 142
},
{
"epoch": 0.449685534591195,
"grad_norm": 0.03251456469297409,
"learning_rate": 0.00012412511505274844,
"loss": 1.0459,
"step": 143
},
{
"epoch": 0.4528301886792453,
"grad_norm": 0.03547577187418938,
"learning_rate": 0.000123117632211497,
"loss": 1.0919,
"step": 144
},
{
"epoch": 0.4559748427672956,
"grad_norm": 0.03621995821595192,
"learning_rate": 0.0001221076642125742,
"loss": 1.0428,
"step": 145
},
{
"epoch": 0.4591194968553459,
"grad_norm": 0.03383413329720497,
"learning_rate": 0.00012109531962807332,
"loss": 1.0704,
"step": 146
},
{
"epoch": 0.46226415094339623,
"grad_norm": 0.031702034175395966,
"learning_rate": 0.00012008070728557186,
"loss": 1.0328,
"step": 147
},
{
"epoch": 0.46540880503144655,
"grad_norm": 0.039653629064559937,
"learning_rate": 0.00011906393625643244,
"loss": 1.0568,
"step": 148
},
{
"epoch": 0.46855345911949686,
"grad_norm": 0.037315912544727325,
"learning_rate": 0.00011804511584407763,
"loss": 1.0668,
"step": 149
},
{
"epoch": 0.4716981132075472,
"grad_norm": 0.03531115874648094,
"learning_rate": 0.00011702435557223987,
"loss": 1.0827,
"step": 150
},
{
"epoch": 0.4748427672955975,
"grad_norm": 0.03649010509252548,
"learning_rate": 0.00011600176517318741,
"loss": 1.0796,
"step": 151
},
{
"epoch": 0.4779874213836478,
"grad_norm": 0.04164504259824753,
"learning_rate": 0.00011497745457592816,
"loss": 1.0314,
"step": 152
},
{
"epoch": 0.4811320754716981,
"grad_norm": 0.037900954484939575,
"learning_rate": 0.00011395153389439233,
"loss": 1.0668,
"step": 153
},
{
"epoch": 0.48427672955974843,
"grad_norm": 0.034743502736091614,
"learning_rate": 0.0001129241134155949,
"loss": 1.0575,
"step": 154
},
{
"epoch": 0.48742138364779874,
"grad_norm": 0.05526720732450485,
"learning_rate": 0.00011189530358778005,
"loss": 1.0537,
"step": 155
},
{
"epoch": 0.49056603773584906,
"grad_norm": 0.03674091398715973,
"learning_rate": 0.00011086521500854745,
"loss": 1.0612,
"step": 156
},
{
"epoch": 0.4937106918238994,
"grad_norm": 0.03560490161180496,
"learning_rate": 0.00010983395841296348,
"loss": 1.0461,
"step": 157
},
{
"epoch": 0.4968553459119497,
"grad_norm": 0.03683093190193176,
"learning_rate": 0.00010880164466165674,
"loss": 1.0489,
"step": 158
},
{
"epoch": 0.5,
"grad_norm": 0.034947801381349564,
"learning_rate": 0.00010776838472890065,
"loss": 1.0908,
"step": 159
},
{
"epoch": 0.5031446540880503,
"grad_norm": 0.034155167639255524,
"learning_rate": 0.00010673428969068364,
"loss": 1.0883,
"step": 160
},
{
"epoch": 0.5062893081761006,
"grad_norm": 0.03542330116033554,
"learning_rate": 0.00010569947071276847,
"loss": 1.0629,
"step": 161
},
{
"epoch": 0.5094339622641509,
"grad_norm": 0.0372898206114769,
"learning_rate": 0.00010466403903874176,
"loss": 1.0515,
"step": 162
},
{
"epoch": 0.5125786163522013,
"grad_norm": 0.03636344522237778,
"learning_rate": 0.00010362810597805526,
"loss": 1.0905,
"step": 163
},
{
"epoch": 0.5157232704402516,
"grad_norm": 0.035335466265678406,
"learning_rate": 0.00010259178289406011,
"loss": 1.0698,
"step": 164
},
{
"epoch": 0.5188679245283019,
"grad_norm": 0.036180030554533005,
"learning_rate": 0.0001015551811920351,
"loss": 1.0487,
"step": 165
},
{
"epoch": 0.5220125786163522,
"grad_norm": 0.03546663746237755,
"learning_rate": 0.00010051841230721065,
"loss": 1.0336,
"step": 166
},
{
"epoch": 0.5251572327044025,
"grad_norm": 0.03683155030012131,
"learning_rate": 9.948158769278939e-05,
"loss": 1.0628,
"step": 167
},
{
"epoch": 0.5283018867924528,
"grad_norm": 0.03633348271250725,
"learning_rate": 9.844481880796491e-05,
"loss": 1.0646,
"step": 168
},
{
"epoch": 0.5314465408805031,
"grad_norm": 0.03651515021920204,
"learning_rate": 9.740821710593989e-05,
"loss": 1.0584,
"step": 169
},
{
"epoch": 0.5345911949685535,
"grad_norm": 0.03433886170387268,
"learning_rate": 9.637189402194476e-05,
"loss": 1.0537,
"step": 170
},
{
"epoch": 0.5377358490566038,
"grad_norm": 0.046192716807127,
"learning_rate": 9.533596096125825e-05,
"loss": 1.0409,
"step": 171
},
{
"epoch": 0.5408805031446541,
"grad_norm": 0.03568156436085701,
"learning_rate": 9.430052928723153e-05,
"loss": 1.0278,
"step": 172
},
{
"epoch": 0.5440251572327044,
"grad_norm": 0.040810681879520416,
"learning_rate": 9.326571030931637e-05,
"loss": 1.0405,
"step": 173
},
{
"epoch": 0.5471698113207547,
"grad_norm": 0.03588728979229927,
"learning_rate": 9.223161527109937e-05,
"loss": 1.065,
"step": 174
},
{
"epoch": 0.550314465408805,
"grad_norm": 0.03548993915319443,
"learning_rate": 9.119835533834331e-05,
"loss": 1.0065,
"step": 175
},
{
"epoch": 0.5534591194968553,
"grad_norm": 0.04264102876186371,
"learning_rate": 9.016604158703654e-05,
"loss": 1.0668,
"step": 176
},
{
"epoch": 0.5566037735849056,
"grad_norm": 0.03986184671521187,
"learning_rate": 8.913478499145254e-05,
"loss": 1.0512,
"step": 177
},
{
"epoch": 0.559748427672956,
"grad_norm": 0.03871089220046997,
"learning_rate": 8.810469641222001e-05,
"loss": 1.0413,
"step": 178
},
{
"epoch": 0.5628930817610063,
"grad_norm": 0.03574568033218384,
"learning_rate": 8.707588658440511e-05,
"loss": 1.0293,
"step": 179
},
{
"epoch": 0.5660377358490566,
"grad_norm": 0.037175796926021576,
"learning_rate": 8.604846610560771e-05,
"loss": 1.0246,
"step": 180
},
{
"epoch": 0.5691823899371069,
"grad_norm": 0.04187128692865372,
"learning_rate": 8.502254542407186e-05,
"loss": 1.023,
"step": 181
},
{
"epoch": 0.5723270440251572,
"grad_norm": 0.04172036051750183,
"learning_rate": 8.399823482681262e-05,
"loss": 1.0455,
"step": 182
},
{
"epoch": 0.5754716981132075,
"grad_norm": 0.03626122325658798,
"learning_rate": 8.297564442776014e-05,
"loss": 1.0457,
"step": 183
},
{
"epoch": 0.5786163522012578,
"grad_norm": 0.03596337512135506,
"learning_rate": 8.195488415592238e-05,
"loss": 1.0521,
"step": 184
},
{
"epoch": 0.5817610062893082,
"grad_norm": 0.03914599120616913,
"learning_rate": 8.093606374356759e-05,
"loss": 1.0645,
"step": 185
},
{
"epoch": 0.5849056603773585,
"grad_norm": 0.044063687324523926,
"learning_rate": 7.991929271442817e-05,
"loss": 1.0677,
"step": 186
},
{
"epoch": 0.5880503144654088,
"grad_norm": 0.04163552075624466,
"learning_rate": 7.89046803719267e-05,
"loss": 1.0568,
"step": 187
},
{
"epoch": 0.5911949685534591,
"grad_norm": 0.036366574466228485,
"learning_rate": 7.789233578742582e-05,
"loss": 1.0038,
"step": 188
},
{
"epoch": 0.5943396226415094,
"grad_norm": 0.04061400517821312,
"learning_rate": 7.688236778850306e-05,
"loss": 1.0462,
"step": 189
},
{
"epoch": 0.5974842767295597,
"grad_norm": 0.03604275360703468,
"learning_rate": 7.587488494725157e-05,
"loss": 1.0275,
"step": 190
},
{
"epoch": 0.60062893081761,
"grad_norm": 0.03972569853067398,
"learning_rate": 7.48699955686089e-05,
"loss": 1.0402,
"step": 191
},
{
"epoch": 0.6037735849056604,
"grad_norm": 0.04172028228640556,
"learning_rate": 7.386780767871397e-05,
"loss": 1.0416,
"step": 192
},
{
"epoch": 0.6069182389937107,
"grad_norm": 0.03570333123207092,
"learning_rate": 7.286842901329412e-05,
"loss": 1.0459,
"step": 193
},
{
"epoch": 0.610062893081761,
"grad_norm": 0.037412162870168686,
"learning_rate": 7.187196700608373e-05,
"loss": 1.0556,
"step": 194
},
{
"epoch": 0.6132075471698113,
"grad_norm": 0.038102056831121445,
"learning_rate": 7.087852877727481e-05,
"loss": 1.0301,
"step": 195
},
{
"epoch": 0.6163522012578616,
"grad_norm": 0.037487804889678955,
"learning_rate": 6.988822112200156e-05,
"loss": 1.0494,
"step": 196
},
{
"epoch": 0.6194968553459119,
"grad_norm": 0.03777475655078888,
"learning_rate": 6.890115049885994e-05,
"loss": 0.9972,
"step": 197
},
{
"epoch": 0.6226415094339622,
"grad_norm": 0.04026506096124649,
"learning_rate": 6.791742301846326e-05,
"loss": 1.0068,
"step": 198
},
{
"epoch": 0.6257861635220126,
"grad_norm": 0.03857170045375824,
"learning_rate": 6.693714443203507e-05,
"loss": 1.0468,
"step": 199
},
{
"epoch": 0.6289308176100629,
"grad_norm": 0.038687944412231445,
"learning_rate": 6.59604201200412e-05,
"loss": 1.021,
"step": 200
},
{
"epoch": 0.6320754716981132,
"grad_norm": 0.03843434900045395,
"learning_rate": 6.498735508086093e-05,
"loss": 1.0443,
"step": 201
},
{
"epoch": 0.6352201257861635,
"grad_norm": 0.03765735775232315,
"learning_rate": 6.40180539194999e-05,
"loss": 1.0068,
"step": 202
},
{
"epoch": 0.6383647798742138,
"grad_norm": 0.038186896592378616,
"learning_rate": 6.305262083634488e-05,
"loss": 1.0368,
"step": 203
},
{
"epoch": 0.6415094339622641,
"grad_norm": 0.03744081035256386,
"learning_rate": 6.209115961596208e-05,
"loss": 1.0035,
"step": 204
},
{
"epoch": 0.6446540880503144,
"grad_norm": 0.03738857060670853,
"learning_rate": 6.113377361594049e-05,
"loss": 1.0343,
"step": 205
},
{
"epoch": 0.6477987421383647,
"grad_norm": 0.03938114643096924,
"learning_rate": 6.018056575578075e-05,
"loss": 1.041,
"step": 206
},
{
"epoch": 0.6509433962264151,
"grad_norm": 0.0429544560611248,
"learning_rate": 5.923163850583113e-05,
"loss": 1.0455,
"step": 207
},
{
"epoch": 0.6540880503144654,
"grad_norm": 0.03791610524058342,
"learning_rate": 5.828709387627218e-05,
"loss": 1.0284,
"step": 208
},
{
"epoch": 0.6572327044025157,
"grad_norm": 0.038352545350790024,
"learning_rate": 5.73470334061505e-05,
"loss": 1.0279,
"step": 209
},
{
"epoch": 0.660377358490566,
"grad_norm": 0.03907958045601845,
"learning_rate": 5.6411558152462894e-05,
"loss": 1.0711,
"step": 210
},
{
"epoch": 0.6635220125786163,
"grad_norm": 0.03748472407460213,
"learning_rate": 5.54807686792933e-05,
"loss": 1.0187,
"step": 211
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.13872897624969482,
"learning_rate": 5.4554765047001613e-05,
"loss": 1.0482,
"step": 212
},
{
"epoch": 0.6698113207547169,
"grad_norm": 0.04007211700081825,
"learning_rate": 5.363364680146725e-05,
"loss": 1.0525,
"step": 213
},
{
"epoch": 0.6729559748427673,
"grad_norm": 0.038152776658535004,
"learning_rate": 5.271751296338823e-05,
"loss": 1.0222,
"step": 214
},
{
"epoch": 0.6761006289308176,
"grad_norm": 0.03928610309958458,
"learning_rate": 5.180646201763577e-05,
"loss": 1.06,
"step": 215
},
{
"epoch": 0.6792452830188679,
"grad_norm": 0.03823390603065491,
"learning_rate": 5.090059190266779e-05,
"loss": 1.006,
"step": 216
},
{
"epoch": 0.6823899371069182,
"grad_norm": 0.03753795474767685,
"learning_rate": 5.000000000000002e-05,
"loss": 1.0471,
"step": 217
},
{
"epoch": 0.6855345911949685,
"grad_norm": 0.03927240148186684,
"learning_rate": 4.9104783123737566e-05,
"loss": 1.0211,
"step": 218
},
{
"epoch": 0.6886792452830188,
"grad_norm": 0.038637347519397736,
"learning_rate": 4.821503751016746e-05,
"loss": 1.0393,
"step": 219
},
{
"epoch": 0.6918238993710691,
"grad_norm": 0.04003263637423515,
"learning_rate": 4.733085880741301e-05,
"loss": 1.0387,
"step": 220
},
{
"epoch": 0.6949685534591195,
"grad_norm": 0.037788983434438705,
"learning_rate": 4.645234206515171e-05,
"loss": 1.0395,
"step": 221
},
{
"epoch": 0.6981132075471698,
"grad_norm": 0.037437207996845245,
"learning_rate": 4.5579581724397255e-05,
"loss": 1.002,
"step": 222
},
{
"epoch": 0.7012578616352201,
"grad_norm": 0.03973449021577835,
"learning_rate": 4.471267160734731e-05,
"loss": 1.0101,
"step": 223
},
{
"epoch": 0.7044025157232704,
"grad_norm": 0.04157485440373421,
"learning_rate": 4.385170490729712e-05,
"loss": 1.0547,
"step": 224
},
{
"epoch": 0.7075471698113207,
"grad_norm": 0.03971412032842636,
"learning_rate": 4.2996774178621736e-05,
"loss": 1.0327,
"step": 225
},
{
"epoch": 0.710691823899371,
"grad_norm": 0.042363688349723816,
"learning_rate": 4.2147971326825966e-05,
"loss": 1.0115,
"step": 226
},
{
"epoch": 0.7138364779874213,
"grad_norm": 0.03927742689847946,
"learning_rate": 4.130538759866457e-05,
"loss": 1.037,
"step": 227
},
{
"epoch": 0.7169811320754716,
"grad_norm": 0.04383242875337601,
"learning_rate": 4.046911357233343e-05,
"loss": 1.0336,
"step": 228
},
{
"epoch": 0.720125786163522,
"grad_norm": 0.041160885244607925,
"learning_rate": 3.963923914773187e-05,
"loss": 1.0453,
"step": 229
},
{
"epoch": 0.7232704402515723,
"grad_norm": 0.038153354078531265,
"learning_rate": 3.8815853536798904e-05,
"loss": 1.0438,
"step": 230
},
{
"epoch": 0.7264150943396226,
"grad_norm": 0.039117470383644104,
"learning_rate": 3.79990452539225e-05,
"loss": 1.0131,
"step": 231
},
{
"epoch": 0.7295597484276729,
"grad_norm": 0.037614606320858,
"learning_rate": 3.7188902106424416e-05,
"loss": 1.0308,
"step": 232
},
{
"epoch": 0.7327044025157232,
"grad_norm": 0.03742281720042229,
"learning_rate": 3.638551118512089e-05,
"loss": 1.0343,
"step": 233
},
{
"epoch": 0.7358490566037735,
"grad_norm": 0.040659379214048386,
"learning_rate": 3.558895885496023e-05,
"loss": 1.0206,
"step": 234
},
{
"epoch": 0.7389937106918238,
"grad_norm": 0.039581410586833954,
"learning_rate": 3.479933074573858e-05,
"loss": 1.0209,
"step": 235
},
{
"epoch": 0.7421383647798742,
"grad_norm": 0.03877450153231621,
"learning_rate": 3.401671174289469e-05,
"loss": 1.0242,
"step": 236
},
{
"epoch": 0.7452830188679245,
"grad_norm": 0.03689349815249443,
"learning_rate": 3.324118597838464e-05,
"loss": 1.0064,
"step": 237
},
{
"epoch": 0.7484276729559748,
"grad_norm": 0.039353396743535995,
"learning_rate": 3.2472836821637744e-05,
"loss": 1.0392,
"step": 238
},
{
"epoch": 0.7515723270440252,
"grad_norm": 0.04024632275104523,
"learning_rate": 3.1711746870594086e-05,
"loss": 1.0398,
"step": 239
},
{
"epoch": 0.7547169811320755,
"grad_norm": 0.0384189747273922,
"learning_rate": 3.0957997942825336e-05,
"loss": 1.0508,
"step": 240
},
{
"epoch": 0.7578616352201258,
"grad_norm": 0.038072239607572556,
"learning_rate": 3.021167106673928e-05,
"loss": 1.0274,
"step": 241
},
{
"epoch": 0.7610062893081762,
"grad_norm": 0.03652197867631912,
"learning_rate": 2.9472846472869298e-05,
"loss": 1.0091,
"step": 242
},
{
"epoch": 0.7641509433962265,
"grad_norm": 0.04008382558822632,
"learning_rate": 2.874160358524931e-05,
"loss": 1.0118,
"step": 243
},
{
"epoch": 0.7672955974842768,
"grad_norm": 0.038193073123693466,
"learning_rate": 2.8018021012875994e-05,
"loss": 1.0492,
"step": 244
},
{
"epoch": 0.7704402515723271,
"grad_norm": 0.04008280113339424,
"learning_rate": 2.7302176541257986e-05,
"loss": 1.0087,
"step": 245
},
{
"epoch": 0.7735849056603774,
"grad_norm": 0.040726155042648315,
"learning_rate": 2.659414712405398e-05,
"loss": 1.0427,
"step": 246
},
{
"epoch": 0.7767295597484277,
"grad_norm": 0.03964506462216377,
"learning_rate": 2.5894008874800325e-05,
"loss": 1.0377,
"step": 247
},
{
"epoch": 0.779874213836478,
"grad_norm": 0.03894224017858505,
"learning_rate": 2.5201837058728505e-05,
"loss": 1.0362,
"step": 248
},
{
"epoch": 0.7830188679245284,
"grad_norm": 0.038798924535512924,
"learning_rate": 2.451770608467432e-05,
"loss": 1.0383,
"step": 249
},
{
"epoch": 0.7861635220125787,
"grad_norm": 0.03763001784682274,
"learning_rate": 2.3841689497078746e-05,
"loss": 1.0488,
"step": 250
},
{
"epoch": 0.789308176100629,
"grad_norm": 0.04090484231710434,
"learning_rate": 2.3173859968081944e-05,
"loss": 1.0297,
"step": 251
},
{
"epoch": 0.7924528301886793,
"grad_norm": 0.039545051753520966,
"learning_rate": 2.251428928971102e-05,
"loss": 1.0396,
"step": 252
},
{
"epoch": 0.7955974842767296,
"grad_norm": 0.037017423659563065,
"learning_rate": 2.1863048366162208e-05,
"loss": 1.0178,
"step": 253
},
{
"epoch": 0.7987421383647799,
"grad_norm": 0.03963112458586693,
"learning_rate": 2.1220207206178688e-05,
"loss": 1.025,
"step": 254
},
{
"epoch": 0.8018867924528302,
"grad_norm": 0.03978583589196205,
"learning_rate": 2.058583491552465e-05,
"loss": 1.0226,
"step": 255
},
{
"epoch": 0.8050314465408805,
"grad_norm": 0.03923904895782471,
"learning_rate": 1.995999968955641e-05,
"loss": 1.0291,
"step": 256
},
{
"epoch": 0.8081761006289309,
"grad_norm": 0.03717755898833275,
"learning_rate": 1.9342768805891178e-05,
"loss": 1.0262,
"step": 257
},
{
"epoch": 0.8113207547169812,
"grad_norm": 0.03690655902028084,
"learning_rate": 1.8734208617174988e-05,
"loss": 1.0263,
"step": 258
},
{
"epoch": 0.8144654088050315,
"grad_norm": 0.038003891706466675,
"learning_rate": 1.8134384543949478e-05,
"loss": 1.0279,
"step": 259
},
{
"epoch": 0.8176100628930818,
"grad_norm": 0.037383392453193665,
"learning_rate": 1.754336106761927e-05,
"loss": 1.0184,
"step": 260
},
{
"epoch": 0.8207547169811321,
"grad_norm": 0.038551997393369675,
"learning_rate": 1.696120172352025e-05,
"loss": 1.055,
"step": 261
},
{
"epoch": 0.8238993710691824,
"grad_norm": 0.03848763927817345,
"learning_rate": 1.6387969094089316e-05,
"loss": 1.0276,
"step": 262
},
{
"epoch": 0.8270440251572327,
"grad_norm": 0.03697813302278519,
"learning_rate": 1.5823724802136865e-05,
"loss": 1.0107,
"step": 263
},
{
"epoch": 0.8301886792452831,
"grad_norm": 0.039934322237968445,
"learning_rate": 1.526852950422226e-05,
"loss": 1.0184,
"step": 264
},
{
"epoch": 0.8333333333333334,
"grad_norm": 0.04363315552473068,
"learning_rate": 1.4722442884133214e-05,
"loss": 0.9912,
"step": 265
},
{
"epoch": 0.8364779874213837,
"grad_norm": 0.04497281834483147,
"learning_rate": 1.4185523646469822e-05,
"loss": 1.0578,
"step": 266
},
{
"epoch": 0.839622641509434,
"grad_norm": 0.03638835996389389,
"learning_rate": 1.3657829510333654e-05,
"loss": 1.0259,
"step": 267
},
{
"epoch": 0.8427672955974843,
"grad_norm": 0.0390971377491951,
"learning_rate": 1.3139417203123027e-05,
"loss": 1.0188,
"step": 268
},
{
"epoch": 0.8459119496855346,
"grad_norm": 0.036897242069244385,
"learning_rate": 1.263034245443473e-05,
"loss": 1.0333,
"step": 269
},
{
"epoch": 0.8490566037735849,
"grad_norm": 0.037717305123806,
"learning_rate": 1.2130659990073146e-05,
"loss": 1.0319,
"step": 270
},
{
"epoch": 0.8522012578616353,
"grad_norm": 0.038260139524936676,
"learning_rate": 1.1640423526166988e-05,
"loss": 1.0063,
"step": 271
},
{
"epoch": 0.8553459119496856,
"grad_norm": 0.04040497913956642,
"learning_rate": 1.1159685763395111e-05,
"loss": 1.01,
"step": 272
},
{
"epoch": 0.8584905660377359,
"grad_norm": 0.036462146788835526,
"learning_rate": 1.0688498381320855e-05,
"loss": 1.0137,
"step": 273
},
{
"epoch": 0.8616352201257862,
"grad_norm": 0.03783508017659187,
"learning_rate": 1.0226912032836611e-05,
"loss": 1.01,
"step": 274
},
{
"epoch": 0.8647798742138365,
"grad_norm": 0.036553751677274704,
"learning_rate": 9.774976338718677e-06,
"loss": 1.035,
"step": 275
},
{
"epoch": 0.8679245283018868,
"grad_norm": 0.038083869963884354,
"learning_rate": 9.332739882292752e-06,
"loss": 1.0514,
"step": 276
},
{
"epoch": 0.8710691823899371,
"grad_norm": 0.036774635314941406,
"learning_rate": 8.900250204211514e-06,
"loss": 1.0211,
"step": 277
},
{
"epoch": 0.8742138364779874,
"grad_norm": 0.038534294813871384,
"learning_rate": 8.47755379734373e-06,
"loss": 1.011,
"step": 278
},
{
"epoch": 0.8773584905660378,
"grad_norm": 0.036409780383110046,
"learning_rate": 8.064696101776358e-06,
"loss": 1.0247,
"step": 279
},
{
"epoch": 0.8805031446540881,
"grad_norm": 0.04032037407159805,
"learning_rate": 7.661721499929753e-06,
"loss": 1.0205,
"step": 280
},
{
"epoch": 0.8836477987421384,
"grad_norm": 0.03601597249507904,
"learning_rate": 7.2686733117863784e-06,
"loss": 1.0278,
"step": 281
},
{
"epoch": 0.8867924528301887,
"grad_norm": 0.03768506646156311,
"learning_rate": 6.8855937902340576e-06,
"loss": 1.0256,
"step": 282
},
{
"epoch": 0.889937106918239,
"grad_norm": 0.0377877801656723,
"learning_rate": 6.512524116523633e-06,
"loss": 1.0238,
"step": 283
},
{
"epoch": 0.8930817610062893,
"grad_norm": 0.038199830800294876,
"learning_rate": 6.149504395842087e-06,
"loss": 1.0335,
"step": 284
},
{
"epoch": 0.8962264150943396,
"grad_norm": 0.03672681748867035,
"learning_rate": 5.7965736530010916e-06,
"loss": 1.0089,
"step": 285
},
{
"epoch": 0.89937106918239,
"grad_norm": 0.03878109157085419,
"learning_rate": 5.453769828241872e-06,
"loss": 1.0007,
"step": 286
},
{
"epoch": 0.9025157232704403,
"grad_norm": 0.03794073313474655,
"learning_rate": 5.121129773156663e-06,
"loss": 1.0507,
"step": 287
},
{
"epoch": 0.9056603773584906,
"grad_norm": 0.03933648765087128,
"learning_rate": 4.798689246727006e-06,
"loss": 1.0266,
"step": 288
},
{
"epoch": 0.9088050314465409,
"grad_norm": 0.03670027107000351,
"learning_rate": 4.486482911479839e-06,
"loss": 1.0367,
"step": 289
},
{
"epoch": 0.9119496855345912,
"grad_norm": 0.037638451904058456,
"learning_rate": 4.184544329761009e-06,
"loss": 1.0401,
"step": 290
},
{
"epoch": 0.9150943396226415,
"grad_norm": 0.03804009407758713,
"learning_rate": 3.892905960127546e-06,
"loss": 0.9959,
"step": 291
},
{
"epoch": 0.9182389937106918,
"grad_norm": 0.04068181291222572,
"learning_rate": 3.611599153858214e-06,
"loss": 1.0631,
"step": 292
},
{
"epoch": 0.9213836477987422,
"grad_norm": 0.036831971257925034,
"learning_rate": 3.3406541515832003e-06,
"loss": 1.0072,
"step": 293
},
{
"epoch": 0.9245283018867925,
"grad_norm": 0.03732535243034363,
"learning_rate": 3.0801000800333877e-06,
"loss": 1.0091,
"step": 294
},
{
"epoch": 0.9276729559748428,
"grad_norm": 0.03764468804001808,
"learning_rate": 2.8299649489090475e-06,
"loss": 1.03,
"step": 295
},
{
"epoch": 0.9308176100628931,
"grad_norm": 0.03870733082294464,
"learning_rate": 2.590275647868867e-06,
"loss": 1.0281,
"step": 296
},
{
"epoch": 0.9339622641509434,
"grad_norm": 0.03789420798420906,
"learning_rate": 2.3610579436393e-06,
"loss": 1.0479,
"step": 297
},
{
"epoch": 0.9371069182389937,
"grad_norm": 0.039737775921821594,
"learning_rate": 2.1423364772445887e-06,
"loss": 1.0584,
"step": 298
},
{
"epoch": 0.940251572327044,
"grad_norm": 0.03715479001402855,
"learning_rate": 1.9341347613579087e-06,
"loss": 1.0168,
"step": 299
},
{
"epoch": 0.9433962264150944,
"grad_norm": 0.03719014674425125,
"learning_rate": 1.7364751777736332e-06,
"loss": 0.9956,
"step": 300
},
{
"epoch": 0.9465408805031447,
"grad_norm": 0.03842631354928017,
"learning_rate": 1.5493789750014031e-06,
"loss": 1.0437,
"step": 301
},
{
"epoch": 0.949685534591195,
"grad_norm": 0.038147032260894775,
"learning_rate": 1.3728662659818204e-06,
"loss": 1.033,
"step": 302
},
{
"epoch": 0.9528301886792453,
"grad_norm": 0.038387030363082886,
"learning_rate": 1.2069560259243328e-06,
"loss": 1.0002,
"step": 303
},
{
"epoch": 0.9559748427672956,
"grad_norm": 0.03752262517809868,
"learning_rate": 1.0516660902673448e-06,
"loss": 1.0258,
"step": 304
},
{
"epoch": 0.9591194968553459,
"grad_norm": 0.03641341254115105,
"learning_rate": 9.070131527609604e-07,
"loss": 1.0259,
"step": 305
},
{
"epoch": 0.9622641509433962,
"grad_norm": 0.03814227133989334,
"learning_rate": 7.730127636723539e-07,
"loss": 0.9939,
"step": 306
},
{
"epoch": 0.9654088050314465,
"grad_norm": 0.03757226839661598,
"learning_rate": 6.496793281141056e-07,
"loss": 0.988,
"step": 307
},
{
"epoch": 0.9685534591194969,
"grad_norm": 0.03574439138174057,
"learning_rate": 5.370261044956971e-07,
"loss": 1.0257,
"step": 308
},
{
"epoch": 0.9716981132075472,
"grad_norm": 0.03881550952792168,
"learning_rate": 4.3506520309813947e-07,
"loss": 1.0399,
"step": 309
},
{
"epoch": 0.9748427672955975,
"grad_norm": 0.03827887400984764,
"learning_rate": 3.4380758477219333e-07,
"loss": 1.0163,
"step": 310
},
{
"epoch": 0.9779874213836478,
"grad_norm": 0.03611140325665474,
"learning_rate": 2.6326305976001055e-07,
"loss": 1.0014,
"step": 311
},
{
"epoch": 0.9811320754716981,
"grad_norm": 0.037388019263744354,
"learning_rate": 1.9344028664056713e-07,
"loss": 1.0223,
"step": 312
},
{
"epoch": 0.9842767295597484,
"grad_norm": 0.041428446769714355,
"learning_rate": 1.3434677139885222e-07,
"loss": 1.0327,
"step": 313
},
{
"epoch": 0.9874213836477987,
"grad_norm": 0.036151085048913956,
"learning_rate": 8.598886661895788e-08,
"loss": 1.0306,
"step": 314
},
{
"epoch": 0.9905660377358491,
"grad_norm": 0.03780834376811981,
"learning_rate": 4.837177080119215e-08,
"loss": 1.0196,
"step": 315
},
{
"epoch": 0.9937106918238994,
"grad_norm": 0.039815668016672134,
"learning_rate": 2.1499527803214846e-08,
"loss": 1.045,
"step": 316
},
{
"epoch": 0.9968553459119497,
"grad_norm": 0.03660481423139572,
"learning_rate": 5.375026405352035e-09,
"loss": 1.016,
"step": 317
},
{
"epoch": 1.0,
"grad_norm": 0.03775152564048767,
"learning_rate": 0.0,
"loss": 1.015,
"step": 318
},
{
"epoch": 1.0,
"eval_loss": 1.0362061262130737,
"eval_runtime": 856.4472,
"eval_samples_per_second": 29.006,
"eval_steps_per_second": 3.627,
"step": 318
}
],
"logging_steps": 1,
"max_steps": 318,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.9003005260988416e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}