Text Generation
Transformers
Safetensors
qwen3
conversational
text-generation-inference
davidanugraha's picture
Upload folder using huggingface_hub
f78bed5 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 417,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.007203962179198559,
"grad_norm": 6.173737525939941,
"learning_rate": 0.0,
"loss": 1.7414,
"step": 1
},
{
"epoch": 0.014407924358397118,
"grad_norm": 6.159329891204834,
"learning_rate": 2.3809523809523811e-07,
"loss": 1.7221,
"step": 2
},
{
"epoch": 0.021611886537595677,
"grad_norm": 6.446763038635254,
"learning_rate": 4.7619047619047623e-07,
"loss": 1.7481,
"step": 3
},
{
"epoch": 0.028815848716794237,
"grad_norm": 6.09867000579834,
"learning_rate": 7.142857142857143e-07,
"loss": 1.7203,
"step": 4
},
{
"epoch": 0.03601981089599279,
"grad_norm": 6.325934886932373,
"learning_rate": 9.523809523809525e-07,
"loss": 1.7419,
"step": 5
},
{
"epoch": 0.04322377307519135,
"grad_norm": 6.029388904571533,
"learning_rate": 1.1904761904761906e-06,
"loss": 1.7401,
"step": 6
},
{
"epoch": 0.05042773525438991,
"grad_norm": 5.957275867462158,
"learning_rate": 1.4285714285714286e-06,
"loss": 1.7205,
"step": 7
},
{
"epoch": 0.05763169743358847,
"grad_norm": 5.456174373626709,
"learning_rate": 1.6666666666666667e-06,
"loss": 1.7044,
"step": 8
},
{
"epoch": 0.06483565961278703,
"grad_norm": 5.111164093017578,
"learning_rate": 1.904761904761905e-06,
"loss": 1.6829,
"step": 9
},
{
"epoch": 0.07203962179198559,
"grad_norm": 3.8403587341308594,
"learning_rate": 2.1428571428571427e-06,
"loss": 1.6228,
"step": 10
},
{
"epoch": 0.07924358397118415,
"grad_norm": 3.6962971687316895,
"learning_rate": 2.380952380952381e-06,
"loss": 1.6107,
"step": 11
},
{
"epoch": 0.0864475461503827,
"grad_norm": 3.4146342277526855,
"learning_rate": 2.6190476190476192e-06,
"loss": 1.6153,
"step": 12
},
{
"epoch": 0.09365150832958127,
"grad_norm": 2.1463379859924316,
"learning_rate": 2.8571428571428573e-06,
"loss": 1.5442,
"step": 13
},
{
"epoch": 0.10085547050877983,
"grad_norm": 2.0825576782226562,
"learning_rate": 3.0952380952380957e-06,
"loss": 1.5311,
"step": 14
},
{
"epoch": 0.1080594326879784,
"grad_norm": 1.9198007583618164,
"learning_rate": 3.3333333333333333e-06,
"loss": 1.5017,
"step": 15
},
{
"epoch": 0.11526339486717695,
"grad_norm": 1.7519303560256958,
"learning_rate": 3.5714285714285718e-06,
"loss": 1.4866,
"step": 16
},
{
"epoch": 0.12246735704637551,
"grad_norm": 1.6560941934585571,
"learning_rate": 3.80952380952381e-06,
"loss": 1.4904,
"step": 17
},
{
"epoch": 0.12967131922557407,
"grad_norm": 1.7502397298812866,
"learning_rate": 4.047619047619048e-06,
"loss": 1.447,
"step": 18
},
{
"epoch": 0.13687528140477262,
"grad_norm": 1.8956769704818726,
"learning_rate": 4.2857142857142855e-06,
"loss": 1.4399,
"step": 19
},
{
"epoch": 0.14407924358397117,
"grad_norm": 1.8007680177688599,
"learning_rate": 4.523809523809524e-06,
"loss": 1.4126,
"step": 20
},
{
"epoch": 0.15128320576316975,
"grad_norm": 1.6545991897583008,
"learning_rate": 4.761904761904762e-06,
"loss": 1.4053,
"step": 21
},
{
"epoch": 0.1584871679423683,
"grad_norm": 1.3897682428359985,
"learning_rate": 5e-06,
"loss": 1.3917,
"step": 22
},
{
"epoch": 0.16569113012156686,
"grad_norm": 1.124558687210083,
"learning_rate": 5.2380952380952384e-06,
"loss": 1.3667,
"step": 23
},
{
"epoch": 0.1728950923007654,
"grad_norm": 0.8316662907600403,
"learning_rate": 5.476190476190477e-06,
"loss": 1.3764,
"step": 24
},
{
"epoch": 0.180099054479964,
"grad_norm": 1.0051462650299072,
"learning_rate": 5.7142857142857145e-06,
"loss": 1.3569,
"step": 25
},
{
"epoch": 0.18730301665916255,
"grad_norm": 1.0914835929870605,
"learning_rate": 5.9523809523809525e-06,
"loss": 1.3281,
"step": 26
},
{
"epoch": 0.1945069788383611,
"grad_norm": 1.0524057149887085,
"learning_rate": 6.1904761904761914e-06,
"loss": 1.3194,
"step": 27
},
{
"epoch": 0.20171094101755965,
"grad_norm": 0.7890483736991882,
"learning_rate": 6.4285714285714295e-06,
"loss": 1.2971,
"step": 28
},
{
"epoch": 0.2089149031967582,
"grad_norm": 0.6859455704689026,
"learning_rate": 6.666666666666667e-06,
"loss": 1.3046,
"step": 29
},
{
"epoch": 0.2161188653759568,
"grad_norm": 0.6448878645896912,
"learning_rate": 6.9047619047619055e-06,
"loss": 1.2594,
"step": 30
},
{
"epoch": 0.22332282755515534,
"grad_norm": 0.6465410590171814,
"learning_rate": 7.1428571428571436e-06,
"loss": 1.269,
"step": 31
},
{
"epoch": 0.2305267897343539,
"grad_norm": 0.6410360336303711,
"learning_rate": 7.380952380952382e-06,
"loss": 1.2648,
"step": 32
},
{
"epoch": 0.23773075191355245,
"grad_norm": 0.6101223230361938,
"learning_rate": 7.61904761904762e-06,
"loss": 1.2401,
"step": 33
},
{
"epoch": 0.24493471409275103,
"grad_norm": 0.5768052339553833,
"learning_rate": 7.857142857142858e-06,
"loss": 1.2381,
"step": 34
},
{
"epoch": 0.2521386762719496,
"grad_norm": 0.5501332879066467,
"learning_rate": 8.095238095238097e-06,
"loss": 1.2576,
"step": 35
},
{
"epoch": 0.25934263845114813,
"grad_norm": 0.5579516291618347,
"learning_rate": 8.333333333333334e-06,
"loss": 1.2269,
"step": 36
},
{
"epoch": 0.2665466006303467,
"grad_norm": 0.5129104256629944,
"learning_rate": 8.571428571428571e-06,
"loss": 1.215,
"step": 37
},
{
"epoch": 0.27375056280954524,
"grad_norm": 0.45212557911872864,
"learning_rate": 8.80952380952381e-06,
"loss": 1.2273,
"step": 38
},
{
"epoch": 0.2809545249887438,
"grad_norm": 0.41491127014160156,
"learning_rate": 9.047619047619049e-06,
"loss": 1.2279,
"step": 39
},
{
"epoch": 0.28815848716794235,
"grad_norm": 0.4179735481739044,
"learning_rate": 9.285714285714288e-06,
"loss": 1.2341,
"step": 40
},
{
"epoch": 0.29536244934714095,
"grad_norm": 0.431852251291275,
"learning_rate": 9.523809523809525e-06,
"loss": 1.1875,
"step": 41
},
{
"epoch": 0.3025664115263395,
"grad_norm": 0.4219491183757782,
"learning_rate": 9.761904761904762e-06,
"loss": 1.2045,
"step": 42
},
{
"epoch": 0.30977037370553806,
"grad_norm": 0.37112221121788025,
"learning_rate": 1e-05,
"loss": 1.1965,
"step": 43
},
{
"epoch": 0.3169743358847366,
"grad_norm": 0.34050077199935913,
"learning_rate": 9.999824541392404e-06,
"loss": 1.1774,
"step": 44
},
{
"epoch": 0.32417829806393517,
"grad_norm": 0.32600072026252747,
"learning_rate": 9.999298177883902e-06,
"loss": 1.1969,
"step": 45
},
{
"epoch": 0.3313822602431337,
"grad_norm": 0.33435478806495667,
"learning_rate": 9.9984209464165e-06,
"loss": 1.1712,
"step": 46
},
{
"epoch": 0.3385862224223323,
"grad_norm": 0.32657337188720703,
"learning_rate": 9.997192908557322e-06,
"loss": 1.1872,
"step": 47
},
{
"epoch": 0.3457901846015308,
"grad_norm": 0.31742358207702637,
"learning_rate": 9.995614150494293e-06,
"loss": 1.1829,
"step": 48
},
{
"epoch": 0.3529941467807294,
"grad_norm": 0.27555903792381287,
"learning_rate": 9.99368478303009e-06,
"loss": 1.1683,
"step": 49
},
{
"epoch": 0.360198108959928,
"grad_norm": 0.26494070887565613,
"learning_rate": 9.99140494157436e-06,
"loss": 1.1746,
"step": 50
},
{
"epoch": 0.36740207113912654,
"grad_norm": 0.28536906838417053,
"learning_rate": 9.988774786134235e-06,
"loss": 1.1668,
"step": 51
},
{
"epoch": 0.3746060333183251,
"grad_norm": 0.2787037193775177,
"learning_rate": 9.98579450130307e-06,
"loss": 1.1793,
"step": 52
},
{
"epoch": 0.38180999549752365,
"grad_norm": 0.26992520689964294,
"learning_rate": 9.982464296247523e-06,
"loss": 1.1748,
"step": 53
},
{
"epoch": 0.3890139576767222,
"grad_norm": 0.2353690266609192,
"learning_rate": 9.978784404692847e-06,
"loss": 1.1559,
"step": 54
},
{
"epoch": 0.39621791985592075,
"grad_norm": 0.2522650957107544,
"learning_rate": 9.974755084906503e-06,
"loss": 1.1635,
"step": 55
},
{
"epoch": 0.4034218820351193,
"grad_norm": 0.2653239965438843,
"learning_rate": 9.970376619680024e-06,
"loss": 1.1715,
"step": 56
},
{
"epoch": 0.41062584421431786,
"grad_norm": 0.24647340178489685,
"learning_rate": 9.965649316309178e-06,
"loss": 1.1556,
"step": 57
},
{
"epoch": 0.4178298063935164,
"grad_norm": 0.2625945508480072,
"learning_rate": 9.960573506572391e-06,
"loss": 1.1361,
"step": 58
},
{
"epoch": 0.425033768572715,
"grad_norm": 0.223532035946846,
"learning_rate": 9.955149546707465e-06,
"loss": 1.1367,
"step": 59
},
{
"epoch": 0.4322377307519136,
"grad_norm": 0.2535472810268402,
"learning_rate": 9.94937781738658e-06,
"loss": 1.1551,
"step": 60
},
{
"epoch": 0.4394416929311121,
"grad_norm": 0.23278413712978363,
"learning_rate": 9.94325872368957e-06,
"loss": 1.1482,
"step": 61
},
{
"epoch": 0.4466456551103107,
"grad_norm": 0.2383483499288559,
"learning_rate": 9.936792695075502e-06,
"loss": 1.1389,
"step": 62
},
{
"epoch": 0.45384961728950923,
"grad_norm": 0.25417017936706543,
"learning_rate": 9.929980185352525e-06,
"loss": 1.145,
"step": 63
},
{
"epoch": 0.4610535794687078,
"grad_norm": 0.2187829613685608,
"learning_rate": 9.922821672646028e-06,
"loss": 1.1367,
"step": 64
},
{
"epoch": 0.46825754164790634,
"grad_norm": 0.2265864610671997,
"learning_rate": 9.915317659365078e-06,
"loss": 1.1353,
"step": 65
},
{
"epoch": 0.4754615038271049,
"grad_norm": 0.2197989523410797,
"learning_rate": 9.907468672167165e-06,
"loss": 1.1486,
"step": 66
},
{
"epoch": 0.48266546600630345,
"grad_norm": 0.224925234913826,
"learning_rate": 9.899275261921236e-06,
"loss": 1.1587,
"step": 67
},
{
"epoch": 0.48986942818550205,
"grad_norm": 0.2215765416622162,
"learning_rate": 9.890738003669029e-06,
"loss": 1.1294,
"step": 68
},
{
"epoch": 0.4970733903647006,
"grad_norm": 0.2000226378440857,
"learning_rate": 9.881857496584726e-06,
"loss": 1.1185,
"step": 69
},
{
"epoch": 0.5042773525438992,
"grad_norm": 0.21352718770503998,
"learning_rate": 9.872634363932887e-06,
"loss": 1.1372,
"step": 70
},
{
"epoch": 0.5114813147230977,
"grad_norm": 0.21923653781414032,
"learning_rate": 9.863069253024719e-06,
"loss": 1.1246,
"step": 71
},
{
"epoch": 0.5186852769022963,
"grad_norm": 0.2089078724384308,
"learning_rate": 9.853162835172638e-06,
"loss": 1.1313,
"step": 72
},
{
"epoch": 0.5258892390814949,
"grad_norm": 0.2270357608795166,
"learning_rate": 9.842915805643156e-06,
"loss": 1.1161,
"step": 73
},
{
"epoch": 0.5330932012606934,
"grad_norm": 0.2318277657032013,
"learning_rate": 9.832328883608088e-06,
"loss": 1.1108,
"step": 74
},
{
"epoch": 0.540297163439892,
"grad_norm": 0.19568417966365814,
"learning_rate": 9.821402812094074e-06,
"loss": 1.1373,
"step": 75
},
{
"epoch": 0.5475011256190905,
"grad_norm": 0.2250116616487503,
"learning_rate": 9.81013835793043e-06,
"loss": 1.1459,
"step": 76
},
{
"epoch": 0.5547050877982891,
"grad_norm": 0.2082553207874298,
"learning_rate": 9.798536311695334e-06,
"loss": 1.1298,
"step": 77
},
{
"epoch": 0.5619090499774876,
"grad_norm": 0.22250007092952728,
"learning_rate": 9.786597487660336e-06,
"loss": 1.1365,
"step": 78
},
{
"epoch": 0.5691130121566862,
"grad_norm": 0.19979779422283173,
"learning_rate": 9.774322723733216e-06,
"loss": 1.1233,
"step": 79
},
{
"epoch": 0.5763169743358847,
"grad_norm": 0.20838016271591187,
"learning_rate": 9.761712881399164e-06,
"loss": 1.1542,
"step": 80
},
{
"epoch": 0.5835209365150833,
"grad_norm": 0.20710162818431854,
"learning_rate": 9.748768845660335e-06,
"loss": 1.1318,
"step": 81
},
{
"epoch": 0.5907248986942819,
"grad_norm": 0.21674193441867828,
"learning_rate": 9.735491524973723e-06,
"loss": 1.0994,
"step": 82
},
{
"epoch": 0.5979288608734804,
"grad_norm": 0.20445245504379272,
"learning_rate": 9.721881851187406e-06,
"loss": 1.1342,
"step": 83
},
{
"epoch": 0.605132823052679,
"grad_norm": 0.20889800786972046,
"learning_rate": 9.707940779475151e-06,
"loss": 1.0988,
"step": 84
},
{
"epoch": 0.6123367852318775,
"grad_norm": 0.22214913368225098,
"learning_rate": 9.693669288269371e-06,
"loss": 1.1129,
"step": 85
},
{
"epoch": 0.6195407474110761,
"grad_norm": 0.22072745859622955,
"learning_rate": 9.679068379192455e-06,
"loss": 1.1,
"step": 86
},
{
"epoch": 0.6267447095902746,
"grad_norm": 0.22884686291217804,
"learning_rate": 9.664139076986473e-06,
"loss": 1.1257,
"step": 87
},
{
"epoch": 0.6339486717694732,
"grad_norm": 0.20036116242408752,
"learning_rate": 9.648882429441258e-06,
"loss": 1.1299,
"step": 88
},
{
"epoch": 0.6411526339486717,
"grad_norm": 0.24411164224147797,
"learning_rate": 9.633299507320862e-06,
"loss": 1.1071,
"step": 89
},
{
"epoch": 0.6483565961278703,
"grad_norm": 0.2171693742275238,
"learning_rate": 9.617391404288412e-06,
"loss": 1.1328,
"step": 90
},
{
"epoch": 0.6555605583070689,
"grad_norm": 0.2116158902645111,
"learning_rate": 9.601159236829353e-06,
"loss": 1.1227,
"step": 91
},
{
"epoch": 0.6627645204862674,
"grad_norm": 0.22783496975898743,
"learning_rate": 9.584604144173084e-06,
"loss": 1.0958,
"step": 92
},
{
"epoch": 0.669968482665466,
"grad_norm": 0.23521655797958374,
"learning_rate": 9.567727288213005e-06,
"loss": 1.1481,
"step": 93
},
{
"epoch": 0.6771724448446645,
"grad_norm": 0.22170735895633698,
"learning_rate": 9.550529853424979e-06,
"loss": 1.1099,
"step": 94
},
{
"epoch": 0.6843764070238632,
"grad_norm": 0.23990625143051147,
"learning_rate": 9.53301304678419e-06,
"loss": 1.1115,
"step": 95
},
{
"epoch": 0.6915803692030617,
"grad_norm": 0.22480110824108124,
"learning_rate": 9.515178097680437e-06,
"loss": 1.0993,
"step": 96
},
{
"epoch": 0.6987843313822603,
"grad_norm": 0.2508992552757263,
"learning_rate": 9.497026257831856e-06,
"loss": 1.1251,
"step": 97
},
{
"epoch": 0.7059882935614588,
"grad_norm": 0.25253668427467346,
"learning_rate": 9.478558801197065e-06,
"loss": 1.1121,
"step": 98
},
{
"epoch": 0.7131922557406574,
"grad_norm": 0.23142355680465698,
"learning_rate": 9.459777023885754e-06,
"loss": 1.136,
"step": 99
},
{
"epoch": 0.720396217919856,
"grad_norm": 0.2402750551700592,
"learning_rate": 9.440682244067724e-06,
"loss": 1.1049,
"step": 100
},
{
"epoch": 0.7276001800990545,
"grad_norm": 0.23145467042922974,
"learning_rate": 9.421275801880363e-06,
"loss": 1.096,
"step": 101
},
{
"epoch": 0.7348041422782531,
"grad_norm": 0.24229560792446136,
"learning_rate": 9.401559059334601e-06,
"loss": 1.1077,
"step": 102
},
{
"epoch": 0.7420081044574516,
"grad_norm": 0.2245631366968155,
"learning_rate": 9.381533400219319e-06,
"loss": 1.1078,
"step": 103
},
{
"epoch": 0.7492120666366502,
"grad_norm": 0.22122395038604736,
"learning_rate": 9.361200230004219e-06,
"loss": 1.1255,
"step": 104
},
{
"epoch": 0.7564160288158487,
"grad_norm": 0.2282179296016693,
"learning_rate": 9.340560975741198e-06,
"loss": 1.1343,
"step": 105
},
{
"epoch": 0.7636199909950473,
"grad_norm": 0.23190978169441223,
"learning_rate": 9.319617085964177e-06,
"loss": 1.1082,
"step": 106
},
{
"epoch": 0.7708239531742458,
"grad_norm": 0.24386624991893768,
"learning_rate": 9.298370030587456e-06,
"loss": 1.1096,
"step": 107
},
{
"epoch": 0.7780279153534444,
"grad_norm": 0.22043921053409576,
"learning_rate": 9.276821300802535e-06,
"loss": 1.0869,
"step": 108
},
{
"epoch": 0.785231877532643,
"grad_norm": 0.2251676470041275,
"learning_rate": 9.25497240897346e-06,
"loss": 1.1036,
"step": 109
},
{
"epoch": 0.7924358397118415,
"grad_norm": 0.23501946032047272,
"learning_rate": 9.232824888530689e-06,
"loss": 1.1151,
"step": 110
},
{
"epoch": 0.7996398018910401,
"grad_norm": 0.21661776304244995,
"learning_rate": 9.210380293863462e-06,
"loss": 1.1053,
"step": 111
},
{
"epoch": 0.8068437640702386,
"grad_norm": 0.22219465672969818,
"learning_rate": 9.18764020021071e-06,
"loss": 1.1176,
"step": 112
},
{
"epoch": 0.8140477262494372,
"grad_norm": 0.2121913731098175,
"learning_rate": 9.164606203550498e-06,
"loss": 1.1292,
"step": 113
},
{
"epoch": 0.8212516884286357,
"grad_norm": 0.21977250277996063,
"learning_rate": 9.141279920488021e-06,
"loss": 1.097,
"step": 114
},
{
"epoch": 0.8284556506078343,
"grad_norm": 0.2105371654033661,
"learning_rate": 9.117662988142138e-06,
"loss": 1.1161,
"step": 115
},
{
"epoch": 0.8356596127870328,
"grad_norm": 0.25086918473243713,
"learning_rate": 9.093757064030473e-06,
"loss": 1.1275,
"step": 116
},
{
"epoch": 0.8428635749662314,
"grad_norm": 0.20763848721981049,
"learning_rate": 9.069563825953092e-06,
"loss": 1.0862,
"step": 117
},
{
"epoch": 0.85006753714543,
"grad_norm": 0.24541738629341125,
"learning_rate": 9.045084971874738e-06,
"loss": 1.0928,
"step": 118
},
{
"epoch": 0.8572714993246285,
"grad_norm": 0.22416935861110687,
"learning_rate": 9.020322219805674e-06,
"loss": 1.1145,
"step": 119
},
{
"epoch": 0.8644754615038271,
"grad_norm": 0.2052609622478485,
"learning_rate": 8.9952773076811e-06,
"loss": 1.0615,
"step": 120
},
{
"epoch": 0.8716794236830256,
"grad_norm": 0.23820039629936218,
"learning_rate": 8.969951993239177e-06,
"loss": 1.1167,
"step": 121
},
{
"epoch": 0.8788833858622243,
"grad_norm": 0.24450096487998962,
"learning_rate": 8.944348053897672e-06,
"loss": 1.1331,
"step": 122
},
{
"epoch": 0.8860873480414228,
"grad_norm": 0.24509143829345703,
"learning_rate": 8.9184672866292e-06,
"loss": 1.0708,
"step": 123
},
{
"epoch": 0.8932913102206214,
"grad_norm": 0.23418590426445007,
"learning_rate": 8.892311507835118e-06,
"loss": 1.094,
"step": 124
},
{
"epoch": 0.9004952723998199,
"grad_norm": 0.23996609449386597,
"learning_rate": 8.865882553218036e-06,
"loss": 1.1309,
"step": 125
},
{
"epoch": 0.9076992345790185,
"grad_norm": 0.23122315108776093,
"learning_rate": 8.83918227765299e-06,
"loss": 1.105,
"step": 126
},
{
"epoch": 0.9149031967582171,
"grad_norm": 0.22732949256896973,
"learning_rate": 8.81221255505724e-06,
"loss": 1.092,
"step": 127
},
{
"epoch": 0.9221071589374156,
"grad_norm": 0.26016128063201904,
"learning_rate": 8.784975278258783e-06,
"loss": 1.1058,
"step": 128
},
{
"epoch": 0.9293111211166142,
"grad_norm": 0.2204255759716034,
"learning_rate": 8.757472358863481e-06,
"loss": 1.1294,
"step": 129
},
{
"epoch": 0.9365150832958127,
"grad_norm": 0.2520386278629303,
"learning_rate": 8.729705727120911e-06,
"loss": 1.0975,
"step": 130
},
{
"epoch": 0.9437190454750113,
"grad_norm": 0.24364745616912842,
"learning_rate": 8.701677331788891e-06,
"loss": 1.1135,
"step": 131
},
{
"epoch": 0.9509230076542098,
"grad_norm": 0.26485344767570496,
"learning_rate": 8.673389139996708e-06,
"loss": 1.0937,
"step": 132
},
{
"epoch": 0.9581269698334084,
"grad_norm": 0.23761332035064697,
"learning_rate": 8.644843137107058e-06,
"loss": 1.0834,
"step": 133
},
{
"epoch": 0.9653309320126069,
"grad_norm": 0.22497673332691193,
"learning_rate": 8.616041326576711e-06,
"loss": 1.1093,
"step": 134
},
{
"epoch": 0.9725348941918055,
"grad_norm": 0.23355747759342194,
"learning_rate": 8.586985729815895e-06,
"loss": 1.1207,
"step": 135
},
{
"epoch": 0.9797388563710041,
"grad_norm": 0.2490537166595459,
"learning_rate": 8.557678386046429e-06,
"loss": 1.079,
"step": 136
},
{
"epoch": 0.9869428185502026,
"grad_norm": 0.26007816195487976,
"learning_rate": 8.528121352158604e-06,
"loss": 1.1101,
"step": 137
},
{
"epoch": 0.9941467807294012,
"grad_norm": 0.23331965506076813,
"learning_rate": 8.498316702566828e-06,
"loss": 1.1167,
"step": 138
},
{
"epoch": 1.0,
"grad_norm": 0.23331965506076813,
"learning_rate": 8.468266529064025e-06,
"loss": 1.0964,
"step": 139
},
{
"epoch": 1.0072039621791986,
"grad_norm": 0.33547741174697876,
"learning_rate": 8.437972940674838e-06,
"loss": 1.07,
"step": 140
},
{
"epoch": 1.0144079243583972,
"grad_norm": 0.24112224578857422,
"learning_rate": 8.4074380635076e-06,
"loss": 1.0695,
"step": 141
},
{
"epoch": 1.0216118865375956,
"grad_norm": 0.27535709738731384,
"learning_rate": 8.376664040605122e-06,
"loss": 1.1001,
"step": 142
},
{
"epoch": 1.0288158487167942,
"grad_norm": 0.3114432394504547,
"learning_rate": 8.345653031794292e-06,
"loss": 1.0891,
"step": 143
},
{
"epoch": 1.0360198108959928,
"grad_norm": 0.24609720706939697,
"learning_rate": 8.314407213534477e-06,
"loss": 1.0843,
"step": 144
},
{
"epoch": 1.0432237730751914,
"grad_norm": 0.3140798807144165,
"learning_rate": 8.282928778764783e-06,
"loss": 1.0936,
"step": 145
},
{
"epoch": 1.0504277352543898,
"grad_norm": 0.2414146065711975,
"learning_rate": 8.251219936750145e-06,
"loss": 1.0705,
"step": 146
},
{
"epoch": 1.0576316974335884,
"grad_norm": 0.30057480931282043,
"learning_rate": 8.21928291292627e-06,
"loss": 1.092,
"step": 147
},
{
"epoch": 1.064835659612787,
"grad_norm": 0.25547441840171814,
"learning_rate": 8.18711994874345e-06,
"loss": 1.0512,
"step": 148
},
{
"epoch": 1.0720396217919856,
"grad_norm": 0.26846885681152344,
"learning_rate": 8.154733301509249e-06,
"loss": 1.0865,
"step": 149
},
{
"epoch": 1.0792435839711843,
"grad_norm": 0.30581358075141907,
"learning_rate": 8.12212524423008e-06,
"loss": 1.0674,
"step": 150
},
{
"epoch": 1.0864475461503826,
"grad_norm": 0.2594411373138428,
"learning_rate": 8.089298065451673e-06,
"loss": 1.0779,
"step": 151
},
{
"epoch": 1.0936515083295812,
"grad_norm": 0.27017942070961,
"learning_rate": 8.05625406909846e-06,
"loss": 1.0922,
"step": 152
},
{
"epoch": 1.1008554705087799,
"grad_norm": 0.2681860327720642,
"learning_rate": 8.022995574311876e-06,
"loss": 1.0632,
"step": 153
},
{
"epoch": 1.1080594326879785,
"grad_norm": 0.24879907071590424,
"learning_rate": 7.989524915287595e-06,
"loss": 1.0773,
"step": 154
},
{
"epoch": 1.1152633948671768,
"grad_norm": 0.28538674116134644,
"learning_rate": 7.95584444111171e-06,
"loss": 1.0853,
"step": 155
},
{
"epoch": 1.1224673570463755,
"grad_norm": 0.2812231779098511,
"learning_rate": 7.921956515595861e-06,
"loss": 1.083,
"step": 156
},
{
"epoch": 1.129671319225574,
"grad_norm": 0.26302483677864075,
"learning_rate": 7.887863517111337e-06,
"loss": 1.0749,
"step": 157
},
{
"epoch": 1.1368752814047727,
"grad_norm": 0.26634806394577026,
"learning_rate": 7.85356783842216e-06,
"loss": 1.093,
"step": 158
},
{
"epoch": 1.1440792435839713,
"grad_norm": 0.27896663546562195,
"learning_rate": 7.819071886517134e-06,
"loss": 1.0736,
"step": 159
},
{
"epoch": 1.1512832057631697,
"grad_norm": 0.2650260031223297,
"learning_rate": 7.78437808244094e-06,
"loss": 1.0683,
"step": 160
},
{
"epoch": 1.1584871679423683,
"grad_norm": 0.2808700203895569,
"learning_rate": 7.7494888611242e-06,
"loss": 1.0312,
"step": 161
},
{
"epoch": 1.1656911301215669,
"grad_norm": 0.2390362024307251,
"learning_rate": 7.714406671212589e-06,
"loss": 1.0757,
"step": 162
},
{
"epoch": 1.1728950923007655,
"grad_norm": 0.25637757778167725,
"learning_rate": 7.679133974894984e-06,
"loss": 1.0633,
"step": 163
},
{
"epoch": 1.1800990544799639,
"grad_norm": 0.2605026662349701,
"learning_rate": 7.64367324773066e-06,
"loss": 1.0942,
"step": 164
},
{
"epoch": 1.1873030166591625,
"grad_norm": 0.23692801594734192,
"learning_rate": 7.6080269784755405e-06,
"loss": 1.0863,
"step": 165
},
{
"epoch": 1.194506978838361,
"grad_norm": 0.2812054455280304,
"learning_rate": 7.572197668907533e-06,
"loss": 1.102,
"step": 166
},
{
"epoch": 1.2017109410175597,
"grad_norm": 0.25642886757850647,
"learning_rate": 7.536187833650947e-06,
"loss": 1.076,
"step": 167
},
{
"epoch": 1.2089149031967583,
"grad_norm": 0.2664526402950287,
"learning_rate": 7.500000000000001e-06,
"loss": 1.0877,
"step": 168
},
{
"epoch": 1.2161188653759567,
"grad_norm": 0.25459641218185425,
"learning_rate": 7.463636707741458e-06,
"loss": 1.0798,
"step": 169
},
{
"epoch": 1.2233228275551553,
"grad_norm": 0.2598586082458496,
"learning_rate": 7.42710050897637e-06,
"loss": 1.0774,
"step": 170
},
{
"epoch": 1.230526789734354,
"grad_norm": 0.26702508330345154,
"learning_rate": 7.390393967940962e-06,
"loss": 1.092,
"step": 171
},
{
"epoch": 1.2377307519135525,
"grad_norm": 0.2823182940483093,
"learning_rate": 7.353519660826665e-06,
"loss": 1.0816,
"step": 172
},
{
"epoch": 1.244934714092751,
"grad_norm": 0.23282551765441895,
"learning_rate": 7.31648017559931e-06,
"loss": 1.0691,
"step": 173
},
{
"epoch": 1.2521386762719495,
"grad_norm": 0.2649790346622467,
"learning_rate": 7.279278111817502e-06,
"loss": 1.033,
"step": 174
},
{
"epoch": 1.2593426384511481,
"grad_norm": 0.23375588655471802,
"learning_rate": 7.241916080450163e-06,
"loss": 1.0749,
"step": 175
},
{
"epoch": 1.2665466006303467,
"grad_norm": 0.2711394727230072,
"learning_rate": 7.2043967036932935e-06,
"loss": 1.0416,
"step": 176
},
{
"epoch": 1.2737505628095454,
"grad_norm": 0.24989663064479828,
"learning_rate": 7.166722614785937e-06,
"loss": 1.0743,
"step": 177
},
{
"epoch": 1.2809545249887437,
"grad_norm": 0.23100513219833374,
"learning_rate": 7.128896457825364e-06,
"loss": 1.0769,
"step": 178
},
{
"epoch": 1.2881584871679423,
"grad_norm": 0.26965799927711487,
"learning_rate": 7.090920887581507e-06,
"loss": 1.0739,
"step": 179
},
{
"epoch": 1.295362449347141,
"grad_norm": 0.2137940227985382,
"learning_rate": 7.052798569310641e-06,
"loss": 1.0872,
"step": 180
},
{
"epoch": 1.3025664115263396,
"grad_norm": 0.2575233280658722,
"learning_rate": 7.014532178568314e-06,
"loss": 1.0558,
"step": 181
},
{
"epoch": 1.309770373705538,
"grad_norm": 0.22105364501476288,
"learning_rate": 6.976124401021583e-06,
"loss": 1.0484,
"step": 182
},
{
"epoch": 1.3169743358847366,
"grad_norm": 0.2336052805185318,
"learning_rate": 6.9375779322605154e-06,
"loss": 1.0805,
"step": 183
},
{
"epoch": 1.3241782980639352,
"grad_norm": 0.23742294311523438,
"learning_rate": 6.898895477609007e-06,
"loss": 1.0862,
"step": 184
},
{
"epoch": 1.3313822602431338,
"grad_norm": 0.258781373500824,
"learning_rate": 6.860079751934908e-06,
"loss": 1.0739,
"step": 185
},
{
"epoch": 1.3385862224223324,
"grad_norm": 0.2347659021615982,
"learning_rate": 6.821133479459492e-06,
"loss": 1.0877,
"step": 186
},
{
"epoch": 1.3457901846015308,
"grad_norm": 0.24011020362377167,
"learning_rate": 6.782059393566254e-06,
"loss": 1.0676,
"step": 187
},
{
"epoch": 1.3529941467807294,
"grad_norm": 0.30535200238227844,
"learning_rate": 6.7428602366090764e-06,
"loss": 1.0809,
"step": 188
},
{
"epoch": 1.360198108959928,
"grad_norm": 0.23181499540805817,
"learning_rate": 6.70353875971976e-06,
"loss": 1.0809,
"step": 189
},
{
"epoch": 1.3674020711391266,
"grad_norm": 0.20072412490844727,
"learning_rate": 6.664097722614934e-06,
"loss": 1.0607,
"step": 190
},
{
"epoch": 1.374606033318325,
"grad_norm": 0.22480317950248718,
"learning_rate": 6.624539893402383e-06,
"loss": 1.0745,
"step": 191
},
{
"epoch": 1.3818099954975236,
"grad_norm": 0.22832217812538147,
"learning_rate": 6.58486804838676e-06,
"loss": 1.0856,
"step": 192
},
{
"epoch": 1.3890139576767222,
"grad_norm": 0.24673670530319214,
"learning_rate": 6.545084971874738e-06,
"loss": 1.0905,
"step": 193
},
{
"epoch": 1.3962179198559208,
"grad_norm": 0.24316424131393433,
"learning_rate": 6.505193455979603e-06,
"loss": 1.0795,
"step": 194
},
{
"epoch": 1.4034218820351194,
"grad_norm": 0.23344801366329193,
"learning_rate": 6.465196300425287e-06,
"loss": 1.0879,
"step": 195
},
{
"epoch": 1.4106258442143178,
"grad_norm": 0.2587188482284546,
"learning_rate": 6.425096312349881e-06,
"loss": 1.082,
"step": 196
},
{
"epoch": 1.4178298063935164,
"grad_norm": 0.23247972130775452,
"learning_rate": 6.384896306108612e-06,
"loss": 1.0677,
"step": 197
},
{
"epoch": 1.425033768572715,
"grad_norm": 0.2605839669704437,
"learning_rate": 6.344599103076329e-06,
"loss": 1.0615,
"step": 198
},
{
"epoch": 1.4322377307519136,
"grad_norm": 0.22571925818920135,
"learning_rate": 6.304207531449486e-06,
"loss": 1.0821,
"step": 199
},
{
"epoch": 1.439441692931112,
"grad_norm": 0.27554214000701904,
"learning_rate": 6.2637244260476474e-06,
"loss": 1.0659,
"step": 200
},
{
"epoch": 1.4466456551103106,
"grad_norm": 0.24706511199474335,
"learning_rate": 6.223152628114537e-06,
"loss": 1.0664,
"step": 201
},
{
"epoch": 1.4538496172895092,
"grad_norm": 0.24783776700496674,
"learning_rate": 6.182494985118625e-06,
"loss": 1.0548,
"step": 202
},
{
"epoch": 1.4610535794687078,
"grad_norm": 0.24173712730407715,
"learning_rate": 6.141754350553279e-06,
"loss": 1.0647,
"step": 203
},
{
"epoch": 1.4682575416479065,
"grad_norm": 0.2548038363456726,
"learning_rate": 6.100933583736508e-06,
"loss": 1.0656,
"step": 204
},
{
"epoch": 1.4754615038271048,
"grad_norm": 0.23962704837322235,
"learning_rate": 6.060035549610275e-06,
"loss": 1.0756,
"step": 205
},
{
"epoch": 1.4826654660063034,
"grad_norm": 0.23094172775745392,
"learning_rate": 6.019063118539425e-06,
"loss": 1.063,
"step": 206
},
{
"epoch": 1.489869428185502,
"grad_norm": 0.24982015788555145,
"learning_rate": 5.978019166110242e-06,
"loss": 1.0805,
"step": 207
},
{
"epoch": 1.4970733903647007,
"grad_norm": 0.23258346319198608,
"learning_rate": 5.936906572928625e-06,
"loss": 1.0595,
"step": 208
},
{
"epoch": 1.504277352543899,
"grad_norm": 0.23061244189739227,
"learning_rate": 5.8957282244179125e-06,
"loss": 1.082,
"step": 209
},
{
"epoch": 1.5114813147230977,
"grad_norm": 0.23120225965976715,
"learning_rate": 5.854487010616384e-06,
"loss": 1.0831,
"step": 210
},
{
"epoch": 1.5186852769022963,
"grad_norm": 0.24363379180431366,
"learning_rate": 5.813185825974419e-06,
"loss": 1.1031,
"step": 211
},
{
"epoch": 1.5258892390814949,
"grad_norm": 0.246430441737175,
"learning_rate": 5.771827569151357e-06,
"loss": 1.0902,
"step": 212
},
{
"epoch": 1.5330932012606935,
"grad_norm": 0.24487066268920898,
"learning_rate": 5.730415142812059e-06,
"loss": 1.0521,
"step": 213
},
{
"epoch": 1.540297163439892,
"grad_norm": 0.2253061830997467,
"learning_rate": 5.68895145342319e-06,
"loss": 1.0843,
"step": 214
},
{
"epoch": 1.5475011256190905,
"grad_norm": 0.22573482990264893,
"learning_rate": 5.647439411049235e-06,
"loss": 1.068,
"step": 215
},
{
"epoch": 1.554705087798289,
"grad_norm": 0.24459558725357056,
"learning_rate": 5.605881929148254e-06,
"loss": 1.0707,
"step": 216
},
{
"epoch": 1.5619090499774875,
"grad_norm": 0.2182885855436325,
"learning_rate": 5.5642819243674085e-06,
"loss": 1.0446,
"step": 217
},
{
"epoch": 1.569113012156686,
"grad_norm": 0.21815907955169678,
"learning_rate": 5.522642316338268e-06,
"loss": 1.0404,
"step": 218
},
{
"epoch": 1.5763169743358847,
"grad_norm": 0.22507762908935547,
"learning_rate": 5.480966027471889e-06,
"loss": 1.0527,
"step": 219
},
{
"epoch": 1.5835209365150833,
"grad_norm": 0.2513904869556427,
"learning_rate": 5.439255982753717e-06,
"loss": 1.0503,
"step": 220
},
{
"epoch": 1.590724898694282,
"grad_norm": 0.20894253253936768,
"learning_rate": 5.3975151095383e-06,
"loss": 1.0765,
"step": 221
},
{
"epoch": 1.5979288608734805,
"grad_norm": 0.2522652745246887,
"learning_rate": 5.355746337343835e-06,
"loss": 1.0855,
"step": 222
},
{
"epoch": 1.6051328230526791,
"grad_norm": 0.2368040829896927,
"learning_rate": 5.3139525976465675e-06,
"loss": 1.0654,
"step": 223
},
{
"epoch": 1.6123367852318775,
"grad_norm": 0.2553313672542572,
"learning_rate": 5.272136823675046e-06,
"loss": 1.073,
"step": 224
},
{
"epoch": 1.6195407474110761,
"grad_norm": 0.23547260463237762,
"learning_rate": 5.230301950204261e-06,
"loss": 1.0681,
"step": 225
},
{
"epoch": 1.6267447095902745,
"grad_norm": 0.2587689757347107,
"learning_rate": 5.188450913349674e-06,
"loss": 1.0603,
"step": 226
},
{
"epoch": 1.6339486717694731,
"grad_norm": 0.2566240429878235,
"learning_rate": 5.146586650361143e-06,
"loss": 1.0576,
"step": 227
},
{
"epoch": 1.6411526339486717,
"grad_norm": 0.23041875660419464,
"learning_rate": 5.1047120994167855e-06,
"loss": 1.0694,
"step": 228
},
{
"epoch": 1.6483565961278703,
"grad_norm": 0.24431072175502777,
"learning_rate": 5.062830199416764e-06,
"loss": 1.0616,
"step": 229
},
{
"epoch": 1.655560558307069,
"grad_norm": 0.21524447202682495,
"learning_rate": 5.0209438897770205e-06,
"loss": 1.065,
"step": 230
},
{
"epoch": 1.6627645204862675,
"grad_norm": 0.24770976603031158,
"learning_rate": 4.979056110222982e-06,
"loss": 1.0548,
"step": 231
},
{
"epoch": 1.6699684826654662,
"grad_norm": 0.2506217062473297,
"learning_rate": 4.937169800583237e-06,
"loss": 1.0779,
"step": 232
},
{
"epoch": 1.6771724448446645,
"grad_norm": 0.246966153383255,
"learning_rate": 4.895287900583216e-06,
"loss": 1.0487,
"step": 233
},
{
"epoch": 1.6843764070238632,
"grad_norm": 0.23613645136356354,
"learning_rate": 4.853413349638859e-06,
"loss": 1.0649,
"step": 234
},
{
"epoch": 1.6915803692030615,
"grad_norm": 0.24523551762104034,
"learning_rate": 4.811549086650327e-06,
"loss": 1.0652,
"step": 235
},
{
"epoch": 1.6987843313822601,
"grad_norm": 0.22116148471832275,
"learning_rate": 4.769698049795739e-06,
"loss": 1.0589,
"step": 236
},
{
"epoch": 1.7059882935614588,
"grad_norm": 0.2507432997226715,
"learning_rate": 4.727863176324955e-06,
"loss": 1.0257,
"step": 237
},
{
"epoch": 1.7131922557406574,
"grad_norm": 0.2185191810131073,
"learning_rate": 4.686047402353433e-06,
"loss": 1.0596,
"step": 238
},
{
"epoch": 1.720396217919856,
"grad_norm": 0.2304830700159073,
"learning_rate": 4.644253662656167e-06,
"loss": 1.0531,
"step": 239
},
{
"epoch": 1.7276001800990546,
"grad_norm": 0.25681596994400024,
"learning_rate": 4.602484890461702e-06,
"loss": 1.0719,
"step": 240
},
{
"epoch": 1.7348041422782532,
"grad_norm": 0.23360076546669006,
"learning_rate": 4.560744017246284e-06,
"loss": 1.0593,
"step": 241
},
{
"epoch": 1.7420081044574516,
"grad_norm": 0.23592416942119598,
"learning_rate": 4.519033972528114e-06,
"loss": 1.0583,
"step": 242
},
{
"epoch": 1.7492120666366502,
"grad_norm": 0.23926499485969543,
"learning_rate": 4.477357683661734e-06,
"loss": 1.0778,
"step": 243
},
{
"epoch": 1.7564160288158486,
"grad_norm": 0.23426315188407898,
"learning_rate": 4.4357180756325915e-06,
"loss": 1.0469,
"step": 244
},
{
"epoch": 1.7636199909950472,
"grad_norm": 0.21399535238742828,
"learning_rate": 4.394118070851749e-06,
"loss": 1.036,
"step": 245
},
{
"epoch": 1.7708239531742458,
"grad_norm": 0.25305458903312683,
"learning_rate": 4.352560588950766e-06,
"loss": 1.0811,
"step": 246
},
{
"epoch": 1.7780279153534444,
"grad_norm": 0.23678237199783325,
"learning_rate": 4.31104854657681e-06,
"loss": 1.0563,
"step": 247
},
{
"epoch": 1.785231877532643,
"grad_norm": 0.22029711306095123,
"learning_rate": 4.269584857187942e-06,
"loss": 1.0634,
"step": 248
},
{
"epoch": 1.7924358397118416,
"grad_norm": 0.21898190677165985,
"learning_rate": 4.228172430848645e-06,
"loss": 1.0609,
"step": 249
},
{
"epoch": 1.7996398018910402,
"grad_norm": 0.22045612335205078,
"learning_rate": 4.186814174025582e-06,
"loss": 1.0483,
"step": 250
},
{
"epoch": 1.8068437640702386,
"grad_norm": 0.2106829732656479,
"learning_rate": 4.145512989383618e-06,
"loss": 1.0443,
"step": 251
},
{
"epoch": 1.8140477262494372,
"grad_norm": 0.2186656892299652,
"learning_rate": 4.104271775582089e-06,
"loss": 1.0346,
"step": 252
},
{
"epoch": 1.8212516884286356,
"grad_norm": 0.23272953927516937,
"learning_rate": 4.063093427071376e-06,
"loss": 1.0576,
"step": 253
},
{
"epoch": 1.8284556506078342,
"grad_norm": 0.23906800150871277,
"learning_rate": 4.02198083388976e-06,
"loss": 1.0428,
"step": 254
},
{
"epoch": 1.8356596127870328,
"grad_norm": 0.22326606512069702,
"learning_rate": 3.980936881460576e-06,
"loss": 1.0751,
"step": 255
},
{
"epoch": 1.8428635749662314,
"grad_norm": 0.2465428113937378,
"learning_rate": 3.939964450389728e-06,
"loss": 1.064,
"step": 256
},
{
"epoch": 1.85006753714543,
"grad_norm": 0.25662410259246826,
"learning_rate": 3.899066416263493e-06,
"loss": 1.0593,
"step": 257
},
{
"epoch": 1.8572714993246286,
"grad_norm": 0.22539134323596954,
"learning_rate": 3.8582456494467214e-06,
"loss": 1.0588,
"step": 258
},
{
"epoch": 1.8644754615038273,
"grad_norm": 0.2232930064201355,
"learning_rate": 3.817505014881378e-06,
"loss": 1.0399,
"step": 259
},
{
"epoch": 1.8716794236830256,
"grad_norm": 0.24547068774700165,
"learning_rate": 3.776847371885464e-06,
"loss": 1.0477,
"step": 260
},
{
"epoch": 1.8788833858622243,
"grad_norm": 0.2386842668056488,
"learning_rate": 3.736275573952354e-06,
"loss": 1.0538,
"step": 261
},
{
"epoch": 1.8860873480414226,
"grad_norm": 0.23360906541347504,
"learning_rate": 3.695792468550517e-06,
"loss": 1.0455,
"step": 262
},
{
"epoch": 1.8932913102206212,
"grad_norm": 0.22610723972320557,
"learning_rate": 3.655400896923672e-06,
"loss": 1.0779,
"step": 263
},
{
"epoch": 1.9004952723998199,
"grad_norm": 0.2285996526479721,
"learning_rate": 3.6151036938913887e-06,
"loss": 1.0672,
"step": 264
},
{
"epoch": 1.9076992345790185,
"grad_norm": 0.2220553308725357,
"learning_rate": 3.5749036876501196e-06,
"loss": 1.0876,
"step": 265
},
{
"epoch": 1.914903196758217,
"grad_norm": 0.2081567645072937,
"learning_rate": 3.5348036995747135e-06,
"loss": 1.0844,
"step": 266
},
{
"epoch": 1.9221071589374157,
"grad_norm": 0.21750611066818237,
"learning_rate": 3.4948065440203982e-06,
"loss": 1.0582,
"step": 267
},
{
"epoch": 1.9293111211166143,
"grad_norm": 0.20807136595249176,
"learning_rate": 3.4549150281252635e-06,
"loss": 1.0738,
"step": 268
},
{
"epoch": 1.9365150832958127,
"grad_norm": 0.21424974501132965,
"learning_rate": 3.4151319516132414e-06,
"loss": 1.0293,
"step": 269
},
{
"epoch": 1.9437190454750113,
"grad_norm": 0.2395254671573639,
"learning_rate": 3.375460106597619e-06,
"loss": 1.0624,
"step": 270
},
{
"epoch": 1.9509230076542097,
"grad_norm": 0.23250122368335724,
"learning_rate": 3.3359022773850673e-06,
"loss": 1.0406,
"step": 271
},
{
"epoch": 1.9581269698334083,
"grad_norm": 0.20610974729061127,
"learning_rate": 3.2964612402802422e-06,
"loss": 1.0673,
"step": 272
},
{
"epoch": 1.965330932012607,
"grad_norm": 0.2716231346130371,
"learning_rate": 3.2571397633909252e-06,
"loss": 1.0312,
"step": 273
},
{
"epoch": 1.9725348941918055,
"grad_norm": 0.23458805680274963,
"learning_rate": 3.217940606433747e-06,
"loss": 1.0442,
"step": 274
},
{
"epoch": 1.979738856371004,
"grad_norm": 0.21636377274990082,
"learning_rate": 3.178866520540509e-06,
"loss": 1.0448,
"step": 275
},
{
"epoch": 1.9869428185502027,
"grad_norm": 0.2635723054409027,
"learning_rate": 3.139920248065095e-06,
"loss": 1.0657,
"step": 276
},
{
"epoch": 1.9941467807294013,
"grad_norm": 0.20790298283100128,
"learning_rate": 3.1011045223909954e-06,
"loss": 1.0635,
"step": 277
},
{
"epoch": 2.0,
"grad_norm": 0.22311536967754364,
"learning_rate": 3.0624220677394854e-06,
"loss": 1.0679,
"step": 278
},
{
"epoch": 2.0072039621791986,
"grad_norm": 0.2736155092716217,
"learning_rate": 3.023875598978419e-06,
"loss": 1.0501,
"step": 279
},
{
"epoch": 2.014407924358397,
"grad_norm": 0.22263078391551971,
"learning_rate": 2.9854678214316875e-06,
"loss": 1.0339,
"step": 280
},
{
"epoch": 2.021611886537596,
"grad_norm": 0.21312211453914642,
"learning_rate": 2.9472014306893605e-06,
"loss": 1.0475,
"step": 281
},
{
"epoch": 2.0288158487167944,
"grad_norm": 0.2075333446264267,
"learning_rate": 2.9090791124184934e-06,
"loss": 1.0658,
"step": 282
},
{
"epoch": 2.0360198108959926,
"grad_norm": 0.21481953561306,
"learning_rate": 2.871103542174637e-06,
"loss": 1.0638,
"step": 283
},
{
"epoch": 2.043223773075191,
"grad_norm": 0.21095344424247742,
"learning_rate": 2.8332773852140644e-06,
"loss": 1.0372,
"step": 284
},
{
"epoch": 2.05042773525439,
"grad_norm": 0.23806796967983246,
"learning_rate": 2.795603296306708e-06,
"loss": 1.0547,
"step": 285
},
{
"epoch": 2.0576316974335884,
"grad_norm": 0.22928300499916077,
"learning_rate": 2.7580839195498397e-06,
"loss": 1.044,
"step": 286
},
{
"epoch": 2.064835659612787,
"grad_norm": 0.20792540907859802,
"learning_rate": 2.7207218881825016e-06,
"loss": 1.0486,
"step": 287
},
{
"epoch": 2.0720396217919856,
"grad_norm": 0.21220359206199646,
"learning_rate": 2.683519824400693e-06,
"loss": 1.0735,
"step": 288
},
{
"epoch": 2.0792435839711843,
"grad_norm": 0.21435540914535522,
"learning_rate": 2.646480339173337e-06,
"loss": 1.0422,
"step": 289
},
{
"epoch": 2.086447546150383,
"grad_norm": 0.20643557608127594,
"learning_rate": 2.6096060320590393e-06,
"loss": 1.0268,
"step": 290
},
{
"epoch": 2.0936515083295815,
"grad_norm": 0.2074732780456543,
"learning_rate": 2.5728994910236304e-06,
"loss": 1.0434,
"step": 291
},
{
"epoch": 2.1008554705087796,
"grad_norm": 0.22091282904148102,
"learning_rate": 2.536363292258543e-06,
"loss": 1.048,
"step": 292
},
{
"epoch": 2.1080594326879782,
"grad_norm": 0.2222498208284378,
"learning_rate": 2.5000000000000015e-06,
"loss": 1.0387,
"step": 293
},
{
"epoch": 2.115263394867177,
"grad_norm": 0.2226988971233368,
"learning_rate": 2.4638121663490546e-06,
"loss": 1.0144,
"step": 294
},
{
"epoch": 2.1224673570463755,
"grad_norm": 0.20479106903076172,
"learning_rate": 2.4278023310924676e-06,
"loss": 1.0411,
"step": 295
},
{
"epoch": 2.129671319225574,
"grad_norm": 0.20795980095863342,
"learning_rate": 2.391973021524461e-06,
"loss": 1.0469,
"step": 296
},
{
"epoch": 2.1368752814047727,
"grad_norm": 0.1892288625240326,
"learning_rate": 2.356326752269342e-06,
"loss": 1.0543,
"step": 297
},
{
"epoch": 2.1440792435839713,
"grad_norm": 0.21468913555145264,
"learning_rate": 2.320866025105016e-06,
"loss": 1.0356,
"step": 298
},
{
"epoch": 2.15128320576317,
"grad_norm": 0.19955170154571533,
"learning_rate": 2.285593328787414e-06,
"loss": 1.0236,
"step": 299
},
{
"epoch": 2.1584871679423685,
"grad_norm": 0.2040010690689087,
"learning_rate": 2.250511138875801e-06,
"loss": 1.0398,
"step": 300
},
{
"epoch": 2.1656911301215667,
"grad_norm": 0.2120560258626938,
"learning_rate": 2.2156219175590623e-06,
"loss": 1.05,
"step": 301
},
{
"epoch": 2.1728950923007653,
"grad_norm": 0.21219217777252197,
"learning_rate": 2.1809281134828663e-06,
"loss": 1.0505,
"step": 302
},
{
"epoch": 2.180099054479964,
"grad_norm": 0.22002087533473969,
"learning_rate": 2.146432161577842e-06,
"loss": 1.0316,
"step": 303
},
{
"epoch": 2.1873030166591625,
"grad_norm": 0.2064754068851471,
"learning_rate": 2.112136482888663e-06,
"loss": 1.0318,
"step": 304
},
{
"epoch": 2.194506978838361,
"grad_norm": 0.21059390902519226,
"learning_rate": 2.07804348440414e-06,
"loss": 1.046,
"step": 305
},
{
"epoch": 2.2017109410175597,
"grad_norm": 0.2134746015071869,
"learning_rate": 2.04415555888829e-06,
"loss": 1.0578,
"step": 306
},
{
"epoch": 2.2089149031967583,
"grad_norm": 0.19525548815727234,
"learning_rate": 2.0104750847124075e-06,
"loss": 1.0484,
"step": 307
},
{
"epoch": 2.216118865375957,
"grad_norm": 0.19859297573566437,
"learning_rate": 1.977004425688126e-06,
"loss": 1.0266,
"step": 308
},
{
"epoch": 2.2233228275551555,
"grad_norm": 0.2262914627790451,
"learning_rate": 1.9437459309015426e-06,
"loss": 1.0691,
"step": 309
},
{
"epoch": 2.2305267897343537,
"grad_norm": 0.2108326107263565,
"learning_rate": 1.910701934548329e-06,
"loss": 1.0429,
"step": 310
},
{
"epoch": 2.2377307519135523,
"grad_norm": 0.21613864600658417,
"learning_rate": 1.8778747557699223e-06,
"loss": 1.0604,
"step": 311
},
{
"epoch": 2.244934714092751,
"grad_norm": 0.19557908177375793,
"learning_rate": 1.8452666984907519e-06,
"loss": 1.0558,
"step": 312
},
{
"epoch": 2.2521386762719495,
"grad_norm": 0.19418083131313324,
"learning_rate": 1.8128800512565514e-06,
"loss": 1.0507,
"step": 313
},
{
"epoch": 2.259342638451148,
"grad_norm": 0.1997075378894806,
"learning_rate": 1.7807170870737317e-06,
"loss": 1.0338,
"step": 314
},
{
"epoch": 2.2665466006303467,
"grad_norm": 0.20641541481018066,
"learning_rate": 1.7487800632498547e-06,
"loss": 1.0297,
"step": 315
},
{
"epoch": 2.2737505628095454,
"grad_norm": 0.18877221643924713,
"learning_rate": 1.7170712212352187e-06,
"loss": 1.0564,
"step": 316
},
{
"epoch": 2.280954524988744,
"grad_norm": 0.20154546201229095,
"learning_rate": 1.6855927864655241e-06,
"loss": 1.0268,
"step": 317
},
{
"epoch": 2.2881584871679426,
"grad_norm": 0.2065822035074234,
"learning_rate": 1.6543469682057105e-06,
"loss": 1.06,
"step": 318
},
{
"epoch": 2.2953624493471407,
"grad_norm": 0.19270487129688263,
"learning_rate": 1.6233359593948777e-06,
"loss": 1.0753,
"step": 319
},
{
"epoch": 2.3025664115263393,
"grad_norm": 0.19982437789440155,
"learning_rate": 1.5925619364924016e-06,
"loss": 1.0346,
"step": 320
},
{
"epoch": 2.309770373705538,
"grad_norm": 0.20062725245952606,
"learning_rate": 1.5620270593251635e-06,
"loss": 1.0228,
"step": 321
},
{
"epoch": 2.3169743358847366,
"grad_norm": 0.18592402338981628,
"learning_rate": 1.531733470935976e-06,
"loss": 1.0677,
"step": 322
},
{
"epoch": 2.324178298063935,
"grad_norm": 0.20376469194889069,
"learning_rate": 1.5016832974331725e-06,
"loss": 1.0468,
"step": 323
},
{
"epoch": 2.3313822602431338,
"grad_norm": 0.20351487398147583,
"learning_rate": 1.4718786478413983e-06,
"loss": 1.0707,
"step": 324
},
{
"epoch": 2.3385862224223324,
"grad_norm": 0.21062229573726654,
"learning_rate": 1.4423216139535735e-06,
"loss": 1.0609,
"step": 325
},
{
"epoch": 2.345790184601531,
"grad_norm": 0.20279563963413239,
"learning_rate": 1.4130142701841076e-06,
"loss": 1.0263,
"step": 326
},
{
"epoch": 2.3529941467807296,
"grad_norm": 0.1935521960258484,
"learning_rate": 1.3839586734232907e-06,
"loss": 1.0262,
"step": 327
},
{
"epoch": 2.3601981089599278,
"grad_norm": 0.2124936282634735,
"learning_rate": 1.3551568628929434e-06,
"loss": 1.0125,
"step": 328
},
{
"epoch": 2.3674020711391264,
"grad_norm": 0.19464442133903503,
"learning_rate": 1.3266108600032928e-06,
"loss": 1.0478,
"step": 329
},
{
"epoch": 2.374606033318325,
"grad_norm": 0.19089345633983612,
"learning_rate": 1.2983226682111094e-06,
"loss": 1.0116,
"step": 330
},
{
"epoch": 2.3818099954975236,
"grad_norm": 0.19948884844779968,
"learning_rate": 1.2702942728790897e-06,
"loss": 1.0635,
"step": 331
},
{
"epoch": 2.389013957676722,
"grad_norm": 0.19687066972255707,
"learning_rate": 1.24252764113652e-06,
"loss": 1.0264,
"step": 332
},
{
"epoch": 2.396217919855921,
"grad_norm": 0.2047269642353058,
"learning_rate": 1.2150247217412186e-06,
"loss": 1.0241,
"step": 333
},
{
"epoch": 2.4034218820351194,
"grad_norm": 0.19504639506340027,
"learning_rate": 1.18778744494276e-06,
"loss": 1.0271,
"step": 334
},
{
"epoch": 2.410625844214318,
"grad_norm": 0.18415555357933044,
"learning_rate": 1.160817722347014e-06,
"loss": 1.0322,
"step": 335
},
{
"epoch": 2.4178298063935166,
"grad_norm": 0.18974873423576355,
"learning_rate": 1.1341174467819637e-06,
"loss": 1.0191,
"step": 336
},
{
"epoch": 2.425033768572715,
"grad_norm": 0.18910925090312958,
"learning_rate": 1.1076884921648834e-06,
"loss": 1.0632,
"step": 337
},
{
"epoch": 2.4322377307519134,
"grad_norm": 0.180083766579628,
"learning_rate": 1.0815327133708015e-06,
"loss": 1.0412,
"step": 338
},
{
"epoch": 2.439441692931112,
"grad_norm": 0.18438957631587982,
"learning_rate": 1.0556519461023301e-06,
"loss": 1.0345,
"step": 339
},
{
"epoch": 2.4466456551103106,
"grad_norm": 0.1945696324110031,
"learning_rate": 1.0300480067608232e-06,
"loss": 1.0305,
"step": 340
},
{
"epoch": 2.4538496172895092,
"grad_norm": 0.20218323171138763,
"learning_rate": 1.0047226923189024e-06,
"loss": 1.0532,
"step": 341
},
{
"epoch": 2.461053579468708,
"grad_norm": 0.18660913407802582,
"learning_rate": 9.79677780194327e-07,
"loss": 1.0248,
"step": 342
},
{
"epoch": 2.4682575416479065,
"grad_norm": 0.20080283284187317,
"learning_rate": 9.549150281252633e-07,
"loss": 1.0266,
"step": 343
},
{
"epoch": 2.475461503827105,
"grad_norm": 0.17952297627925873,
"learning_rate": 9.304361740469103e-07,
"loss": 1.0285,
"step": 344
},
{
"epoch": 2.4826654660063037,
"grad_norm": 0.19411954283714294,
"learning_rate": 9.06242935969528e-07,
"loss": 1.0362,
"step": 345
},
{
"epoch": 2.489869428185502,
"grad_norm": 0.19213198125362396,
"learning_rate": 8.823370118578628e-07,
"loss": 1.0304,
"step": 346
},
{
"epoch": 2.4970733903647004,
"grad_norm": 0.1971171349287033,
"learning_rate": 8.587200795119793e-07,
"loss": 1.0621,
"step": 347
},
{
"epoch": 2.504277352543899,
"grad_norm": 0.2063921093940735,
"learning_rate": 8.353937964495029e-07,
"loss": 1.0198,
"step": 348
},
{
"epoch": 2.5114813147230977,
"grad_norm": 0.18766029179096222,
"learning_rate": 8.123597997892918e-07,
"loss": 1.0231,
"step": 349
},
{
"epoch": 2.5186852769022963,
"grad_norm": 0.17498192191123962,
"learning_rate": 7.89619706136539e-07,
"loss": 1.0438,
"step": 350
},
{
"epoch": 2.525889239081495,
"grad_norm": 0.18453362584114075,
"learning_rate": 7.671751114693104e-07,
"loss": 1.0313,
"step": 351
},
{
"epoch": 2.5330932012606935,
"grad_norm": 0.17940948903560638,
"learning_rate": 7.450275910265415e-07,
"loss": 1.0169,
"step": 352
},
{
"epoch": 2.540297163439892,
"grad_norm": 0.1889266073703766,
"learning_rate": 7.23178699197467e-07,
"loss": 1.0317,
"step": 353
},
{
"epoch": 2.5475011256190907,
"grad_norm": 0.17744366824626923,
"learning_rate": 7.01629969412545e-07,
"loss": 1.0466,
"step": 354
},
{
"epoch": 2.554705087798289,
"grad_norm": 0.1801387220621109,
"learning_rate": 6.803829140358237e-07,
"loss": 1.0414,
"step": 355
},
{
"epoch": 2.5619090499774875,
"grad_norm": 0.19586600363254547,
"learning_rate": 6.594390242588044e-07,
"loss": 1.0464,
"step": 356
},
{
"epoch": 2.569113012156686,
"grad_norm": 0.19039765000343323,
"learning_rate": 6.387997699957815e-07,
"loss": 1.0275,
"step": 357
},
{
"epoch": 2.5763169743358847,
"grad_norm": 0.18481585383415222,
"learning_rate": 6.184665997806832e-07,
"loss": 1.0298,
"step": 358
},
{
"epoch": 2.5835209365150833,
"grad_norm": 0.19624339044094086,
"learning_rate": 5.98440940665399e-07,
"loss": 1.0538,
"step": 359
},
{
"epoch": 2.590724898694282,
"grad_norm": 0.18870003521442413,
"learning_rate": 5.787241981196384e-07,
"loss": 1.0454,
"step": 360
},
{
"epoch": 2.5979288608734805,
"grad_norm": 0.1834286004304886,
"learning_rate": 5.593177559322776e-07,
"loss": 0.9966,
"step": 361
},
{
"epoch": 2.605132823052679,
"grad_norm": 0.19677117466926575,
"learning_rate": 5.402229761142464e-07,
"loss": 1.0752,
"step": 362
},
{
"epoch": 2.6123367852318777,
"grad_norm": 0.18339574337005615,
"learning_rate": 5.214411988029355e-07,
"loss": 1.0525,
"step": 363
},
{
"epoch": 2.619540747411076,
"grad_norm": 0.17459526658058167,
"learning_rate": 5.029737421681446e-07,
"loss": 1.0209,
"step": 364
},
{
"epoch": 2.6267447095902745,
"grad_norm": 0.18108704686164856,
"learning_rate": 4.848219023195644e-07,
"loss": 1.053,
"step": 365
},
{
"epoch": 2.633948671769473,
"grad_norm": 0.19006100296974182,
"learning_rate": 4.6698695321581165e-07,
"loss": 1.0343,
"step": 366
},
{
"epoch": 2.6411526339486717,
"grad_norm": 0.19221577048301697,
"learning_rate": 4.494701465750217e-07,
"loss": 1.0217,
"step": 367
},
{
"epoch": 2.6483565961278703,
"grad_norm": 0.1743839979171753,
"learning_rate": 4.322727117869951e-07,
"loss": 1.0542,
"step": 368
},
{
"epoch": 2.655560558307069,
"grad_norm": 0.19367291033267975,
"learning_rate": 4.153958558269189e-07,
"loss": 1.0319,
"step": 369
},
{
"epoch": 2.6627645204862675,
"grad_norm": 0.1787528395652771,
"learning_rate": 3.9884076317064813e-07,
"loss": 1.0489,
"step": 370
},
{
"epoch": 2.669968482665466,
"grad_norm": 0.1872800886631012,
"learning_rate": 3.8260859571158883e-07,
"loss": 1.0282,
"step": 371
},
{
"epoch": 2.6771724448446648,
"grad_norm": 0.188198521733284,
"learning_rate": 3.6670049267913954e-07,
"loss": 1.0347,
"step": 372
},
{
"epoch": 2.684376407023863,
"grad_norm": 0.17867067456245422,
"learning_rate": 3.511175705587433e-07,
"loss": 1.0329,
"step": 373
},
{
"epoch": 2.6915803692030615,
"grad_norm": 0.1834060102701187,
"learning_rate": 3.358609230135268e-07,
"loss": 1.0427,
"step": 374
},
{
"epoch": 2.69878433138226,
"grad_norm": 0.18759381771087646,
"learning_rate": 3.2093162080754634e-07,
"loss": 1.0221,
"step": 375
},
{
"epoch": 2.7059882935614588,
"grad_norm": 0.1826649308204651,
"learning_rate": 3.0633071173062966e-07,
"loss": 1.0362,
"step": 376
},
{
"epoch": 2.7131922557406574,
"grad_norm": 0.1856626272201538,
"learning_rate": 2.920592205248496e-07,
"loss": 1.0376,
"step": 377
},
{
"epoch": 2.720396217919856,
"grad_norm": 0.1898716390132904,
"learning_rate": 2.7811814881259503e-07,
"loss": 1.0424,
"step": 378
},
{
"epoch": 2.7276001800990546,
"grad_norm": 0.18418952822685242,
"learning_rate": 2.6450847502627883e-07,
"loss": 1.049,
"step": 379
},
{
"epoch": 2.734804142278253,
"grad_norm": 0.17716997861862183,
"learning_rate": 2.5123115433966615e-07,
"loss": 1.055,
"step": 380
},
{
"epoch": 2.742008104457452,
"grad_norm": 0.17998386919498444,
"learning_rate": 2.3828711860083676e-07,
"loss": 1.0457,
"step": 381
},
{
"epoch": 2.74921206663665,
"grad_norm": 0.17547675967216492,
"learning_rate": 2.2567727626678527e-07,
"loss": 1.0483,
"step": 382
},
{
"epoch": 2.7564160288158486,
"grad_norm": 0.17643079161643982,
"learning_rate": 2.134025123396638e-07,
"loss": 1.0344,
"step": 383
},
{
"epoch": 2.763619990995047,
"grad_norm": 0.16483494639396667,
"learning_rate": 2.0146368830466668e-07,
"loss": 1.0752,
"step": 384
},
{
"epoch": 2.770823953174246,
"grad_norm": 0.17367129027843475,
"learning_rate": 1.8986164206957037e-07,
"loss": 1.0487,
"step": 385
},
{
"epoch": 2.7780279153534444,
"grad_norm": 0.17891699075698853,
"learning_rate": 1.785971879059273e-07,
"loss": 1.0087,
"step": 386
},
{
"epoch": 2.785231877532643,
"grad_norm": 0.19728310406208038,
"learning_rate": 1.6767111639191202e-07,
"loss": 1.0226,
"step": 387
},
{
"epoch": 2.7924358397118416,
"grad_norm": 0.17195719480514526,
"learning_rate": 1.5708419435684463e-07,
"loss": 1.0392,
"step": 388
},
{
"epoch": 2.7996398018910402,
"grad_norm": 0.18537551164627075,
"learning_rate": 1.4683716482736364e-07,
"loss": 1.038,
"step": 389
},
{
"epoch": 2.806843764070239,
"grad_norm": 0.191681370139122,
"learning_rate": 1.3693074697528231e-07,
"loss": 1.0431,
"step": 390
},
{
"epoch": 2.814047726249437,
"grad_norm": 0.16845721006393433,
"learning_rate": 1.2736563606711384e-07,
"loss": 1.02,
"step": 391
},
{
"epoch": 2.8212516884286356,
"grad_norm": 0.176877960562706,
"learning_rate": 1.1814250341527611e-07,
"loss": 1.0547,
"step": 392
},
{
"epoch": 2.828455650607834,
"grad_norm": 0.1708219051361084,
"learning_rate": 1.0926199633097156e-07,
"loss": 1.0143,
"step": 393
},
{
"epoch": 2.835659612787033,
"grad_norm": 0.17549683153629303,
"learning_rate": 1.007247380787657e-07,
"loss": 1.0222,
"step": 394
},
{
"epoch": 2.8428635749662314,
"grad_norm": 0.18442362546920776,
"learning_rate": 9.253132783283548e-08,
"loss": 1.0487,
"step": 395
},
{
"epoch": 2.85006753714543,
"grad_norm": 0.17564083635807037,
"learning_rate": 8.468234063492287e-08,
"loss": 1.0169,
"step": 396
},
{
"epoch": 2.8572714993246286,
"grad_norm": 0.17153891921043396,
"learning_rate": 7.717832735397335e-08,
"loss": 1.0532,
"step": 397
},
{
"epoch": 2.8644754615038273,
"grad_norm": 0.1801021248102188,
"learning_rate": 7.001981464747565e-08,
"loss": 1.0294,
"step": 398
},
{
"epoch": 2.871679423683026,
"grad_norm": 0.18023864924907684,
"learning_rate": 6.3207304924498e-08,
"loss": 1.0518,
"step": 399
},
{
"epoch": 2.878883385862224,
"grad_norm": 0.17345793545246124,
"learning_rate": 5.674127631043025e-08,
"loss": 1.0301,
"step": 400
},
{
"epoch": 2.8860873480414226,
"grad_norm": 0.18773488700389862,
"learning_rate": 5.062218261342122e-08,
"loss": 1.0307,
"step": 401
},
{
"epoch": 2.8932913102206212,
"grad_norm": 0.1761646866798401,
"learning_rate": 4.485045329253646e-08,
"loss": 1.0221,
"step": 402
},
{
"epoch": 2.90049527239982,
"grad_norm": 0.1819511353969574,
"learning_rate": 3.9426493427611177e-08,
"loss": 1.0565,
"step": 403
},
{
"epoch": 2.9076992345790185,
"grad_norm": 0.17573249340057373,
"learning_rate": 3.435068369082306e-08,
"loss": 1.0432,
"step": 404
},
{
"epoch": 2.914903196758217,
"grad_norm": 0.1772117167711258,
"learning_rate": 2.9623380319976912e-08,
"loss": 1.039,
"step": 405
},
{
"epoch": 2.9221071589374157,
"grad_norm": 0.17637501657009125,
"learning_rate": 2.5244915093499134e-08,
"loss": 1.0398,
"step": 406
},
{
"epoch": 2.9293111211166143,
"grad_norm": 0.16873933374881744,
"learning_rate": 2.1215595307154667e-08,
"loss": 1.0196,
"step": 407
},
{
"epoch": 2.936515083295813,
"grad_norm": 0.1777278184890747,
"learning_rate": 1.753570375247815e-08,
"loss": 1.0373,
"step": 408
},
{
"epoch": 2.943719045475011,
"grad_norm": 0.1736447662115097,
"learning_rate": 1.4205498696930332e-08,
"loss": 1.0593,
"step": 409
},
{
"epoch": 2.9509230076542097,
"grad_norm": 0.17402058839797974,
"learning_rate": 1.1225213865767026e-08,
"loss": 1.0174,
"step": 410
},
{
"epoch": 2.9581269698334083,
"grad_norm": 0.16966642439365387,
"learning_rate": 8.595058425640012e-09,
"loss": 1.0114,
"step": 411
},
{
"epoch": 2.965330932012607,
"grad_norm": 0.17315024137496948,
"learning_rate": 6.315216969912663e-09,
"loss": 1.0558,
"step": 412
},
{
"epoch": 2.9725348941918055,
"grad_norm": 0.1718152016401291,
"learning_rate": 4.385849505708084e-09,
"loss": 1.058,
"step": 413
},
{
"epoch": 2.979738856371004,
"grad_norm": 0.1746918112039566,
"learning_rate": 2.8070914426786555e-09,
"loss": 1.0505,
"step": 414
},
{
"epoch": 2.9869428185502027,
"grad_norm": 0.18175256252288818,
"learning_rate": 1.5790535835003006e-09,
"loss": 1.0555,
"step": 415
},
{
"epoch": 2.9941467807294013,
"grad_norm": 0.177822545170784,
"learning_rate": 7.018221160981498e-10,
"loss": 1.0346,
"step": 416
},
{
"epoch": 3.0,
"grad_norm": 0.18819527328014374,
"learning_rate": 1.7545860759693446e-10,
"loss": 0.9961,
"step": 417
}
],
"logging_steps": 1,
"max_steps": 417,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2226454674800640.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}